From 5b62bef8cbf73f910513ef3b1f557aa94b384854 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 19 Aug 2015 13:17:26 -0700
Subject: [PATCH 001/802] [SPARK-8918] [MLLIB] [DOC] Add @since tags to
 mllib.clustering

This continues the work from #8256. I removed `since` tags from private/protected/local methods/variables (see https://github.com/apache/spark/commit/72fdeb64630470f6f46cf3eed8ffbfe83a7c4659). MechCoder

Closes #8256

Author: Xiangrui Meng <meng@databricks.com>
Author: Xiaoqing Wang <spark445@126.com>
Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8288 from mengxr/SPARK-8918.
---
 .../mllib/clustering/GaussianMixture.scala    | 56 +++++++++++----
 .../clustering/GaussianMixtureModel.scala     | 32 +++++++--
 .../spark/mllib/clustering/KMeans.scala       | 36 +++++++++-
 .../spark/mllib/clustering/KMeansModel.scala  | 37 ++++++++--
 .../apache/spark/mllib/clustering/LDA.scala   | 71 ++++++++++++++++---
 .../spark/mllib/clustering/LDAModel.scala     | 64 +++++++++++++++--
 .../spark/mllib/clustering/LDAOptimizer.scala | 12 +++-
 .../clustering/PowerIterationClustering.scala | 29 +++++++-
 .../mllib/clustering/StreamingKMeans.scala    | 53 +++++++++++---
 9 files changed, 338 insertions(+), 52 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index e459367333d26..bc27b1fe7390b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -62,6 +62,7 @@ class GaussianMixture private (
   /**
    * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
    * maxIterations: 100, seed: random}.
+   * @since 1.3.0
    */
   def this() = this(2, 0.01, 100, Utils.random.nextLong())
 
@@ -72,9 +73,11 @@ class GaussianMixture private (
   // default random starting point
   private var initialModel: Option[GaussianMixtureModel] = None
 
-  /** Set the initial GMM starting point, bypassing the random initialization.
-   *  You must call setK() prior to calling this method, and the condition
-   *  (model.k == this.k) must be met; failure will result in an IllegalArgumentException
+  /**
+   * Set the initial GMM starting point, bypassing the random initialization.
+   * You must call setK() prior to calling this method, and the condition
+   * (model.k == this.k) must be met; failure will result in an IllegalArgumentException
+   * @since 1.3.0
    */
   def setInitialModel(model: GaussianMixtureModel): this.type = {
     if (model.k == k) {
@@ -85,30 +88,46 @@ class GaussianMixture private (
     this
   }
 
-  /** Return the user supplied initial GMM, if supplied */
+  /**
+   * Return the user supplied initial GMM, if supplied
+   * @since 1.3.0
+   */
   def getInitialModel: Option[GaussianMixtureModel] = initialModel
 
-  /** Set the number of Gaussians in the mixture model.  Default: 2 */
+  /**
+   * Set the number of Gaussians in the mixture model.  Default: 2
+   * @since 1.3.0
+   */
   def setK(k: Int): this.type = {
     this.k = k
     this
   }
 
-  /** Return the number of Gaussians in the mixture model */
+  /**
+   * Return the number of Gaussians in the mixture model
+   * @since 1.3.0
+   */
   def getK: Int = k
 
-  /** Set the maximum number of iterations to run. Default: 100 */
+  /**
+   * Set the maximum number of iterations to run. Default: 100
+   * @since 1.3.0
+   */
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
   }
 
-  /** Return the maximum number of iterations to run */
+  /**
+   * Return the maximum number of iterations to run
+   * @since 1.3.0
+   */
   def getMaxIterations: Int = maxIterations
 
   /**
    * Set the largest change in log-likelihood at which convergence is
    * considered to have occurred.
+   * @since 1.3.0
    */
   def setConvergenceTol(convergenceTol: Double): this.type = {
     this.convergenceTol = convergenceTol
@@ -118,19 +137,29 @@ class GaussianMixture private (
   /**
    * Return the largest change in log-likelihood at which convergence is
    * considered to have occurred.
+   * @since 1.3.0
    */
   def getConvergenceTol: Double = convergenceTol
 
-  /** Set the random seed */
+  /**
+   * Set the random seed
+   * @since 1.3.0
+   */
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
   }
 
-  /** Return the random seed */
+  /**
+   * Return the random seed
+   * @since 1.3.0
+   */
   def getSeed: Long = seed
 
-  /** Perform expectation maximization */
+  /**
+   * Perform expectation maximization
+   * @since 1.3.0
+   */
   def run(data: RDD[Vector]): GaussianMixtureModel = {
     val sc = data.sparkContext
 
@@ -204,7 +233,10 @@ class GaussianMixture private (
     new GaussianMixtureModel(weights, gaussians)
   }
 
-  /** Java-friendly version of [[run()]] */
+  /**
+   * Java-friendly version of [[run()]]
+   * @since 1.3.0
+   */
   def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
 
   private def updateWeightsAndGaussians(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 76aeebd703d4e..2fa0473737aae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -43,6 +43,7 @@ import org.apache.spark.sql.{SQLContext, Row}
  *                the weight for Gaussian i, and weights.sum == 1
  * @param gaussians Array of MultivariateGaussian where gaussians(i) represents
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
+ * @since 1.3.0
  */
 @Experimental
 class GaussianMixtureModel(
@@ -53,32 +54,48 @@ class GaussianMixtureModel(
 
   override protected def formatVersion = "1.0"
 
+  /**
+   * @since 1.4.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     GaussianMixtureModel.SaveLoadV1_0.save(sc, path, weights, gaussians)
   }
 
-  /** Number of gaussians in mixture */
+  /**
+   * Number of gaussians in mixture
+   * @since 1.3.0
+   */
   def k: Int = weights.length
 
-  /** Maps given points to their cluster indices. */
+  /**
+   * Maps given points to their cluster indices.
+   * @since 1.3.0
+   */
   def predict(points: RDD[Vector]): RDD[Int] = {
     val responsibilityMatrix = predictSoft(points)
     responsibilityMatrix.map(r => r.indexOf(r.max))
   }
 
-  /** Maps given point to its cluster index. */
+  /**
+   * Maps given point to its cluster index.
+   * @since 1.5.0
+   */
   def predict(point: Vector): Int = {
     val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
     r.indexOf(r.max)
   }
 
-  /** Java-friendly version of [[predict()]] */
+  /**
+   * Java-friendly version of [[predict()]]
+   * @since 1.4.0
+   */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
    * Given the input vectors, return the membership value of each vector
    * to all mixture components.
+   * @since 1.3.0
    */
   def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
     val sc = points.sparkContext
@@ -91,6 +108,7 @@ class GaussianMixtureModel(
 
   /**
    * Given the input vector, return the membership values to all mixture components.
+   * @since 1.4.0
    */
   def predictSoft(point: Vector): Array[Double] = {
     computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
@@ -115,6 +133,9 @@ class GaussianMixtureModel(
   }
 }
 
+/**
+ * @since 1.4.0
+ */
 @Experimental
 object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
 
@@ -165,6 +186,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
     }
   }
 
+  /**
+   * @since 1.4.0
+   */
   override def load(sc: SparkContext, path: String) : GaussianMixtureModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 0a65403f4ec95..9ef6834e5ea8d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -49,15 +49,20 @@ class KMeans private (
   /**
    * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
    * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
+   * @since 0.8.0
    */
   def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong())
 
   /**
    * Number of clusters to create (k).
+   * @since 1.4.0
    */
   def getK: Int = k
 
-  /** Set the number of clusters to create (k). Default: 2. */
+  /**
+   * Set the number of clusters to create (k). Default: 2.
+   * @since 0.8.0
+   */
   def setK(k: Int): this.type = {
     this.k = k
     this
@@ -65,10 +70,14 @@ class KMeans private (
 
   /**
    * Maximum number of iterations to run.
+   * @since 1.4.0
    */
   def getMaxIterations: Int = maxIterations
 
-  /** Set maximum number of iterations to run. Default: 20. */
+  /**
+   * Set maximum number of iterations to run. Default: 20.
+   * @since 0.8.0
+   */
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
@@ -76,6 +85,7 @@ class KMeans private (
 
   /**
    * The initialization algorithm. This can be either "random" or "k-means||".
+   * @since 1.4.0
    */
   def getInitializationMode: String = initializationMode
 
@@ -83,6 +93,7 @@ class KMeans private (
    * Set the initialization algorithm. This can be either "random" to choose random points as
    * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
    * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
+   * @since 0.8.0
    */
   def setInitializationMode(initializationMode: String): this.type = {
     KMeans.validateInitMode(initializationMode)
@@ -93,6 +104,7 @@ class KMeans private (
   /**
    * :: Experimental ::
    * Number of runs of the algorithm to execute in parallel.
+   * @since 1.4.0
    */
   @Experimental
   def getRuns: Int = runs
@@ -102,6 +114,7 @@ class KMeans private (
    * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
    * this many times with random starting conditions (configured by the initialization mode), then
    * return the best clustering found over any run. Default: 1.
+   * @since 0.8.0
    */
   @Experimental
   def setRuns(runs: Int): this.type = {
@@ -114,12 +127,14 @@ class KMeans private (
 
   /**
    * Number of steps for the k-means|| initialization mode
+   * @since 1.4.0
    */
   def getInitializationSteps: Int = initializationSteps
 
   /**
    * Set the number of steps for the k-means|| initialization mode. This is an advanced
    * setting -- the default of 5 is almost always enough. Default: 5.
+   * @since 0.8.0
    */
   def setInitializationSteps(initializationSteps: Int): this.type = {
     if (initializationSteps <= 0) {
@@ -131,12 +146,14 @@ class KMeans private (
 
   /**
    * The distance threshold within which we've consider centers to have converged.
+   * @since 1.4.0
    */
   def getEpsilon: Double = epsilon
 
   /**
    * Set the distance threshold within which we've consider centers to have converged.
    * If all centers move less than this Euclidean distance, we stop iterating one run.
+   * @since 0.8.0
    */
   def setEpsilon(epsilon: Double): this.type = {
     this.epsilon = epsilon
@@ -145,10 +162,14 @@ class KMeans private (
 
   /**
    * The random seed for cluster initialization.
+   * @since 1.4.0
    */
   def getSeed: Long = seed
 
-  /** Set the random seed for cluster initialization. */
+  /**
+   * Set the random seed for cluster initialization.
+   * @since 1.4.0
+   */
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -162,6 +183,7 @@ class KMeans private (
    * Set the initial starting point, bypassing the random initialization or k-means||
    * The condition model.k == this.k must be met, failure results
    * in an IllegalArgumentException.
+   * @since 1.4.0
    */
   def setInitialModel(model: KMeansModel): this.type = {
     require(model.k == k, "mismatched cluster count")
@@ -172,6 +194,7 @@ class KMeans private (
   /**
    * Train a K-means model on the given set of points; `data` should be cached for high
    * performance, because this is an iterative algorithm.
+   * @since 0.8.0
    */
   def run(data: RDD[Vector]): KMeansModel = {
 
@@ -430,11 +453,14 @@ class KMeans private (
 
 /**
  * Top-level methods for calling K-means clustering.
+ * @since 0.8.0
  */
 object KMeans {
 
   // Initialization mode names
+  /** @since 0.8.0 */
   val RANDOM = "random"
+  /** @since 0.8.0 */
   val K_MEANS_PARALLEL = "k-means||"
 
   /**
@@ -446,6 +472,7 @@ object KMeans {
    * @param runs number of parallel runs, defaults to 1. The best model is returned.
    * @param initializationMode initialization model, either "random" or "k-means||" (default).
    * @param seed random seed value for cluster initialization
+   * @since 1.3.0
    */
   def train(
       data: RDD[Vector],
@@ -470,6 +497,7 @@ object KMeans {
    * @param maxIterations max number of iterations
    * @param runs number of parallel runs, defaults to 1. The best model is returned.
    * @param initializationMode initialization model, either "random" or "k-means||" (default).
+   * @since 0.8.0
    */
   def train(
       data: RDD[Vector],
@@ -486,6 +514,7 @@ object KMeans {
 
   /**
    * Trains a k-means model using specified parameters and the default values for unspecified.
+   * @since 0.8.0
    */
   def train(
       data: RDD[Vector],
@@ -496,6 +525,7 @@ object KMeans {
 
   /**
    * Trains a k-means model using specified parameters and the default values for unspecified.
+   * @since 0.8.0
    */
   def train(
       data: RDD[Vector],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index 96359024fa228..8de2087ceb4df 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -34,35 +34,52 @@ import org.apache.spark.sql.Row
 
 /**
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
+ * @since 0.8.0
  */
 class KMeansModel (
     val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
 
-  /** A Java-friendly constructor that takes an Iterable of Vectors. */
+  /**
+   * A Java-friendly constructor that takes an Iterable of Vectors.
+   * @since 1.4.0
+   */
   def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray)
 
-  /** Total number of clusters. */
+  /**
+   * Total number of clusters.
+   * @since 0.8.0
+   */
   def k: Int = clusterCenters.length
 
-  /** Returns the cluster index that a given point belongs to. */
+  /**
+   * Returns the cluster index that a given point belongs to.
+   * @since 0.8.0
+   */
   def predict(point: Vector): Int = {
     KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
   }
 
-  /** Maps given points to their cluster indices. */
+  /**
+   * Maps given points to their cluster indices.
+   * @since 1.0.0
+   */
   def predict(points: RDD[Vector]): RDD[Int] = {
     val centersWithNorm = clusterCentersWithNorm
     val bcCentersWithNorm = points.context.broadcast(centersWithNorm)
     points.map(p => KMeans.findClosest(bcCentersWithNorm.value, new VectorWithNorm(p))._1)
   }
 
-  /** Maps given points to their cluster indices. */
+  /**
+   * Maps given points to their cluster indices.
+   * @since 1.0.0
+   */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
    * Return the K-means cost (sum of squared distances of points to their nearest center) for this
    * model on the given data.
+   * @since 0.8.0
    */
   def computeCost(data: RDD[Vector]): Double = {
     val centersWithNorm = clusterCentersWithNorm
@@ -73,6 +90,9 @@ class KMeansModel (
   private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
     clusterCenters.map(new VectorWithNorm(_))
 
+  /**
+   * @since 1.4.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     KMeansModel.SaveLoadV1_0.save(sc, this, path)
   }
@@ -80,7 +100,14 @@ class KMeansModel (
   override protected def formatVersion: String = "1.0"
 }
 
+/**
+ * @since 1.4.0
+ */
 object KMeansModel extends Loader[KMeansModel] {
+
+  /**
+   * @since 1.4.0
+   */
   override def load(sc: SparkContext, path: String): KMeansModel = {
     KMeansModel.SaveLoadV1_0.load(sc, path)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 0fc9b1ac4d716..2a8c6acbaec61 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -43,6 +43,7 @@ import org.apache.spark.util.Utils
  *
  * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
  *       (Wikipedia)]]
+ * @since 1.3.0
  */
 @Experimental
 class LDA private (
@@ -54,18 +55,25 @@ class LDA private (
     private var checkpointInterval: Int,
     private var ldaOptimizer: LDAOptimizer) extends Logging {
 
+  /**
+   * Constructs a LDA instance with default parameters.
+   * @since 1.3.0
+   */
   def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1),
     topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10,
     ldaOptimizer = new EMLDAOptimizer)
 
   /**
    * Number of topics to infer.  I.e., the number of soft cluster centers.
+   *
+   * @since 1.3.0
    */
   def getK: Int = k
 
   /**
    * Number of topics to infer.  I.e., the number of soft cluster centers.
    * (default = 10)
+   * @since 1.3.0
    */
   def setK(k: Int): this.type = {
     require(k > 0, s"LDA k (number of clusters) must be > 0, but was set to $k")
@@ -78,6 +86,7 @@ class LDA private (
    * distributions over topics ("theta").
    *
    * This is the parameter to a Dirichlet distribution.
+   * @since 1.5.0
    */
   def getAsymmetricDocConcentration: Vector = this.docConcentration
 
@@ -87,6 +96,7 @@ class LDA private (
    *
    * This method assumes the Dirichlet distribution is symmetric and can be described by a single
    * [[Double]] parameter. It should fail if docConcentration is asymmetric.
+   * @since 1.3.0
    */
   def getDocConcentration: Double = {
     val parameter = docConcentration(0)
@@ -121,6 +131,7 @@ class LDA private (
    *     - Values should be >= 0
    *     - default = uniformly (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
+   * @since 1.5.0
    */
   def setDocConcentration(docConcentration: Vector): this.type = {
     require(docConcentration.size > 0, "docConcentration must have > 0 elements")
@@ -128,22 +139,37 @@ class LDA private (
     this
   }
 
-  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
+  /**
+   * Replicates a [[Double]] docConcentration to create a symmetric prior.
+   * @since 1.3.0
+   */
   def setDocConcentration(docConcentration: Double): this.type = {
     this.docConcentration = Vectors.dense(docConcentration)
     this
   }
 
-  /** Alias for [[getAsymmetricDocConcentration]] */
+  /**
+   * Alias for [[getAsymmetricDocConcentration]]
+   * @since 1.5.0
+   */
   def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
 
-  /** Alias for [[getDocConcentration]] */
+  /**
+   * Alias for [[getDocConcentration]]
+   * @since 1.3.0
+   */
   def getAlpha: Double = getDocConcentration
 
-  /** Alias for [[setDocConcentration()]] */
+  /**
+   * Alias for [[setDocConcentration()]]
+   * @since 1.5.0
+   */
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
 
-  /** Alias for [[setDocConcentration()]] */
+  /**
+   * Alias for [[setDocConcentration()]]
+   * @since 1.3.0
+   */
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
 
   /**
@@ -154,6 +180,7 @@ class LDA private (
    *
    * Note: The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
+   * @since 1.3.0
    */
   def getTopicConcentration: Double = this.topicConcentration
 
@@ -178,36 +205,51 @@ class LDA private (
    *     - Value should be >= 0
    *     - default = (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
+   * @since 1.3.0
    */
   def setTopicConcentration(topicConcentration: Double): this.type = {
     this.topicConcentration = topicConcentration
     this
   }
 
-  /** Alias for [[getTopicConcentration]] */
+  /**
+   * Alias for [[getTopicConcentration]]
+   * @since 1.3.0
+   */
   def getBeta: Double = getTopicConcentration
 
-  /** Alias for [[setTopicConcentration()]] */
+  /**
+   * Alias for [[setTopicConcentration()]]
+   * @since 1.3.0
+   */
   def setBeta(beta: Double): this.type = setTopicConcentration(beta)
 
   /**
    * Maximum number of iterations for learning.
+   * @since 1.3.0
    */
   def getMaxIterations: Int = maxIterations
 
   /**
    * Maximum number of iterations for learning.
    * (default = 20)
+   * @since 1.3.0
    */
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
   }
 
-  /** Random seed */
+  /**
+   * Random seed
+   * @since 1.3.0
+   */
   def getSeed: Long = seed
 
-  /** Random seed */
+  /**
+   * Random seed
+   * @since 1.3.0
+   */
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -215,6 +257,7 @@ class LDA private (
 
   /**
    * Period (in iterations) between checkpoints.
+   * @since 1.3.0
    */
   def getCheckpointInterval: Int = checkpointInterval
 
@@ -225,6 +268,7 @@ class LDA private (
    * [[org.apache.spark.SparkContext]], this setting is ignored.
    *
    * @see [[org.apache.spark.SparkContext#setCheckpointDir]]
+   * @since 1.3.0
    */
   def setCheckpointInterval(checkpointInterval: Int): this.type = {
     this.checkpointInterval = checkpointInterval
@@ -236,6 +280,7 @@ class LDA private (
    * :: DeveloperApi ::
    *
    * LDAOptimizer used to perform the actual calculation
+   * @since 1.4.0
    */
   @DeveloperApi
   def getOptimizer: LDAOptimizer = ldaOptimizer
@@ -244,6 +289,7 @@ class LDA private (
    * :: DeveloperApi ::
    *
    * LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
+   * @since 1.4.0
    */
   @DeveloperApi
   def setOptimizer(optimizer: LDAOptimizer): this.type = {
@@ -254,6 +300,7 @@ class LDA private (
   /**
    * Set the LDAOptimizer used to perform the actual calculation by algorithm name.
    * Currently "em", "online" are supported.
+   * @since 1.4.0
    */
   def setOptimizer(optimizerName: String): this.type = {
     this.ldaOptimizer =
@@ -274,6 +321,7 @@ class LDA private (
    *                   (where the vocabulary size is the length of the vector).
    *                   Document IDs must be unique and >= 0.
    * @return  Inferred LDA model
+   * @since 1.3.0
    */
   def run(documents: RDD[(Long, Vector)]): LDAModel = {
     val state = ldaOptimizer.initialize(documents, this)
@@ -289,7 +337,10 @@ class LDA private (
     state.getLDAModel(iterationTimes)
   }
 
-  /** Java-friendly version of [[run()]] */
+  /**
+   * Java-friendly version of [[run()]]
+   * @since 1.3.0
+   */
   def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = {
     run(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82f05e4a18cee..b70e380c0393e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -192,12 +192,24 @@ class LocalLDAModel private[clustering] (
     override protected[clustering] val gammaShape: Double = 100)
   extends LDAModel with Serializable {
 
+  /**
+   * @since 1.3.0
+   */
   override def k: Int = topics.numCols
 
+  /**
+   * @since 1.3.0
+   */
   override def vocabSize: Int = topics.numRows
 
+  /**
+   * @since 1.3.0
+   */
   override def topicsMatrix: Matrix = topics
 
+  /**
+   * @since 1.3.0
+   */
   override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = {
     val brzTopics = topics.toBreeze.toDenseMatrix
     Range(0, k).map { topicIndex =>
@@ -210,6 +222,9 @@ class LocalLDAModel private[clustering] (
 
   override protected def formatVersion = "1.0"
 
+  /**
+   * @since 1.5.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
       gammaShape)
@@ -223,12 +238,16 @@ class LocalLDAModel private[clustering] (
    *
    * @param documents test corpus to use for calculating log likelihood
    * @return variational lower bound on the log likelihood of the entire corpus
+   * @since 1.5.0
    */
   def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents,
     docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
     vocabSize)
 
-  /** Java-friendly version of [[logLikelihood]] */
+  /**
+   * Java-friendly version of [[logLikelihood]]
+   * @since 1.5.0
+   */
   def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
     logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
@@ -239,6 +258,7 @@ class LocalLDAModel private[clustering] (
    *
    * @param documents test corpus to use for calculating perplexity
    * @return Variational upper bound on log perplexity per token.
+   * @since 1.5.0
    */
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
     val corpusTokenCount = documents
@@ -247,7 +267,9 @@ class LocalLDAModel private[clustering] (
     -logLikelihood(documents) / corpusTokenCount
   }
 
-  /** Java-friendly version of [[logPerplexity]] */
+  /** Java-friendly version of [[logPerplexity]]
+   *  @since 1.5.0
+   */
   def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
     logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
@@ -325,6 +347,7 @@ class LocalLDAModel private[clustering] (
    * for each document.
    * @param documents documents to predict topic mixture distributions for
    * @return An RDD of (document ID, topic mixture distribution for document)
+   * @since 1.3.0
    */
   // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
   def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = {
@@ -351,7 +374,10 @@ class LocalLDAModel private[clustering] (
     }
   }
 
-  /** Java-friendly version of [[topicDistributions]] */
+  /**
+   * Java-friendly version of [[topicDistributions]]
+   * @since 1.4.1
+   */
   def topicDistributions(
       documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = {
     val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
@@ -425,6 +451,9 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
     }
   }
 
+  /**
+   * @since 1.5.0
+   */
   override def load(sc: SparkContext, path: String): LocalLDAModel = {
     val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
@@ -481,6 +510,7 @@ class DistributedLDAModel private[clustering] (
    * Convert model to a local model.
    * The local model stores the inferred topics but not the topic distributions for training
    * documents.
+   * @since 1.3.0
    */
   def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration,
     gammaShape)
@@ -491,6 +521,7 @@ class DistributedLDAModel private[clustering] (
    * No guarantees are given about the ordering of the topics.
    *
    * WARNING: This matrix is collected from an RDD. Beware memory usage when vocabSize, k are large.
+   * @since 1.3.0
    */
   override lazy val topicsMatrix: Matrix = {
     // Collect row-major topics
@@ -510,6 +541,9 @@ class DistributedLDAModel private[clustering] (
     Matrices.fromBreeze(brzTopics)
   }
 
+  /**
+   * @since 1.3.0
+   */
   override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = {
     val numTopics = k
     // Note: N_k is not needed to find the top terms, but it is needed to normalize weights
@@ -548,6 +582,7 @@ class DistributedLDAModel private[clustering] (
    * @return  Array over topics.  Each element represent as a pair of matching arrays:
    *          (IDs for the documents, weights of the topic in these documents).
    *          For each topic, documents are sorted in order of decreasing topic weights.
+   * @since 1.5.0
    */
   def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = {
     val numTopics = k
@@ -587,6 +622,7 @@ class DistributedLDAModel private[clustering] (
    *  - This excludes the prior; for that, use [[logPrior]].
    *  - Even with [[logPrior]], this is NOT the same as the data log likelihood given the
    *    hyperparameters.
+   * @since 1.3.0
    */
   lazy val logLikelihood: Double = {
     // TODO: generalize this for asymmetric (non-scalar) alpha
@@ -612,7 +648,8 @@ class DistributedLDAModel private[clustering] (
 
   /**
    * Log probability of the current parameter estimate:
-   *  log P(topics, topic distributions for docs | alpha, eta)
+   * log P(topics, topic distributions for docs | alpha, eta)
+   * @since 1.3.0
    */
   lazy val logPrior: Double = {
     // TODO: generalize this for asymmetric (non-scalar) alpha
@@ -644,6 +681,7 @@ class DistributedLDAModel private[clustering] (
    * ("theta_doc").
    *
    * @return  RDD of (document ID, topic distribution) pairs
+   * @since 1.3.0
    */
   def topicDistributions: RDD[(Long, Vector)] = {
     graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
@@ -651,7 +689,10 @@ class DistributedLDAModel private[clustering] (
     }
   }
 
-  /** Java-friendly version of [[topicDistributions]] */
+  /**
+   * Java-friendly version of [[topicDistributions]]
+   * @since 1.4.1
+   */
   def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = {
     JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]])
   }
@@ -659,6 +700,7 @@ class DistributedLDAModel private[clustering] (
   /**
    * For each document, return the top k weighted topics for that document and their weights.
    * @return RDD of (doc ID, topic indices, topic weights)
+   * @since 1.5.0
    */
   def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = {
     graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
@@ -673,7 +715,10 @@ class DistributedLDAModel private[clustering] (
     }
   }
 
-  /** Java-friendly version of [[topTopicsPerDocument]] */
+  /**
+   * Java-friendly version of [[topTopicsPerDocument]]
+   * @since 1.5.0
+   */
   def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = {
     val topics = topTopicsPerDocument(k)
     topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD()
@@ -684,6 +729,10 @@ class DistributedLDAModel private[clustering] (
 
   override protected def formatVersion = "1.0"
 
+  /**
+   * Java-friendly version of [[topicDistributions]]
+   * @since 1.5.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     DistributedLDAModel.SaveLoadV1_0.save(
       sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration,
@@ -784,6 +833,9 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
 
   }
 
+  /**
+   * @since 1.5.0
+   */
   override def load(sc: SparkContext, path: String): DistributedLDAModel = {
     val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index a0008f9c99ad7..360241c8081ac 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -35,6 +35,7 @@ import org.apache.spark.rdd.RDD
  *
  * An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
  * hold optimizer-specific parameters for users to set.
+ * @since 1.4.0
  */
 @DeveloperApi
 sealed trait LDAOptimizer {
@@ -73,7 +74,7 @@ sealed trait LDAOptimizer {
  *  - Paper which clearly explains several algorithms, including EM:
  *    Asuncion, Welling, Smyth, and Teh.
  *    "On Smoothing and Inference for Topic Models."  UAI, 2009.
- *
+ * @since 1.4.0
  */
 @DeveloperApi
 final class EMLDAOptimizer extends LDAOptimizer {
@@ -225,6 +226,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
  *
  * Original Online LDA paper:
  *   Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010.
+ * @since 1.4.0
  */
 @DeveloperApi
 final class OnlineLDAOptimizer extends LDAOptimizer {
@@ -274,6 +276,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /**
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
+   * @since 1.4.0
    */
   def getTau0: Double = this.tau0
 
@@ -281,6 +284,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
    * Default: 1024, following the original Online LDA paper.
+   * @since 1.4.0
    */
   def setTau0(tau0: Double): this.type = {
     require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
@@ -290,6 +294,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   /**
    * Learning rate: exponential decay rate
+   * @since 1.4.0
    */
   def getKappa: Double = this.kappa
 
@@ -297,6 +302,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Learning rate: exponential decay rate---should be between
    * (0.5, 1.0] to guarantee asymptotic convergence.
    * Default: 0.51, based on the original Online LDA paper.
+   * @since 1.4.0
    */
   def setKappa(kappa: Double): this.type = {
     require(kappa >= 0, s"Online LDA kappa must be nonnegative, but was set to $kappa")
@@ -306,6 +312,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   /**
    * Mini-batch fraction, which sets the fraction of document sampled and used in each iteration
+   * @since 1.4.0
    */
   def getMiniBatchFraction: Double = this.miniBatchFraction
 
@@ -318,6 +325,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * maxIterations * miniBatchFraction >= 1.
    *
    * Default: 0.05, i.e., 5% of total documents.
+   * @since 1.4.0
    */
   def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
     require(miniBatchFraction > 0.0 && miniBatchFraction <= 1.0,
@@ -329,6 +337,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /**
    * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution)
    * will be optimized during training.
+   * @since 1.5.0
    */
   def getOptimzeAlpha: Boolean = this.optimizeAlpha
 
@@ -336,6 +345,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Sets whether to optimize alpha parameter during training.
    *
    * Default: false
+   * @since 1.5.0
    */
   def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
     this.optimizeAlpha = optimizeAlpha
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 407e43a024a2e..b4733ca975152 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -39,12 +39,16 @@ import org.apache.spark.{Logging, SparkContext, SparkException}
  *
  * @param k number of clusters
  * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
+ * @since 1.3.0
  */
 @Experimental
 class PowerIterationClusteringModel(
     val k: Int,
     val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable {
 
+  /**
+   * @since 1.4.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     PowerIterationClusteringModel.SaveLoadV1_0.save(sc, this, path)
   }
@@ -52,6 +56,9 @@ class PowerIterationClusteringModel(
   override protected def formatVersion: String = "1.0"
 }
 
+/**
+ * @since 1.4.0
+ */
 object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] {
   override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
     PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
@@ -65,6 +72,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
     private[clustering]
     val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel"
 
+    /**
+     * @since 1.4.0
+     */
     def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = {
       val sqlContext = new SQLContext(sc)
       import sqlContext.implicits._
@@ -77,6 +87,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
       dataRDD.write.parquet(Loader.dataPath(path))
     }
 
+    /**
+     * @since 1.4.0
+     */
     def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
       implicit val formats = DefaultFormats
       val sqlContext = new SQLContext(sc)
@@ -120,13 +133,16 @@ class PowerIterationClustering private[clustering] (
 
   import org.apache.spark.mllib.clustering.PowerIterationClustering._
 
-  /** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
-   *  initMode: "random"}.
+  /**
+   * Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
+   * initMode: "random"}.
+   * @since 1.3.0
    */
   def this() = this(k = 2, maxIterations = 100, initMode = "random")
 
   /**
    * Set the number of clusters.
+   * @since 1.3.0
    */
   def setK(k: Int): this.type = {
     this.k = k
@@ -135,6 +151,7 @@ class PowerIterationClustering private[clustering] (
 
   /**
    * Set maximum number of iterations of the power iteration loop
+   * @since 1.3.0
    */
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
@@ -144,6 +161,7 @@ class PowerIterationClustering private[clustering] (
   /**
    * Set the initialization mode. This can be either "random" to use a random vector
    * as vertex properties, or "degree" to use normalized sum similarities. Default: random.
+   * @since 1.3.0
    */
   def setInitializationMode(mode: String): this.type = {
     this.initMode = mode match {
@@ -164,6 +182,7 @@ class PowerIterationClustering private[clustering] (
    *              assume s,,ij,, = 0.0.
    *
    * @return a [[PowerIterationClusteringModel]] that contains the clustering result
+   * @since 1.5.0
    */
   def run(graph: Graph[Double, Double]): PowerIterationClusteringModel = {
     val w = normalize(graph)
@@ -185,6 +204,7 @@ class PowerIterationClustering private[clustering] (
    *                     assume s,,ij,, = 0.0.
    *
    * @return a [[PowerIterationClusteringModel]] that contains the clustering result
+   * @since 1.3.0
    */
   def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = {
     val w = normalize(similarities)
@@ -197,6 +217,7 @@ class PowerIterationClustering private[clustering] (
 
   /**
    * A Java-friendly version of [[PowerIterationClustering.run]].
+   * @since 1.3.0
    */
   def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
     : PowerIterationClusteringModel = {
@@ -221,6 +242,9 @@ class PowerIterationClustering private[clustering] (
   }
 }
 
+/**
+ * @since 1.3.0
+ */
 @Experimental
 object PowerIterationClustering extends Logging {
 
@@ -229,6 +253,7 @@ object PowerIterationClustering extends Logging {
    * Cluster assignment.
    * @param id node id
    * @param cluster assigned cluster id
+   * @since 1.3.0
    */
   @Experimental
   case class Assignment(id: Long, cluster: Int)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index d9b34cec64894..a915804b02c90 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -63,6 +63,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * such that at time t + h the discount applied to the data from t is 0.5.
  * The definition remains the same whether the time unit is given
  * as batches or points.
+ * @since 1.2.0
  *
  */
 @Experimental
@@ -70,7 +71,10 @@ class StreamingKMeansModel(
     override val clusterCenters: Array[Vector],
     val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
 
-  /** Perform a k-means update on a batch of data. */
+  /**
+   * Perform a k-means update on a batch of data.
+   * @since 1.2.0
+   */
   def update(data: RDD[Vector], decayFactor: Double, timeUnit: String): StreamingKMeansModel = {
 
     // find nearest cluster to each point
@@ -82,6 +86,7 @@ class StreamingKMeansModel(
       (p1._1, p1._2 + p2._2)
     }
     val dim = clusterCenters(0).size
+
     val pointStats: Array[(Int, (Vector, Long))] = closest
       .aggregateByKey((Vectors.zeros(dim), 0L))(mergeContribs, mergeContribs)
       .collect()
@@ -161,6 +166,7 @@ class StreamingKMeansModel(
  *    .setRandomCenters(5, 100.0)
  *    .trainOn(DStream)
  * }}}
+ * @since 1.2.0
  */
 @Experimental
 class StreamingKMeans(
@@ -168,23 +174,33 @@ class StreamingKMeans(
     var decayFactor: Double,
     var timeUnit: String) extends Logging with Serializable {
 
+  /** @since 1.2.0 */
   def this() = this(2, 1.0, StreamingKMeans.BATCHES)
 
   protected var model: StreamingKMeansModel = new StreamingKMeansModel(null, null)
 
-  /** Set the number of clusters. */
+  /**
+   * Set the number of clusters.
+   * @since 1.2.0
+   */
   def setK(k: Int): this.type = {
     this.k = k
     this
   }
 
-  /** Set the decay factor directly (for forgetful algorithms). */
+  /**
+   * Set the decay factor directly (for forgetful algorithms).
+   * @since 1.2.0
+   */
   def setDecayFactor(a: Double): this.type = {
     this.decayFactor = a
     this
   }
 
-  /** Set the half life and time unit ("batches" or "points") for forgetful algorithms. */
+  /**
+   * Set the half life and time unit ("batches" or "points") for forgetful algorithms.
+   * @since 1.2.0
+   */
   def setHalfLife(halfLife: Double, timeUnit: String): this.type = {
     if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) {
       throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit)
@@ -195,7 +211,10 @@ class StreamingKMeans(
     this
   }
 
-  /** Specify initial centers directly. */
+  /**
+   * Specify initial centers directly.
+   * @since 1.2.0
+   */
   def setInitialCenters(centers: Array[Vector], weights: Array[Double]): this.type = {
     model = new StreamingKMeansModel(centers, weights)
     this
@@ -207,6 +226,7 @@ class StreamingKMeans(
    * @param dim Number of dimensions
    * @param weight Weight for each center
    * @param seed Random seed
+   * @since 1.2.0
    */
   def setRandomCenters(dim: Int, weight: Double, seed: Long = Utils.random.nextLong): this.type = {
     val random = new XORShiftRandom(seed)
@@ -216,7 +236,10 @@ class StreamingKMeans(
     this
   }
 
-  /** Return the latest model. */
+  /**
+   * Return the latest model.
+   * @since 1.2.0
+   */
   def latestModel(): StreamingKMeansModel = {
     model
   }
@@ -228,6 +251,7 @@ class StreamingKMeans(
    * and updates the model using each batch of data from the stream.
    *
    * @param data DStream containing vector data
+   * @since 1.2.0
    */
   def trainOn(data: DStream[Vector]) {
     assertInitialized()
@@ -236,7 +260,10 @@ class StreamingKMeans(
     }
   }
 
-  /** Java-friendly version of `trainOn`. */
+  /**
+   * Java-friendly version of `trainOn`.
+   * @since 1.4.0
+   */
   def trainOn(data: JavaDStream[Vector]): Unit = trainOn(data.dstream)
 
   /**
@@ -244,13 +271,17 @@ class StreamingKMeans(
    *
    * @param data DStream containing vector data
    * @return DStream containing predictions
+   * @since 1.2.0
    */
   def predictOn(data: DStream[Vector]): DStream[Int] = {
     assertInitialized()
     data.map(model.predict)
   }
 
-  /** Java-friendly version of `predictOn`. */
+  /**
+   * Java-friendly version of `predictOn`.
+   * @since 1.4.0
+   */
   def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Integer] = {
     JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Integer]])
   }
@@ -261,13 +292,17 @@ class StreamingKMeans(
    * @param data DStream containing (key, feature vector) pairs
    * @tparam K key type
    * @return DStream containing the input keys and the predictions as values
+   * @since 1.2.0
    */
   def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Int)] = {
     assertInitialized()
     data.mapValues(model.predict)
   }
 
-  /** Java-friendly version of `predictOnValues`. */
+  /**
+   * Java-friendly version of `predictOnValues`.
+   * @since 1.4.0
+   */
   def predictOnValues[K](
       data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Integer] = {
     implicit val tag = fakeClassTag[K]

From f3391ff2b8b9c1f1308755dc223776692e3b7725 Mon Sep 17 00:00:00 2001
From: Joshi <rekhajoshm@gmail.com>
Date: Wed, 19 Aug 2015 21:23:02 +0100
Subject: [PATCH 002/802] [SPARK-8889] [CORE] Fix for OOM for graph creation

Fix for OOM for graph creation

Author: Joshi <rekhajoshm@gmail.com>
Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #7602 from rekhajoshm/SPARK-8889.
---
 .../spark/ui/scope/RDDOperationGraph.scala    | 23 +++++------
 .../org/apache/spark/ui/UISeleniumSuite.scala | 39 +++++++++++++++++++
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index ffea9817c0b08..81f168a447ead 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ui.scope
 
 import scala.collection.mutable
-import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.{StringBuilder, ListBuffer}
 
 import org.apache.spark.Logging
 import org.apache.spark.scheduler.StageInfo
@@ -167,7 +167,7 @@ private[ui] object RDDOperationGraph extends Logging {
   def makeDotFile(graph: RDDOperationGraph): String = {
     val dotFile = new StringBuilder
     dotFile.append("digraph G {\n")
-    dotFile.append(makeDotSubgraph(graph.rootCluster, indent = "  "))
+    makeDotSubgraph(dotFile, graph.rootCluster, indent = "  ")
     graph.edges.foreach { edge => dotFile.append(s"""  ${edge.fromId}->${edge.toId};\n""") }
     dotFile.append("}")
     val result = dotFile.toString()
@@ -180,18 +180,19 @@ private[ui] object RDDOperationGraph extends Logging {
     s"""${node.id} [label="${node.name} [${node.id}]"]"""
   }
 
-  /** Return the dot representation of a subgraph in an RDDOperationGraph. */
-  private def makeDotSubgraph(cluster: RDDOperationCluster, indent: String): String = {
-    val subgraph = new StringBuilder
-    subgraph.append(indent + s"subgraph cluster${cluster.id} {\n")
-    subgraph.append(indent + s"""  label="${cluster.name}";\n""")
+  /** Update the dot representation of the RDDOperationGraph in cluster to subgraph. */
+  private def makeDotSubgraph(
+      subgraph: StringBuilder,
+      cluster: RDDOperationCluster,
+      indent: String): Unit = {
+    subgraph.append(indent).append(s"subgraph cluster${cluster.id} {\n")
+    subgraph.append(indent).append(s"""  label="${cluster.name}";\n""")
     cluster.childNodes.foreach { node =>
-      subgraph.append(indent + s"  ${makeDotNode(node)};\n")
+      subgraph.append(indent).append(s"  ${makeDotNode(node)};\n")
     }
     cluster.childClusters.foreach { cscope =>
-      subgraph.append(makeDotSubgraph(cscope, indent + "  "))
+      makeDotSubgraph(subgraph, cscope, indent + "  ")
     }
-    subgraph.append(indent + "}\n")
-    subgraph.toString()
+    subgraph.append(indent).append("}\n")
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 3aa672f8b713c..69888b2694bae 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ui
 import java.net.{HttpURLConnection, URL}
 import javax.servlet.http.{HttpServletResponse, HttpServletRequest}
 
+import scala.io.Source
 import scala.collection.JavaConversions._
 import scala.xml.Node
 
@@ -603,6 +604,44 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
     }
   }
 
+  test("job stages should have expected dotfile under DAG visualization") {
+    withSpark(newSparkContext()) { sc =>
+      // Create a multi-stage job
+      val rdd =
+        sc.parallelize(Seq(1, 2, 3)).map(identity).groupBy(identity).map(identity).groupBy(identity)
+      rdd.count()
+
+      val stage0 = Source.fromURL(sc.ui.get.appUIAddress +
+        "/stages/stage/?id=0&attempt=0&expandDagViz=true").mkString
+      assert(stage0.contains("digraph G {\n  subgraph clusterstage_0 {\n    " +
+        "label=&quot;Stage 0&quot;;\n    subgraph "))
+      assert(stage0.contains("{\n      label=&quot;parallelize&quot;;\n      " +
+        "0 [label=&quot;ParallelCollectionRDD [0]&quot;];\n    }"))
+      assert(stage0.contains("{\n      label=&quot;map&quot;;\n      " +
+        "1 [label=&quot;MapPartitionsRDD [1]&quot;];\n    }"))
+      assert(stage0.contains("{\n      label=&quot;groupBy&quot;;\n      " +
+        "2 [label=&quot;MapPartitionsRDD [2]&quot;];\n    }"))
+
+      val stage1 = Source.fromURL(sc.ui.get.appUIAddress +
+        "/stages/stage/?id=1&attempt=0&expandDagViz=true").mkString
+      assert(stage1.contains("digraph G {\n  subgraph clusterstage_1 {\n    " +
+        "label=&quot;Stage 1&quot;;\n    subgraph "))
+      assert(stage1.contains("{\n      label=&quot;groupBy&quot;;\n      " +
+        "3 [label=&quot;ShuffledRDD [3]&quot;];\n    }"))
+      assert(stage1.contains("{\n      label=&quot;map&quot;;\n      " +
+        "4 [label=&quot;MapPartitionsRDD [4]&quot;];\n    }"))
+      assert(stage1.contains("{\n      label=&quot;groupBy&quot;;\n      " +
+        "5 [label=&quot;MapPartitionsRDD [5]&quot;];\n    }"))
+
+      val stage2 = Source.fromURL(sc.ui.get.appUIAddress +
+        "/stages/stage/?id=2&attempt=0&expandDagViz=true").mkString
+      assert(stage2.contains("digraph G {\n  subgraph clusterstage_2 {\n    " +
+        "label=&quot;Stage 2&quot;;\n    subgraph "))
+      assert(stage2.contains("{\n      label=&quot;groupBy&quot;;\n      " +
+        "6 [label=&quot;ShuffledRDD [6]&quot;];\n    }"))
+    }
+  }
+
   def goToUi(sc: SparkContext, path: String): Unit = {
     goToUi(sc.ui.get, path)
   }

From e05da5cb5ea253e6372f648fc8203204f2a8df8d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 19 Aug 2015 13:43:04 -0700
Subject: [PATCH 003/802] [SPARK-10107] [SQL] fix NPE in format_number

Author: Davies Liu <davies@databricks.com>

Closes #8305 from davies/format_number.
---
 .../spark/sql/catalyst/expressions/stringOperations.scala     | 2 +-
 .../scala/org/apache/spark/sql/StringFunctionsSuite.scala     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 134f1aa2af9a8..ca044d3e95e38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -1306,8 +1306,8 @@ case class FormatNumber(x: Expression, d: Expression)
             $df $dFormat = new $df($pattern.toString());
             $lastDValue = $d;
             $numberFormat.applyPattern($dFormat.toPattern());
-            ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)}));
           }
+          ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)}));
         } else {
           ${ev.primitive} = null;
           ${ev.isNull} = true;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index cc95eede005d7..b91438baea06f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -348,9 +348,9 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
     // it will still use the interpretProjection if projection follows by a LocalRelation,
     // hence we add a filter operator.
     // See the optimizer rule `ConvertToLocalRelation`
-    val df2 = Seq((5L, 4), (4L, 3), (3L, 2)).toDF("a", "b")
+    val df2 = Seq((5L, 4), (4L, 3), (4L, 3), (4L, 3), (3L, 2)).toDF("a", "b")
     checkAnswer(
       df2.filter("b>0").selectExpr("format_number(a, b)"),
-      Row("5.0000") :: Row("4.000") :: Row("3.00") :: Nil)
+      Row("5.0000") :: Row("4.000") :: Row("4.000") :: Row("4.000") :: Row("3.00") :: Nil)
   }
 }

From 08887369c890e0dd87eb8b34e8c32bb03307bf24 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 19 Aug 2015 13:56:40 -0700
Subject: [PATCH 004/802] [SPARK-10073] [SQL] Python withColumn should replace
 the old column

DataFrame.withColumn in Python should be consistent with the Scala one (replacing the existing column  that has the same name).

cc marmbrus

Author: Davies Liu <davies@databricks.com>

Closes #8300 from davies/with_column.
---
 python/pyspark/sql/dataframe.py                      | 12 ++++++------
 python/pyspark/sql/tests.py                          |  4 ++++
 .../main/scala/org/apache/spark/sql/DataFrame.scala  |  3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index da742d7ce7d13..025811f519293 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1202,7 +1202,9 @@ def freqItems(self, cols, support=None):
     @ignore_unicode_prefix
     @since(1.3)
     def withColumn(self, colName, col):
-        """Returns a new :class:`DataFrame` by adding a column.
+        """
+        Returns a new :class:`DataFrame` by adding a column or replacing the
+        existing column that has the same name.
 
         :param colName: string, name of the new column.
         :param col: a :class:`Column` expression for the new column.
@@ -1210,7 +1212,8 @@ def withColumn(self, colName, col):
         >>> df.withColumn('age2', df.age + 2).collect()
         [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
         """
-        return self.select('*', col.alias(colName))
+        assert isinstance(col, Column), "col should be Column"
+        return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
 
     @ignore_unicode_prefix
     @since(1.3)
@@ -1223,10 +1226,7 @@ def withColumnRenamed(self, existing, new):
         >>> df.withColumnRenamed('age', 'age2').collect()
         [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
         """
-        cols = [Column(_to_java_column(c)).alias(new)
-                if c == existing else c
-                for c in self.columns]
-        return self.select(*cols)
+        return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx)
 
     @since(1.4)
     @ignore_unicode_prefix
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13cf647b66da8..aacfb34c77618 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1035,6 +1035,10 @@ def test_capture_illegalargument_exception(self):
         self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
                                 lambda: df.select(sha2(df.a, 1024)).collect())
 
+    def test_with_column_with_existing_name(self):
+        keys = self.df.withColumn("key", self.df.key).select("key").collect()
+        self.assertEqual([r.key for r in keys], list(range(100)))
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index fd0ead4401193..d6688b24ae7d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1133,7 +1133,8 @@ class DataFrame private[sql](
   /////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Returns a new [[DataFrame]] by adding a column.
+   * Returns a new [[DataFrame]] by adding a column or replacing the existing column that has
+   * the same name.
    * @group dfops
    * @since 1.3.0
    */

From 21bdbe9fe69be47be562de24216a469e5ee64c7b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 19 Aug 2015 13:57:52 -0700
Subject: [PATCH 005/802] [SPARK-9627] [SQL] Stops using Scala runtime
 reflection in DictionaryEncoding

`DictionaryEncoding` uses Scala runtime reflection to avoid boxing costs while building the directory array. However, this code path may hit [SI-6240] [1] and throw exception.

[1]: https://issues.scala-lang.org/browse/SI-6240

Author: Cheng Lian <lian@databricks.com>

Closes #8306 from liancheng/spark-9627/in-memory-cache-scala-reflection.
---
 .../sql/columnar/InMemoryColumnarTableScan.scala  |  1 -
 .../columnar/compression/compressionSchemes.scala | 15 ++++-----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 45f15fd04d4e2..66d429bc06198 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -120,7 +120,6 @@ private[sql] case class InMemoryRelation(
       new Iterator[CachedBatch] {
         def next(): CachedBatch = {
           val columnBuilders = output.map { attribute =>
-            val columnType = ColumnType(attribute.dataType)
             ColumnBuilder(attribute.dataType, batchSize, attribute.name, useCompression)
           }.toArray
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index c91d960a0932b..ca910a99db082 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -270,20 +270,13 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
   class Decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T])
     extends compression.Decoder[T] {
 
-    private val dictionary = {
-      // TODO Can we clean up this mess? Maybe move this to `DataType`?
-      implicit val classTag = {
-        val mirror = runtimeMirror(Utils.getSparkClassLoader)
-        ClassTag[T#InternalType](mirror.runtimeClass(columnType.scalaTag.tpe))
-      }
-
-      Array.fill(buffer.getInt()) {
-        columnType.extract(buffer)
-      }
+    private val dictionary: Array[Any] = {
+      val elementNum = buffer.getInt()
+      Array.fill[Any](elementNum)(columnType.extract(buffer).asInstanceOf[Any])
     }
 
     override def next(row: MutableRow, ordinal: Int): Unit = {
-      columnType.setField(row, ordinal, dictionary(buffer.getShort()))
+      columnType.setField(row, ordinal, dictionary(buffer.getShort()).asInstanceOf[T#InternalType])
     }
 
     override def hasNext: Boolean = buffer.hasRemaining

From 1f4c4fe6dfd8cc52b5fddfd67a31a77edbb1a036 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 19 Aug 2015 14:03:47 -0700
Subject: [PATCH 006/802] [SPARK-10090] [SQL] fix decimal scale of division

We should rounding the result of multiply/division of decimal to expected precision/scale, also check overflow.

Author: Davies Liu <davies@databricks.com>

Closes #8287 from davies/decimal_division.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 28 +++++----
 .../spark/sql/catalyst/expressions/Cast.scala | 32 +++++-----
 .../expressions/decimalFunctions.scala        | 38 ++++++++++-
 .../org/apache/spark/sql/types/Decimal.scala  |  4 +-
 .../expressions/DecimalExpressionSuite.scala  | 63 +++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 23 ++++++-
 6 files changed, 157 insertions(+), 31 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 8581d6b496c15..62c27ee0b9ee0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -371,8 +371,8 @@ object HiveTypeCoercion {
       DecimalType.bounded(range + scale, scale)
     }
 
-    private def changePrecision(e: Expression, dataType: DataType): Expression = {
-      ChangeDecimalPrecision(Cast(e, dataType))
+    private def promotePrecision(e: Expression, dataType: DataType): Expression = {
+      PromotePrecision(Cast(e, dataType))
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
@@ -383,36 +383,42 @@ object HiveTypeCoercion {
         case e if !e.childrenResolved => e
 
         // Skip nodes who is already promoted
-        case e: BinaryArithmetic if e.left.isInstanceOf[ChangeDecimalPrecision] => e
+        case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e
 
         case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
           val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-          Add(changePrecision(e1, dt), changePrecision(e2, dt))
+          CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
 
         case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
           val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-          Subtract(changePrecision(e1, dt), changePrecision(e2, dt))
+          CheckOverflow(Subtract(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
 
         case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          val dt = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
-          Multiply(changePrecision(e1, dt), changePrecision(e2, dt))
+          val resultType = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+          val widerType = widerDecimalType(p1, s1, p2, s2)
+          CheckOverflow(Multiply(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
+            resultType)
 
         case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          val dt = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), max(6, s1 + p2 + 1))
-          Divide(changePrecision(e1, dt), changePrecision(e2, dt))
+          val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1),
+            max(6, s1 + p2 + 1))
+          val widerType = widerDecimalType(p1, s1, p2, s2)
+          CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
+            resultType)
 
         case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
           val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
           // resultType may have lower precision, so we cast them into wider type first.
           val widerType = widerDecimalType(p1, s1, p2, s2)
-          Cast(Remainder(changePrecision(e1, widerType), changePrecision(e2, widerType)),
+          CheckOverflow(Remainder(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
             resultType)
 
         case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
           val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
           // resultType may have lower precision, so we cast them into wider type first.
           val widerType = widerDecimalType(p1, s1, p2, s2)
-          Cast(Pmod(changePrecision(e1, widerType), changePrecision(e2, widerType)), resultType)
+          CheckOverflow(Pmod(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
+            resultType)
 
         case b @ BinaryComparison(e1 @ DecimalType.Expression(p1, s1),
                                   e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 616b9e0e65b78..2db954257be35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -447,7 +447,7 @@ case class Cast(child: Expression, dataType: DataType)
     case StringType => castToStringCode(from, ctx)
     case BinaryType => castToBinaryCode(from)
     case DateType => castToDateCode(from, ctx)
-    case decimal: DecimalType => castToDecimalCode(from, decimal)
+    case decimal: DecimalType => castToDecimalCode(from, decimal, ctx)
     case TimestampType => castToTimestampCode(from, ctx)
     case CalendarIntervalType => castToIntervalCode(from)
     case BooleanType => castToBooleanCode(from)
@@ -528,14 +528,18 @@ case class Cast(child: Expression, dataType: DataType)
       }
     """
 
-  private[this] def castToDecimalCode(from: DataType, target: DecimalType): CastFunction = {
+  private[this] def castToDecimalCode(
+      from: DataType,
+      target: DecimalType,
+      ctx: CodeGenContext): CastFunction = {
+    val tmp = ctx.freshName("tmpDecimal")
     from match {
       case StringType =>
         (c, evPrim, evNull) =>
           s"""
             try {
-              Decimal tmpDecimal = Decimal.apply(new java.math.BigDecimal($c.toString()));
-              ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+              Decimal $tmp = Decimal.apply(new java.math.BigDecimal($c.toString()));
+              ${changePrecision(tmp, target, evPrim, evNull)}
             } catch (java.lang.NumberFormatException e) {
               $evNull = true;
             }
@@ -543,8 +547,8 @@ case class Cast(child: Expression, dataType: DataType)
       case BooleanType =>
         (c, evPrim, evNull) =>
           s"""
-            Decimal tmpDecimal = $c ? Decimal.apply(1) : Decimal.apply(0);
-            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            Decimal $tmp = $c ? Decimal.apply(1) : Decimal.apply(0);
+            ${changePrecision(tmp, target, evPrim, evNull)}
           """
       case DateType =>
         // date can't cast to decimal in Hive
@@ -553,29 +557,29 @@ case class Cast(child: Expression, dataType: DataType)
         // Note that we lose precision here.
         (c, evPrim, evNull) =>
           s"""
-            Decimal tmpDecimal = Decimal.apply(
+            Decimal $tmp = Decimal.apply(
               scala.math.BigDecimal.valueOf(${timestampToDoubleCode(c)}));
-            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            ${changePrecision(tmp, target, evPrim, evNull)}
           """
       case DecimalType() =>
         (c, evPrim, evNull) =>
           s"""
-            Decimal tmpDecimal = $c.clone();
-            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            Decimal $tmp = $c.clone();
+            ${changePrecision(tmp, target, evPrim, evNull)}
           """
       case x: IntegralType =>
         (c, evPrim, evNull) =>
           s"""
-            Decimal tmpDecimal = Decimal.apply((long) $c);
-            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            Decimal $tmp = Decimal.apply((long) $c);
+            ${changePrecision(tmp, target, evPrim, evNull)}
           """
       case x: FractionalType =>
         // All other numeric types can be represented precisely as Doubles
         (c, evPrim, evNull) =>
           s"""
             try {
-              Decimal tmpDecimal = Decimal.apply(scala.math.BigDecimal.valueOf((double) $c));
-              ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+              Decimal $tmp = Decimal.apply(scala.math.BigDecimal.valueOf((double) $c));
+              ${changePrecision(tmp, target, evPrim, evNull)}
             } catch (java.lang.NumberFormatException e) {
               $evNull = true;
             }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index adb33e4c8d4a1..b7be12f7aa741 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -66,10 +66,44 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
  * An expression used to wrap the children when promote the precision of DecimalType to avoid
  * promote multiple times.
  */
-case class ChangeDecimalPrecision(child: Expression) extends UnaryExpression {
+case class PromotePrecision(child: Expression) extends UnaryExpression {
   override def dataType: DataType = child.dataType
   override def eval(input: InternalRow): Any = child.eval(input)
   override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = ""
-  override def prettyName: String = "change_decimal_precision"
+  override def prettyName: String = "promote_precision"
+}
+
+/**
+ * Rounds the decimal to given scale and check whether the decimal can fit in provided precision
+ * or not, returns null if not.
+ */
+case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression {
+
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(input: Any): Any = {
+    val d = input.asInstanceOf[Decimal].clone()
+    if (d.changePrecision(dataType.precision, dataType.scale)) {
+      d
+    } else {
+      null
+    }
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, eval => {
+      val tmp = ctx.freshName("tmp")
+      s"""
+         | Decimal $tmp = $eval.clone();
+         | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) {
+         |   ${ev.primitive} = $tmp;
+         | } else {
+         |   ${ev.isNull} = true;
+         | }
+       """.stripMargin
+    })
+  }
+
+  override def toString: String = s"CheckOverflow($child, $dataType)"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index d95805c24521c..c988f1d1b972e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -267,7 +267,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     if (decimalVal.eq(null) && that.decimalVal.eq(null) && scale == that.scale) {
       Decimal(longVal + that.longVal, Math.max(precision, that.precision), scale)
     } else {
-      Decimal(toBigDecimal + that.toBigDecimal, precision, scale)
+      Decimal(toBigDecimal + that.toBigDecimal)
     }
   }
 
@@ -275,7 +275,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     if (decimalVal.eq(null) && that.decimalVal.eq(null) && scale == that.scale) {
       Decimal(longVal - that.longVal, Math.max(precision, that.precision), scale)
     } else {
-      Decimal(toBigDecimal - that.toBigDecimal, precision, scale)
+      Decimal(toBigDecimal - that.toBigDecimal)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala
new file mode 100644
index 0000000000000..511f0307901df
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DecimalExpressionSuite.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{LongType, DecimalType, Decimal}
+
+
+class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("UnscaledValue") {
+    val d1 = Decimal("10.1")
+    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
+    val d2 = Decimal(101, 3, 1)
+    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
+    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
+  }
+
+  test("MakeDecimal") {
+    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
+    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
+  }
+
+  test("PromotePrecision") {
+    val d1 = Decimal("10.1")
+    checkEvaluation(PromotePrecision(Literal(d1)), d1)
+    val d2 = Decimal(101, 3, 1)
+    checkEvaluation(PromotePrecision(Literal(d2)), d2)
+    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
+  }
+
+  test("CheckOverflow") {
+    val d1 = Decimal("10.1")
+    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
+    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
+    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
+    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)
+
+    val d2 = Decimal(101, 3, 1)
+    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
+    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
+    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
+    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)
+
+    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c329fdb2a6bb1..141468ca00d67 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -17,16 +17,17 @@
 
 package org.apache.spark.sql
 
+import java.math.MathContext
 import java.sql.Timestamp
 
 import org.apache.spark.AccumulatorSuite
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.DefaultParserDialect
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
+import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
 /** A SQL Dialect for testing purpose, and it can not be nested type */
@@ -1608,6 +1609,24 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("decimal precision with multiply/division") {
+    checkAnswer(sql("select 10.3 * 3.0"), Row(BigDecimal("30.90")))
+    checkAnswer(sql("select 10.3000 * 3.0"), Row(BigDecimal("30.90000")))
+    checkAnswer(sql("select 10.30000 * 30.0"), Row(BigDecimal("309.000000")))
+    checkAnswer(sql("select 10.300000000000000000 * 3.000000000000000000"),
+      Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38))))
+    checkAnswer(sql("select 10.300000000000000000 * 3.0000000000000000000"),
+      Row(null))
+
+    checkAnswer(sql("select 10.3 / 3.0"), Row(BigDecimal("3.433333")))
+    checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333")))
+    checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333")))
+    checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"),
+      Row(BigDecimal("3.4333333333333333333333333333333333333", new MathContext(38))))
+    checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"),
+      Row(null))
+  }
+
   test("external sorting updates peak execution memory") {
     withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) {
       val sc = sqlContext.sparkContext

From f3ff4c41d2e32bd0f2419d1c9c68fcd0c2593e41 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 19 Aug 2015 14:15:28 -0700
Subject: [PATCH 007/802] [SPARK-9899] [SQL] Disables customized output
 committer when speculation is on

Speculation hates direct output committer, as there are multiple corner cases that may cause data corruption and/or data loss.

Please see this [PR comment] [1] for more details.

[1]: https://github.com/apache/spark/pull/8191#issuecomment-131598385

Author: Cheng Lian <lian@databricks.com>

Closes #8317 from liancheng/spark-9899/speculation-hates-direct-output-committer.
---
 .../datasources/WriterContainer.scala         | 16 ++++++++-
 .../sql/sources/hadoopFsRelationSuites.scala  | 34 +++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index e0147079e6997..78f48a5cd72c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -58,6 +58,9 @@ private[sql] abstract class BaseWriterContainer(
   // This is only used on driver side.
   @transient private val jobContext: JobContext = job
 
+  private val speculationEnabled: Boolean =
+    relation.sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false)
+
   // The following fields are initialized and used on both driver and executor side.
   @transient protected var outputCommitter: OutputCommitter = _
   @transient private var jobId: JobID = _
@@ -126,10 +129,21 @@ private[sql] abstract class BaseWriterContainer(
       // associated with the file output format since it is not safe to use a custom
       // committer for appending. For example, in S3, direct parquet output committer may
       // leave partial data in the destination dir when the the appending job fails.
+      //
+      // See SPARK-8578 for more details
       logInfo(
-        s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName} " +
+        s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " +
           "for appending.")
       defaultOutputCommitter
+    } else if (speculationEnabled) {
+      // When speculation is enabled, it's not safe to use customized output committer classes,
+      // especially direct output committers (e.g. `DirectParquetOutputCommitter`).
+      //
+      // See SPARK-9899 for more details.
+      logInfo(
+        s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " +
+          "because spark.speculation is configured to be true.")
+      defaultOutputCommitter
     } else {
       val committerClass = context.getConfiguration.getClass(
         SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 8d0d9218ddd6a..5bbca14bad320 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -570,6 +570,40 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       df.write.format(dataSourceName).partitionBy("c", "d", "e").saveAsTable("t")
     }
   }
+
+  test("SPARK-9899 Disable customized output committer when speculation is on") {
+    val clonedConf = new Configuration(configuration)
+    val speculationEnabled =
+      sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false)
+
+    try {
+      withTempPath { dir =>
+        // Enables task speculation
+        sqlContext.sparkContext.conf.set("spark.speculation", "true")
+
+        // Uses a customized output committer which always fails
+        configuration.set(
+          SQLConf.OUTPUT_COMMITTER_CLASS.key,
+          classOf[AlwaysFailOutputCommitter].getName)
+
+        // Code below shouldn't throw since customized output committer should be disabled.
+        val df = sqlContext.range(10).coalesce(1)
+        df.write.format(dataSourceName).save(dir.getCanonicalPath)
+        checkAnswer(
+          sqlContext
+            .read
+            .format(dataSourceName)
+            .option("dataSchema", df.schema.json)
+            .load(dir.getCanonicalPath),
+          df)
+      }
+    } finally {
+      // Hadoop 1 doesn't have `Configuration.unset`
+      configuration.clear()
+      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString)
+    }
+  }
 }
 
 // This class is used to test SPARK-8578. We should not use any custom output committer when

From 373a376c04320aab228b5c385e2b788809877d3e Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Wed, 19 Aug 2015 14:31:51 -0700
Subject: [PATCH 008/802] [SPARK-10083] [SQL] CaseWhen should support type
 coercion of DecimalType and FractionalType

create t1 (a decimal(7, 2), b long);
select case when 1=1 then a else 1.0 end from t1;
select case when 1=1 then a else b end from t1;

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #8270 from adrian-wang/casewhenfractional.
---
 .../sql/catalyst/analysis/HiveTypeCoercion.scala      |  4 ++--
 .../sql/catalyst/analysis/HiveTypeCoercionSuite.scala | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 62c27ee0b9ee0..f2f2ba2f96552 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -605,7 +605,7 @@ object HiveTypeCoercion {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       case c: CaseWhenLike if c.childrenResolved && !c.valueTypesEqual =>
         logDebug(s"Input values for null casting ${c.valueTypes.mkString(",")}")
-        val maybeCommonType = findTightestCommonTypeAndPromoteToString(c.valueTypes)
+        val maybeCommonType = findWiderCommonType(c.valueTypes)
         maybeCommonType.map { commonType =>
           val castedBranches = c.branches.grouped(2).map {
             case Seq(when, value) if value.dataType != commonType =>
@@ -622,7 +622,7 @@ object HiveTypeCoercion {
 
       case c: CaseKeyWhen if c.childrenResolved && !c.resolved =>
         val maybeCommonType =
-          findTightestCommonTypeAndPromoteToString((c.key +: c.whenList).map(_.dataType))
+          findWiderCommonType((c.key +: c.whenList).map(_.dataType))
         maybeCommonType.map { commonType =>
           val castedBranches = c.branches.grouped(2).map {
             case Seq(whenExpr, thenExpr) if whenExpr.dataType != commonType =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index cbdf453f600ab..6f33ab733b615 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -285,6 +285,17 @@ class HiveTypeCoercionSuite extends PlanTest {
       CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a"))),
       CaseKeyWhen(Literal(true), Seq(Literal(1), Literal("a")))
     )
+    ruleTest(HiveTypeCoercion.CaseWhenCoercion,
+      CaseWhen(Seq(Literal(true), Literal(1.2), Literal.create(1, DecimalType(7, 2)))),
+      CaseWhen(Seq(
+        Literal(true), Literal(1.2), Cast(Literal.create(1, DecimalType(7, 2)), DoubleType)))
+    )
+    ruleTest(HiveTypeCoercion.CaseWhenCoercion,
+      CaseWhen(Seq(Literal(true), Literal(100L), Literal.create(1, DecimalType(7, 2)))),
+      CaseWhen(Seq(
+        Literal(true), Cast(Literal(100L), DecimalType(22, 2)),
+        Cast(Literal.create(1, DecimalType(7, 2)), DecimalType(22, 2))))
+    )
   }
 
   test("type coercion simplification for equal to") {

From e0dd1309ac248375f429639801923570f14de18d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 19 Aug 2015 14:33:32 -0700
Subject: [PATCH 009/802] [SPARK-10119] [CORE] Fix isDynamicAllocationEnabled
 when config is expliticly disabled.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8316 from vanzin/SPARK-10119.
---
 .../main/scala/org/apache/spark/util/Utils.scala   |  2 +-
 .../scala/org/apache/spark/util/UtilsSuite.scala   | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index fddc24dbfc237..8313312226713 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2141,7 +2141,7 @@ private[spark] object Utils extends Logging {
    * the latter should override the former (SPARK-9092).
    */
   def isDynamicAllocationEnabled(conf: SparkConf): Boolean = {
-    conf.contains("spark.dynamicAllocation.enabled") &&
+    conf.getBoolean("spark.dynamicAllocation.enabled", false) &&
       conf.getInt("spark.executor.instances", 0) == 0
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 8f7e402d5f2a6..1fb81ad565b41 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -720,4 +720,18 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(Utils.nanSafeCompareFloats(Float.PositiveInfinity, Float.NaN) === -1)
     assert(Utils.nanSafeCompareFloats(Float.NegativeInfinity, Float.NaN) === -1)
   }
+
+  test("isDynamicAllocationEnabled") {
+    val conf = new SparkConf()
+    assert(Utils.isDynamicAllocationEnabled(conf) === false)
+    assert(Utils.isDynamicAllocationEnabled(
+      conf.set("spark.dynamicAllocation.enabled", "false")) === false)
+    assert(Utils.isDynamicAllocationEnabled(
+      conf.set("spark.dynamicAllocation.enabled", "true")) === true)
+    assert(Utils.isDynamicAllocationEnabled(
+      conf.set("spark.executor.instances", "1")) === false)
+    assert(Utils.isDynamicAllocationEnabled(
+      conf.set("spark.executor.instances", "0")) === true)
+  }
+
 }

From b0dbaec4f942a47afde3490b9339ad3bd187024d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 19 Aug 2015 15:04:56 -0700
Subject: [PATCH 010/802] [SPARK-6489] [SQL] add column pruning for Generate

This PR takes over https://github.com/apache/spark/pull/5358

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8268 from cloud-fan/6489.
---
 .../sql/catalyst/expressions/generators.scala |  2 -
 .../sql/catalyst/optimizer/Optimizer.scala    | 16 ++++
 .../optimizer/ColumnPruningSuite.scala        | 84 +++++++++++++++++++
 3 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index d474853355e5b..c0845e1a0102f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.collection.Map
-
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 47b06cae15436..42457d5318b48 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -165,6 +165,7 @@ object SetOperationPushDown extends Rule[LogicalPlan] {
  *
  *  - Inserting Projections beneath the following operators:
  *   - Aggregate
+ *   - Generate
  *   - Project <- Join
  *   - LeftSemiJoin
  */
@@ -178,6 +179,21 @@ object ColumnPruning extends Rule[LogicalPlan] {
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = Project(a.references.toSeq, child))
 
+    // Eliminate attributes that are not needed to calculate the Generate.
+    case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty =>
+      g.copy(child = Project(g.references.toSeq, g.child))
+
+    case p @ Project(_, g: Generate) if g.join && p.references.subsetOf(g.generatedSet) =>
+      p.copy(child = g.copy(join = false))
+
+    case p @ Project(projectList, g: Generate) if g.join =>
+      val neededChildOutput = p.references -- g.generatorOutput ++ g.references
+      if (neededChildOutput == g.child.outputSet) {
+        p
+      } else {
+        Project(projectList, g.copy(child = Project(neededChildOutput.toSeq, g.child)))
+      }
+
     case p @ Project(projectList, a @ Aggregate(groupingExpressions, aggregateExpressions, child))
         if (a.outputSet -- p.references).nonEmpty =>
       Project(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
new file mode 100644
index 0000000000000..dbebcb86809de
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions.Explode
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation, Generate, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.types.StringType
+
+class ColumnPruningSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("Column pruning", FixedPoint(100),
+      ColumnPruning) :: Nil
+  }
+
+  test("Column pruning for Generate when Generate.join = false") {
+    val input = LocalRelation('a.int, 'b.array(StringType))
+
+    val query = Generate(Explode('b), false, false, None, 's.string :: Nil, input).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer =
+      Generate(Explode('b), false, false, None, 's.string :: Nil,
+        Project('b.attr :: Nil, input)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("Column pruning for Generate when Generate.join = true") {
+    val input = LocalRelation('a.int, 'b.int, 'c.array(StringType))
+
+    val query =
+      Project(Seq('a, 's),
+        Generate(Explode('c), true, false, None, 's.string :: Nil,
+          input)).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer =
+      Project(Seq('a, 's),
+        Generate(Explode('c), true, false, None, 's.string :: Nil,
+          Project(Seq('a, 'c),
+            input))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("Turn Generate.join to false if possible") {
+    val input = LocalRelation('b.array(StringType))
+
+    val query =
+      Project(('s + 1).as("s+1") :: Nil,
+        Generate(Explode('b), true, false, None, 's.string :: Nil,
+          input)).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer =
+      Project(('s + 1).as("s+1") :: Nil,
+        Generate(Explode('b), false, false, None, 's.string :: Nil,
+          input)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  // todo: add more tests for column pruning
+}

From 8e0a072f78b4902d5f7ccc6b15232ed202a117f9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Aug 2015 15:43:08 -0700
Subject: [PATCH 011/802] [SPARK-9895] User Guide for RFormula Feature
 Transformer

mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #8293 from ericl/docs-2.
---
 docs/ml-features.md                           | 108 ++++++++++++++++++
 .../apache/spark/ml/feature/RFormula.scala    |   4 +-
 2 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index d0e8eeb7a757e..6309db97be4d0 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1477,3 +1477,111 @@ print(output.select("features", "clicked").first())
 </div>
 </div>
 
+## RFormula
+
+`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
+
+**Examples**
+
+Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `clicked`:
+
+~~~
+id | country | hour | clicked
+---|---------|------|---------
+ 7 | "US"    | 18   | 1.0
+ 8 | "CA"    | 12   | 0.0
+ 9 | "NZ"    | 15   | 0.0
+~~~
+
+If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to
+predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame:
+
+~~~
+id | country | hour | clicked | features         | label
+---|---------|------|---------|------------------|-------
+ 7 | "US"    | 18   | 1.0     | [0.0, 0.0, 18.0] | 1.0
+ 8 | "CA"    | 12   | 0.0     | [0.0, 1.0, 12.0] | 0.0
+ 9 | "NZ"    | 15   | 0.0     | [1.0, 0.0, 15.0] | 0.0
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.RFormula
+
+val dataset = sqlContext.createDataFrame(Seq(
+  (7, "US", 18, 1.0),
+  (8, "CA", 12, 0.0),
+  (9, "NZ", 15, 0.0)
+)).toDF("id", "country", "hour", "clicked")
+val formula = new RFormula()
+  .setFormula("clicked ~ country + hour")
+  .setFeaturesCol("features")
+  .setLabelCol("label")
+val output = formula.fit(dataset).transform(dataset)
+output.select("features", "label").show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+StructType schema = createStructType(new StructField[] {
+  createStructField("id", IntegerType, false),
+  createStructField("country", StringType, false),
+  createStructField("hour", IntegerType, false),
+  createStructField("clicked", DoubleType, false)
+});
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(7, "US", 18, 1.0),
+  RowFactory.create(8, "CA", 12, 0.0),
+  RowFactory.create(9, "NZ", 15, 0.0)
+));
+DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+RFormula formula = new RFormula()
+  .setFormula("clicked ~ country + hour")
+  .setFeaturesCol("features")
+  .setLabelCol("label");
+
+DataFrame output = formula.fit(dataset).transform(dataset);
+output.select("features", "label").show();
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+
+{% highlight python %}
+from pyspark.ml.feature import RFormula
+
+dataset = sqlContext.createDataFrame(
+    [(7, "US", 18, 1.0),
+     (8, "CA", 12, 0.0),
+     (9, "NZ", 15, 0.0)],
+    ["id", "country", "hour", "clicked"])
+formula = RFormula(
+    formula="clicked ~ country + hour",
+    featuresCol="features",
+    labelCol="label")
+output = formula.fit(dataset).transform(dataset)
+output.select("features", "label").show()
+{% endhighlight %}
+</div>
+</div>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index a752dacd72d95..a7fa50444209b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -42,8 +42,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
- * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the
+ * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {

From ba5f7e1842f2c5852b5309910c0d39926643da69 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 20 Aug 2015 08:13:25 +0800
Subject: [PATCH 012/802] [SPARK-10035] [SQL] Parquet filters does not process
 EqualNullSafe filter.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As I talked with Lian,

1. I added EquelNullSafe to ParquetFilters
 - It uses the same equality comparison filter with EqualTo since the Parquet filter performs actually null-safe equality comparison.

2. Updated the test code (ParquetFilterSuite)
 - Convert catalyst.Expression to sources.Filter
 - Removed Cast since only Literal is picked up as a proper Filter in DataSourceStrategy
 - Added EquelNullSafe comparison

3. Removed deprecated createFilter for catalyst.Expression

Author: hyukjinkwon <gurwls223@gmail.com>
Author: 권혁진 <gurwls223@gmail.com>

Closes #8275 from HyukjinKwon/master.
---
 .../datasources/parquet/ParquetFilters.scala  | 113 +++---------------
 .../parquet/ParquetFilterSuite.scala          |  63 ++++------
 2 files changed, 37 insertions(+), 139 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 63915e0a28655..c74c8388632f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -22,8 +22,6 @@ import java.nio.ByteBuffer
 
 import com.google.common.io.BaseEncoding
 import org.apache.hadoop.conf.Configuration
-import org.apache.parquet.filter2.compat.FilterCompat
-import org.apache.parquet.filter2.compat.FilterCompat._
 import org.apache.parquet.filter2.predicate.FilterApi._
 import org.apache.parquet.filter2.predicate._
 import org.apache.parquet.io.api.Binary
@@ -39,12 +37,6 @@ import org.apache.spark.unsafe.types.UTF8String
 private[sql] object ParquetFilters {
   val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
 
-  def createRecordFilter(filterExpressions: Seq[Expression]): Option[Filter] = {
-    filterExpressions.flatMap { filter =>
-      createFilter(filter)
-    }.reduceOption(FilterApi.and).map(FilterCompat.get)
-  }
-
   case class SetInFilter[T <: Comparable[T]](
     valueSet: Set[T]) extends UserDefinedPredicate[T] with Serializable {
 
@@ -205,6 +197,16 @@ private[sql] object ParquetFilters {
     // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`,
     // which can be casted to `false` implicitly. Please refer to the `eval` method of these
     // operators and the `SimplifyFilters` rule for details.
+
+    // Hyukjin:
+    // I added [[EqualNullSafe]] with [[org.apache.parquet.filter2.predicate.Operators.Eq]].
+    // So, it performs equality comparison identically when given [[sources.Filter]] is [[EqualTo]].
+    // The reason why I did this is, that the actual Parquet filter checks null-safe equality
+    // comparison.
+    // So I added this and maybe [[EqualTo]] should be changed. It still seems fine though, because
+    // physical planning does not set `NULL` to [[EqualTo]] but changes it to [[IsNull]] and etc.
+    // Probably I missed something and obviously this should be changed.
+
     predicate match {
       case sources.IsNull(name) =>
         makeEq.lift(dataTypeOf(name)).map(_(name, null))
@@ -216,6 +218,11 @@ private[sql] object ParquetFilters {
       case sources.Not(sources.EqualTo(name, value)) =>
         makeNotEq.lift(dataTypeOf(name)).map(_(name, value))
 
+      case sources.EqualNullSafe(name, value) =>
+        makeEq.lift(dataTypeOf(name)).map(_(name, value))
+      case sources.Not(sources.EqualNullSafe(name, value)) =>
+        makeNotEq.lift(dataTypeOf(name)).map(_(name, value))
+
       case sources.LessThan(name, value) =>
         makeLt.lift(dataTypeOf(name)).map(_(name, value))
       case sources.LessThanOrEqual(name, value) =>
@@ -273,96 +280,6 @@ private[sql] object ParquetFilters {
     addMethod.invoke(null, classOf[Binary], enumTypeDescriptor)
   }
 
-  /**
-   * Converts Catalyst predicate expressions to Parquet filter predicates.
-   *
-   * @todo This can be removed once we get rid of the old Parquet support.
-   */
-  def createFilter(predicate: Expression): Option[FilterPredicate] = {
-    // NOTE:
-    //
-    // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`,
-    // which can be casted to `false` implicitly. Please refer to the `eval` method of these
-    // operators and the `SimplifyFilters` rule for details.
-    predicate match {
-      case IsNull(NamedExpression(name, dataType)) =>
-        makeEq.lift(dataType).map(_(name, null))
-      case IsNotNull(NamedExpression(name, dataType)) =>
-        makeNotEq.lift(dataType).map(_(name, null))
-
-      case EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
-        makeEq.lift(dataType).map(_(name, value))
-      case EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) =>
-        makeEq.lift(dataType).map(_(name, value))
-      case EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
-        makeEq.lift(dataType).map(_(name, value))
-      case EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) =>
-        makeEq.lift(dataType).map(_(name, value))
-
-      case Not(EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType))) =>
-        makeNotEq.lift(dataType).map(_(name, value))
-      case Not(EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _))) =>
-        makeNotEq.lift(dataType).map(_(name, value))
-      case Not(EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _))) =>
-        makeNotEq.lift(dataType).map(_(name, value))
-      case Not(EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType))) =>
-        makeNotEq.lift(dataType).map(_(name, value))
-
-      case LessThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
-        makeLt.lift(dataType).map(_(name, value))
-      case LessThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) =>
-        makeLt.lift(dataType).map(_(name, value))
-      case LessThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
-        makeGt.lift(dataType).map(_(name, value))
-      case LessThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) =>
-        makeGt.lift(dataType).map(_(name, value))
-
-      case LessThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
-        makeLtEq.lift(dataType).map(_(name, value))
-      case LessThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) =>
-        makeLtEq.lift(dataType).map(_(name, value))
-      case LessThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
-        makeGtEq.lift(dataType).map(_(name, value))
-      case LessThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) =>
-        makeGtEq.lift(dataType).map(_(name, value))
-
-      case GreaterThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
-        makeGt.lift(dataType).map(_(name, value))
-      case GreaterThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) =>
-        makeGt.lift(dataType).map(_(name, value))
-      case GreaterThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
-        makeLt.lift(dataType).map(_(name, value))
-      case GreaterThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) =>
-        makeLt.lift(dataType).map(_(name, value))
-
-      case GreaterThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) =>
-        makeGtEq.lift(dataType).map(_(name, value))
-      case GreaterThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) =>
-        makeGtEq.lift(dataType).map(_(name, value))
-      case GreaterThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) =>
-        makeLtEq.lift(dataType).map(_(name, value))
-      case GreaterThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) =>
-        makeLtEq.lift(dataType).map(_(name, value))
-
-      case And(lhs, rhs) =>
-        (createFilter(lhs) ++ createFilter(rhs)).reduceOption(FilterApi.and)
-
-      case Or(lhs, rhs) =>
-        for {
-          lhsFilter <- createFilter(lhs)
-          rhsFilter <- createFilter(rhs)
-        } yield FilterApi.or(lhsFilter, rhsFilter)
-
-      case Not(pred) =>
-        createFilter(pred).map(FilterApi.not)
-
-      case InSet(NamedExpression(name, dataType), valueSet) =>
-        makeInSet.lift(dataType).map(_(name, valueSet))
-
-      case _ => None
-    }
-  }
-
   /**
    * Note: Inside the Hadoop API we only have access to `Configuration`, not to
    * [[org.apache.spark.SparkContext]], so we cannot use broadcasts to convey
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 5b4e568bb9838..f067112cfca95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -24,9 +24,8 @@ import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, LogicalRelation}
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types._
 
 /**
  * A test suite that tests Parquet filter2 API based filter pushdown optimization.
@@ -55,20 +54,22 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         .select(output.map(e => Column(e)): _*)
         .where(Column(predicate))
 
-      val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
+      val analyzedPredicate = query.queryExecution.optimizedPlan.collect {
         case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters
-      }.flatten.reduceOption(_ && _)
+      }.flatten
+      assert(analyzedPredicate.nonEmpty)
 
-      assert(maybeAnalyzedPredicate.isDefined)
-      maybeAnalyzedPredicate.foreach { pred =>
-        val maybeFilter = ParquetFilters.createFilter(pred)
+      val selectedFilters = DataSourceStrategy.selectFilters(analyzedPredicate)
+      assert(selectedFilters.nonEmpty)
+
+      selectedFilters.foreach { pred =>
+        val maybeFilter = ParquetFilters.createFilter(df.schema, pred)
         assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred")
         maybeFilter.foreach { f =>
           // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`)
           assert(f.getClass === filterClass)
         }
       }
-
       checker(query, expected)
     }
   }
@@ -109,43 +110,18 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
 
       checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
+      checkFilterPredicate('_1 <=> true, classOf[Eq[_]], true)
       checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
     }
   }
 
-  test("filter pushdown - short") {
-    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit df =>
-      checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
-
-      checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
-
-      checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
-
-      checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
-        classOf[Operators.Or],
-        Seq(Row(1), Row(4)))
-    }
-  }
-
   test("filter pushdown - integer") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
@@ -154,13 +130,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1)
       checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
       checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
       checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
@@ -171,6 +147,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
@@ -179,13 +156,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1)
       checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
       checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
       checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
@@ -196,6 +173,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
@@ -204,13 +182,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1)
       checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
       checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
       checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
@@ -221,6 +199,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
       checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
@@ -229,13 +208,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1)
       checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
       checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
       checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
@@ -247,6 +226,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
 
       checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
+      checkFilterPredicate('_1 <=> "1", classOf[Eq[_]], "1")
       checkFilterPredicate(
         '_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
 
@@ -256,13 +236,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
 
       checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1")
+      checkFilterPredicate(Literal("1") <=> '_1, classOf[Eq[_]], "1")
       checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1")
       checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4")
       checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1")
       checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4")
 
       checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4")
-      checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
       checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4")))
     }
   }
@@ -274,6 +254,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
     withParquetDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df =>
       checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
+      checkBinaryFilterPredicate('_1 <=> 1.b, classOf[Eq[_]], 1.b)
 
       checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkBinaryFilterPredicate(
@@ -288,13 +269,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
 
       checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b)
+      checkBinaryFilterPredicate(Literal(1.b) <=> '_1, classOf[Eq[_]], 1.b)
       checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b)
       checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b)
       checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b)
       checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b)
 
       checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
-      checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
       checkBinaryFilterPredicate(
         '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
     }

From 2f2686a73f5a2a53ca5b1023e0d7e0e6c9be5896 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 19 Aug 2015 17:35:41 -0700
Subject: [PATCH 013/802] [SPARK-9242] [SQL] Audit UDAF interface.

A few minor changes:

1. Improved documentation
2. Rename apply(distinct....) to distinct.
3. Changed MutableAggregationBuffer from a trait to an abstract class.
4. Renamed returnDataType to dataType to be more consistent with other expressions.

And unrelated to UDAFs:

1. Renamed file names in expressions to use suffix "Expressions" to be more consistent.
2. Moved regexp related expressions out to its own file.
3. Renamed StringComparison => StringPredicate.

Author: Reynold Xin <rxin@databricks.com>

Closes #8321 from rxin/SPARK-9242.
---
 ...bitwise.scala => bitwiseExpressions.scala} |   0
 ...als.scala => conditionalExpressions.scala} |   0
 ...ctions.scala => datetimeExpressions.scala} |   0
 ...nctions.scala => decimalExpressions.scala} |   0
 ...nFunctions.scala => jsonExpressions.scala} |   0
 .../{math.scala => mathExpressions.scala}     |   0
 ...lFunctions.scala => nullExpressions.scala} |   0
 .../{random.scala => randomExpressions.scala} |   0
 .../expressions/regexpExpressions.scala       | 346 ++++++++++++++++++
 ...erations.scala => stringExpressions.scala} | 332 +----------------
 .../sql/catalyst/optimizer/Optimizer.scala    |   2 +-
 .../expressions/StringExpressionsSuite.scala  |   2 +-
 .../apache/spark/sql/UDFRegistration.scala    |   1 +
 .../spark/sql/execution/aggregate/udaf.scala  |   2 +-
 .../apache/spark/sql/expressions/udaf.scala   |  44 ++-
 .../spark/sql/hive/JavaDataFrameSuite.java    |   2 +-
 .../spark/sql/hive/aggregate/MyDoubleAvg.java |   2 +-
 .../spark/sql/hive/aggregate/MyDoubleSum.java |   2 +-
 18 files changed, 386 insertions(+), 349 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{bitwise.scala => bitwiseExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{conditionals.scala => conditionalExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{datetimeFunctions.scala => datetimeExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{decimalFunctions.scala => decimalExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{jsonFunctions.scala => jsonExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{math.scala => mathExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{nullFunctions.scala => nullExpressions.scala} (100%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{random.scala => randomExpressions.scala} (100%)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{stringOperations.scala => stringExpressions.scala} (74%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonFunctions.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
new file mode 100644
index 0000000000000..6dff28a7cde46
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.util.regex.{MatchResult, Pattern}
+
+import org.apache.commons.lang3.StringEscapeUtils
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.StringUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+
+trait StringRegexExpression extends ImplicitCastInputTypes {
+  self: BinaryExpression =>
+
+  def escape(v: String): String
+  def matches(regex: Pattern, str: String): Boolean
+
+  override def dataType: DataType = BooleanType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+
+  // try cache the pattern for Literal
+  private lazy val cache: Pattern = right match {
+    case x @ Literal(value: String, StringType) => compile(value)
+    case _ => null
+  }
+
+  protected def compile(str: String): Pattern = if (str == null) {
+    null
+  } else {
+    // Let it raise exception if couldn't compile the regex string
+    Pattern.compile(escape(str))
+  }
+
+  protected def pattern(str: String) = if (cache == null) compile(str) else cache
+
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val regex = pattern(input2.asInstanceOf[UTF8String].toString)
+    if(regex == null) {
+      null
+    } else {
+      matches(regex, input1.asInstanceOf[UTF8String].toString)
+    }
+  }
+}
+
+
+/**
+ * Simple RegEx pattern matching function
+ */
+case class Like(left: Expression, right: Expression)
+  extends BinaryExpression with StringRegexExpression with CodegenFallback {
+
+  override def escape(v: String): String = StringUtils.escapeLikeRegex(v)
+
+  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
+
+  override def toString: String = s"$left LIKE $right"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val patternClass = classOf[Pattern].getName
+    val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex"
+    val pattern = ctx.freshName("pattern")
+
+    if (right.foldable) {
+      val rVal = right.eval()
+      if (rVal != null) {
+        val regexStr =
+          StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
+        ctx.addMutableState(patternClass, pattern,
+          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+
+        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
+        val eval = left.gen(ctx)
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = ${eval.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches();
+          }
+        """
+      } else {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+        s"""
+          String rightStr = ${eval2}.toString();
+          ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr));
+          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches();
+        """
+      })
+    }
+  }
+}
+
+
+case class RLike(left: Expression, right: Expression)
+  extends BinaryExpression with StringRegexExpression with CodegenFallback {
+
+  override def escape(v: String): String = v
+  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
+  override def toString: String = s"$left RLIKE $right"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val patternClass = classOf[Pattern].getName
+    val pattern = ctx.freshName("pattern")
+
+    if (right.foldable) {
+      val rVal = right.eval()
+      if (rVal != null) {
+        val regexStr =
+          StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString())
+        ctx.addMutableState(patternClass, pattern,
+          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+
+        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
+        val eval = left.gen(ctx)
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = ${eval.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0);
+          }
+        """
+      } else {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+        s"""
+          String rightStr = ${eval2}.toString();
+          ${patternClass} $pattern = ${patternClass}.compile(rightStr);
+          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0);
+        """
+      })
+    }
+  }
+}
+
+
+/**
+ * Splits str around pat (pattern is a regular expression).
+ */
+case class StringSplit(str: Expression, pattern: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = str
+  override def right: Expression = pattern
+  override def dataType: DataType = ArrayType(StringType)
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+
+  override def nullSafeEval(string: Any, regex: Any): Any = {
+    val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+    new GenericArrayData(strings.asInstanceOf[Array[Any]])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val arrayClass = classOf[GenericArrayData].getName
+    nullSafeCodeGen(ctx, ev, (str, pattern) =>
+      // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
+      s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""")
+  }
+
+  override def prettyName: String = "split"
+}
+
+
+/**
+ * Replace all substrings of str that match regexp with rep.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
+ */
+case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
+  extends TernaryExpression with ImplicitCastInputTypes {
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+  // last replacement string, we don't want to convert a UTF8String => java.langString every time.
+  @transient private var lastReplacement: String = _
+  @transient private var lastReplacementInUTF8: UTF8String = _
+  // result buffer write by Matcher
+  @transient private val result: StringBuffer = new StringBuffer
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+    if (!p.equals(lastRegex)) {
+      // regex value changed
+      lastRegex = p.asInstanceOf[UTF8String].clone()
+      pattern = Pattern.compile(lastRegex.toString)
+    }
+    if (!r.equals(lastReplacementInUTF8)) {
+      // replacement string changed
+      lastReplacementInUTF8 = r.asInstanceOf[UTF8String].clone()
+      lastReplacement = lastReplacementInUTF8.toString
+    }
+    val m = pattern.matcher(s.toString())
+    result.delete(0, result.length())
+
+    while (m.find) {
+      m.appendReplacement(result, lastReplacement)
+    }
+    m.appendTail(result)
+
+    UTF8String.fromString(result.toString)
+  }
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType)
+  override def children: Seq[Expression] = subject :: regexp :: rep :: Nil
+  override def prettyName: String = "regexp_replace"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val termLastRegex = ctx.freshName("lastRegex")
+    val termPattern = ctx.freshName("pattern")
+
+    val termLastReplacement = ctx.freshName("lastReplacement")
+    val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
+
+    val termResult = ctx.freshName("result")
+
+    val classNamePattern = classOf[Pattern].getCanonicalName
+    val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
+
+    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
+    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
+    ctx.addMutableState("String", termLastReplacement, s"${termLastReplacement} = null;")
+    ctx.addMutableState("UTF8String",
+      termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
+    ctx.addMutableState(classNameStringBuffer,
+      termResult, s"${termResult} = new $classNameStringBuffer();")
+
+    nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => {
+    s"""
+      if (!$regexp.equals(${termLastRegex})) {
+        // regex value changed
+        ${termLastRegex} = $regexp.clone();
+        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+      }
+      if (!$rep.equals(${termLastReplacementInUTF8})) {
+        // replacement string changed
+        ${termLastReplacementInUTF8} = $rep.clone();
+        ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
+      }
+      ${termResult}.delete(0, ${termResult}.length());
+      java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString());
+
+      while (m.find()) {
+        m.appendReplacement(${termResult}, ${termLastReplacement});
+      }
+      m.appendTail(${termResult});
+      ${ev.primitive} = UTF8String.fromString(${termResult}.toString());
+      ${ev.isNull} = false;
+    """
+    })
+  }
+}
+
+/**
+ * Extract a specific(idx) group identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
+ */
+case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
+  extends TernaryExpression with ImplicitCastInputTypes {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+    if (!p.equals(lastRegex)) {
+      // regex value changed
+      lastRegex = p.asInstanceOf[UTF8String].clone()
+      pattern = Pattern.compile(lastRegex.toString)
+    }
+    val m = pattern.matcher(s.toString)
+    if (m.find) {
+      val mr: MatchResult = m.toMatchResult
+      UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
+    } else {
+      UTF8String.EMPTY_UTF8
+    }
+  }
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+  override def prettyName: String = "regexp_extract"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val termLastRegex = ctx.freshName("lastRegex")
+    val termPattern = ctx.freshName("pattern")
+    val classNamePattern = classOf[Pattern].getCanonicalName
+
+    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
+    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
+
+    nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
+      s"""
+      if (!$regexp.equals(${termLastRegex})) {
+        // regex value changed
+        ${termLastRegex} = $regexp.clone();
+        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+      }
+      java.util.regex.Matcher m =
+        ${termPattern}.matcher($subject.toString());
+      if (m.find()) {
+        java.util.regex.MatchResult mr = m.toMatchResult();
+        ${ev.primitive} = UTF8String.fromString(mr.group($idx));
+        ${ev.isNull} = false;
+      } else {
+        ${ev.primitive} = UTF8String.EMPTY_UTF8;
+        ${ev.isNull} = false;
+      }"""
+    })
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
similarity index 74%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index ca044d3e95e38..3c23f2ecfb57c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -21,13 +21,9 @@ import java.text.DecimalFormat
 import java.util.Arrays
 import java.util.{Map => JMap, HashMap}
 import java.util.Locale
-import java.util.regex.{MatchResult, Pattern}
-
-import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -124,143 +120,6 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
-
-trait StringRegexExpression extends ImplicitCastInputTypes {
-  self: BinaryExpression =>
-
-  def escape(v: String): String
-  def matches(regex: Pattern, str: String): Boolean
-
-  override def dataType: DataType = BooleanType
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
-
-  // try cache the pattern for Literal
-  private lazy val cache: Pattern = right match {
-    case x @ Literal(value: String, StringType) => compile(value)
-    case _ => null
-  }
-
-  protected def compile(str: String): Pattern = if (str == null) {
-    null
-  } else {
-    // Let it raise exception if couldn't compile the regex string
-    Pattern.compile(escape(str))
-  }
-
-  protected def pattern(str: String) = if (cache == null) compile(str) else cache
-
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    val regex = pattern(input2.asInstanceOf[UTF8String].toString())
-    if(regex == null) {
-      null
-    } else {
-      matches(regex, input1.asInstanceOf[UTF8String].toString())
-    }
-  }
-}
-
-/**
- * Simple RegEx pattern matching function
- */
-case class Like(left: Expression, right: Expression)
-  extends BinaryExpression with StringRegexExpression with CodegenFallback {
-
-  override def escape(v: String): String = StringUtils.escapeLikeRegex(v)
-
-  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
-
-  override def toString: String = s"$left LIKE $right"
-
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val patternClass = classOf[Pattern].getName
-    val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex"
-    val pattern = ctx.freshName("pattern")
-
-    if (right.foldable) {
-      val rVal = right.eval()
-      if (rVal != null) {
-        val regexStr =
-          StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
-        ctx.addMutableState(patternClass, pattern,
-          s"""$pattern = ${patternClass}.compile("$regexStr");""")
-
-        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
-        val eval = left.gen(ctx)
-        s"""
-          ${eval.code}
-          boolean ${ev.isNull} = ${eval.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (!${ev.isNull}) {
-            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches();
-          }
-        """
-      } else {
-        s"""
-          boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-        """
-      }
-    } else {
-      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-        s"""
-          String rightStr = ${eval2}.toString();
-          ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr));
-          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches();
-        """
-      })
-    }
-  }
-}
-
-
-case class RLike(left: Expression, right: Expression)
-  extends BinaryExpression with StringRegexExpression with CodegenFallback {
-
-  override def escape(v: String): String = v
-  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
-  override def toString: String = s"$left RLIKE $right"
-
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val patternClass = classOf[Pattern].getName
-    val pattern = ctx.freshName("pattern")
-
-    if (right.foldable) {
-      val rVal = right.eval()
-      if (rVal != null) {
-        val regexStr =
-          StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString())
-        ctx.addMutableState(patternClass, pattern,
-          s"""$pattern = ${patternClass}.compile("$regexStr");""")
-
-        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
-        val eval = left.gen(ctx)
-        s"""
-          ${eval.code}
-          boolean ${ev.isNull} = ${eval.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (!${ev.isNull}) {
-            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0);
-          }
-        """
-      } else {
-        s"""
-          boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-        """
-      }
-    } else {
-      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-        s"""
-          String rightStr = ${eval2}.toString();
-          ${patternClass} $pattern = ${patternClass}.compile(rightStr);
-          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0);
-        """
-      })
-    }
-  }
-}
-
-
 trait String2StringExpression extends ImplicitCastInputTypes {
   self: UnaryExpression =>
 
@@ -305,7 +164,7 @@ case class Lower(child: Expression) extends UnaryExpression with String2StringEx
 }
 
 /** A base trait for functions that compare two strings, returning a boolean. */
-trait StringComparison extends ImplicitCastInputTypes {
+trait StringPredicate extends Predicate with ImplicitCastInputTypes {
   self: BinaryExpression =>
 
   def compare(l: UTF8String, r: UTF8String): Boolean
@@ -322,7 +181,7 @@ trait StringComparison extends ImplicitCastInputTypes {
  * A function that returns true if the string `left` contains the string `right`.
  */
 case class Contains(left: Expression, right: Expression)
-    extends BinaryExpression with Predicate with StringComparison {
+    extends BinaryExpression with StringPredicate {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.contains(r)
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).contains($c2)")
@@ -333,7 +192,7 @@ case class Contains(left: Expression, right: Expression)
  * A function that returns true if the string `left` starts with the string `right`.
  */
 case class StartsWith(left: Expression, right: Expression)
-    extends BinaryExpression with Predicate with StringComparison {
+    extends BinaryExpression with StringPredicate {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.startsWith(r)
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).startsWith($c2)")
@@ -344,7 +203,7 @@ case class StartsWith(left: Expression, right: Expression)
  * A function that returns true if the string `left` ends with the string `right`.
  */
 case class EndsWith(left: Expression, right: Expression)
-    extends BinaryExpression with Predicate with StringComparison {
+    extends BinaryExpression with StringPredicate {
   override def compare(l: UTF8String, r: UTF8String): Boolean = l.endsWith(r)
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c1, c2) => s"($c1).endsWith($c2)")
@@ -769,32 +628,6 @@ case class StringSpace(child: Expression)
   override def prettyName: String = "space"
 }
 
-/**
- * Splits str around pat (pattern is a regular expression).
- */
-case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
-
-  override def left: Expression = str
-  override def right: Expression = pattern
-  override def dataType: DataType = ArrayType(StringType)
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
-
-  override def nullSafeEval(string: Any, regex: Any): Any = {
-    val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
-    new GenericArrayData(strings.asInstanceOf[Array[Any]])
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val arrayClass = classOf[GenericArrayData].getName
-    nullSafeCodeGen(ctx, ev, (str, pattern) =>
-      // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
-      s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""")
-  }
-
-  override def prettyName: String = "split"
-}
-
 object Substring {
   def subStringBinarySQL(bytes: Array[Byte], pos: Int, len: Int): Array[Byte] = {
     if (pos > bytes.length) {
@@ -1048,163 +881,6 @@ case class Encode(value: Expression, charset: Expression)
   }
 }
 
-/**
- * Replace all substrings of str that match regexp with rep.
- *
- * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
- */
-case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
-  extends TernaryExpression with ImplicitCastInputTypes {
-
-  // last regex in string, we will update the pattern iff regexp value changed.
-  @transient private var lastRegex: UTF8String = _
-  // last regex pattern, we cache it for performance concern
-  @transient private var pattern: Pattern = _
-  // last replacement string, we don't want to convert a UTF8String => java.langString every time.
-  @transient private var lastReplacement: String = _
-  @transient private var lastReplacementInUTF8: UTF8String = _
-  // result buffer write by Matcher
-  @transient private val result: StringBuffer = new StringBuffer
-
-  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
-    if (!p.equals(lastRegex)) {
-      // regex value changed
-      lastRegex = p.asInstanceOf[UTF8String].clone()
-      pattern = Pattern.compile(lastRegex.toString)
-    }
-    if (!r.equals(lastReplacementInUTF8)) {
-      // replacement string changed
-      lastReplacementInUTF8 = r.asInstanceOf[UTF8String].clone()
-      lastReplacement = lastReplacementInUTF8.toString
-    }
-    val m = pattern.matcher(s.toString())
-    result.delete(0, result.length())
-
-    while (m.find) {
-      m.appendReplacement(result, lastReplacement)
-    }
-    m.appendTail(result)
-
-    UTF8String.fromString(result.toString)
-  }
-
-  override def dataType: DataType = StringType
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType)
-  override def children: Seq[Expression] = subject :: regexp :: rep :: Nil
-  override def prettyName: String = "regexp_replace"
-
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val termLastRegex = ctx.freshName("lastRegex")
-    val termPattern = ctx.freshName("pattern")
-
-    val termLastReplacement = ctx.freshName("lastReplacement")
-    val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
-
-    val termResult = ctx.freshName("result")
-
-    val classNamePattern = classOf[Pattern].getCanonicalName
-    val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
-
-    ctx.addMutableState("UTF8String",
-      termLastRegex, s"${termLastRegex} = null;")
-    ctx.addMutableState(classNamePattern,
-      termPattern, s"${termPattern} = null;")
-    ctx.addMutableState("String",
-      termLastReplacement, s"${termLastReplacement} = null;")
-    ctx.addMutableState("UTF8String",
-      termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
-    ctx.addMutableState(classNameStringBuffer,
-      termResult, s"${termResult} = new $classNameStringBuffer();")
-
-    nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => {
-    s"""
-      if (!$regexp.equals(${termLastRegex})) {
-        // regex value changed
-        ${termLastRegex} = $regexp.clone();
-        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
-      }
-      if (!$rep.equals(${termLastReplacementInUTF8})) {
-        // replacement string changed
-        ${termLastReplacementInUTF8} = $rep.clone();
-        ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
-      }
-      ${termResult}.delete(0, ${termResult}.length());
-      java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString());
-
-      while (m.find()) {
-        m.appendReplacement(${termResult}, ${termLastReplacement});
-      }
-      m.appendTail(${termResult});
-      ${ev.primitive} = UTF8String.fromString(${termResult}.toString());
-      ${ev.isNull} = false;
-    """
-    })
-  }
-}
-
-/**
- * Extract a specific(idx) group identified by a Java regex.
- *
- * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
- */
-case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
-  extends TernaryExpression with ImplicitCastInputTypes {
-  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
-
-  // last regex in string, we will update the pattern iff regexp value changed.
-  @transient private var lastRegex: UTF8String = _
-  // last regex pattern, we cache it for performance concern
-  @transient private var pattern: Pattern = _
-
-  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
-    if (!p.equals(lastRegex)) {
-      // regex value changed
-      lastRegex = p.asInstanceOf[UTF8String].clone()
-      pattern = Pattern.compile(lastRegex.toString)
-    }
-    val m = pattern.matcher(s.toString())
-    if (m.find) {
-      val mr: MatchResult = m.toMatchResult
-      UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
-    } else {
-      UTF8String.EMPTY_UTF8
-    }
-  }
-
-  override def dataType: DataType = StringType
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)
-  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
-  override def prettyName: String = "regexp_extract"
-
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val termLastRegex = ctx.freshName("lastRegex")
-    val termPattern = ctx.freshName("pattern")
-    val classNamePattern = classOf[Pattern].getCanonicalName
-
-    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
-    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
-
-    nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
-      s"""
-      if (!$regexp.equals(${termLastRegex})) {
-        // regex value changed
-        ${termLastRegex} = $regexp.clone();
-        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
-      }
-      java.util.regex.Matcher m =
-        ${termPattern}.matcher($subject.toString());
-      if (m.find()) {
-        java.util.regex.MatchResult mr = m.toMatchResult();
-        ${ev.primitive} = UTF8String.fromString(mr.group($idx));
-        ${ev.isNull} = false;
-      } else {
-        ${ev.primitive} = UTF8String.EMPTY_UTF8;
-        ${ev.isNull} = false;
-      }"""
-    })
-  }
-}
-
 /**
  * Formats the number X to a format like '#,###,###.##', rounded to D decimal places,
  * and returns the result as a string. If D is 0, the result has no decimal point or
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 42457d5318b48..854463dd11c74 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -372,7 +372,7 @@ object NullPropagation extends Rule[LogicalPlan] {
         case _ => e
       }
 
-      case e: StringComparison => e.children match {
+      case e: StringPredicate => e.children match {
         case Literal(null, _) :: right :: Nil => Literal.create(null, e.dataType)
         case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)
         case _ => e
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 426dc272471ae..99e3b13ce8c97 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -673,7 +673,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
   }
 
-  test("number format") {
+  test("format_number / FormatNumber") {
     checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Byte]), Literal(3)), "4.000")
     checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Short]), Literal(3)), "4.000")
     checkEvaluation(FormatNumber(Literal(4.0f), Literal(3)), "4.000")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 1f270560d7bc1..fc4d0938c533a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -56,6 +56,7 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
 
   /**
    * Register a user-defined aggregate function (UDAF).
+   *
    * @param name the name of the UDAF.
    * @param udaf the UDAF needs to be registered.
    * @return the registered UDAF.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index 7619f3ec9f0a7..d43d3dd9ffaae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -304,7 +304,7 @@ private[sql] case class ScalaUDAF(
 
   override def nullable: Boolean = true
 
-  override def dataType: DataType = udaf.returnDataType
+  override def dataType: DataType = udaf.dataType
 
   override def deterministic: Boolean = udaf.deterministic
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
index 5180871585f25..258afadc76951 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2}
 import org.apache.spark.sql.execution.aggregate.ScalaUDAF
 import org.apache.spark.sql.{Column, Row}
@@ -26,7 +25,7 @@ import org.apache.spark.annotation.Experimental
 
 /**
  * :: Experimental ::
- * The abstract class for implementing user-defined aggregate functions.
+ * The base class for implementing user-defined aggregate functions (UDAF).
  */
 @Experimental
 abstract class UserDefinedAggregateFunction extends Serializable {
@@ -67,22 +66,35 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   /**
    * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
    */
-  def returnDataType: DataType
+  def dataType: DataType
 
-  /** Indicates if this function is deterministic. */
+  /**
+   * Returns true iff this function is deterministic, i.e. given the same input,
+   * always return the same output.
+   */
   def deterministic: Boolean
 
   /**
-   *  Initializes the given aggregation buffer. Initial values set by this method should satisfy
-   *  the condition that when merging two buffers with initial values, the new buffer
-   *  still store initial values.
+   * Initializes the given aggregation buffer, i.e. the zero value of the aggregation buffer.
+   *
+   * The contract should be that applying the merge function on two initial buffers should just
+   * return the initial buffer itself, i.e.
+   * `merge(initialBuffer, initialBuffer)` should equal `initialBuffer`.
    */
   def initialize(buffer: MutableAggregationBuffer): Unit
 
-  /** Updates the given aggregation buffer `buffer` with new input data from `input`. */
+  /**
+   * Updates the given aggregation buffer `buffer` with new input data from `input`.
+   *
+   * This is called once per input row.
+   */
   def update(buffer: MutableAggregationBuffer, input: Row): Unit
 
-  /** Merges two aggregation buffers and stores the updated buffer values back to `buffer1`. */
+  /**
+   * Merges two aggregation buffers and stores the updated buffer values back to `buffer1`.
+   *
+   * This is called when we merge two partially aggregated data together.
+   */
   def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
 
   /**
@@ -92,7 +104,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def evaluate(buffer: Row): Any
 
   /**
-   * Creates a [[Column]] for this UDAF with given [[Column]]s as arguments.
+   * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments.
    */
   @scala.annotation.varargs
   def apply(exprs: Column*): Column = {
@@ -105,16 +117,16 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   }
 
   /**
-   * Creates a [[Column]] for this UDAF with given [[Column]]s as arguments.
-   * If `isDistinct` is true, this UDAF is working on distinct input values.
+   * Creates a [[Column]] for this UDAF using the distinct values of the given
+   * [[Column]]s as input arguments.
    */
   @scala.annotation.varargs
-  def apply(isDistinct: Boolean, exprs: Column*): Column = {
+  def distinct(exprs: Column*): Column = {
     val aggregateExpression =
       AggregateExpression2(
         ScalaUDAF(exprs.map(_.expr), this),
         Complete,
-        isDistinct = isDistinct)
+        isDistinct = true)
     Column(aggregateExpression)
   }
 }
@@ -122,9 +134,11 @@ abstract class UserDefinedAggregateFunction extends Serializable {
 /**
  * :: Experimental ::
  * A [[Row]] representing an mutable aggregation buffer.
+ *
+ * This is not meant to be extended outside of Spark.
  */
 @Experimental
-trait MutableAggregationBuffer extends Row {
+abstract class MutableAggregationBuffer extends Row {
 
   /** Update the ith value of this buffer. */
   def update(i: Int, value: Any): Unit
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index 21b053f07a3ba..a30dfa554eabc 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -92,7 +92,7 @@ public void testUDAF() {
     DataFrame aggregatedDF =
       df.groupBy()
         .agg(
-          udaf.apply(true, col("value")),
+          udaf.distinct(col("value")),
           udaf.apply(col("value")),
           registeredUDAF.apply(col("value")),
           callUDF("mydoublesum", col("value")));
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
index a2247e3da1554..2961b803f14aa 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
@@ -65,7 +65,7 @@ public MyDoubleAvg() {
     return _bufferSchema;
   }
 
-  @Override public DataType returnDataType() {
+  @Override public DataType dataType() {
     return _returnDataType;
   }
 
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
index da29e24d267dd..c71882a6e7bed 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
@@ -60,7 +60,7 @@ public MyDoubleSum() {
     return _bufferSchema;
   }
 
-  @Override public DataType returnDataType() {
+  @Override public DataType dataType() {
     return _returnDataType;
   }
 

From 1f29d502e7ecd6faa185d70dc714f9ea3922fb6d Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 19 Aug 2015 18:36:01 -0700
Subject: [PATCH 014/802] [SPARK-9812] [STREAMING] Fix Python 3 compatibility
 issue in PySpark Streaming and some docs

This PR includes the following fixes:
1. Use `range` instead of `xrange` in `queue_stream.py` to support Python 3.
2. Fix the issue that `utf8_decoder` will return `bytes` rather than `str` when receiving an empty `bytes` in Python 3.
3. Fix the commands in docs so that the user can copy them directly to the command line. The previous commands was broken in the middle of a path, so when copying to the command line, the path would be split to two parts by the extra spaces, which forces the user to fix it manually.

Author: zsxwing <zsxwing@gmail.com>

Closes #8315 from zsxwing/SPARK-9812.
---
 .../src/main/python/streaming/direct_kafka_wordcount.py     | 6 +++---
 examples/src/main/python/streaming/flume_wordcount.py       | 5 +++--
 examples/src/main/python/streaming/kafka_wordcount.py       | 5 +++--
 examples/src/main/python/streaming/mqtt_wordcount.py        | 5 +++--
 examples/src/main/python/streaming/queue_stream.py          | 4 ++--
 python/pyspark/streaming/flume.py                           | 4 +++-
 python/pyspark/streaming/kafka.py                           | 4 +++-
 python/pyspark/streaming/kinesis.py                         | 4 +++-
 8 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/examples/src/main/python/streaming/direct_kafka_wordcount.py b/examples/src/main/python/streaming/direct_kafka_wordcount.py
index 6ef188a220c51..ea20678b9acad 100644
--- a/examples/src/main/python/streaming/direct_kafka_wordcount.py
+++ b/examples/src/main/python/streaming/direct_kafka_wordcount.py
@@ -23,8 +23,8 @@
  http://kafka.apache.org/documentation.html#quickstart
 
  and then run the example
-    `$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
-      spark-streaming-kafka-assembly-*.jar \
+    `$ bin/spark-submit --jars \
+      external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
       examples/src/main/python/streaming/direct_kafka_wordcount.py \
       localhost:9092 test`
 """
@@ -37,7 +37,7 @@
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
-        print >> sys.stderr, "Usage: direct_kafka_wordcount.py <broker_list> <topic>"
+        print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
         exit(-1)
 
     sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
diff --git a/examples/src/main/python/streaming/flume_wordcount.py b/examples/src/main/python/streaming/flume_wordcount.py
index 091b64d8c4af4..d75bc6daac138 100644
--- a/examples/src/main/python/streaming/flume_wordcount.py
+++ b/examples/src/main/python/streaming/flume_wordcount.py
@@ -23,8 +23,9 @@
  https://flume.apache.org/documentation.html
 
  and then run the example
-    `$ bin/spark-submit --jars external/flume-assembly/target/scala-*/\
-      spark-streaming-flume-assembly-*.jar examples/src/main/python/streaming/flume_wordcount.py \
+    `$ bin/spark-submit --jars \
+      external/flume-assembly/target/scala-*/spark-streaming-flume-assembly-*.jar \
+      examples/src/main/python/streaming/flume_wordcount.py \
       localhost 12345
 """
 from __future__ import print_function
diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py
index b178e7899b5e1..8d697f620f467 100644
--- a/examples/src/main/python/streaming/kafka_wordcount.py
+++ b/examples/src/main/python/streaming/kafka_wordcount.py
@@ -23,8 +23,9 @@
  http://kafka.apache.org/documentation.html#quickstart
 
  and then run the example
-    `$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
-      spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \
+    `$ bin/spark-submit --jars \
+      external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
+      examples/src/main/python/streaming/kafka_wordcount.py \
       localhost:2181 test`
 """
 from __future__ import print_function
diff --git a/examples/src/main/python/streaming/mqtt_wordcount.py b/examples/src/main/python/streaming/mqtt_wordcount.py
index 617ce5ea6775e..abf9c0e21d307 100644
--- a/examples/src/main/python/streaming/mqtt_wordcount.py
+++ b/examples/src/main/python/streaming/mqtt_wordcount.py
@@ -26,8 +26,9 @@
  http://www.eclipse.org/paho/#getting-started
 
  and then run the example
-    `$ bin/spark-submit --jars external/mqtt-assembly/target/scala-*/\
-      spark-streaming-mqtt-assembly-*.jar examples/src/main/python/streaming/mqtt_wordcount.py \
+    `$ bin/spark-submit --jars \
+      external/mqtt-assembly/target/scala-*/spark-streaming-mqtt-assembly-*.jar \
+      examples/src/main/python/streaming/mqtt_wordcount.py \
       tcp://localhost:1883 foo`
 """
 
diff --git a/examples/src/main/python/streaming/queue_stream.py b/examples/src/main/python/streaming/queue_stream.py
index dcd6a0fc6ff91..b3808907f74a6 100644
--- a/examples/src/main/python/streaming/queue_stream.py
+++ b/examples/src/main/python/streaming/queue_stream.py
@@ -36,8 +36,8 @@
     # Create the queue through which RDDs can be pushed to
     # a QueueInputDStream
     rddQueue = []
-    for i in xrange(5):
-        rddQueue += [ssc.sparkContext.parallelize([j for j in xrange(1, 1001)], 10)]
+    for i in range(5):
+        rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]
 
     # Create the QueueInputDStream and use it do some processing
     inputStream = ssc.queueStream(rddQueue)
diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index cbb573f226bbe..c0cdc50d8d423 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -31,7 +31,9 @@
 
 def utf8_decoder(s):
     """ Decode the unicode as UTF-8 """
-    return s and s.decode('utf-8')
+    if s is None:
+        return None
+    return s.decode('utf-8')
 
 
 class FlumeUtils(object):
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index dc5b7fd878aef..8a814c64c0423 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -29,7 +29,9 @@
 
 def utf8_decoder(s):
     """ Decode the unicode as UTF-8 """
-    return s and s.decode('utf-8')
+    if s is None:
+        return None
+    return s.decode('utf-8')
 
 
 class KafkaUtils(object):
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index bcfe2703fecf9..34be5880e1708 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -26,7 +26,9 @@
 
 def utf8_decoder(s):
     """ Decode the unicode as UTF-8 """
-    return s and s.decode('utf-8')
+    if s is None:
+        return None
+    return s.decode('utf-8')
 
 
 class KinesisUtils(object):

From affc8a887ede9fdc2ca6051833954cd10918c869 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 19 Aug 2015 19:43:09 -0700
Subject: [PATCH 015/802] [SPARK-10125] [STREAMING] Fix a potential deadlock in
 JobGenerator.stop

Because `lazy val` uses `this` lock, if JobGenerator.stop and JobGenerator.doCheckpoint (JobGenerator.shouldCheckpoint has not yet been initialized) run at the same time, it may hang.

Here are the stack traces for the deadlock:

```Java
"pool-1-thread-1-ScalaTest-running-StreamingListenerSuite" #11 prio=5 os_prio=31 tid=0x00007fd35d094800 nid=0x5703 in Object.wait() [0x000000012ecaf000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.Thread.join(Thread.java:1245)
        - locked <0x00000007b5d8d7f8> (a org.apache.spark.util.EventLoop$$anon$1)
        at java.lang.Thread.join(Thread.java:1319)
        at org.apache.spark.util.EventLoop.stop(EventLoop.scala:81)
        at org.apache.spark.streaming.scheduler.JobGenerator.stop(JobGenerator.scala:155)
        - locked <0x00000007b5d8cea0> (a org.apache.spark.streaming.scheduler.JobGenerator)
        at org.apache.spark.streaming.scheduler.JobScheduler.stop(JobScheduler.scala:95)
        - locked <0x00000007b5d8ced8> (a org.apache.spark.streaming.scheduler.JobScheduler)
        at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:687)

"JobGenerator" #67 daemon prio=5 os_prio=31 tid=0x00007fd35c3b9800 nid=0x9f03 waiting for monitor entry [0x0000000139e4a000]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint$lzycompute(JobGenerator.scala:63)
        - waiting to lock <0x00000007b5d8cea0> (a org.apache.spark.streaming.scheduler.JobGenerator)
        at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint(JobGenerator.scala:63)
        at org.apache.spark.streaming.scheduler.JobGenerator.doCheckpoint(JobGenerator.scala:290)
        at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182)
        at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:83)
        at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:82)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
```

I can use this patch to produce this deadlock: https://github.com/zsxwing/spark/commit/8a88f28d1331003a65fabef48ae3d22a7c21f05f

And a timeout build in Jenkins due to this deadlock: https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1654/

This PR initializes `checkpointWriter` before `eventLoop` uses it to avoid this deadlock.

Author: zsxwing <zsxwing@gmail.com>

Closes #8326 from zsxwing/SPARK-10125.
---
 .../org/apache/spark/streaming/scheduler/JobGenerator.scala   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
index 9f2117ada61c0..2de035d166e7b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -79,6 +79,10 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
   def start(): Unit = synchronized {
     if (eventLoop != null) return // generator has already been started
 
+    // Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
+    // See SPARK-10125
+    checkpointWriter
+
     eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
       override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)
 

From 73431d8afb41b93888d2642a1ce2d011f03fb740 Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Wed, 19 Aug 2015 19:43:26 -0700
Subject: [PATCH 016/802] [SPARK-10124] [MESOS] Fix removing queued driver in
 mesos cluster mode.

Currently the spark applications can be queued to the Mesos cluster dispatcher, but when multiple jobs are in queue we don't handle removing jobs from the buffer correctly while iterating and causes null pointer exception.

This patch copies the buffer before iterating them, so exceptions aren't thrown when the jobs are removed.

Author: Timothy Chen <tnachen@gmail.com>

Closes #8322 from tnachen/fix_cluster_mode.
---
 .../cluster/mesos/MesosClusterScheduler.scala | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 64ec2b8e3db15..1206f184fbc82 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -507,14 +507,16 @@ private[spark] class MesosClusterScheduler(
       val driversToRetry = pendingRetryDrivers.filter { d =>
         d.retryState.get.nextRetry.before(currentTime)
       }
+
       scheduleTasks(
-        driversToRetry,
+        copyBuffer(driversToRetry),
         removeFromPendingRetryDrivers,
         currentOffers,
         tasks)
+
       // Then we walk through the queued drivers and try to schedule them.
       scheduleTasks(
-        queuedDrivers,
+        copyBuffer(queuedDrivers),
         removeFromQueuedDrivers,
         currentOffers,
         tasks)
@@ -527,13 +529,14 @@ private[spark] class MesosClusterScheduler(
       .foreach(o => driver.declineOffer(o.getId))
   }
 
+  private def copyBuffer(
+      buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = {
+    val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size)
+    buffer.copyToBuffer(newBuffer)
+    newBuffer
+  }
+
   def getSchedulerState(): MesosClusterSchedulerState = {
-    def copyBuffer(
-        buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = {
-      val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size)
-      buffer.copyToBuffer(newBuffer)
-      newBuffer
-    }
     stateLock.synchronized {
       new MesosClusterSchedulerState(
         frameworkId,

From b762f9920f7587d3c08493c49dd2fede62110b88 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 19 Aug 2015 21:15:58 -0700
Subject: [PATCH 017/802] [SPARK-10128] [STREAMING] Used correct classloader to
 deserialize WAL data

Recovering Kinesis sequence numbers from WAL leads to classnotfoundexception because the ObjectInputStream does not use the correct classloader and the SequenceNumberRanges class (in streaming-kinesis-asl package) cannot be found (added through spark-submit) while deserializing. The solution is to use `Thread.currentThread().getContextClassLoader` while deserializing.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8328 from tdas/SPARK-10128 and squashes the following commits:

f19b1c2 [Tathagata Das] Used correct classloader to deserialize WAL data
---
 .../spark/streaming/scheduler/ReceivedBlockTracker.scala     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index 7720259a5d794..53b96d51c9180 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.streaming.Time
 import org.apache.spark.streaming.util.{WriteAheadLog, WriteAheadLogUtils}
 import org.apache.spark.util.{Clock, Utils}
-import org.apache.spark.{Logging, SparkConf, SparkException}
+import org.apache.spark.{Logging, SparkConf}
 
 /** Trait representing any event in the ReceivedBlockTracker that updates its state. */
 private[streaming] sealed trait ReceivedBlockTrackerLogEvent
@@ -199,7 +199,8 @@ private[streaming] class ReceivedBlockTracker(
       import scala.collection.JavaConversions._
       writeAheadLog.readAll().foreach { byteBuffer =>
         logTrace("Recovering record " + byteBuffer)
-        Utils.deserialize[ReceivedBlockTrackerLogEvent](byteBuffer.array) match {
+        Utils.deserialize[ReceivedBlockTrackerLogEvent](
+          byteBuffer.array, Thread.currentThread().getContextClassLoader) match {
           case BlockAdditionEvent(receivedBlockInfo) =>
             insertAddedBlock(receivedBlockInfo)
           case BatchAllocationEvent(time, allocatedBlocks) =>

From 43e0135421b2262cbb0e06aae53523f663b4f959 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 20 Aug 2015 15:30:31 +0800
Subject: [PATCH 018/802] [SPARK-10092] [SQL] Multi-DB support follow up.

https://issues.apache.org/jira/browse/SPARK-10092

This pr is a follow-up one for Multi-DB support. It has the following changes:

* `HiveContext.refreshTable` now accepts `dbName.tableName`.
* `HiveContext.analyze` now accepts `dbName.tableName`.
* `CreateTableUsing`, `CreateTableUsingAsSelect`, `CreateTempTableUsing`, `CreateTempTableUsingAsSelect`, `CreateMetastoreDataSource`, and `CreateMetastoreDataSourceAsSelect` all take `TableIdentifier` instead of the string representation of table name.
* When you call `saveAsTable` with a specified database, the data will be saved to the correct location.
* Explicitly do not allow users to create a temporary with a specified database name (users cannot do it before).
* When we save table to metastore, we also check if db name and table name can be accepted by hive (using `MetaStoreUtils.validateName`).

Author: Yin Huai <yhuai@databricks.com>

Closes #8324 from yhuai/saveAsTableDB.
---
 .../spark/sql/catalyst/TableIdentifier.scala  |   4 +-
 .../spark/sql/catalyst/analysis/Catalog.scala |  63 +++++--
 .../apache/spark/sql/DataFrameWriter.scala    |   2 +-
 .../org/apache/spark/sql/SQLContext.scala     |  15 +-
 .../spark/sql/execution/SparkStrategies.scala |  10 +-
 .../sql/execution/datasources/DDLParser.scala |  32 ++--
 .../spark/sql/execution/datasources/ddl.scala |  22 +--
 .../sql/execution/datasources/rules.scala     |   8 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  35 ++++
 .../apache/spark/sql/hive/HiveContext.scala   |  14 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  22 ++-
 .../spark/sql/hive/HiveStrategies.scala       |  12 +-
 .../spark/sql/hive/execution/commands.scala   |  54 ++++--
 .../spark/sql/hive/ListTablesSuite.scala      |   6 -
 .../spark/sql/hive/MultiDatabaseSuite.scala   | 158 +++++++++++++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    |  35 ++++
 16 files changed, 398 insertions(+), 94 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
index aebcdeb9d070f..d701559bf2d9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
@@ -25,7 +25,9 @@ private[sql] case class TableIdentifier(table: String, database: Option[String]
 
   def toSeq: Seq[String] = database.toSeq :+ table
 
-  override def toString: String = toSeq.map("`" + _ + "`").mkString(".")
+  override def toString: String = quotedString
+
+  def quotedString: String = toSeq.map("`" + _ + "`").mkString(".")
 
   def unquotedString: String = toSeq.mkString(".")
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 5766e6a2dd51a..503c4f4b20f38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -23,6 +23,7 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{TableIdentifier, CatalystConf, EmptyConf}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
 
@@ -55,12 +56,15 @@ trait Catalog {
 
   def refreshTable(tableIdent: TableIdentifier): Unit
 
+  // TODO: Refactor it in the work of SPARK-10104
   def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
 
+  // TODO: Refactor it in the work of SPARK-10104
   def unregisterTable(tableIdentifier: Seq[String]): Unit
 
   def unregisterAllTables(): Unit
 
+  // TODO: Refactor it in the work of SPARK-10104
   protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = {
     if (conf.caseSensitiveAnalysis) {
       tableIdentifier
@@ -69,6 +73,7 @@ trait Catalog {
     }
   }
 
+  // TODO: Refactor it in the work of SPARK-10104
   protected def getDbTableName(tableIdent: Seq[String]): String = {
     val size = tableIdent.size
     if (size <= 2) {
@@ -78,9 +83,22 @@ trait Catalog {
     }
   }
 
+  // TODO: Refactor it in the work of SPARK-10104
   protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = {
     (tableIdent.lift(tableIdent.size - 2), tableIdent.last)
   }
+
+  /**
+   * It is not allowed to specifiy database name for tables stored in [[SimpleCatalog]].
+   * We use this method to check it.
+   */
+  protected def checkTableIdentifier(tableIdentifier: Seq[String]): Unit = {
+    if (tableIdentifier.length > 1) {
+      throw new AnalysisException("Specifying database name or other qualifiers are not allowed " +
+        "for temporary tables. If the table name has dots (.) in it, please quote the " +
+        "table name with backticks (`).")
+    }
+  }
 }
 
 class SimpleCatalog(val conf: CatalystConf) extends Catalog {
@@ -89,11 +107,13 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   override def registerTable(
       tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
+    checkTableIdentifier(tableIdentifier)
     val tableIdent = processTableIdentifier(tableIdentifier)
     tables.put(getDbTableName(tableIdent), plan)
   }
 
   override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+    checkTableIdentifier(tableIdentifier)
     val tableIdent = processTableIdentifier(tableIdentifier)
     tables.remove(getDbTableName(tableIdent))
   }
@@ -103,6 +123,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   }
 
   override def tableExists(tableIdentifier: Seq[String]): Boolean = {
+    checkTableIdentifier(tableIdentifier)
     val tableIdent = processTableIdentifier(tableIdentifier)
     tables.containsKey(getDbTableName(tableIdent))
   }
@@ -110,6 +131,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   override def lookupRelation(
       tableIdentifier: Seq[String],
       alias: Option[String] = None): LogicalPlan = {
+    checkTableIdentifier(tableIdentifier)
     val tableIdent = processTableIdentifier(tableIdentifier)
     val tableFullName = getDbTableName(tableIdent)
     val table = tables.get(tableFullName)
@@ -149,7 +171,13 @@ trait OverrideCatalog extends Catalog {
 
   abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    overrides.get(getDBTable(tableIdent)) match {
+    // A temporary tables only has a single part in the tableIdentifier.
+    val overriddenTable = if (tableIdentifier.length > 1) {
+      None: Option[LogicalPlan]
+    } else {
+      overrides.get(getDBTable(tableIdent))
+    }
+    overriddenTable match {
       case Some(_) => true
       case None => super.tableExists(tableIdentifier)
     }
@@ -159,7 +187,12 @@ trait OverrideCatalog extends Catalog {
       tableIdentifier: Seq[String],
       alias: Option[String] = None): LogicalPlan = {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    val overriddenTable = overrides.get(getDBTable(tableIdent))
+    // A temporary tables only has a single part in the tableIdentifier.
+    val overriddenTable = if (tableIdentifier.length > 1) {
+      None: Option[LogicalPlan]
+    } else {
+      overrides.get(getDBTable(tableIdent))
+    }
     val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r))
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
@@ -171,20 +204,8 @@ trait OverrideCatalog extends Catalog {
   }
 
   abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    val dbName = if (conf.caseSensitiveAnalysis) {
-      databaseName
-    } else {
-      if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
-    }
-
-    val temporaryTables = overrides.filter {
-      // If a temporary table does not have an associated database, we should return its name.
-      case ((None, _), _) => true
-      // If a temporary table does have an associated database, we should return it if the database
-      // matches the given database name.
-      case ((db: Some[String], _), _) if db == dbName => true
-      case _ => false
-    }.map {
+    // We always return all temporary tables.
+    val temporaryTables = overrides.map {
       case ((_, tableName), _) => (tableName, true)
     }.toSeq
 
@@ -194,13 +215,19 @@ trait OverrideCatalog extends Catalog {
   override def registerTable(
       tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
+    checkTableIdentifier(tableIdentifier)
     val tableIdent = processTableIdentifier(tableIdentifier)
     overrides.put(getDBTable(tableIdent), plan)
   }
 
   override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    overrides.remove(getDBTable(tableIdent))
+    // A temporary tables only has a single part in the tableIdentifier.
+    // If tableIdentifier has more than one parts, it is not a temporary table
+    // and we do not need to do anything at here.
+    if (tableIdentifier.length == 1) {
+      val tableIdent = processTableIdentifier(tableIdentifier)
+      overrides.remove(getDBTable(tableIdent))
+    }
   }
 
   override def unregisterAllTables(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f0bf1be506411..ce8744b53175b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -218,7 +218,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
       case _ =>
         val cmd =
           CreateTableUsingAsSelect(
-            tableIdent.unquotedString,
+            tableIdent,
             source,
             temporary = false,
             partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 58fe75b59f418..126c9c6f839c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -584,9 +584,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
       tableName: String,
       source: String,
       options: Map[String, String]): DataFrame = {
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
-        tableName,
+        tableIdent,
         userSpecifiedSchema = None,
         source,
         temporary = false,
@@ -594,7 +595,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
         allowExisting = false,
         managedIfNoPath = false)
     executePlan(cmd).toRdd
-    table(tableName)
+    table(tableIdent)
   }
 
   /**
@@ -629,9 +630,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
       source: String,
       schema: StructType,
       options: Map[String, String]): DataFrame = {
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
-        tableName,
+        tableIdent,
         userSpecifiedSchema = Some(schema),
         source,
         temporary = false,
@@ -639,7 +641,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
         allowExisting = false,
         managedIfNoPath = false)
     executePlan(cmd).toRdd
-    table(tableName)
+    table(tableIdent)
   }
 
   /**
@@ -724,7 +726,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @since 1.3.0
    */
   def table(tableName: String): DataFrame = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    table(new SqlParser().parseTableIdentifier(tableName))
+  }
+
+  private def table(tableIdent: TableIdentifier): DataFrame = {
     DataFrame(this, catalog.lookupRelation(tableIdent.toSeq))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 1fc870d44b578..4df53687a0731 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -395,22 +395,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object DDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case CreateTableUsing(tableName, userSpecifiedSchema, provider, true, opts, false, _) =>
+      case CreateTableUsing(tableIdent, userSpecifiedSchema, provider, true, opts, false, _) =>
         ExecutedCommand(
           CreateTempTableUsing(
-            tableName, userSpecifiedSchema, provider, opts)) :: Nil
+            tableIdent, userSpecifiedSchema, provider, opts)) :: Nil
       case c: CreateTableUsing if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
       case c: CreateTableUsing if c.temporary && c.allowExisting =>
         sys.error("allowExisting should be set to false when creating a temporary table.")
 
-      case CreateTableUsingAsSelect(tableName, provider, true, partitionsCols, mode, opts, query)
+      case CreateTableUsingAsSelect(tableIdent, provider, true, partitionsCols, mode, opts, query)
           if partitionsCols.nonEmpty =>
         sys.error("Cannot create temporary partitioned table.")
 
-      case CreateTableUsingAsSelect(tableName, provider, true, _, mode, opts, query) =>
+      case CreateTableUsingAsSelect(tableIdent, provider, true, _, mode, opts, query) =>
         val cmd = CreateTempTableUsingAsSelect(
-          tableName, provider, Array.empty[String], mode, opts, query)
+          tableIdent, provider, Array.empty[String], mode, opts, query)
         ExecutedCommand(cmd) :: Nil
       case c: CreateTableUsingAsSelect if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
index 6c462fa30461b..f7a88b98c0b48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
@@ -80,9 +80,9 @@ class DDLParser(parseQuery: String => LogicalPlan)
    */
   protected lazy val createTable: Parser[LogicalPlan] = {
     // TODO: Support database.table.
-    (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident ~
+    (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ tableIdentifier ~
       tableCols.? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ {
-      case temp ~ allowExisting ~ tableName ~ columns ~ provider ~ opts ~ query =>
+      case temp ~ allowExisting ~ tableIdent ~ columns ~ provider ~ opts ~ query =>
         if (temp.isDefined && allowExisting.isDefined) {
           throw new DDLException(
             "a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause.")
@@ -104,7 +104,7 @@ class DDLParser(parseQuery: String => LogicalPlan)
           }
 
           val queryPlan = parseQuery(query.get)
-          CreateTableUsingAsSelect(tableName,
+          CreateTableUsingAsSelect(tableIdent,
             provider,
             temp.isDefined,
             Array.empty[String],
@@ -114,7 +114,7 @@ class DDLParser(parseQuery: String => LogicalPlan)
         } else {
           val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields)))
           CreateTableUsing(
-            tableName,
+            tableIdent,
             userSpecifiedSchema,
             provider,
             temp.isDefined,
@@ -125,6 +125,12 @@ class DDLParser(parseQuery: String => LogicalPlan)
     }
   }
 
+  // This is the same as tableIdentifier in SqlParser.
+  protected lazy val tableIdentifier: Parser[TableIdentifier] =
+    (ident <~ ".").? ~ ident ^^ {
+      case maybeDbName ~ tableName => TableIdentifier(tableName, maybeDbName)
+    }
+
   protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")"
 
   /*
@@ -132,21 +138,15 @@ class DDLParser(parseQuery: String => LogicalPlan)
    * This will display all columns of table `avroTable` includes column_name,column_type,comment
    */
   protected lazy val describeTable: Parser[LogicalPlan] =
-    (DESCRIBE ~> opt(EXTENDED)) ~ (ident <~ ".").? ~ ident  ^^ {
-      case e ~ db ~ tbl =>
-        val tblIdentifier = db match {
-          case Some(dbName) =>
-            Seq(dbName, tbl)
-          case None =>
-            Seq(tbl)
-        }
-        DescribeCommand(UnresolvedRelation(tblIdentifier, None), e.isDefined)
+    (DESCRIBE ~> opt(EXTENDED)) ~ tableIdentifier ^^ {
+      case e ~ tableIdent =>
+        DescribeCommand(UnresolvedRelation(tableIdent.toSeq, None), e.isDefined)
     }
 
   protected lazy val refreshTable: Parser[LogicalPlan] =
-    REFRESH ~> TABLE ~> (ident <~ ".").? ~ ident ^^ {
-      case maybeDatabaseName ~ tableName =>
-        RefreshTable(TableIdentifier(tableName, maybeDatabaseName))
+    REFRESH ~> TABLE ~> tableIdentifier ^^ {
+      case tableIndet =>
+        RefreshTable(tableIndet)
     }
 
   protected lazy val options: Parser[Map[String, String]] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index ecd304c30cdee..31d6b75e13477 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -53,7 +53,7 @@ case class DescribeCommand(
   *                      If it is false, an exception will be thrown
   */
 case class CreateTableUsing(
-    tableName: String,
+    tableIdent: TableIdentifier,
     userSpecifiedSchema: Option[StructType],
     provider: String,
     temporary: Boolean,
@@ -71,8 +71,9 @@ case class CreateTableUsing(
  * can analyze the logical plan that will be used to populate the table.
  * So, [[PreWriteCheck]] can detect cases that are not allowed.
  */
+// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 case class CreateTableUsingAsSelect(
-    tableName: String,
+    tableIdent: TableIdentifier,
     provider: String,
     temporary: Boolean,
     partitionColumns: Array[String],
@@ -80,12 +81,10 @@ case class CreateTableUsingAsSelect(
     options: Map[String, String],
     child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = Seq.empty[Attribute]
-  // TODO: Override resolved after we support databaseName.
-  // override lazy val resolved = databaseName != None && childrenResolved
 }
 
 case class CreateTempTableUsing(
-    tableName: String,
+    tableIdent: TableIdentifier,
     userSpecifiedSchema: Option[StructType],
     provider: String,
     options: Map[String, String]) extends RunnableCommand {
@@ -93,14 +92,16 @@ case class CreateTempTableUsing(
   def run(sqlContext: SQLContext): Seq[Row] = {
     val resolved = ResolvedDataSource(
       sqlContext, userSpecifiedSchema, Array.empty[String], provider, options)
-    sqlContext.registerDataFrameAsTable(
-      DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
+    sqlContext.catalog.registerTable(
+      tableIdent.toSeq,
+      DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
+
     Seq.empty[Row]
   }
 }
 
 case class CreateTempTableUsingAsSelect(
-    tableName: String,
+    tableIdent: TableIdentifier,
     provider: String,
     partitionColumns: Array[String],
     mode: SaveMode,
@@ -110,8 +111,9 @@ case class CreateTempTableUsingAsSelect(
   override def run(sqlContext: SQLContext): Seq[Row] = {
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df)
-    sqlContext.registerDataFrameAsTable(
-      DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
+    sqlContext.catalog.registerTable(
+      tableIdent.toSeq,
+      DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
 
     Seq.empty[Row]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 9d3d35692ffcc..16c9138419fa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -140,12 +140,12 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           // OK
         }
 
-      case CreateTableUsingAsSelect(tableName, _, _, partitionColumns, mode, _, query) =>
+      case CreateTableUsingAsSelect(tableIdent, _, _, partitionColumns, mode, _, query) =>
         // When the SaveMode is Overwrite, we need to check if the table is an input table of
         // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
-        if (mode == SaveMode.Overwrite && catalog.tableExists(Seq(tableName))) {
+        if (mode == SaveMode.Overwrite && catalog.tableExists(tableIdent.toSeq)) {
           // Need to remove SubQuery operator.
-          EliminateSubQueries(catalog.lookupRelation(Seq(tableName))) match {
+          EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
             case l @ LogicalRelation(dest: BaseRelation) =>
@@ -155,7 +155,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
               }
               if (srcRelations.contains(dest)) {
                 failAnalysis(
-                  s"Cannot overwrite table $tableName that is also being read from.")
+                  s"Cannot overwrite table $tableIdent that is also being read from.")
               } else {
                 // OK
               }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 141468ca00d67..da50aec17c89e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1644,4 +1644,39 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(sql("select count(num) from 1one"), Row(10))
     }
   }
+
+  test("specifying database name for a temporary table is not allowed") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df =
+        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+      df
+        .write
+        .format("parquet")
+        .save(path)
+
+      val message = intercept[AnalysisException] {
+        sqlContext.sql(
+          s"""
+          |CREATE TEMPORARY TABLE db.t
+          |USING parquet
+          |OPTIONS (
+          |  path '$path'
+          |)
+        """.stripMargin)
+      }.getMessage
+      assert(message.contains("Specifying database name or other qualifiers are not allowed"))
+
+      // If you use backticks to quote the name of a temporary table having dot in it.
+      sqlContext.sql(
+        s"""
+          |CREATE TEMPORARY TABLE `db.t`
+          |USING parquet
+          |OPTIONS (
+          |  path '$path'
+          |)
+        """.stripMargin)
+      checkAnswer(sqlContext.table("`db.t`"), df)
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 17762649fd70d..17cc83087fb1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -43,7 +43,7 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
-import org.apache.spark.sql.catalyst.{TableIdentifier, ParserDialect}
+import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier, ParserDialect}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand}
@@ -189,6 +189,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     // We instantiate a HiveConf here to read in the hive-site.xml file and then pass the options
     // into the isolated client loader
     val metadataConf = new HiveConf()
+
+    val defaltWarehouseLocation = metadataConf.get("hive.metastore.warehouse.dir")
+    logInfo("defalt warehouse location is " + defaltWarehouseLocation)
+
     // `configure` goes second to override other settings.
     val allConfig = metadataConf.iterator.map(e => e.getKey -> e.getValue).toMap ++ configure
 
@@ -288,12 +292,13 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * @since 1.3.0
    */
   def refreshTable(tableName: String): Unit = {
-    val tableIdent = TableIdentifier(tableName).withDatabase(catalog.client.currentDatabase)
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
     catalog.refreshTable(tableIdent)
   }
 
   protected[hive] def invalidateTable(tableName: String): Unit = {
-    catalog.invalidateTable(catalog.client.currentDatabase, tableName)
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    catalog.invalidateTable(tableIdent)
   }
 
   /**
@@ -307,7 +312,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    */
   @Experimental
   def analyze(tableName: String) {
-    val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq))
 
     relation match {
       case relation: MetastoreRelation =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 6770462bb0ad3..bbe8c1911bf86 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -174,10 +174,13 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     // it is better at here to invalidate the cache to avoid confusing waring logs from the
     // cache loader (e.g. cannot find data source provider, which is only defined for
     // data source table.).
-    invalidateTable(tableIdent.database.getOrElse(client.currentDatabase), tableIdent.table)
+    invalidateTable(tableIdent)
   }
 
-  def invalidateTable(databaseName: String, tableName: String): Unit = {
+  def invalidateTable(tableIdent: TableIdentifier): Unit = {
+    val databaseName = tableIdent.database.getOrElse(client.currentDatabase)
+    val tableName = tableIdent.table
+
     cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase)
   }
 
@@ -187,6 +190,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * Creates a data source table (a table created with USING clause) in Hive's metastore.
    * Returns true when the table has been created. Otherwise, false.
    */
+  // TODO: Remove this in SPARK-10104.
   def createDataSourceTable(
       tableName: String,
       userSpecifiedSchema: Option[StructType],
@@ -203,7 +207,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       isExternal)
   }
 
-  private def createDataSourceTable(
+  def createDataSourceTable(
       tableIdent: TableIdentifier,
       userSpecifiedSchema: Option[StructType],
       partitionColumns: Array[String],
@@ -371,10 +375,16 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   }
 
   def hiveDefaultTableFilePath(tableName: String): String = {
+    hiveDefaultTableFilePath(new SqlParser().parseTableIdentifier(tableName))
+  }
+
+  def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {
     // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName)
+    val database = tableIdent.database.getOrElse(client.currentDatabase)
+
     new Path(
-      new Path(client.getDatabase(client.currentDatabase).location),
-      tableName.toLowerCase).toString
+      new Path(client.getDatabase(database).location),
+      tableIdent.table.toLowerCase).toString
   }
 
   def tableExists(tableIdentifier: Seq[String]): Boolean = {
@@ -635,7 +645,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
 
           val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists
           CreateTableUsingAsSelect(
-            desc.name,
+            TableIdentifier(desc.name),
             hive.conf.defaultDataSourceName,
             temporary = false,
             Array.empty[String],
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index cd6cd322c94ed..d38ad9127327d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -83,14 +83,16 @@ private[hive] trait HiveStrategies {
   object HiveDDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case CreateTableUsing(
-          tableName, userSpecifiedSchema, provider, false, opts, allowExisting, managedIfNoPath) =>
-        ExecutedCommand(
+        tableIdent, userSpecifiedSchema, provider, false, opts, allowExisting, managedIfNoPath) =>
+        val cmd =
           CreateMetastoreDataSource(
-            tableName, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath)) :: Nil
+            tableIdent, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath)
+        ExecutedCommand(cmd) :: Nil
 
-      case CreateTableUsingAsSelect(tableName, provider, false, partitionCols, mode, opts, query) =>
+      case CreateTableUsingAsSelect(
+        tableIdent, provider, false, partitionCols, mode, opts, query) =>
         val cmd =
-          CreateMetastoreDataSourceAsSelect(tableName, provider, partitionCols, mode, opts, query)
+          CreateMetastoreDataSourceAsSelect(tableIdent, provider, partitionCols, mode, opts, query)
         ExecutedCommand(cmd) :: Nil
 
       case _ => Nil
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 05a78930afe3d..d1699dd536817 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.hadoop.hive.metastore.MetaStoreUtils
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.{TableIdentifier, SqlParser}
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -120,9 +122,10 @@ case class AddFile(path: String) extends RunnableCommand {
   }
 }
 
+// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSource(
-    tableName: String,
+    tableIdent: TableIdentifier,
     userSpecifiedSchema: Option[StructType],
     provider: String,
     options: Map[String, String],
@@ -130,9 +133,24 @@ case class CreateMetastoreDataSource(
     managedIfNoPath: Boolean) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
+    // Since we are saving metadata to metastore, we need to check if metastore supports
+    // the table name and database name we have for this query. MetaStoreUtils.validateName
+    // is the method used by Hive to check if a table name or a database name is valid for
+    // the metastore.
+    if (!MetaStoreUtils.validateName(tableIdent.table)) {
+      throw new AnalysisException(s"Table name ${tableIdent.table} is not a valid name for " +
+        s"metastore. Metastore only accepts table name containing characters, numbers and _.")
+    }
+    if (tableIdent.database.isDefined && !MetaStoreUtils.validateName(tableIdent.database.get)) {
+      throw new AnalysisException(s"Database name ${tableIdent.database.get} is not a valid name " +
+        s"for metastore. Metastore only accepts database name containing " +
+        s"characters, numbers and _.")
+    }
+
+    val tableName = tableIdent.unquotedString
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
 
-    if (hiveContext.catalog.tableExists(tableName :: Nil)) {
+    if (hiveContext.catalog.tableExists(tableIdent.toSeq)) {
       if (allowExisting) {
         return Seq.empty[Row]
       } else {
@@ -144,13 +162,13 @@ case class CreateMetastoreDataSource(
     val optionsWithPath =
       if (!options.contains("path") && managedIfNoPath) {
         isExternal = false
-        options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableName))
+        options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableIdent))
       } else {
         options
       }
 
     hiveContext.catalog.createDataSourceTable(
-      tableName,
+      tableIdent,
       userSpecifiedSchema,
       Array.empty[String],
       provider,
@@ -161,9 +179,10 @@ case class CreateMetastoreDataSource(
   }
 }
 
+// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSourceAsSelect(
-    tableName: String,
+    tableIdent: TableIdentifier,
     provider: String,
     partitionColumns: Array[String],
     mode: SaveMode,
@@ -171,19 +190,34 @@ case class CreateMetastoreDataSourceAsSelect(
     query: LogicalPlan) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
+    // Since we are saving metadata to metastore, we need to check if metastore supports
+    // the table name and database name we have for this query. MetaStoreUtils.validateName
+    // is the method used by Hive to check if a table name or a database name is valid for
+    // the metastore.
+    if (!MetaStoreUtils.validateName(tableIdent.table)) {
+      throw new AnalysisException(s"Table name ${tableIdent.table} is not a valid name for " +
+        s"metastore. Metastore only accepts table name containing characters, numbers and _.")
+    }
+    if (tableIdent.database.isDefined && !MetaStoreUtils.validateName(tableIdent.database.get)) {
+      throw new AnalysisException(s"Database name ${tableIdent.database.get} is not a valid name " +
+        s"for metastore. Metastore only accepts database name containing " +
+        s"characters, numbers and _.")
+    }
+
+    val tableName = tableIdent.unquotedString
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
     var createMetastoreTable = false
     var isExternal = true
     val optionsWithPath =
       if (!options.contains("path")) {
         isExternal = false
-        options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableName))
+        options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableIdent))
       } else {
         options
       }
 
     var existingSchema = None: Option[StructType]
-    if (sqlContext.catalog.tableExists(Seq(tableName))) {
+    if (sqlContext.catalog.tableExists(tableIdent.toSeq)) {
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
@@ -200,7 +234,7 @@ case class CreateMetastoreDataSourceAsSelect(
           val resolved = ResolvedDataSource(
             sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
-          EliminateSubQueries(sqlContext.table(tableName).logicalPlan) match {
+          EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match {
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation) =>
               if (l.relation != createdRelation.relation) {
                 val errorDescription =
@@ -249,7 +283,7 @@ case class CreateMetastoreDataSourceAsSelect(
       // the schema of df). It is important since the nullability may be changed by the relation
       // provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
       hiveContext.catalog.createDataSourceTable(
-        tableName,
+        tableIdent,
         Some(resolved.relation.schema),
         partitionColumns,
         provider,
@@ -258,7 +292,7 @@ case class CreateMetastoreDataSourceAsSelect(
     }
 
     // Refresh the cache of the table in the catalog.
-    hiveContext.refreshTable(tableName)
+    hiveContext.catalog.refreshTable(tableIdent)
     Seq.empty[Row]
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 1c15997ea8e6d..d3388a9429e41 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -34,7 +34,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
     catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
-    catalog.registerTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"), df.logicalPlan)
     sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
     sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
     sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
@@ -42,7 +41,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
 
   override def afterAll(): Unit = {
     catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    catalog.unregisterTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"))
     sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
     sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
     sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
@@ -55,7 +53,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
         checkAnswer(
           allTables.filter("tableName = 'listtablessuitetable'"),
           Row("listtablessuitetable", true))
-        assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0)
         checkAnswer(
           allTables.filter("tableName = 'hivelisttablessuitetable'"),
           Row("hivelisttablessuitetable", false))
@@ -69,9 +66,6 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
         checkAnswer(
           allTables.filter("tableName = 'listtablessuitetable'"),
           Row("listtablessuitetable", true))
-        checkAnswer(
-          allTables.filter("tableName = 'indblisttablessuitetable'"),
-          Row("indblisttablessuitetable", true))
         assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
         checkAnswer(
           allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 417e8b07917cc..997c667ec0d1b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -19,14 +19,22 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.{QueryTest, SQLContext, SaveMode}
+import org.apache.spark.sql.{AnalysisException, QueryTest, SQLContext, SaveMode}
 
 class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
-  override val _sqlContext: SQLContext = TestHive
+  override val _sqlContext: HiveContext = TestHive
   private val sqlContext = _sqlContext
 
   private val df = sqlContext.range(10).coalesce(1)
 
+  private def checkTablePath(dbName: String, tableName: String): Unit = {
+    // val hiveContext = sqlContext.asInstanceOf[HiveContext]
+    val metastoreTable = sqlContext.catalog.client.getTable(dbName, tableName)
+    val expectedPath = sqlContext.catalog.client.getDatabase(dbName).location + "/" + tableName
+
+    assert(metastoreTable.serdeProperties("path") === expectedPath)
+  }
+
   test(s"saveAsTable() to non-default database - with USE - Overwrite") {
     withTempDatabase { db =>
       activateDatabase(db) {
@@ -37,6 +45,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
       assert(sqlContext.tableNames(db).contains("t"))
       checkAnswer(sqlContext.table(s"$db.t"), df)
+
+      checkTablePath(db, "t")
     }
   }
 
@@ -45,6 +55,58 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
       df.write.mode(SaveMode.Overwrite).saveAsTable(s"$db.t")
       assert(sqlContext.tableNames(db).contains("t"))
       checkAnswer(sqlContext.table(s"$db.t"), df)
+
+      checkTablePath(db, "t")
+    }
+  }
+
+  test(s"createExternalTable() to non-default database - with USE") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        withTempPath { dir =>
+          val path = dir.getCanonicalPath
+          df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
+
+          sqlContext.createExternalTable("t", path, "parquet")
+          assert(sqlContext.tableNames(db).contains("t"))
+          checkAnswer(sqlContext.table("t"), df)
+
+          sql(
+            s"""
+              |CREATE TABLE t1
+              |USING parquet
+              |OPTIONS (
+              |  path '$path'
+              |)
+            """.stripMargin)
+          assert(sqlContext.tableNames(db).contains("t1"))
+          checkAnswer(sqlContext.table("t1"), df)
+        }
+      }
+    }
+  }
+
+  test(s"createExternalTable() to non-default database - without USE") {
+    withTempDatabase { db =>
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
+        sqlContext.createExternalTable(s"$db.t", path, "parquet")
+
+        assert(sqlContext.tableNames(db).contains("t"))
+        checkAnswer(sqlContext.table(s"$db.t"), df)
+
+        sql(
+          s"""
+              |CREATE TABLE $db.t1
+              |USING parquet
+              |OPTIONS (
+              |  path '$path'
+              |)
+            """.stripMargin)
+        assert(sqlContext.tableNames(db).contains("t1"))
+        checkAnswer(sqlContext.table(s"$db.t1"), df)
+      }
     }
   }
 
@@ -59,6 +121,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
       assert(sqlContext.tableNames(db).contains("t"))
       checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+
+      checkTablePath(db, "t")
     }
   }
 
@@ -68,6 +132,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
       df.write.mode(SaveMode.Append).saveAsTable(s"$db.t")
       assert(sqlContext.tableNames(db).contains("t"))
       checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+
+      checkTablePath(db, "t")
     }
   }
 
@@ -130,7 +196,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("Refreshes a table in a non-default database") {
+  test("Refreshes a table in a non-default database - with USE") {
     import org.apache.spark.sql.functions.lit
 
     withTempDatabase { db =>
@@ -151,8 +217,94 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
           sql("ALTER TABLE t ADD PARTITION (p=1)")
           sql("REFRESH TABLE t")
           checkAnswer(sqlContext.table("t"), df.withColumn("p", lit(1)))
+
+          df.write.parquet(s"$path/p=2")
+          sql("ALTER TABLE t ADD PARTITION (p=2)")
+          sqlContext.refreshTable("t")
+          checkAnswer(
+            sqlContext.table("t"),
+            df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
         }
       }
     }
   }
+
+  test("Refreshes a table in a non-default database - without USE") {
+    import org.apache.spark.sql.functions.lit
+
+    withTempDatabase { db =>
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+
+        sql(
+          s"""CREATE EXTERNAL TABLE $db.t (id BIGINT)
+               |PARTITIONED BY (p INT)
+               |STORED AS PARQUET
+               |LOCATION '$path'
+             """.stripMargin)
+
+        checkAnswer(sqlContext.table(s"$db.t"), sqlContext.emptyDataFrame)
+
+        df.write.parquet(s"$path/p=1")
+        sql(s"ALTER TABLE $db.t ADD PARTITION (p=1)")
+        sql(s"REFRESH TABLE $db.t")
+        checkAnswer(sqlContext.table(s"$db.t"), df.withColumn("p", lit(1)))
+
+        df.write.parquet(s"$path/p=2")
+        sql(s"ALTER TABLE $db.t ADD PARTITION (p=2)")
+        sqlContext.refreshTable(s"$db.t")
+        checkAnswer(
+          sqlContext.table(s"$db.t"),
+          df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
+      }
+    }
+  }
+
+  test("invalid database name and table names") {
+    {
+      val message = intercept[AnalysisException] {
+        df.write.format("parquet").saveAsTable("`d:b`.`t:a`")
+      }.getMessage
+      assert(message.contains("is not a valid name for metastore"))
+    }
+
+    {
+      val message = intercept[AnalysisException] {
+        df.write.format("parquet").saveAsTable("`d:b`.`table`")
+      }.getMessage
+      assert(message.contains("is not a valid name for metastore"))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      {
+        val message = intercept[AnalysisException] {
+          sql(
+            s"""
+            |CREATE TABLE `d:b`.`t:a` (a int)
+            |USING parquet
+            |OPTIONS (
+            |  path '$path'
+            |)
+            """.stripMargin)
+        }.getMessage
+        assert(message.contains("is not a valid name for metastore"))
+      }
+
+      {
+        val message = intercept[AnalysisException] {
+          sql(
+            s"""
+              |CREATE TABLE `d:b`.`table` (a int)
+              |USING parquet
+              |OPTIONS (
+              |  path '$path'
+              |)
+              """.stripMargin)
+        }.getMessage
+        assert(message.contains("is not a valid name for metastore"))
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8b8f520776e70..55ecbd5b5f21d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1138,4 +1138,39 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
       Row(CalendarInterval.fromString(
         "interval 4 minutes 59 seconds 889 milliseconds 987 microseconds")))
   }
+
+  test("specifying database name for a temporary table is not allowed") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df =
+        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+      df
+        .write
+        .format("parquet")
+        .save(path)
+
+      val message = intercept[AnalysisException] {
+        sqlContext.sql(
+          s"""
+          |CREATE TEMPORARY TABLE db.t
+          |USING parquet
+          |OPTIONS (
+          |  path '$path'
+          |)
+        """.stripMargin)
+      }.getMessage
+      assert(message.contains("Specifying database name or other qualifiers are not allowed"))
+
+      // If you use backticks to quote the name of a temporary table having dot in it.
+      sqlContext.sql(
+        s"""
+          |CREATE TEMPORARY TABLE `db.t`
+          |USING parquet
+          |OPTIONS (
+          |  path '$path'
+          |)
+        """.stripMargin)
+      checkAnswer(sqlContext.table("`db.t`"), df)
+    }
+  }
 }

From b4f4e91c395cb69ced61d9ff1492d1b814f96828 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 20 Aug 2015 07:53:27 -0700
Subject: [PATCH 019/802] [SPARK-10100] [SQL] Eliminate hash table lookup if
 there is no grouping key in aggregation.

This improves performance by ~ 20 - 30% in one of my local test and should fix the performance regression from 1.4 to 1.5 on ss_max.

Author: Reynold Xin <rxin@databricks.com>

Closes #8332 from rxin/SPARK-10100.
---
 .../aggregate/TungstenAggregate.scala         |  2 +-
 .../TungstenAggregationIterator.scala         | 30 +++++++++++++------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 99f51ba5b6935..ba379d358d206 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -104,7 +104,7 @@ case class TungstenAggregate(
         } else {
           // This is a grouped aggregate and the input iterator is empty,
           // so return an empty iterator.
-          Iterator[UnsafeRow]()
+          Iterator.empty
         }
       } else {
         aggregationIterator.start(parentIterator)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index af7e0fcedbe4e..26fdbc83ef50b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -357,18 +357,30 @@ class TungstenAggregationIterator(
   // sort-based aggregation (by calling switchToSortBasedAggregation).
   private def processInputs(): Unit = {
     assert(inputIter != null, "attempted to process input when iterator was null")
-    while (!sortBased && inputIter.hasNext) {
-      val newInput = inputIter.next()
-      numInputRows += 1
-      val groupingKey = groupProjection.apply(newInput)
+    if (groupingExpressions.isEmpty) {
+      // If there is no grouping expressions, we can just reuse the same buffer over and over again.
+      // Note that it would be better to eliminate the hash map entirely in the future.
+      val groupingKey = groupProjection.apply(null)
       val buffer: UnsafeRow = hashMap.getAggregationBufferFromUnsafeRow(groupingKey)
-      if (buffer == null) {
-        // buffer == null means that we could not allocate more memory.
-        // Now, we need to spill the map and switch to sort-based aggregation.
-        switchToSortBasedAggregation(groupingKey, newInput)
-      } else {
+      while (inputIter.hasNext) {
+        val newInput = inputIter.next()
+        numInputRows += 1
         processRow(buffer, newInput)
       }
+    } else {
+      while (!sortBased && inputIter.hasNext) {
+        val newInput = inputIter.next()
+        numInputRows += 1
+        val groupingKey = groupProjection.apply(newInput)
+        val buffer: UnsafeRow = hashMap.getAggregationBufferFromUnsafeRow(groupingKey)
+        if (buffer == null) {
+          // buffer == null means that we could not allocate more memory.
+          // Now, we need to spill the map and switch to sort-based aggregation.
+          switchToSortBasedAggregation(groupingKey, newInput)
+        } else {
+          processRow(buffer, newInput)
+        }
+      }
     }
   }
 

From 52c60537a274af5414f6b0340a4bd7488ef35280 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 20 Aug 2015 10:05:31 -0700
Subject: [PATCH 020/802] [MINOR] [SQL] Fix sphinx warnings in PySpark SQL

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8171 from MechCoder/sql_sphinx.
---
 python/pyspark/context.py   | 8 ++++----
 python/pyspark/sql/types.py | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index eb5b0bbbdac4b..1b2a52ad64114 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -302,10 +302,10 @@ def applicationId(self):
         """
         A unique identifier for the Spark application.
         Its format depends on the scheduler implementation.
-        (i.e.
-            in case of local spark app something like 'local-1433865536131'
-            in case of YARN something like 'application_1433865536131_34483'
-        )
+
+        * in case of local spark app something like 'local-1433865536131'
+        * in case of YARN something like 'application_1433865536131_34483'
+
         >>> sc.applicationId  # doctest: +ELLIPSIS
         u'local-...'
         """
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index c083bf89905bf..ed4e5b594bd61 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -467,9 +467,11 @@ def add(self, field, data_type=None, nullable=True, metadata=None):
         """
         Construct a StructType by adding new elements to it to define the schema. The method accepts
         either:
+
             a) A single parameter which is a StructField object.
             b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
-             metadata(optional). The data_type parameter may be either a String or a DataType object
+               metadata(optional). The data_type parameter may be either a String or a
+               DataType object.
 
         >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         >>> struct2 = StructType([StructField("f1", StringType(), True),\

From 39e91fe2fd43044cc734d55625a3c03284b69f09 Mon Sep 17 00:00:00 2001
From: Alex Shkurenko <ashkurenko@enova.com>
Date: Thu, 20 Aug 2015 10:16:38 -0700
Subject: [PATCH 021/802] [SPARK-9982] [SPARKR] SparkR DataFrame fail to return
 data of Decimal type

Author: Alex Shkurenko <ashkurenko@enova.com>

Closes #8239 from ashkurenko/master.
---
 core/src/main/scala/org/apache/spark/api/r/SerDe.scala | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index d5b4260bf4529..3c89f24473744 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -181,6 +181,7 @@ private[spark] object SerDe {
   // Boolean -> logical
   // Float -> double
   // Double -> double
+  // Decimal -> double
   // Long -> double
   // Array[Byte] -> raw
   // Date -> Date
@@ -219,6 +220,10 @@ private[spark] object SerDe {
         case "float" | "java.lang.Float" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Float].toDouble)
+        case "decimal" | "java.math.BigDecimal" =>
+          writeType(dos, "double")
+          val javaDecimal = value.asInstanceOf[java.math.BigDecimal]
+          writeDouble(dos, scala.math.BigDecimal(javaDecimal).toDouble)
         case "double" | "java.lang.Double" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Double])

From 85f9a61357994da5023b08b0a8a2eb09388ce7f8 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 20 Aug 2015 11:00:24 -0700
Subject: [PATCH 022/802] [SPARK-10136] [SQL] Fixes Parquet support for Avro
 array of primitive array

I caught SPARK-10136 while adding more test cases to `ParquetAvroCompatibilitySuite`. Actual bug fix code lies in `CatalystRowConverter.scala`.

Author: Cheng Lian <lian@databricks.com>

Closes #8341 from liancheng/spark-10136/parquet-avro-nested-primitive-array.
---
 .../parquet/CatalystReadSupport.scala         |   1 -
 .../parquet/CatalystRowConverter.scala        |  24 +-
 sql/core/src/test/avro/parquet-compat.avdl    |  19 +-
 sql/core/src/test/avro/parquet-compat.avpr    |  54 +-
 .../parquet/test/avro/AvroArrayOfArray.java   | 142 +++
 .../parquet/test/avro/AvroMapOfArray.java     | 142 +++
 .../test/avro/AvroNonNullableArrays.java      | 196 +++++
 .../test/avro/AvroOptionalPrimitives.java     | 466 ++++++++++
 .../parquet/test/avro/AvroPrimitives.java     | 461 ++++++++++
 .../parquet/test/avro/CompatibilityTest.java  |   2 +-
 .../parquet/test/avro/ParquetAvroCompat.java  | 821 +-----------------
 .../ParquetAvroCompatibilitySuite.scala       | 227 +++--
 .../parquet/ParquetCompatibilityTest.scala    |   7 +
 13 files changed, 1718 insertions(+), 844 deletions(-)
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index a4679bb2f6389..3f8353af6e2ad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -61,7 +61,6 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
          |
          |Parquet form:
          |$parquetRequestedSchema
-         |
          |Catalyst form:
          |$catalystRequestedSchema
        """.stripMargin
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 18c5b500209e6..d2c2db51769ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -25,11 +25,12 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
-import org.apache.parquet.schema.OriginalType.{LIST, INT_32, UTF8}
+import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8}
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE
 import org.apache.parquet.schema.Type.Repetition
 import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -145,7 +146,16 @@ private[parquet] class CatalystRowConverter(
     parquetType: GroupType,
     catalystType: StructType,
     updater: ParentContainerUpdater)
-  extends CatalystGroupConverter(updater) {
+  extends CatalystGroupConverter(updater) with Logging {
+
+  logDebug(
+    s"""Building row converter for the following schema:
+       |
+       |Parquet form:
+       |$parquetType
+       |Catalyst form:
+       |${catalystType.prettyJson}
+     """.stripMargin)
 
   /**
    * Updater used together with field converters within a [[CatalystRowConverter]].  It propagates
@@ -464,9 +474,15 @@ private[parquet] class CatalystRowConverter(
 
       override def getConverter(fieldIndex: Int): Converter = converter
 
-      override def end(): Unit = currentArray += currentElement
+      override def end(): Unit = {
+        converter.updater.end()
+        currentArray += currentElement
+      }
 
-      override def start(): Unit = currentElement = null
+      override def start(): Unit = {
+        converter.updater.start()
+        currentElement = null
+      }
     }
   }
 
diff --git a/sql/core/src/test/avro/parquet-compat.avdl b/sql/core/src/test/avro/parquet-compat.avdl
index 8070d0a9170a3..c5eb5b5164cf4 100644
--- a/sql/core/src/test/avro/parquet-compat.avdl
+++ b/sql/core/src/test/avro/parquet-compat.avdl
@@ -34,7 +34,7 @@ protocol CompatibilityTest {
         string nested_string_column;
     }
 
-    record ParquetAvroCompat {
+    record AvroPrimitives {
         boolean bool_column;
         int int_column;
         long long_column;
@@ -42,7 +42,9 @@ protocol CompatibilityTest {
         double double_column;
         bytes binary_column;
         string string_column;
+    }
 
+    record AvroOptionalPrimitives {
         union { null, boolean } maybe_bool_column;
         union { null, int } maybe_int_column;
         union { null, long } maybe_long_column;
@@ -50,7 +52,22 @@ protocol CompatibilityTest {
         union { null, double } maybe_double_column;
         union { null, bytes } maybe_binary_column;
         union { null, string } maybe_string_column;
+    }
+
+    record AvroNonNullableArrays {
+        array<string> strings_column;
+        union { null, array<int> } maybe_ints_column;
+    }
 
+    record AvroArrayOfArray {
+        array<array<int>> int_arrays_column;
+    }
+
+    record AvroMapOfArray {
+        map<array<int>> string_to_ints_column;
+    }
+
+    record ParquetAvroCompat {
         array<string> strings_column;
         map<int> string_to_int_column;
         map<array<Nested>> complex_column;
diff --git a/sql/core/src/test/avro/parquet-compat.avpr b/sql/core/src/test/avro/parquet-compat.avpr
index 060391765034b..9ad315b74fb41 100644
--- a/sql/core/src/test/avro/parquet-compat.avpr
+++ b/sql/core/src/test/avro/parquet-compat.avpr
@@ -27,7 +27,7 @@
     } ]
   }, {
     "type" : "record",
-    "name" : "ParquetAvroCompat",
+    "name" : "AvroPrimitives",
     "fields" : [ {
       "name" : "bool_column",
       "type" : "boolean"
@@ -49,7 +49,11 @@
     }, {
       "name" : "string_column",
       "type" : "string"
-    }, {
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "AvroOptionalPrimitives",
+    "fields" : [ {
       "name" : "maybe_bool_column",
       "type" : [ "null", "boolean" ]
     }, {
@@ -70,7 +74,53 @@
     }, {
       "name" : "maybe_string_column",
       "type" : [ "null", "string" ]
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "AvroNonNullableArrays",
+    "fields" : [ {
+      "name" : "strings_column",
+      "type" : {
+        "type" : "array",
+        "items" : "string"
+      }
     }, {
+      "name" : "maybe_ints_column",
+      "type" : [ "null", {
+        "type" : "array",
+        "items" : "int"
+      } ]
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "AvroArrayOfArray",
+    "fields" : [ {
+      "name" : "int_arrays_column",
+      "type" : {
+        "type" : "array",
+        "items" : {
+          "type" : "array",
+          "items" : "int"
+        }
+      }
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "AvroMapOfArray",
+    "fields" : [ {
+      "name" : "string_to_ints_column",
+      "type" : {
+        "type" : "map",
+        "values" : {
+          "type" : "array",
+          "items" : "int"
+        }
+      }
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "ParquetAvroCompat",
+    "fields" : [ {
       "name" : "strings_column",
       "type" : {
         "type" : "array",
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java
new file mode 100644
index 0000000000000..ee327827903e5
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroArrayOfArray.java
@@ -0,0 +1,142 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.execution.datasources.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class AvroArrayOfArray extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroArrayOfArray\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"int_arrays_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public java.util.List<java.util.List<java.lang.Integer>> int_arrays_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public AvroArrayOfArray() {}
+
+  /**
+   * All-args constructor.
+   */
+  public AvroArrayOfArray(java.util.List<java.util.List<java.lang.Integer>> int_arrays_column) {
+    this.int_arrays_column = int_arrays_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return int_arrays_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: int_arrays_column = (java.util.List<java.util.List<java.lang.Integer>>)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'int_arrays_column' field.
+   */
+  public java.util.List<java.util.List<java.lang.Integer>> getIntArraysColumn() {
+    return int_arrays_column;
+  }
+
+  /**
+   * Sets the value of the 'int_arrays_column' field.
+   * @param value the value to set.
+   */
+  public void setIntArraysColumn(java.util.List<java.util.List<java.lang.Integer>> value) {
+    this.int_arrays_column = value;
+  }
+
+  /** Creates a new AvroArrayOfArray RecordBuilder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder() {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder();
+  }
+  
+  /** Creates a new AvroArrayOfArray RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder(other);
+  }
+  
+  /** Creates a new AvroArrayOfArray RecordBuilder by copying an existing AvroArrayOfArray instance */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for AvroArrayOfArray instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<AvroArrayOfArray>
+    implements org.apache.avro.data.RecordBuilder<AvroArrayOfArray> {
+
+    private java.util.List<java.util.List<java.lang.Integer>> int_arrays_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.int_arrays_column)) {
+        this.int_arrays_column = data().deepCopy(fields()[0].schema(), other.int_arrays_column);
+        fieldSetFlags()[0] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing AvroArrayOfArray instance */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray other) {
+            super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.SCHEMA$);
+      if (isValidValue(fields()[0], other.int_arrays_column)) {
+        this.int_arrays_column = data().deepCopy(fields()[0].schema(), other.int_arrays_column);
+        fieldSetFlags()[0] = true;
+      }
+    }
+
+    /** Gets the value of the 'int_arrays_column' field */
+    public java.util.List<java.util.List<java.lang.Integer>> getIntArraysColumn() {
+      return int_arrays_column;
+    }
+    
+    /** Sets the value of the 'int_arrays_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder setIntArraysColumn(java.util.List<java.util.List<java.lang.Integer>> value) {
+      validate(fields()[0], value);
+      this.int_arrays_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'int_arrays_column' field has been set */
+    public boolean hasIntArraysColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'int_arrays_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray.Builder clearIntArraysColumn() {
+      int_arrays_column = null;
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    @Override
+    public AvroArrayOfArray build() {
+      try {
+        AvroArrayOfArray record = new AvroArrayOfArray();
+        record.int_arrays_column = fieldSetFlags()[0] ? this.int_arrays_column : (java.util.List<java.util.List<java.lang.Integer>>) defaultValue(fields()[0]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java
new file mode 100644
index 0000000000000..727f6a7bf733e
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroMapOfArray.java
@@ -0,0 +1,142 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.execution.datasources.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class AvroMapOfArray extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroMapOfArray\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"string_to_ints_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"int\"},\"avro.java.string\":\"String\"}}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> string_to_ints_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public AvroMapOfArray() {}
+
+  /**
+   * All-args constructor.
+   */
+  public AvroMapOfArray(java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> string_to_ints_column) {
+    this.string_to_ints_column = string_to_ints_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return string_to_ints_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: string_to_ints_column = (java.util.Map<java.lang.String,java.util.List<java.lang.Integer>>)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'string_to_ints_column' field.
+   */
+  public java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> getStringToIntsColumn() {
+    return string_to_ints_column;
+  }
+
+  /**
+   * Sets the value of the 'string_to_ints_column' field.
+   * @param value the value to set.
+   */
+  public void setStringToIntsColumn(java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> value) {
+    this.string_to_ints_column = value;
+  }
+
+  /** Creates a new AvroMapOfArray RecordBuilder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder() {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder();
+  }
+  
+  /** Creates a new AvroMapOfArray RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder(other);
+  }
+  
+  /** Creates a new AvroMapOfArray RecordBuilder by copying an existing AvroMapOfArray instance */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for AvroMapOfArray instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<AvroMapOfArray>
+    implements org.apache.avro.data.RecordBuilder<AvroMapOfArray> {
+
+    private java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> string_to_ints_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.string_to_ints_column)) {
+        this.string_to_ints_column = data().deepCopy(fields()[0].schema(), other.string_to_ints_column);
+        fieldSetFlags()[0] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing AvroMapOfArray instance */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray other) {
+            super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.SCHEMA$);
+      if (isValidValue(fields()[0], other.string_to_ints_column)) {
+        this.string_to_ints_column = data().deepCopy(fields()[0].schema(), other.string_to_ints_column);
+        fieldSetFlags()[0] = true;
+      }
+    }
+
+    /** Gets the value of the 'string_to_ints_column' field */
+    public java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> getStringToIntsColumn() {
+      return string_to_ints_column;
+    }
+    
+    /** Sets the value of the 'string_to_ints_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder setStringToIntsColumn(java.util.Map<java.lang.String,java.util.List<java.lang.Integer>> value) {
+      validate(fields()[0], value);
+      this.string_to_ints_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'string_to_ints_column' field has been set */
+    public boolean hasStringToIntsColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'string_to_ints_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroMapOfArray.Builder clearStringToIntsColumn() {
+      string_to_ints_column = null;
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    @Override
+    public AvroMapOfArray build() {
+      try {
+        AvroMapOfArray record = new AvroMapOfArray();
+        record.string_to_ints_column = fieldSetFlags()[0] ? this.string_to_ints_column : (java.util.Map<java.lang.String,java.util.List<java.lang.Integer>>) defaultValue(fields()[0]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java
new file mode 100644
index 0000000000000..934793f42f9c9
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroNonNullableArrays.java
@@ -0,0 +1,196 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.execution.datasources.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class AvroNonNullableArrays extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroNonNullableArrays\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"maybe_ints_column\",\"type\":[\"null\",{\"type\":\"array\",\"items\":\"int\"}]}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public java.util.List<java.lang.String> strings_column;
+  @Deprecated public java.util.List<java.lang.Integer> maybe_ints_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public AvroNonNullableArrays() {}
+
+  /**
+   * All-args constructor.
+   */
+  public AvroNonNullableArrays(java.util.List<java.lang.String> strings_column, java.util.List<java.lang.Integer> maybe_ints_column) {
+    this.strings_column = strings_column;
+    this.maybe_ints_column = maybe_ints_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return strings_column;
+    case 1: return maybe_ints_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: strings_column = (java.util.List<java.lang.String>)value$; break;
+    case 1: maybe_ints_column = (java.util.List<java.lang.Integer>)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'strings_column' field.
+   */
+  public java.util.List<java.lang.String> getStringsColumn() {
+    return strings_column;
+  }
+
+  /**
+   * Sets the value of the 'strings_column' field.
+   * @param value the value to set.
+   */
+  public void setStringsColumn(java.util.List<java.lang.String> value) {
+    this.strings_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_ints_column' field.
+   */
+  public java.util.List<java.lang.Integer> getMaybeIntsColumn() {
+    return maybe_ints_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_ints_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeIntsColumn(java.util.List<java.lang.Integer> value) {
+    this.maybe_ints_column = value;
+  }
+
+  /** Creates a new AvroNonNullableArrays RecordBuilder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder() {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder();
+  }
+  
+  /** Creates a new AvroNonNullableArrays RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder(other);
+  }
+  
+  /** Creates a new AvroNonNullableArrays RecordBuilder by copying an existing AvroNonNullableArrays instance */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for AvroNonNullableArrays instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<AvroNonNullableArrays>
+    implements org.apache.avro.data.RecordBuilder<AvroNonNullableArrays> {
+
+    private java.util.List<java.lang.String> strings_column;
+    private java.util.List<java.lang.Integer> maybe_ints_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.maybe_ints_column)) {
+        this.maybe_ints_column = data().deepCopy(fields()[1].schema(), other.maybe_ints_column);
+        fieldSetFlags()[1] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing AvroNonNullableArrays instance */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays other) {
+            super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.SCHEMA$);
+      if (isValidValue(fields()[0], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.maybe_ints_column)) {
+        this.maybe_ints_column = data().deepCopy(fields()[1].schema(), other.maybe_ints_column);
+        fieldSetFlags()[1] = true;
+      }
+    }
+
+    /** Gets the value of the 'strings_column' field */
+    public java.util.List<java.lang.String> getStringsColumn() {
+      return strings_column;
+    }
+    
+    /** Sets the value of the 'strings_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder setStringsColumn(java.util.List<java.lang.String> value) {
+      validate(fields()[0], value);
+      this.strings_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'strings_column' field has been set */
+    public boolean hasStringsColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'strings_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder clearStringsColumn() {
+      strings_column = null;
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_ints_column' field */
+    public java.util.List<java.lang.Integer> getMaybeIntsColumn() {
+      return maybe_ints_column;
+    }
+    
+    /** Sets the value of the 'maybe_ints_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder setMaybeIntsColumn(java.util.List<java.lang.Integer> value) {
+      validate(fields()[1], value);
+      this.maybe_ints_column = value;
+      fieldSetFlags()[1] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_ints_column' field has been set */
+    public boolean hasMaybeIntsColumn() {
+      return fieldSetFlags()[1];
+    }
+    
+    /** Clears the value of the 'maybe_ints_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroNonNullableArrays.Builder clearMaybeIntsColumn() {
+      maybe_ints_column = null;
+      fieldSetFlags()[1] = false;
+      return this;
+    }
+
+    @Override
+    public AvroNonNullableArrays build() {
+      try {
+        AvroNonNullableArrays record = new AvroNonNullableArrays();
+        record.strings_column = fieldSetFlags()[0] ? this.strings_column : (java.util.List<java.lang.String>) defaultValue(fields()[0]);
+        record.maybe_ints_column = fieldSetFlags()[1] ? this.maybe_ints_column : (java.util.List<java.lang.Integer>) defaultValue(fields()[1]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java
new file mode 100644
index 0000000000000..e4d1ead8dd15f
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroOptionalPrimitives.java
@@ -0,0 +1,466 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.execution.datasources.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class AvroOptionalPrimitives extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroOptionalPrimitives\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public java.lang.Boolean maybe_bool_column;
+  @Deprecated public java.lang.Integer maybe_int_column;
+  @Deprecated public java.lang.Long maybe_long_column;
+  @Deprecated public java.lang.Float maybe_float_column;
+  @Deprecated public java.lang.Double maybe_double_column;
+  @Deprecated public java.nio.ByteBuffer maybe_binary_column;
+  @Deprecated public java.lang.String maybe_string_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public AvroOptionalPrimitives() {}
+
+  /**
+   * All-args constructor.
+   */
+  public AvroOptionalPrimitives(java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column) {
+    this.maybe_bool_column = maybe_bool_column;
+    this.maybe_int_column = maybe_int_column;
+    this.maybe_long_column = maybe_long_column;
+    this.maybe_float_column = maybe_float_column;
+    this.maybe_double_column = maybe_double_column;
+    this.maybe_binary_column = maybe_binary_column;
+    this.maybe_string_column = maybe_string_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return maybe_bool_column;
+    case 1: return maybe_int_column;
+    case 2: return maybe_long_column;
+    case 3: return maybe_float_column;
+    case 4: return maybe_double_column;
+    case 5: return maybe_binary_column;
+    case 6: return maybe_string_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: maybe_bool_column = (java.lang.Boolean)value$; break;
+    case 1: maybe_int_column = (java.lang.Integer)value$; break;
+    case 2: maybe_long_column = (java.lang.Long)value$; break;
+    case 3: maybe_float_column = (java.lang.Float)value$; break;
+    case 4: maybe_double_column = (java.lang.Double)value$; break;
+    case 5: maybe_binary_column = (java.nio.ByteBuffer)value$; break;
+    case 6: maybe_string_column = (java.lang.String)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'maybe_bool_column' field.
+   */
+  public java.lang.Boolean getMaybeBoolColumn() {
+    return maybe_bool_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_bool_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeBoolColumn(java.lang.Boolean value) {
+    this.maybe_bool_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_int_column' field.
+   */
+  public java.lang.Integer getMaybeIntColumn() {
+    return maybe_int_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_int_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeIntColumn(java.lang.Integer value) {
+    this.maybe_int_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_long_column' field.
+   */
+  public java.lang.Long getMaybeLongColumn() {
+    return maybe_long_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_long_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeLongColumn(java.lang.Long value) {
+    this.maybe_long_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_float_column' field.
+   */
+  public java.lang.Float getMaybeFloatColumn() {
+    return maybe_float_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_float_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeFloatColumn(java.lang.Float value) {
+    this.maybe_float_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_double_column' field.
+   */
+  public java.lang.Double getMaybeDoubleColumn() {
+    return maybe_double_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_double_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeDoubleColumn(java.lang.Double value) {
+    this.maybe_double_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_binary_column' field.
+   */
+  public java.nio.ByteBuffer getMaybeBinaryColumn() {
+    return maybe_binary_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_binary_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+    this.maybe_binary_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_string_column' field.
+   */
+  public java.lang.String getMaybeStringColumn() {
+    return maybe_string_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_string_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeStringColumn(java.lang.String value) {
+    this.maybe_string_column = value;
+  }
+
+  /** Creates a new AvroOptionalPrimitives RecordBuilder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder() {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder();
+  }
+  
+  /** Creates a new AvroOptionalPrimitives RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder(other);
+  }
+  
+  /** Creates a new AvroOptionalPrimitives RecordBuilder by copying an existing AvroOptionalPrimitives instance */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for AvroOptionalPrimitives instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<AvroOptionalPrimitives>
+    implements org.apache.avro.data.RecordBuilder<AvroOptionalPrimitives> {
+
+    private java.lang.Boolean maybe_bool_column;
+    private java.lang.Integer maybe_int_column;
+    private java.lang.Long maybe_long_column;
+    private java.lang.Float maybe_float_column;
+    private java.lang.Double maybe_double_column;
+    private java.nio.ByteBuffer maybe_binary_column;
+    private java.lang.String maybe_string_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.maybe_bool_column)) {
+        this.maybe_bool_column = data().deepCopy(fields()[0].schema(), other.maybe_bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.maybe_int_column)) {
+        this.maybe_int_column = data().deepCopy(fields()[1].schema(), other.maybe_int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.maybe_long_column)) {
+        this.maybe_long_column = data().deepCopy(fields()[2].schema(), other.maybe_long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.maybe_float_column)) {
+        this.maybe_float_column = data().deepCopy(fields()[3].schema(), other.maybe_float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.maybe_double_column)) {
+        this.maybe_double_column = data().deepCopy(fields()[4].schema(), other.maybe_double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.maybe_binary_column)) {
+        this.maybe_binary_column = data().deepCopy(fields()[5].schema(), other.maybe_binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.maybe_string_column)) {
+        this.maybe_string_column = data().deepCopy(fields()[6].schema(), other.maybe_string_column);
+        fieldSetFlags()[6] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing AvroOptionalPrimitives instance */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives other) {
+            super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.SCHEMA$);
+      if (isValidValue(fields()[0], other.maybe_bool_column)) {
+        this.maybe_bool_column = data().deepCopy(fields()[0].schema(), other.maybe_bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.maybe_int_column)) {
+        this.maybe_int_column = data().deepCopy(fields()[1].schema(), other.maybe_int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.maybe_long_column)) {
+        this.maybe_long_column = data().deepCopy(fields()[2].schema(), other.maybe_long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.maybe_float_column)) {
+        this.maybe_float_column = data().deepCopy(fields()[3].schema(), other.maybe_float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.maybe_double_column)) {
+        this.maybe_double_column = data().deepCopy(fields()[4].schema(), other.maybe_double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.maybe_binary_column)) {
+        this.maybe_binary_column = data().deepCopy(fields()[5].schema(), other.maybe_binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.maybe_string_column)) {
+        this.maybe_string_column = data().deepCopy(fields()[6].schema(), other.maybe_string_column);
+        fieldSetFlags()[6] = true;
+      }
+    }
+
+    /** Gets the value of the 'maybe_bool_column' field */
+    public java.lang.Boolean getMaybeBoolColumn() {
+      return maybe_bool_column;
+    }
+    
+    /** Sets the value of the 'maybe_bool_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeBoolColumn(java.lang.Boolean value) {
+      validate(fields()[0], value);
+      this.maybe_bool_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_bool_column' field has been set */
+    public boolean hasMaybeBoolColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'maybe_bool_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeBoolColumn() {
+      maybe_bool_column = null;
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_int_column' field */
+    public java.lang.Integer getMaybeIntColumn() {
+      return maybe_int_column;
+    }
+    
+    /** Sets the value of the 'maybe_int_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeIntColumn(java.lang.Integer value) {
+      validate(fields()[1], value);
+      this.maybe_int_column = value;
+      fieldSetFlags()[1] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_int_column' field has been set */
+    public boolean hasMaybeIntColumn() {
+      return fieldSetFlags()[1];
+    }
+    
+    /** Clears the value of the 'maybe_int_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeIntColumn() {
+      maybe_int_column = null;
+      fieldSetFlags()[1] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_long_column' field */
+    public java.lang.Long getMaybeLongColumn() {
+      return maybe_long_column;
+    }
+    
+    /** Sets the value of the 'maybe_long_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeLongColumn(java.lang.Long value) {
+      validate(fields()[2], value);
+      this.maybe_long_column = value;
+      fieldSetFlags()[2] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_long_column' field has been set */
+    public boolean hasMaybeLongColumn() {
+      return fieldSetFlags()[2];
+    }
+    
+    /** Clears the value of the 'maybe_long_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeLongColumn() {
+      maybe_long_column = null;
+      fieldSetFlags()[2] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_float_column' field */
+    public java.lang.Float getMaybeFloatColumn() {
+      return maybe_float_column;
+    }
+    
+    /** Sets the value of the 'maybe_float_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeFloatColumn(java.lang.Float value) {
+      validate(fields()[3], value);
+      this.maybe_float_column = value;
+      fieldSetFlags()[3] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_float_column' field has been set */
+    public boolean hasMaybeFloatColumn() {
+      return fieldSetFlags()[3];
+    }
+    
+    /** Clears the value of the 'maybe_float_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeFloatColumn() {
+      maybe_float_column = null;
+      fieldSetFlags()[3] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_double_column' field */
+    public java.lang.Double getMaybeDoubleColumn() {
+      return maybe_double_column;
+    }
+    
+    /** Sets the value of the 'maybe_double_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeDoubleColumn(java.lang.Double value) {
+      validate(fields()[4], value);
+      this.maybe_double_column = value;
+      fieldSetFlags()[4] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_double_column' field has been set */
+    public boolean hasMaybeDoubleColumn() {
+      return fieldSetFlags()[4];
+    }
+    
+    /** Clears the value of the 'maybe_double_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeDoubleColumn() {
+      maybe_double_column = null;
+      fieldSetFlags()[4] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_binary_column' field */
+    public java.nio.ByteBuffer getMaybeBinaryColumn() {
+      return maybe_binary_column;
+    }
+    
+    /** Sets the value of the 'maybe_binary_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+      validate(fields()[5], value);
+      this.maybe_binary_column = value;
+      fieldSetFlags()[5] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_binary_column' field has been set */
+    public boolean hasMaybeBinaryColumn() {
+      return fieldSetFlags()[5];
+    }
+    
+    /** Clears the value of the 'maybe_binary_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeBinaryColumn() {
+      maybe_binary_column = null;
+      fieldSetFlags()[5] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_string_column' field */
+    public java.lang.String getMaybeStringColumn() {
+      return maybe_string_column;
+    }
+    
+    /** Sets the value of the 'maybe_string_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder setMaybeStringColumn(java.lang.String value) {
+      validate(fields()[6], value);
+      this.maybe_string_column = value;
+      fieldSetFlags()[6] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_string_column' field has been set */
+    public boolean hasMaybeStringColumn() {
+      return fieldSetFlags()[6];
+    }
+    
+    /** Clears the value of the 'maybe_string_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroOptionalPrimitives.Builder clearMaybeStringColumn() {
+      maybe_string_column = null;
+      fieldSetFlags()[6] = false;
+      return this;
+    }
+
+    @Override
+    public AvroOptionalPrimitives build() {
+      try {
+        AvroOptionalPrimitives record = new AvroOptionalPrimitives();
+        record.maybe_bool_column = fieldSetFlags()[0] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[0]);
+        record.maybe_int_column = fieldSetFlags()[1] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[1]);
+        record.maybe_long_column = fieldSetFlags()[2] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[2]);
+        record.maybe_float_column = fieldSetFlags()[3] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[3]);
+        record.maybe_double_column = fieldSetFlags()[4] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[4]);
+        record.maybe_binary_column = fieldSetFlags()[5] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]);
+        record.maybe_string_column = fieldSetFlags()[6] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[6]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java
new file mode 100644
index 0000000000000..1c2afed16781e
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/AvroPrimitives.java
@@ -0,0 +1,461 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.execution.datasources.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class AvroPrimitives extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"AvroPrimitives\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public boolean bool_column;
+  @Deprecated public int int_column;
+  @Deprecated public long long_column;
+  @Deprecated public float float_column;
+  @Deprecated public double double_column;
+  @Deprecated public java.nio.ByteBuffer binary_column;
+  @Deprecated public java.lang.String string_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public AvroPrimitives() {}
+
+  /**
+   * All-args constructor.
+   */
+  public AvroPrimitives(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column) {
+    this.bool_column = bool_column;
+    this.int_column = int_column;
+    this.long_column = long_column;
+    this.float_column = float_column;
+    this.double_column = double_column;
+    this.binary_column = binary_column;
+    this.string_column = string_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return bool_column;
+    case 1: return int_column;
+    case 2: return long_column;
+    case 3: return float_column;
+    case 4: return double_column;
+    case 5: return binary_column;
+    case 6: return string_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: bool_column = (java.lang.Boolean)value$; break;
+    case 1: int_column = (java.lang.Integer)value$; break;
+    case 2: long_column = (java.lang.Long)value$; break;
+    case 3: float_column = (java.lang.Float)value$; break;
+    case 4: double_column = (java.lang.Double)value$; break;
+    case 5: binary_column = (java.nio.ByteBuffer)value$; break;
+    case 6: string_column = (java.lang.String)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'bool_column' field.
+   */
+  public java.lang.Boolean getBoolColumn() {
+    return bool_column;
+  }
+
+  /**
+   * Sets the value of the 'bool_column' field.
+   * @param value the value to set.
+   */
+  public void setBoolColumn(java.lang.Boolean value) {
+    this.bool_column = value;
+  }
+
+  /**
+   * Gets the value of the 'int_column' field.
+   */
+  public java.lang.Integer getIntColumn() {
+    return int_column;
+  }
+
+  /**
+   * Sets the value of the 'int_column' field.
+   * @param value the value to set.
+   */
+  public void setIntColumn(java.lang.Integer value) {
+    this.int_column = value;
+  }
+
+  /**
+   * Gets the value of the 'long_column' field.
+   */
+  public java.lang.Long getLongColumn() {
+    return long_column;
+  }
+
+  /**
+   * Sets the value of the 'long_column' field.
+   * @param value the value to set.
+   */
+  public void setLongColumn(java.lang.Long value) {
+    this.long_column = value;
+  }
+
+  /**
+   * Gets the value of the 'float_column' field.
+   */
+  public java.lang.Float getFloatColumn() {
+    return float_column;
+  }
+
+  /**
+   * Sets the value of the 'float_column' field.
+   * @param value the value to set.
+   */
+  public void setFloatColumn(java.lang.Float value) {
+    this.float_column = value;
+  }
+
+  /**
+   * Gets the value of the 'double_column' field.
+   */
+  public java.lang.Double getDoubleColumn() {
+    return double_column;
+  }
+
+  /**
+   * Sets the value of the 'double_column' field.
+   * @param value the value to set.
+   */
+  public void setDoubleColumn(java.lang.Double value) {
+    this.double_column = value;
+  }
+
+  /**
+   * Gets the value of the 'binary_column' field.
+   */
+  public java.nio.ByteBuffer getBinaryColumn() {
+    return binary_column;
+  }
+
+  /**
+   * Sets the value of the 'binary_column' field.
+   * @param value the value to set.
+   */
+  public void setBinaryColumn(java.nio.ByteBuffer value) {
+    this.binary_column = value;
+  }
+
+  /**
+   * Gets the value of the 'string_column' field.
+   */
+  public java.lang.String getStringColumn() {
+    return string_column;
+  }
+
+  /**
+   * Sets the value of the 'string_column' field.
+   * @param value the value to set.
+   */
+  public void setStringColumn(java.lang.String value) {
+    this.string_column = value;
+  }
+
+  /** Creates a new AvroPrimitives RecordBuilder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder() {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder();
+  }
+  
+  /** Creates a new AvroPrimitives RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder(other);
+  }
+  
+  /** Creates a new AvroPrimitives RecordBuilder by copying an existing AvroPrimitives instance */
+  public static org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder newBuilder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives other) {
+    return new org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for AvroPrimitives instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<AvroPrimitives>
+    implements org.apache.avro.data.RecordBuilder<AvroPrimitives> {
+
+    private boolean bool_column;
+    private int int_column;
+    private long long_column;
+    private float float_column;
+    private double double_column;
+    private java.nio.ByteBuffer binary_column;
+    private java.lang.String string_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.bool_column)) {
+        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.int_column)) {
+        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.long_column)) {
+        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.float_column)) {
+        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.double_column)) {
+        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.binary_column)) {
+        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.string_column)) {
+        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+        fieldSetFlags()[6] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing AvroPrimitives instance */
+    private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives other) {
+            super(org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.SCHEMA$);
+      if (isValidValue(fields()[0], other.bool_column)) {
+        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.int_column)) {
+        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.long_column)) {
+        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.float_column)) {
+        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.double_column)) {
+        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.binary_column)) {
+        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.string_column)) {
+        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+        fieldSetFlags()[6] = true;
+      }
+    }
+
+    /** Gets the value of the 'bool_column' field */
+    public java.lang.Boolean getBoolColumn() {
+      return bool_column;
+    }
+    
+    /** Sets the value of the 'bool_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setBoolColumn(boolean value) {
+      validate(fields()[0], value);
+      this.bool_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'bool_column' field has been set */
+    public boolean hasBoolColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'bool_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearBoolColumn() {
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'int_column' field */
+    public java.lang.Integer getIntColumn() {
+      return int_column;
+    }
+    
+    /** Sets the value of the 'int_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setIntColumn(int value) {
+      validate(fields()[1], value);
+      this.int_column = value;
+      fieldSetFlags()[1] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'int_column' field has been set */
+    public boolean hasIntColumn() {
+      return fieldSetFlags()[1];
+    }
+    
+    /** Clears the value of the 'int_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearIntColumn() {
+      fieldSetFlags()[1] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'long_column' field */
+    public java.lang.Long getLongColumn() {
+      return long_column;
+    }
+    
+    /** Sets the value of the 'long_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setLongColumn(long value) {
+      validate(fields()[2], value);
+      this.long_column = value;
+      fieldSetFlags()[2] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'long_column' field has been set */
+    public boolean hasLongColumn() {
+      return fieldSetFlags()[2];
+    }
+    
+    /** Clears the value of the 'long_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearLongColumn() {
+      fieldSetFlags()[2] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'float_column' field */
+    public java.lang.Float getFloatColumn() {
+      return float_column;
+    }
+    
+    /** Sets the value of the 'float_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setFloatColumn(float value) {
+      validate(fields()[3], value);
+      this.float_column = value;
+      fieldSetFlags()[3] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'float_column' field has been set */
+    public boolean hasFloatColumn() {
+      return fieldSetFlags()[3];
+    }
+    
+    /** Clears the value of the 'float_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearFloatColumn() {
+      fieldSetFlags()[3] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'double_column' field */
+    public java.lang.Double getDoubleColumn() {
+      return double_column;
+    }
+    
+    /** Sets the value of the 'double_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setDoubleColumn(double value) {
+      validate(fields()[4], value);
+      this.double_column = value;
+      fieldSetFlags()[4] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'double_column' field has been set */
+    public boolean hasDoubleColumn() {
+      return fieldSetFlags()[4];
+    }
+    
+    /** Clears the value of the 'double_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearDoubleColumn() {
+      fieldSetFlags()[4] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'binary_column' field */
+    public java.nio.ByteBuffer getBinaryColumn() {
+      return binary_column;
+    }
+    
+    /** Sets the value of the 'binary_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setBinaryColumn(java.nio.ByteBuffer value) {
+      validate(fields()[5], value);
+      this.binary_column = value;
+      fieldSetFlags()[5] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'binary_column' field has been set */
+    public boolean hasBinaryColumn() {
+      return fieldSetFlags()[5];
+    }
+    
+    /** Clears the value of the 'binary_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearBinaryColumn() {
+      binary_column = null;
+      fieldSetFlags()[5] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'string_column' field */
+    public java.lang.String getStringColumn() {
+      return string_column;
+    }
+    
+    /** Sets the value of the 'string_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder setStringColumn(java.lang.String value) {
+      validate(fields()[6], value);
+      this.string_column = value;
+      fieldSetFlags()[6] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'string_column' field has been set */
+    public boolean hasStringColumn() {
+      return fieldSetFlags()[6];
+    }
+    
+    /** Clears the value of the 'string_column' field */
+    public org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroPrimitives.Builder clearStringColumn() {
+      string_column = null;
+      fieldSetFlags()[6] = false;
+      return this;
+    }
+
+    @Override
+    public AvroPrimitives build() {
+      try {
+        AvroPrimitives record = new AvroPrimitives();
+        record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]);
+        record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]);
+        record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]);
+        record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]);
+        record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]);
+        record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]);
+        record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java
index 2368323cb36b9..28fdc1dfb911c 100644
--- a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/CompatibilityTest.java
@@ -8,7 +8,7 @@
 @SuppressWarnings("all")
 @org.apache.avro.specific.AvroGenerated
 public interface CompatibilityTest {
-  public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"types\":[{\"type\":\"enum\",\"name\":\"Suit\",\"symbols\":[\"SPADES\",\"HEARTS\",\"DIAMONDS\",\"CLUBS\"]},{\"type\":\"record\",\"name\":\"ParquetEnum\",\"fields\":[{\"name\":\"suit\",\"type\":\"Suit\"}]},{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}");
+  public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"types\":[{\"type\":\"enum\",\"name\":\"Suit\",\"symbols\":[\"SPADES\",\"HEARTS\",\"DIAMONDS\",\"CLUBS\"]},{\"type\":\"record\",\"name\":\"ParquetEnum\",\"fields\":[{\"name\":\"suit\",\"type\":\"Suit\"}]},{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"AvroPrimitives\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"AvroOptionalPrimitives\",\"fields\":[{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]}]},{\"type\":\"record\",\"name\":\"AvroNonNullableArrays\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"maybe_ints_column\",\"type\":[\"null\",{\"type\":\"array\",\"items\":\"int\"}]}]},{\"type\":\"record\",\"name\":\"AvroArrayOfArray\",\"fields\":[{\"name\":\"int_arrays_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}}]},{\"type\":\"record\",\"name\":\"AvroMapOfArray\",\"fields\":[{\"name\":\"string_to_ints_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"int\"},\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}");
 
   @SuppressWarnings("all")
   public interface Callback extends CompatibilityTest {
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java
index 681cacbd12c7c..ef12d193f916c 100644
--- a/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/execution/datasources/parquet/test/avro/ParquetAvroCompat.java
@@ -7,22 +7,8 @@
 @SuppressWarnings("all")
 @org.apache.avro.specific.AvroGenerated
 public class ParquetAvroCompat extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
-  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}");
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.execution.datasources.parquet.test.avro\",\"fields\":[{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}");
   public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
-  @Deprecated public boolean bool_column;
-  @Deprecated public int int_column;
-  @Deprecated public long long_column;
-  @Deprecated public float float_column;
-  @Deprecated public double double_column;
-  @Deprecated public java.nio.ByteBuffer binary_column;
-  @Deprecated public java.lang.String string_column;
-  @Deprecated public java.lang.Boolean maybe_bool_column;
-  @Deprecated public java.lang.Integer maybe_int_column;
-  @Deprecated public java.lang.Long maybe_long_column;
-  @Deprecated public java.lang.Float maybe_float_column;
-  @Deprecated public java.lang.Double maybe_double_column;
-  @Deprecated public java.nio.ByteBuffer maybe_binary_column;
-  @Deprecated public java.lang.String maybe_string_column;
   @Deprecated public java.util.List<java.lang.String> strings_column;
   @Deprecated public java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column;
   @Deprecated public java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>> complex_column;
@@ -37,21 +23,7 @@ public ParquetAvroCompat() {}
   /**
    * All-args constructor.
    */
-  public ParquetAvroCompat(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column, java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column, java.util.List<java.lang.String> strings_column, java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column, java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>> complex_column) {
-    this.bool_column = bool_column;
-    this.int_column = int_column;
-    this.long_column = long_column;
-    this.float_column = float_column;
-    this.double_column = double_column;
-    this.binary_column = binary_column;
-    this.string_column = string_column;
-    this.maybe_bool_column = maybe_bool_column;
-    this.maybe_int_column = maybe_int_column;
-    this.maybe_long_column = maybe_long_column;
-    this.maybe_float_column = maybe_float_column;
-    this.maybe_double_column = maybe_double_column;
-    this.maybe_binary_column = maybe_binary_column;
-    this.maybe_string_column = maybe_string_column;
+  public ParquetAvroCompat(java.util.List<java.lang.String> strings_column, java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column, java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>> complex_column) {
     this.strings_column = strings_column;
     this.string_to_int_column = string_to_int_column;
     this.complex_column = complex_column;
@@ -61,23 +33,9 @@ public ParquetAvroCompat(java.lang.Boolean bool_column, java.lang.Integer int_co
   // Used by DatumWriter.  Applications should not call. 
   public java.lang.Object get(int field$) {
     switch (field$) {
-    case 0: return bool_column;
-    case 1: return int_column;
-    case 2: return long_column;
-    case 3: return float_column;
-    case 4: return double_column;
-    case 5: return binary_column;
-    case 6: return string_column;
-    case 7: return maybe_bool_column;
-    case 8: return maybe_int_column;
-    case 9: return maybe_long_column;
-    case 10: return maybe_float_column;
-    case 11: return maybe_double_column;
-    case 12: return maybe_binary_column;
-    case 13: return maybe_string_column;
-    case 14: return strings_column;
-    case 15: return string_to_int_column;
-    case 16: return complex_column;
+    case 0: return strings_column;
+    case 1: return string_to_int_column;
+    case 2: return complex_column;
     default: throw new org.apache.avro.AvroRuntimeException("Bad index");
     }
   }
@@ -85,237 +43,13 @@ public java.lang.Object get(int field$) {
   @SuppressWarnings(value="unchecked")
   public void put(int field$, java.lang.Object value$) {
     switch (field$) {
-    case 0: bool_column = (java.lang.Boolean)value$; break;
-    case 1: int_column = (java.lang.Integer)value$; break;
-    case 2: long_column = (java.lang.Long)value$; break;
-    case 3: float_column = (java.lang.Float)value$; break;
-    case 4: double_column = (java.lang.Double)value$; break;
-    case 5: binary_column = (java.nio.ByteBuffer)value$; break;
-    case 6: string_column = (java.lang.String)value$; break;
-    case 7: maybe_bool_column = (java.lang.Boolean)value$; break;
-    case 8: maybe_int_column = (java.lang.Integer)value$; break;
-    case 9: maybe_long_column = (java.lang.Long)value$; break;
-    case 10: maybe_float_column = (java.lang.Float)value$; break;
-    case 11: maybe_double_column = (java.lang.Double)value$; break;
-    case 12: maybe_binary_column = (java.nio.ByteBuffer)value$; break;
-    case 13: maybe_string_column = (java.lang.String)value$; break;
-    case 14: strings_column = (java.util.List<java.lang.String>)value$; break;
-    case 15: string_to_int_column = (java.util.Map<java.lang.String,java.lang.Integer>)value$; break;
-    case 16: complex_column = (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>>)value$; break;
+    case 0: strings_column = (java.util.List<java.lang.String>)value$; break;
+    case 1: string_to_int_column = (java.util.Map<java.lang.String,java.lang.Integer>)value$; break;
+    case 2: complex_column = (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>>)value$; break;
     default: throw new org.apache.avro.AvroRuntimeException("Bad index");
     }
   }
 
-  /**
-   * Gets the value of the 'bool_column' field.
-   */
-  public java.lang.Boolean getBoolColumn() {
-    return bool_column;
-  }
-
-  /**
-   * Sets the value of the 'bool_column' field.
-   * @param value the value to set.
-   */
-  public void setBoolColumn(java.lang.Boolean value) {
-    this.bool_column = value;
-  }
-
-  /**
-   * Gets the value of the 'int_column' field.
-   */
-  public java.lang.Integer getIntColumn() {
-    return int_column;
-  }
-
-  /**
-   * Sets the value of the 'int_column' field.
-   * @param value the value to set.
-   */
-  public void setIntColumn(java.lang.Integer value) {
-    this.int_column = value;
-  }
-
-  /**
-   * Gets the value of the 'long_column' field.
-   */
-  public java.lang.Long getLongColumn() {
-    return long_column;
-  }
-
-  /**
-   * Sets the value of the 'long_column' field.
-   * @param value the value to set.
-   */
-  public void setLongColumn(java.lang.Long value) {
-    this.long_column = value;
-  }
-
-  /**
-   * Gets the value of the 'float_column' field.
-   */
-  public java.lang.Float getFloatColumn() {
-    return float_column;
-  }
-
-  /**
-   * Sets the value of the 'float_column' field.
-   * @param value the value to set.
-   */
-  public void setFloatColumn(java.lang.Float value) {
-    this.float_column = value;
-  }
-
-  /**
-   * Gets the value of the 'double_column' field.
-   */
-  public java.lang.Double getDoubleColumn() {
-    return double_column;
-  }
-
-  /**
-   * Sets the value of the 'double_column' field.
-   * @param value the value to set.
-   */
-  public void setDoubleColumn(java.lang.Double value) {
-    this.double_column = value;
-  }
-
-  /**
-   * Gets the value of the 'binary_column' field.
-   */
-  public java.nio.ByteBuffer getBinaryColumn() {
-    return binary_column;
-  }
-
-  /**
-   * Sets the value of the 'binary_column' field.
-   * @param value the value to set.
-   */
-  public void setBinaryColumn(java.nio.ByteBuffer value) {
-    this.binary_column = value;
-  }
-
-  /**
-   * Gets the value of the 'string_column' field.
-   */
-  public java.lang.String getStringColumn() {
-    return string_column;
-  }
-
-  /**
-   * Sets the value of the 'string_column' field.
-   * @param value the value to set.
-   */
-  public void setStringColumn(java.lang.String value) {
-    this.string_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_bool_column' field.
-   */
-  public java.lang.Boolean getMaybeBoolColumn() {
-    return maybe_bool_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_bool_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeBoolColumn(java.lang.Boolean value) {
-    this.maybe_bool_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_int_column' field.
-   */
-  public java.lang.Integer getMaybeIntColumn() {
-    return maybe_int_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_int_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeIntColumn(java.lang.Integer value) {
-    this.maybe_int_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_long_column' field.
-   */
-  public java.lang.Long getMaybeLongColumn() {
-    return maybe_long_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_long_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeLongColumn(java.lang.Long value) {
-    this.maybe_long_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_float_column' field.
-   */
-  public java.lang.Float getMaybeFloatColumn() {
-    return maybe_float_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_float_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeFloatColumn(java.lang.Float value) {
-    this.maybe_float_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_double_column' field.
-   */
-  public java.lang.Double getMaybeDoubleColumn() {
-    return maybe_double_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_double_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeDoubleColumn(java.lang.Double value) {
-    this.maybe_double_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_binary_column' field.
-   */
-  public java.nio.ByteBuffer getMaybeBinaryColumn() {
-    return maybe_binary_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_binary_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeBinaryColumn(java.nio.ByteBuffer value) {
-    this.maybe_binary_column = value;
-  }
-
-  /**
-   * Gets the value of the 'maybe_string_column' field.
-   */
-  public java.lang.String getMaybeStringColumn() {
-    return maybe_string_column;
-  }
-
-  /**
-   * Sets the value of the 'maybe_string_column' field.
-   * @param value the value to set.
-   */
-  public void setMaybeStringColumn(java.lang.String value) {
-    this.maybe_string_column = value;
-  }
-
   /**
    * Gets the value of the 'strings_column' field.
    */
@@ -382,20 +116,6 @@ public static org.apache.spark.sql.execution.datasources.parquet.test.avro.Parqu
   public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<ParquetAvroCompat>
     implements org.apache.avro.data.RecordBuilder<ParquetAvroCompat> {
 
-    private boolean bool_column;
-    private int int_column;
-    private long long_column;
-    private float float_column;
-    private double double_column;
-    private java.nio.ByteBuffer binary_column;
-    private java.lang.String string_column;
-    private java.lang.Boolean maybe_bool_column;
-    private java.lang.Integer maybe_int_column;
-    private java.lang.Long maybe_long_column;
-    private java.lang.Float maybe_float_column;
-    private java.lang.Double maybe_double_column;
-    private java.nio.ByteBuffer maybe_binary_column;
-    private java.lang.String maybe_string_column;
     private java.util.List<java.lang.String> strings_column;
     private java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column;
     private java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>> complex_column;
@@ -408,492 +128,35 @@ private Builder() {
     /** Creates a Builder by copying an existing Builder */
     private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder other) {
       super(other);
-      if (isValidValue(fields()[0], other.bool_column)) {
-        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+      if (isValidValue(fields()[0], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column);
         fieldSetFlags()[0] = true;
       }
-      if (isValidValue(fields()[1], other.int_column)) {
-        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+      if (isValidValue(fields()[1], other.string_to_int_column)) {
+        this.string_to_int_column = data().deepCopy(fields()[1].schema(), other.string_to_int_column);
         fieldSetFlags()[1] = true;
       }
-      if (isValidValue(fields()[2], other.long_column)) {
-        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+      if (isValidValue(fields()[2], other.complex_column)) {
+        this.complex_column = data().deepCopy(fields()[2].schema(), other.complex_column);
         fieldSetFlags()[2] = true;
       }
-      if (isValidValue(fields()[3], other.float_column)) {
-        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
-        fieldSetFlags()[3] = true;
-      }
-      if (isValidValue(fields()[4], other.double_column)) {
-        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
-        fieldSetFlags()[4] = true;
-      }
-      if (isValidValue(fields()[5], other.binary_column)) {
-        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
-        fieldSetFlags()[5] = true;
-      }
-      if (isValidValue(fields()[6], other.string_column)) {
-        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
-        fieldSetFlags()[6] = true;
-      }
-      if (isValidValue(fields()[7], other.maybe_bool_column)) {
-        this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
-        fieldSetFlags()[7] = true;
-      }
-      if (isValidValue(fields()[8], other.maybe_int_column)) {
-        this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
-        fieldSetFlags()[8] = true;
-      }
-      if (isValidValue(fields()[9], other.maybe_long_column)) {
-        this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
-        fieldSetFlags()[9] = true;
-      }
-      if (isValidValue(fields()[10], other.maybe_float_column)) {
-        this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
-        fieldSetFlags()[10] = true;
-      }
-      if (isValidValue(fields()[11], other.maybe_double_column)) {
-        this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
-        fieldSetFlags()[11] = true;
-      }
-      if (isValidValue(fields()[12], other.maybe_binary_column)) {
-        this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
-        fieldSetFlags()[12] = true;
-      }
-      if (isValidValue(fields()[13], other.maybe_string_column)) {
-        this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
-        fieldSetFlags()[13] = true;
-      }
-      if (isValidValue(fields()[14], other.strings_column)) {
-        this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
-        fieldSetFlags()[14] = true;
-      }
-      if (isValidValue(fields()[15], other.string_to_int_column)) {
-        this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
-        fieldSetFlags()[15] = true;
-      }
-      if (isValidValue(fields()[16], other.complex_column)) {
-        this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
-        fieldSetFlags()[16] = true;
-      }
     }
     
     /** Creates a Builder by copying an existing ParquetAvroCompat instance */
     private Builder(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat other) {
             super(org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.SCHEMA$);
-      if (isValidValue(fields()[0], other.bool_column)) {
-        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+      if (isValidValue(fields()[0], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[0].schema(), other.strings_column);
         fieldSetFlags()[0] = true;
       }
-      if (isValidValue(fields()[1], other.int_column)) {
-        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+      if (isValidValue(fields()[1], other.string_to_int_column)) {
+        this.string_to_int_column = data().deepCopy(fields()[1].schema(), other.string_to_int_column);
         fieldSetFlags()[1] = true;
       }
-      if (isValidValue(fields()[2], other.long_column)) {
-        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+      if (isValidValue(fields()[2], other.complex_column)) {
+        this.complex_column = data().deepCopy(fields()[2].schema(), other.complex_column);
         fieldSetFlags()[2] = true;
       }
-      if (isValidValue(fields()[3], other.float_column)) {
-        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
-        fieldSetFlags()[3] = true;
-      }
-      if (isValidValue(fields()[4], other.double_column)) {
-        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
-        fieldSetFlags()[4] = true;
-      }
-      if (isValidValue(fields()[5], other.binary_column)) {
-        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
-        fieldSetFlags()[5] = true;
-      }
-      if (isValidValue(fields()[6], other.string_column)) {
-        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
-        fieldSetFlags()[6] = true;
-      }
-      if (isValidValue(fields()[7], other.maybe_bool_column)) {
-        this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
-        fieldSetFlags()[7] = true;
-      }
-      if (isValidValue(fields()[8], other.maybe_int_column)) {
-        this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
-        fieldSetFlags()[8] = true;
-      }
-      if (isValidValue(fields()[9], other.maybe_long_column)) {
-        this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
-        fieldSetFlags()[9] = true;
-      }
-      if (isValidValue(fields()[10], other.maybe_float_column)) {
-        this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
-        fieldSetFlags()[10] = true;
-      }
-      if (isValidValue(fields()[11], other.maybe_double_column)) {
-        this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
-        fieldSetFlags()[11] = true;
-      }
-      if (isValidValue(fields()[12], other.maybe_binary_column)) {
-        this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
-        fieldSetFlags()[12] = true;
-      }
-      if (isValidValue(fields()[13], other.maybe_string_column)) {
-        this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
-        fieldSetFlags()[13] = true;
-      }
-      if (isValidValue(fields()[14], other.strings_column)) {
-        this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
-        fieldSetFlags()[14] = true;
-      }
-      if (isValidValue(fields()[15], other.string_to_int_column)) {
-        this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
-        fieldSetFlags()[15] = true;
-      }
-      if (isValidValue(fields()[16], other.complex_column)) {
-        this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
-        fieldSetFlags()[16] = true;
-      }
-    }
-
-    /** Gets the value of the 'bool_column' field */
-    public java.lang.Boolean getBoolColumn() {
-      return bool_column;
-    }
-    
-    /** Sets the value of the 'bool_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setBoolColumn(boolean value) {
-      validate(fields()[0], value);
-      this.bool_column = value;
-      fieldSetFlags()[0] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'bool_column' field has been set */
-    public boolean hasBoolColumn() {
-      return fieldSetFlags()[0];
-    }
-    
-    /** Clears the value of the 'bool_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearBoolColumn() {
-      fieldSetFlags()[0] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'int_column' field */
-    public java.lang.Integer getIntColumn() {
-      return int_column;
-    }
-    
-    /** Sets the value of the 'int_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setIntColumn(int value) {
-      validate(fields()[1], value);
-      this.int_column = value;
-      fieldSetFlags()[1] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'int_column' field has been set */
-    public boolean hasIntColumn() {
-      return fieldSetFlags()[1];
-    }
-    
-    /** Clears the value of the 'int_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearIntColumn() {
-      fieldSetFlags()[1] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'long_column' field */
-    public java.lang.Long getLongColumn() {
-      return long_column;
-    }
-    
-    /** Sets the value of the 'long_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setLongColumn(long value) {
-      validate(fields()[2], value);
-      this.long_column = value;
-      fieldSetFlags()[2] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'long_column' field has been set */
-    public boolean hasLongColumn() {
-      return fieldSetFlags()[2];
-    }
-    
-    /** Clears the value of the 'long_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearLongColumn() {
-      fieldSetFlags()[2] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'float_column' field */
-    public java.lang.Float getFloatColumn() {
-      return float_column;
-    }
-    
-    /** Sets the value of the 'float_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setFloatColumn(float value) {
-      validate(fields()[3], value);
-      this.float_column = value;
-      fieldSetFlags()[3] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'float_column' field has been set */
-    public boolean hasFloatColumn() {
-      return fieldSetFlags()[3];
-    }
-    
-    /** Clears the value of the 'float_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearFloatColumn() {
-      fieldSetFlags()[3] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'double_column' field */
-    public java.lang.Double getDoubleColumn() {
-      return double_column;
-    }
-    
-    /** Sets the value of the 'double_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setDoubleColumn(double value) {
-      validate(fields()[4], value);
-      this.double_column = value;
-      fieldSetFlags()[4] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'double_column' field has been set */
-    public boolean hasDoubleColumn() {
-      return fieldSetFlags()[4];
-    }
-    
-    /** Clears the value of the 'double_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearDoubleColumn() {
-      fieldSetFlags()[4] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'binary_column' field */
-    public java.nio.ByteBuffer getBinaryColumn() {
-      return binary_column;
-    }
-    
-    /** Sets the value of the 'binary_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setBinaryColumn(java.nio.ByteBuffer value) {
-      validate(fields()[5], value);
-      this.binary_column = value;
-      fieldSetFlags()[5] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'binary_column' field has been set */
-    public boolean hasBinaryColumn() {
-      return fieldSetFlags()[5];
-    }
-    
-    /** Clears the value of the 'binary_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearBinaryColumn() {
-      binary_column = null;
-      fieldSetFlags()[5] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'string_column' field */
-    public java.lang.String getStringColumn() {
-      return string_column;
-    }
-    
-    /** Sets the value of the 'string_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setStringColumn(java.lang.String value) {
-      validate(fields()[6], value);
-      this.string_column = value;
-      fieldSetFlags()[6] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'string_column' field has been set */
-    public boolean hasStringColumn() {
-      return fieldSetFlags()[6];
-    }
-    
-    /** Clears the value of the 'string_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearStringColumn() {
-      string_column = null;
-      fieldSetFlags()[6] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_bool_column' field */
-    public java.lang.Boolean getMaybeBoolColumn() {
-      return maybe_bool_column;
-    }
-    
-    /** Sets the value of the 'maybe_bool_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBoolColumn(java.lang.Boolean value) {
-      validate(fields()[7], value);
-      this.maybe_bool_column = value;
-      fieldSetFlags()[7] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_bool_column' field has been set */
-    public boolean hasMaybeBoolColumn() {
-      return fieldSetFlags()[7];
-    }
-    
-    /** Clears the value of the 'maybe_bool_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBoolColumn() {
-      maybe_bool_column = null;
-      fieldSetFlags()[7] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_int_column' field */
-    public java.lang.Integer getMaybeIntColumn() {
-      return maybe_int_column;
-    }
-    
-    /** Sets the value of the 'maybe_int_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeIntColumn(java.lang.Integer value) {
-      validate(fields()[8], value);
-      this.maybe_int_column = value;
-      fieldSetFlags()[8] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_int_column' field has been set */
-    public boolean hasMaybeIntColumn() {
-      return fieldSetFlags()[8];
-    }
-    
-    /** Clears the value of the 'maybe_int_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeIntColumn() {
-      maybe_int_column = null;
-      fieldSetFlags()[8] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_long_column' field */
-    public java.lang.Long getMaybeLongColumn() {
-      return maybe_long_column;
-    }
-    
-    /** Sets the value of the 'maybe_long_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeLongColumn(java.lang.Long value) {
-      validate(fields()[9], value);
-      this.maybe_long_column = value;
-      fieldSetFlags()[9] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_long_column' field has been set */
-    public boolean hasMaybeLongColumn() {
-      return fieldSetFlags()[9];
-    }
-    
-    /** Clears the value of the 'maybe_long_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeLongColumn() {
-      maybe_long_column = null;
-      fieldSetFlags()[9] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_float_column' field */
-    public java.lang.Float getMaybeFloatColumn() {
-      return maybe_float_column;
-    }
-    
-    /** Sets the value of the 'maybe_float_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeFloatColumn(java.lang.Float value) {
-      validate(fields()[10], value);
-      this.maybe_float_column = value;
-      fieldSetFlags()[10] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_float_column' field has been set */
-    public boolean hasMaybeFloatColumn() {
-      return fieldSetFlags()[10];
-    }
-    
-    /** Clears the value of the 'maybe_float_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeFloatColumn() {
-      maybe_float_column = null;
-      fieldSetFlags()[10] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_double_column' field */
-    public java.lang.Double getMaybeDoubleColumn() {
-      return maybe_double_column;
-    }
-    
-    /** Sets the value of the 'maybe_double_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeDoubleColumn(java.lang.Double value) {
-      validate(fields()[11], value);
-      this.maybe_double_column = value;
-      fieldSetFlags()[11] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_double_column' field has been set */
-    public boolean hasMaybeDoubleColumn() {
-      return fieldSetFlags()[11];
-    }
-    
-    /** Clears the value of the 'maybe_double_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeDoubleColumn() {
-      maybe_double_column = null;
-      fieldSetFlags()[11] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_binary_column' field */
-    public java.nio.ByteBuffer getMaybeBinaryColumn() {
-      return maybe_binary_column;
-    }
-    
-    /** Sets the value of the 'maybe_binary_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) {
-      validate(fields()[12], value);
-      this.maybe_binary_column = value;
-      fieldSetFlags()[12] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_binary_column' field has been set */
-    public boolean hasMaybeBinaryColumn() {
-      return fieldSetFlags()[12];
-    }
-    
-    /** Clears the value of the 'maybe_binary_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBinaryColumn() {
-      maybe_binary_column = null;
-      fieldSetFlags()[12] = false;
-      return this;
-    }
-
-    /** Gets the value of the 'maybe_string_column' field */
-    public java.lang.String getMaybeStringColumn() {
-      return maybe_string_column;
-    }
-    
-    /** Sets the value of the 'maybe_string_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setMaybeStringColumn(java.lang.String value) {
-      validate(fields()[13], value);
-      this.maybe_string_column = value;
-      fieldSetFlags()[13] = true;
-      return this; 
-    }
-    
-    /** Checks whether the 'maybe_string_column' field has been set */
-    public boolean hasMaybeStringColumn() {
-      return fieldSetFlags()[13];
-    }
-    
-    /** Clears the value of the 'maybe_string_column' field */
-    public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeStringColumn() {
-      maybe_string_column = null;
-      fieldSetFlags()[13] = false;
-      return this;
     }
 
     /** Gets the value of the 'strings_column' field */
@@ -903,21 +166,21 @@ public java.util.List<java.lang.String> getStringsColumn() {
     
     /** Sets the value of the 'strings_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setStringsColumn(java.util.List<java.lang.String> value) {
-      validate(fields()[14], value);
+      validate(fields()[0], value);
       this.strings_column = value;
-      fieldSetFlags()[14] = true;
+      fieldSetFlags()[0] = true;
       return this; 
     }
     
     /** Checks whether the 'strings_column' field has been set */
     public boolean hasStringsColumn() {
-      return fieldSetFlags()[14];
+      return fieldSetFlags()[0];
     }
     
     /** Clears the value of the 'strings_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearStringsColumn() {
       strings_column = null;
-      fieldSetFlags()[14] = false;
+      fieldSetFlags()[0] = false;
       return this;
     }
 
@@ -928,21 +191,21 @@ public java.util.Map<java.lang.String,java.lang.Integer> getStringToIntColumn()
     
     /** Sets the value of the 'string_to_int_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setStringToIntColumn(java.util.Map<java.lang.String,java.lang.Integer> value) {
-      validate(fields()[15], value);
+      validate(fields()[1], value);
       this.string_to_int_column = value;
-      fieldSetFlags()[15] = true;
+      fieldSetFlags()[1] = true;
       return this; 
     }
     
     /** Checks whether the 'string_to_int_column' field has been set */
     public boolean hasStringToIntColumn() {
-      return fieldSetFlags()[15];
+      return fieldSetFlags()[1];
     }
     
     /** Clears the value of the 'string_to_int_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearStringToIntColumn() {
       string_to_int_column = null;
-      fieldSetFlags()[15] = false;
+      fieldSetFlags()[1] = false;
       return this;
     }
 
@@ -953,21 +216,21 @@ public java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execut
     
     /** Sets the value of the 'complex_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder setComplexColumn(java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>> value) {
-      validate(fields()[16], value);
+      validate(fields()[2], value);
       this.complex_column = value;
-      fieldSetFlags()[16] = true;
+      fieldSetFlags()[2] = true;
       return this; 
     }
     
     /** Checks whether the 'complex_column' field has been set */
     public boolean hasComplexColumn() {
-      return fieldSetFlags()[16];
+      return fieldSetFlags()[2];
     }
     
     /** Clears the value of the 'complex_column' field */
     public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroCompat.Builder clearComplexColumn() {
       complex_column = null;
-      fieldSetFlags()[16] = false;
+      fieldSetFlags()[2] = false;
       return this;
     }
 
@@ -975,23 +238,9 @@ public org.apache.spark.sql.execution.datasources.parquet.test.avro.ParquetAvroC
     public ParquetAvroCompat build() {
       try {
         ParquetAvroCompat record = new ParquetAvroCompat();
-        record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]);
-        record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]);
-        record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]);
-        record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]);
-        record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]);
-        record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]);
-        record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]);
-        record.maybe_bool_column = fieldSetFlags()[7] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[7]);
-        record.maybe_int_column = fieldSetFlags()[8] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[8]);
-        record.maybe_long_column = fieldSetFlags()[9] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[9]);
-        record.maybe_float_column = fieldSetFlags()[10] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[10]);
-        record.maybe_double_column = fieldSetFlags()[11] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[11]);
-        record.maybe_binary_column = fieldSetFlags()[12] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[12]);
-        record.maybe_string_column = fieldSetFlags()[13] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[13]);
-        record.strings_column = fieldSetFlags()[14] ? this.strings_column : (java.util.List<java.lang.String>) defaultValue(fields()[14]);
-        record.string_to_int_column = fieldSetFlags()[15] ? this.string_to_int_column : (java.util.Map<java.lang.String,java.lang.Integer>) defaultValue(fields()[15]);
-        record.complex_column = fieldSetFlags()[16] ? this.complex_column : (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>>) defaultValue(fields()[16]);
+        record.strings_column = fieldSetFlags()[0] ? this.strings_column : (java.util.List<java.lang.String>) defaultValue(fields()[0]);
+        record.string_to_int_column = fieldSetFlags()[1] ? this.string_to_int_column : (java.util.Map<java.lang.String,java.lang.Integer>) defaultValue(fields()[1]);
+        record.complex_column = fieldSetFlags()[2] ? this.complex_column : (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.execution.datasources.parquet.test.avro.Nested>>) defaultValue(fields()[2]);
         return record;
       } catch (Exception e) {
         throw new org.apache.avro.AvroRuntimeException(e);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
index 82d40e2b61a10..45db619567a22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources.parquet
 import java.nio.ByteBuffer
 import java.util.{List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters.seqAsJavaListConverter
+import scala.collection.JavaConverters.mapAsJavaMapConverter
 
 import org.apache.avro.Schema
 import org.apache.avro.generic.IndexedRecord
@@ -32,48 +33,196 @@ import org.apache.spark.sql.execution.datasources.parquet.test.avro._
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
-  import ParquetCompatibilityTest._
-
   private def withWriter[T <: IndexedRecord]
       (path: String, schema: Schema)
       (f: AvroParquetWriter[T] => Unit): Unit = {
+    logInfo(
+      s"""Writing Avro records with the following Avro schema into Parquet file:
+         |
+         |${schema.toString(true)}
+       """.stripMargin)
+
     val writer = new AvroParquetWriter[T](new Path(path), schema)
     try f(writer) finally writer.close()
   }
 
-  test("Read Parquet file generated by parquet-avro") {
+  test("required primitives") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
-      withWriter[ParquetAvroCompat](path, ParquetAvroCompat.getClassSchema) { writer =>
-        (0 until 10).foreach(i => writer.write(makeParquetAvroCompat(i)))
+      withWriter[AvroPrimitives](path, AvroPrimitives.getClassSchema) { writer =>
+        (0 until 10).foreach { i =>
+          writer.write(
+            AvroPrimitives.newBuilder()
+              .setBoolColumn(i % 2 == 0)
+              .setIntColumn(i)
+              .setLongColumn(i.toLong * 10)
+              .setFloatColumn(i.toFloat + 0.1f)
+              .setDoubleColumn(i.toDouble + 0.2d)
+              .setBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes("UTF-8")))
+              .setStringColumn(s"val_$i")
+              .build())
+        }
       }
 
-      logInfo(
-        s"""Schema of the Parquet file written by parquet-avro:
-           |${readParquetSchema(path)}
-       """.stripMargin)
+      logParquetSchema(path)
 
       checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
-        def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
-
         Row(
           i % 2 == 0,
           i,
           i.toLong * 10,
           i.toFloat + 0.1f,
           i.toDouble + 0.2d,
-          s"val_$i".getBytes,
-          s"val_$i",
+          s"val_$i".getBytes("UTF-8"),
+          s"val_$i")
+      })
+    }
+  }
+
+  test("optional primitives") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withWriter[AvroOptionalPrimitives](path, AvroOptionalPrimitives.getClassSchema) { writer =>
+        (0 until 10).foreach { i =>
+          val record = if (i % 3 == 0) {
+            AvroOptionalPrimitives.newBuilder()
+              .setMaybeBoolColumn(null)
+              .setMaybeIntColumn(null)
+              .setMaybeLongColumn(null)
+              .setMaybeFloatColumn(null)
+              .setMaybeDoubleColumn(null)
+              .setMaybeBinaryColumn(null)
+              .setMaybeStringColumn(null)
+              .build()
+          } else {
+            AvroOptionalPrimitives.newBuilder()
+              .setMaybeBoolColumn(i % 2 == 0)
+              .setMaybeIntColumn(i)
+              .setMaybeLongColumn(i.toLong * 10)
+              .setMaybeFloatColumn(i.toFloat + 0.1f)
+              .setMaybeDoubleColumn(i.toDouble + 0.2d)
+              .setMaybeBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes("UTF-8")))
+              .setMaybeStringColumn(s"val_$i")
+              .build()
+          }
+
+          writer.write(record)
+        }
+      }
+
+      logParquetSchema(path)
+
+      checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
+        if (i % 3 == 0) {
+          Row.apply(Seq.fill(7)(null): _*)
+        } else {
+          Row(
+            i % 2 == 0,
+            i,
+            i.toLong * 10,
+            i.toFloat + 0.1f,
+            i.toDouble + 0.2d,
+            s"val_$i".getBytes("UTF-8"),
+            s"val_$i")
+        }
+      })
+    }
+  }
+
+  test("non-nullable arrays") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withWriter[AvroNonNullableArrays](path, AvroNonNullableArrays.getClassSchema) { writer =>
+        (0 until 10).foreach { i =>
+          val record = {
+            val builder =
+              AvroNonNullableArrays.newBuilder()
+                .setStringsColumn(Seq.tabulate(3)(i => s"val_$i").asJava)
+
+            if (i % 3 == 0) {
+              builder.setMaybeIntsColumn(null).build()
+            } else {
+              builder.setMaybeIntsColumn(Seq.tabulate(3)(Int.box).asJava).build()
+            }
+          }
+
+          writer.write(record)
+        }
+      }
+
+      logParquetSchema(path)
+
+      checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
+        Row(
+          Seq.tabulate(3)(i => s"val_$i"),
+          if (i % 3 == 0) null else Seq.tabulate(3)(identity))
+      })
+    }
+  }
+
+  ignore("nullable arrays (parquet-avro 1.7.0 does not properly support this)") {
+    // TODO Complete this test case after upgrading to parquet-mr 1.8+
+  }
+
+  test("SPARK-10136 array of primitive array") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withWriter[AvroArrayOfArray](path, AvroArrayOfArray.getClassSchema) { writer =>
+        (0 until 10).foreach { i =>
+          writer.write(AvroArrayOfArray.newBuilder()
+            .setIntArraysColumn(
+              Seq.tabulate(3, 3)((i, j) => i * 3 + j: Integer).map(_.asJava).asJava)
+            .build())
+        }
+      }
 
-          nullable(i % 2 == 0: java.lang.Boolean),
-          nullable(i: Integer),
-          nullable(i.toLong: java.lang.Long),
-          nullable(i.toFloat + 0.1f: java.lang.Float),
-          nullable(i.toDouble + 0.2d: java.lang.Double),
-          nullable(s"val_$i".getBytes),
-          nullable(s"val_$i"),
+      logParquetSchema(path)
 
+      checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
+        Row(Seq.tabulate(3, 3)((i, j) => i * 3 + j))
+      })
+    }
+  }
+
+  test("map of primitive array") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withWriter[AvroMapOfArray](path, AvroMapOfArray.getClassSchema) { writer =>
+        (0 until 10).foreach { i =>
+          writer.write(AvroMapOfArray.newBuilder()
+            .setStringToIntsColumn(
+              Seq.tabulate(3) { i =>
+                i.toString -> Seq.tabulate(3)(j => i + j: Integer).asJava
+              }.toMap.asJava)
+            .build())
+        }
+      }
+
+      logParquetSchema(path)
+
+      checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
+        Row(Seq.tabulate(3)(i => i.toString -> Seq.tabulate(3)(j => i + j)).toMap)
+      })
+    }
+  }
+
+  test("various complex types") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withWriter[ParquetAvroCompat](path, ParquetAvroCompat.getClassSchema) { writer =>
+        (0 until 10).foreach(i => writer.write(makeParquetAvroCompat(i)))
+      }
+
+      logParquetSchema(path)
+
+      checkAnswer(sqlContext.read.parquet(path), (0 until 10).map { i =>
+        Row(
           Seq.tabulate(3)(n => s"arr_${i + n}"),
           Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap,
           Seq.tabulate(3) { n =>
@@ -86,47 +235,27 @@ class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest with Shared
   }
 
   def makeParquetAvroCompat(i: Int): ParquetAvroCompat = {
-    def nullable[T <: AnyRef] = makeNullable[T](i) _
-
     def makeComplexColumn(i: Int): JMap[String, JList[Nested]] = {
-      mapAsJavaMap(Seq.tabulate(3) { n =>
-        (i + n).toString -> seqAsJavaList(Seq.tabulate(3) { m =>
+      Seq.tabulate(3) { n =>
+        (i + n).toString -> Seq.tabulate(3) { m =>
           Nested
             .newBuilder()
-            .setNestedIntsColumn(seqAsJavaList(Seq.tabulate(3)(j => i + j + m)))
+            .setNestedIntsColumn(Seq.tabulate(3)(j => i + j + m: Integer).asJava)
             .setNestedStringColumn(s"val_${i + m}")
             .build()
-        })
-      }.toMap)
+        }.asJava
+      }.toMap.asJava
     }
 
     ParquetAvroCompat
       .newBuilder()
-      .setBoolColumn(i % 2 == 0)
-      .setIntColumn(i)
-      .setLongColumn(i.toLong * 10)
-      .setFloatColumn(i.toFloat + 0.1f)
-      .setDoubleColumn(i.toDouble + 0.2d)
-      .setBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes))
-      .setStringColumn(s"val_$i")
-
-      .setMaybeBoolColumn(nullable(i % 2 == 0: java.lang.Boolean))
-      .setMaybeIntColumn(nullable(i: Integer))
-      .setMaybeLongColumn(nullable(i.toLong: java.lang.Long))
-      .setMaybeFloatColumn(nullable(i.toFloat + 0.1f: java.lang.Float))
-      .setMaybeDoubleColumn(nullable(i.toDouble + 0.2d: java.lang.Double))
-      .setMaybeBinaryColumn(nullable(ByteBuffer.wrap(s"val_$i".getBytes)))
-      .setMaybeStringColumn(nullable(s"val_$i"))
-
-      .setStringsColumn(Seq.tabulate(3)(n => s"arr_${i + n}"))
-      .setStringToIntColumn(
-        mapAsJavaMap(Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap))
+      .setStringsColumn(Seq.tabulate(3)(n => s"arr_${i + n}").asJava)
+      .setStringToIntColumn(Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap.asJava)
       .setComplexColumn(makeComplexColumn(i))
-
       .build()
   }
 
-  test("SPARK-9407 Don't push down predicates involving Parquet ENUM columns") {
+  test("SPARK-9407 Push down predicates involving Parquet ENUM columns") {
     import testImplicits._
 
     withTempPath { dir =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
index b3406729fcc5e..d85c564e3e8d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
@@ -43,6 +43,13 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
     val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
     footers.head.getParquetMetadata.getFileMetaData.getSchema
   }
+
+  protected def logParquetSchema(path: String): Unit = {
+    logInfo(
+      s"""Schema of the Parquet file written by parquet-avro:
+         |${readParquetSchema(path)}
+       """.stripMargin)
+  }
 }
 
 object ParquetCompatibilityTest {

From 12de348332108f8c0c5bdad1d4cfac89b952b0f8 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 20 Aug 2015 11:31:03 -0700
Subject: [PATCH 023/802] [SPARK-10126] [PROJECT INFRA] Fix typo in
 release-build.sh which broke snapshot publishing for Scala 2.11

The current `release-build.sh` has a typo which breaks snapshot publication for Scala 2.11. We should change the Scala version to 2.11 and clean before building a 2.11 snapshot.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8325 from JoshRosen/fix-2.11-snapshots.
---
 dev/create-release/release-build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 399c73e7bf6bc..d0b3a54dde1dc 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -225,9 +225,9 @@ if [[ "$1" == "publish-snapshot" ]]; then
 
   $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \
     -Phive-thriftserver deploy
-  ./dev/change-scala-version.sh 2.10
+  ./dev/change-scala-version.sh 2.11
   $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \
-    -DskipTests $PUBLISH_PROFILES deploy
+    -DskipTests $PUBLISH_PROFILES clean deploy
 
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill

From 907df2fce00d2cbc9fae371344f05f800e0d2726 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 20 Aug 2015 13:51:54 -0700
Subject: [PATCH 024/802] [SQL] [MINOR] remove unnecessary class

This class is identical to `org.apache.spark.sql.execution.datasources.jdbc. DefaultSource` and is not needed.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8334 from cloud-fan/minor.
---
 .../execution/datasources/DefaultSource.scala | 64 -------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala
deleted file mode 100644
index 6e4cc4de7f651..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DefaultSource.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.execution.datasources
-
-import java.util.Properties
-
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.execution.datasources.jdbc.{JDBCRelation, JDBCPartitioningInfo, DriverRegistry}
-import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider}
-
-
-class DefaultSource extends RelationProvider with DataSourceRegister {
-
-  override def shortName(): String = "jdbc"
-
-  /** Returns a new base relation with the given parameters. */
-  override def createRelation(
-      sqlContext: SQLContext,
-      parameters: Map[String, String]): BaseRelation = {
-    val url = parameters.getOrElse("url", sys.error("Option 'url' not specified"))
-    val driver = parameters.getOrElse("driver", null)
-    val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified"))
-    val partitionColumn = parameters.getOrElse("partitionColumn", null)
-    val lowerBound = parameters.getOrElse("lowerBound", null)
-    val upperBound = parameters.getOrElse("upperBound", null)
-    val numPartitions = parameters.getOrElse("numPartitions", null)
-
-    if (driver != null) DriverRegistry.register(driver)
-
-    if (partitionColumn != null
-      && (lowerBound == null || upperBound == null || numPartitions == null)) {
-      sys.error("Partitioning incompletely specified")
-    }
-
-    val partitionInfo = if (partitionColumn == null) {
-      null
-    } else {
-      JDBCPartitioningInfo(
-        partitionColumn,
-        lowerBound.toLong,
-        upperBound.toLong,
-        numPartitions.toInt)
-    }
-    val parts = JDBCRelation.columnPartition(partitionInfo)
-    val properties = new Properties() // Additional properties that we will pass to getConnection
-    parameters.foreach(kv => properties.setProperty(kv._1, kv._2))
-    JDBCRelation(url, table, parts, properties)(sqlContext)
-  }
-}

From 2a3d98aae285aba39786e9809f96de412a130f39 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 20 Aug 2015 14:47:04 -0700
Subject: [PATCH 025/802] [SPARK-10138] [ML] move setters to
 MultilayerPerceptronClassifier and add Java test suite

Otherwise, setters do not return self type. jkbradley avulanov

Author: Xiangrui Meng <meng@databricks.com>

Closes #8342 from mengxr/SPARK-10138.
---
 .../MultilayerPerceptronClassifier.scala      | 54 +++++++-------
 ...vaMultilayerPerceptronClassifierSuite.java | 74 +++++++++++++++++++
 2 files changed, 101 insertions(+), 27 deletions(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index ccca4ecc004c3..1e5b0bc4453e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -42,9 +42,6 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
     ParamValidators.arrayLengthGt(1)
   )
 
-  /** @group setParam */
-  def setLayers(value: Array[Int]): this.type = set(layers, value)
-
   /** @group getParam */
   final def getLayers: Array[Int] = $(layers)
 
@@ -61,33 +58,9 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
       "it is adjusted to the size of this data. Recommended size is between 10 and 1000",
     ParamValidators.gt(0))
 
-  /** @group setParam */
-  def setBlockSize(value: Int): this.type = set(blockSize, value)
-
   /** @group getParam */
   final def getBlockSize: Int = $(blockSize)
 
-  /**
-   * Set the maximum number of iterations.
-   * Default is 100.
-   * @group setParam
-   */
-  def setMaxIter(value: Int): this.type = set(maxIter, value)
-
-  /**
-   * Set the convergence tolerance of iterations.
-   * Smaller value will lead to higher accuracy with the cost of more iterations.
-   * Default is 1E-4.
-   * @group setParam
-   */
-  def setTol(value: Double): this.type = set(tol, value)
-
-  /**
-   * Set the seed for weights initialization.
-   * @group setParam
-   */
-  def setSeed(value: Long): this.type = set(seed, value)
-
   setDefault(maxIter -> 100, tol -> 1e-4, layers -> Array(1, 1), blockSize -> 128)
 }
 
@@ -136,6 +109,33 @@ class MultilayerPerceptronClassifier(override val uid: String)
 
   def this() = this(Identifiable.randomUID("mlpc"))
 
+  /** @group setParam */
+  def setLayers(value: Array[Int]): this.type = set(layers, value)
+
+  /** @group setParam */
+  def setBlockSize(value: Int): this.type = set(blockSize, value)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   * @group setParam
+   */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-4.
+   * @group setParam
+   */
+  def setTol(value: Double): this.type = set(tol, value)
+
+  /**
+   * Set the seed for weights initialization.
+   * @group setParam
+   */
+  def setSeed(value: Long): this.type = set(seed, value)
+
   override def copy(extra: ParamMap): MultilayerPerceptronClassifier = defaultCopy(extra)
 
   /**
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java
new file mode 100644
index 0000000000000..ec6b4bf3c0f8c
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaMultilayerPerceptronClassifierSuite.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaMultilayerPerceptronClassifierSuite implements Serializable {
+
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaLogisticRegressionSuite");
+    sqlContext = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+    sqlContext = null;
+  }
+
+  @Test
+  public void testMLPC() {
+    DataFrame dataFrame = sqlContext.createDataFrame(
+      jsc.parallelize(Arrays.asList(
+        new LabeledPoint(0.0, Vectors.dense(0.0, 0.0)),
+        new LabeledPoint(1.0, Vectors.dense(0.0, 1.0)),
+        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
+        new LabeledPoint(0.0, Vectors.dense(1.0, 1.0)))),
+      LabeledPoint.class);
+    MultilayerPerceptronClassifier mlpc = new MultilayerPerceptronClassifier()
+      .setLayers(new int[] {2, 5, 2})
+      .setBlockSize(1)
+      .setSeed(11L)
+      .setMaxIter(100);
+    MultilayerPerceptronClassificationModel model = mlpc.fit(dataFrame);
+    DataFrame result = model.transform(dataFrame);
+    Row[] predictionAndLabels = result.select("prediction", "label").collect();
+    for (Row r: predictionAndLabels) {
+      Assert.assertEquals((int) r.getDouble(0), (int) r.getDouble(1));
+    }
+  }
+}

From 7cfc0750e14f2c1b3847e4720cc02150253525a9 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 20 Aug 2015 14:56:08 -0700
Subject: [PATCH 026/802] [SPARK-10108] Add since tags to mllib.feature

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8309 from MechCoder/tags_feature.
---
 .../spark/mllib/feature/ChiSqSelector.scala   | 12 +++++++++---
 .../mllib/feature/ElementwiseProduct.scala    |  4 +++-
 .../spark/mllib/feature/HashingTF.scala       | 11 ++++++++++-
 .../org/apache/spark/mllib/feature/IDF.scala  |  8 +++++++-
 .../spark/mllib/feature/Normalizer.scala      |  5 ++++-
 .../org/apache/spark/mllib/feature/PCA.scala  |  9 ++++++++-
 .../spark/mllib/feature/StandardScaler.scala  | 13 ++++++++++++-
 .../mllib/feature/VectorTransformer.scala     |  6 +++++-
 .../apache/spark/mllib/feature/Word2Vec.scala | 19 ++++++++++++++++++-
 9 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 5f8c1dea237b4..fdd974d7a391e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature
 
 import scala.collection.mutable.ArrayBuilder
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
@@ -31,8 +31,10 @@ import org.apache.spark.rdd.RDD
  *
  * @param selectedFeatures list of indices to select (filter). Must be ordered asc
  */
+@Since("1.3.0")
 @Experimental
-class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransformer {
+class ChiSqSelectorModel (
+  @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
 
   require(isSorted(selectedFeatures), "Array has to be sorted asc")
 
@@ -52,6 +54,7 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf
    * @param vector vector to be transformed.
    * @return transformed vector.
    */
+  @Since("1.3.0")
   override def transform(vector: Vector): Vector = {
     compress(vector, selectedFeatures)
   }
@@ -107,8 +110,10 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf
  * @param numTopFeatures number of features that selector will select
  *                       (ordered by statistic value descending)
  */
+@Since("1.3.0")
 @Experimental
-class ChiSqSelector (val numTopFeatures: Int) extends Serializable {
+class ChiSqSelector (
+  @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
 
   /**
    * Returns a ChiSquared feature selector.
@@ -117,6 +122,7 @@ class ChiSqSelector (val numTopFeatures: Int) extends Serializable {
    *             Real-valued features will be treated as categorical for each distinct value.
    *             Apply feature discretizer before using this function.
    */
+  @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
     val indices = Statistics.chiSqTest(data)
       .zipWithIndex.sortBy { case (res, _) => -res.statistic }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index d67fe6c3ee4f8..33e2d17bb472e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg._
 
 /**
@@ -27,6 +27,7 @@ import org.apache.spark.mllib.linalg._
  * multiplier.
  * @param scalingVec The values used to scale the reference vector's individual components.
  */
+@Since("1.4.0")
 @Experimental
 class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
 
@@ -36,6 +37,7 @@ class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
    * @param vector vector to be transformed.
    * @return transformed vector.
    */
+  @Since("1.4.0")
   override def transform(vector: Vector): Vector = {
     require(vector.size == scalingVec.size,
       s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index c53475818395f..e47d524b61623 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -22,7 +22,7 @@ import java.lang.{Iterable => JavaIterable}
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
@@ -34,19 +34,25 @@ import org.apache.spark.util.Utils
  *
  * @param numFeatures number of features (default: 2^20^)
  */
+@Since("1.1.0")
 @Experimental
 class HashingTF(val numFeatures: Int) extends Serializable {
 
+  /**
+   */
+  @Since("1.1.0")
   def this() = this(1 << 20)
 
   /**
    * Returns the index of the input term.
    */
+  @Since("1.1.0")
   def indexOf(term: Any): Int = Utils.nonNegativeMod(term.##, numFeatures)
 
   /**
    * Transforms the input document into a sparse term frequency vector.
    */
+  @Since("1.1.0")
   def transform(document: Iterable[_]): Vector = {
     val termFrequencies = mutable.HashMap.empty[Int, Double]
     document.foreach { term =>
@@ -59,6 +65,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
   /**
    * Transforms the input document into a sparse term frequency vector (Java version).
    */
+  @Since("1.1.0")
   def transform(document: JavaIterable[_]): Vector = {
     transform(document.asScala)
   }
@@ -66,6 +73,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
   /**
    * Transforms the input document to term frequency vectors.
    */
+  @Since("1.1.0")
   def transform[D <: Iterable[_]](dataset: RDD[D]): RDD[Vector] = {
     dataset.map(this.transform)
   }
@@ -73,6 +81,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
   /**
    * Transforms the input document to term frequency vectors (Java version).
    */
+  @Since("1.1.0")
   def transform[D <: JavaIterable[_]](dataset: JavaRDD[D]): JavaRDD[Vector] = {
     dataset.rdd.map(this.transform).toJavaRDD()
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 3fab7ea79befc..d5353ddd972e0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature
 
 import breeze.linalg.{DenseVector => BDV}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.rdd.RDD
@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
  * @param minDocFreq minimum of documents in which a term
  *                   should appear for filtering
  */
+@Since("1.1.0")
 @Experimental
 class IDF(val minDocFreq: Int) {
 
@@ -48,6 +49,7 @@ class IDF(val minDocFreq: Int) {
    * Computes the inverse document frequency.
    * @param dataset an RDD of term frequency vectors
    */
+  @Since("1.1.0")
   def fit(dataset: RDD[Vector]): IDFModel = {
     val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
           minDocFreq = minDocFreq))(
@@ -61,6 +63,7 @@ class IDF(val minDocFreq: Int) {
    * Computes the inverse document frequency.
    * @param dataset a JavaRDD of term frequency vectors
    */
+  @Since("1.1.0")
   def fit(dataset: JavaRDD[Vector]): IDFModel = {
     fit(dataset.rdd)
   }
@@ -171,6 +174,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
    * @param dataset an RDD of term frequency vectors
    * @return an RDD of TF-IDF vectors
    */
+  @Since("1.1.0")
   def transform(dataset: RDD[Vector]): RDD[Vector] = {
     val bcIdf = dataset.context.broadcast(idf)
     dataset.mapPartitions(iter => iter.map(v => IDFModel.transform(bcIdf.value, v)))
@@ -182,6 +186,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
    * @param v a term frequency vector
    * @return a TF-IDF vector
    */
+  @Since("1.3.0")
   def transform(v: Vector): Vector = IDFModel.transform(idf, v)
 
   /**
@@ -189,6 +194,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
    * @param dataset a JavaRDD of term frequency vectors
    * @return a JavaRDD of TF-IDF vectors
    */
+  @Since("1.1.0")
   def transform(dataset: JavaRDD[Vector]): JavaRDD[Vector] = {
     transform(dataset.rdd).toJavaRDD()
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index 32848e039eb81..0e070257d9fb2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 
 /**
@@ -31,9 +31,11 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors
  *
  * @param p Normalization in L^p^ space, p = 2 by default.
  */
+@Since("1.1.0")
 @Experimental
 class Normalizer(p: Double) extends VectorTransformer {
 
+  @Since("1.1.0")
   def this() = this(2)
 
   require(p >= 1.0)
@@ -44,6 +46,7 @@ class Normalizer(p: Double) extends VectorTransformer {
    * @param vector vector to be normalized.
    * @return normalized vector. If the norm of the input is zero, it will return the input vector.
    */
+  @Since("1.1.0")
   override def transform(vector: Vector): Vector = {
     val norm = Vectors.norm(vector, p)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 2a66263d8b7d6..a48b7bba665d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.feature
 
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
@@ -27,6 +28,7 @@ import org.apache.spark.rdd.RDD
  *
  * @param k number of principal components
  */
+@Since("1.4.0")
 class PCA(val k: Int) {
   require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k")
 
@@ -35,6 +37,7 @@ class PCA(val k: Int) {
    *
    * @param sources source vectors
    */
+  @Since("1.4.0")
   def fit(sources: RDD[Vector]): PCAModel = {
     require(k <= sources.first().size,
       s"source vector size is ${sources.first().size} must be greater than k=$k")
@@ -58,7 +61,10 @@ class PCA(val k: Int) {
     new PCAModel(k, pc)
   }
 
-  /** Java-friendly version of [[fit()]] */
+  /**
+   * Java-friendly version of [[fit()]]
+   */
+  @Since("1.4.0")
   def fit(sources: JavaRDD[Vector]): PCAModel = fit(sources.rdd)
 }
 
@@ -76,6 +82,7 @@ class PCAModel private[spark] (val k: Int, val pc: DenseMatrix) extends VectorTr
    *               Vector must be the same length as the source vectors given to [[PCA.fit()]].
    * @return transformed vector. Vector will be of length k.
    */
+  @Since("1.4.0")
   override def transform(vector: Vector): Vector = {
     vector match {
       case dv: DenseVector =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index c73b8f258060d..b95d5a899001e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.feature
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
@@ -32,9 +32,11 @@ import org.apache.spark.rdd.RDD
  *                 dense output, so this does not work on sparse input and will raise an exception.
  * @param withStd True by default. Scales the data to unit standard deviation.
  */
+@Since("1.1.0")
 @Experimental
 class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
 
+  @Since("1.1.0")
   def this() = this(false, true)
 
   if (!(withMean || withStd)) {
@@ -47,6 +49,7 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
    * @param data The data used to compute the mean and variance to build the transformation model.
    * @return a StandardScalarModel
    */
+  @Since("1.1.0")
   def fit(data: RDD[Vector]): StandardScalerModel = {
     // TODO: skip computation if both withMean and withStd are false
     val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
@@ -69,6 +72,7 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
  * @param withStd whether to scale the data to have unit standard deviation
  * @param withMean whether to center the data before scaling
  */
+@Since("1.1.0")
 @Experimental
 class StandardScalerModel (
     val std: Vector,
@@ -76,6 +80,9 @@ class StandardScalerModel (
     var withStd: Boolean,
     var withMean: Boolean) extends VectorTransformer {
 
+  /**
+   */
+  @Since("1.3.0")
   def this(std: Vector, mean: Vector) {
     this(std, mean, withStd = std != null, withMean = mean != null)
     require(this.withStd || this.withMean,
@@ -86,8 +93,10 @@ class StandardScalerModel (
     }
   }
 
+  @Since("1.3.0")
   def this(std: Vector) = this(std, null)
 
+  @Since("1.3.0")
   @DeveloperApi
   def setWithMean(withMean: Boolean): this.type = {
     require(!(withMean && this.mean == null), "cannot set withMean to true while mean is null")
@@ -95,6 +104,7 @@ class StandardScalerModel (
     this
   }
 
+  @Since("1.3.0")
   @DeveloperApi
   def setWithStd(withStd: Boolean): this.type = {
     require(!(withStd && this.std == null),
@@ -115,6 +125,7 @@ class StandardScalerModel (
    * @return Standardized vector. If the std of a column is zero, it will return default `0.0`
    *         for the column with zero std.
    */
+  @Since("1.1.0")
   override def transform(vector: Vector): Vector = {
     require(mean.size == vector.size)
     if (withMean) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
index 7358c1c84f79c..5778fd1d09254 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
@@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD
  * :: DeveloperApi ::
  * Trait for transformation of a vector
  */
+@Since("1.1.0")
 @DeveloperApi
 trait VectorTransformer extends Serializable {
 
@@ -35,6 +36,7 @@ trait VectorTransformer extends Serializable {
    * @param vector vector to be transformed.
    * @return transformed vector.
    */
+  @Since("1.1.0")
   def transform(vector: Vector): Vector
 
   /**
@@ -43,6 +45,7 @@ trait VectorTransformer extends Serializable {
    * @param data RDD[Vector] to be transformed.
    * @return transformed RDD[Vector].
    */
+  @Since("1.1.0")
   def transform(data: RDD[Vector]): RDD[Vector] = {
     // Later in #1498 , all RDD objects are sent via broadcasting instead of akka.
     // So it should be no longer necessary to explicitly broadcast `this` object.
@@ -55,6 +58,7 @@ trait VectorTransformer extends Serializable {
    * @param data JavaRDD[Vector] to be transformed.
    * @return transformed JavaRDD[Vector].
    */
+  @Since("1.1.0")
   def transform(data: JavaRDD[Vector]): JavaRDD[Vector] = {
     transform(data.rdd)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index cbbd2b0c8d060..e6f45ae4b01d5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -32,7 +32,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseMatrix, BLAS, DenseVector}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -70,6 +70,7 @@ private case class VocabWord(
  * and
  * Distributed Representations of Words and Phrases and their Compositionality.
  */
+@Since("1.1.0")
 @Experimental
 class Word2Vec extends Serializable with Logging {
 
@@ -83,6 +84,7 @@ class Word2Vec extends Serializable with Logging {
   /**
    * Sets vector size (default: 100).
    */
+  @Since("1.1.0")
   def setVectorSize(vectorSize: Int): this.type = {
     this.vectorSize = vectorSize
     this
@@ -91,6 +93,7 @@ class Word2Vec extends Serializable with Logging {
   /**
    * Sets initial learning rate (default: 0.025).
    */
+  @Since("1.1.0")
   def setLearningRate(learningRate: Double): this.type = {
     this.learningRate = learningRate
     this
@@ -99,6 +102,7 @@ class Word2Vec extends Serializable with Logging {
   /**
    * Sets number of partitions (default: 1). Use a small number for accuracy.
    */
+  @Since("1.1.0")
   def setNumPartitions(numPartitions: Int): this.type = {
     require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions")
     this.numPartitions = numPartitions
@@ -109,6 +113,7 @@ class Word2Vec extends Serializable with Logging {
    * Sets number of iterations (default: 1), which should be smaller than or equal to number of
    * partitions.
    */
+  @Since("1.1.0")
   def setNumIterations(numIterations: Int): this.type = {
     this.numIterations = numIterations
     this
@@ -117,6 +122,7 @@ class Word2Vec extends Serializable with Logging {
   /**
    * Sets random seed (default: a random long integer).
    */
+  @Since("1.1.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -126,6 +132,7 @@ class Word2Vec extends Serializable with Logging {
    * Sets minCount, the minimum number of times a token must appear to be included in the word2vec
    * model's vocabulary (default: 5).
    */
+  @Since("1.3.0")
   def setMinCount(minCount: Int): this.type = {
     this.minCount = minCount
     this
@@ -263,6 +270,7 @@ class Word2Vec extends Serializable with Logging {
    * @param dataset an RDD of words
    * @return a Word2VecModel
    */
+  @Since("1.1.0")
   def fit[S <: Iterable[String]](dataset: RDD[S]): Word2VecModel = {
 
     val words = dataset.flatMap(x => x)
@@ -412,6 +420,7 @@ class Word2Vec extends Serializable with Logging {
    * @param dataset a JavaRDD of words
    * @return a Word2VecModel
    */
+  @Since("1.1.0")
   def fit[S <: JavaIterable[String]](dataset: JavaRDD[S]): Word2VecModel = {
     fit(dataset.rdd.map(_.asScala))
   }
@@ -454,6 +463,7 @@ class Word2VecModel private[mllib] (
     wordVecNorms
   }
 
+  @Since("1.5.0")
   def this(model: Map[String, Array[Float]]) = {
     this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model))
   }
@@ -469,6 +479,7 @@ class Word2VecModel private[mllib] (
 
   override protected def formatVersion = "1.0"
 
+  @Since("1.4.0")
   def save(sc: SparkContext, path: String): Unit = {
     Word2VecModel.SaveLoadV1_0.save(sc, path, getVectors)
   }
@@ -478,6 +489,7 @@ class Word2VecModel private[mllib] (
    * @param word a word
    * @return vector representation of word
    */
+  @Since("1.1.0")
   def transform(word: String): Vector = {
     wordIndex.get(word) match {
       case Some(ind) =>
@@ -494,6 +506,7 @@ class Word2VecModel private[mllib] (
    * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
    */
+  @Since("1.1.0")
   def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
     val vector = transform(word)
     findSynonyms(vector, num)
@@ -505,6 +518,7 @@ class Word2VecModel private[mllib] (
    * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
    */
+  @Since("1.1.0")
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
     // TODO: optimize top-k
@@ -534,6 +548,7 @@ class Word2VecModel private[mllib] (
   /**
    * Returns a map of words to their vector representations.
    */
+  @Since("1.2.0")
   def getVectors: Map[String, Array[Float]] = {
     wordIndex.map { case (word, ind) =>
       (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
@@ -541,6 +556,7 @@ class Word2VecModel private[mllib] (
   }
 }
 
+@Since("1.4.0")
 @Experimental
 object Word2VecModel extends Loader[Word2VecModel] {
 
@@ -600,6 +616,7 @@ object Word2VecModel extends Loader[Word2VecModel] {
     }
   }
 
+  @Since("1.4.0")
   override def load(sc: SparkContext, path: String): Word2VecModel = {
 
     val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)

From eaafe139f881d6105996373c9b11f2ccd91b5b3e Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 20 Aug 2015 15:01:31 -0700
Subject: [PATCH 027/802] [SPARK-9245] [MLLIB] LDA topic assignments

For each (document, term) pair, return top topic.  Note that instances of (doc, term) pairs within a document (a.k.a. "tokens") are exchangeable, so we should provide an estimate per document-term, rather than per token.

CC: rotationsymmetry mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8329 from jkbradley/lda-topic-assignments.
---
 .../spark/mllib/clustering/LDAModel.scala     | 51 +++++++++++++++++--
 .../spark/mllib/clustering/LDAOptimizer.scala |  2 +-
 .../spark/mllib/clustering/JavaLDASuite.java  |  7 +++
 .../spark/mllib/clustering/LDASuite.scala     | 21 +++++++-
 4 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index b70e380c0393e..6bc68a4c18b99 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argtopk, normalize, sum}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax, argtopk, normalize, sum}
 import breeze.numerics.{exp, lgamma}
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
@@ -438,7 +438,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       Loader.checkSchema[Data](dataFrame.schema)
       val topics = dataFrame.collect()
       val vocabSize = topics(0).getAs[Vector](0).size
-      val k = topics.size
+      val k = topics.length
 
       val brzTopics = BDM.zeros[Double](vocabSize, k)
       topics.foreach { case Row(vec: Vector, ind: Int) =>
@@ -610,6 +610,50 @@ class DistributedLDAModel private[clustering] (
     }
   }
 
+  /**
+   * Return the top topic for each (doc, term) pair.  I.e., for each document, what is the most
+   * likely topic generating each term?
+   *
+   * @return RDD of (doc ID, assignment of top topic index for each term),
+   *         where the assignment is specified via a pair of zippable arrays
+   *         (term indices, topic indices).  Note that terms will be omitted if not present in
+   *         the document.
+   */
+  lazy val topicAssignments: RDD[(Long, Array[Int], Array[Int])] = {
+    // For reference, compare the below code with the core part of EMLDAOptimizer.next().
+    val eta = topicConcentration
+    val W = vocabSize
+    val alpha = docConcentration(0)
+    val N_k = globalTopicTotals
+    val sendMsg: EdgeContext[TopicCounts, TokenCount, (Array[Int], Array[Int])] => Unit =
+      (edgeContext) => {
+        // E-STEP: Compute gamma_{wjk} (smoothed topic distributions).
+        val scaledTopicDistribution: TopicCounts =
+          computePTopic(edgeContext.srcAttr, edgeContext.dstAttr, N_k, W, eta, alpha)
+        // For this (doc j, term w), send top topic k to doc vertex.
+        val topTopic: Int = argmax(scaledTopicDistribution)
+        val term: Int = index2term(edgeContext.dstId)
+        edgeContext.sendToSrc((Array(term), Array(topTopic)))
+      }
+    val mergeMsg: ((Array[Int], Array[Int]), (Array[Int], Array[Int])) => (Array[Int], Array[Int]) =
+      (terms_topics0, terms_topics1) => {
+        (terms_topics0._1 ++ terms_topics1._1, terms_topics0._2 ++ terms_topics1._2)
+      }
+    // M-STEP: Aggregation computes new N_{kj}, N_{wk} counts.
+    val perDocAssignments =
+      graph.aggregateMessages[(Array[Int], Array[Int])](sendMsg, mergeMsg).filter(isDocumentVertex)
+    perDocAssignments.map { case (docID: Long, (terms: Array[Int], topics: Array[Int])) =>
+      // TODO: Avoid zip, which is inefficient.
+      val (sortedTerms, sortedTopics) = terms.zip(topics).sortBy(_._1).unzip
+      (docID, sortedTerms.toArray, sortedTopics.toArray)
+    }
+  }
+
+  /** Java-friendly version of [[topicAssignments]] */
+  lazy val javaTopicAssignments: JavaRDD[(java.lang.Long, Array[Int], Array[Int])] = {
+    topicAssignments.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Int])]].toJavaRDD()
+  }
+
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 
@@ -849,10 +893,9 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
     val classNameV1_0 = SaveLoadV1_0.thisClassName
 
     val model = (loadedClassName, loadedVersion) match {
-      case (className, "1.0") if className == classNameV1_0 => {
+      case (className, "1.0") if className == classNameV1_0 =>
         DistributedLDAModel.SaveLoadV1_0.load(sc, path, vocabSize, docConcentration,
           topicConcentration, iterationTimes.toArray, gammaShape)
-      }
       case _ => throw new Exception(
         s"DistributedLDAModel.load did not recognize model with (className, format version):" +
           s"($loadedClassName, $loadedVersion).  Supported: ($classNameV1_0, 1.0)")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 360241c8081ac..cb517f9689ade 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -167,7 +167,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
         edgeContext.sendToDst((false, scaledTopicDistribution))
         edgeContext.sendToSrc((false, scaledTopicDistribution))
       }
-    // This is a hack to detect whether we could modify the values in-place.
+    // The Boolean is a hack to detect whether we could modify the values in-place.
     // TODO: Add zero/seqOp/combOp option to aggregateMessages. (SPARK-5438)
     val mergeMsg: ((Boolean, TopicCounts), (Boolean, TopicCounts)) => (Boolean, TopicCounts) =
       (m0, m1) => {
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 6e91cde2eabb5..3fea359a3b46c 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -134,6 +134,13 @@ public Boolean call(Tuple2<Long, Vector> tuple2) {
     double[] topicWeights = topTopics._3();
     assertEquals(3, topicIndices.length);
     assertEquals(3, topicWeights.length);
+
+    // Check: topTopicAssignments
+    Tuple3<Long, int[], int[]> topicAssignment = model.javaTopicAssignments().first();
+    Long docId2 = topicAssignment._1();
+    int[] termIndices2 = topicAssignment._2();
+    int[] topicIndices2 = topicAssignment._3();
+    assertEquals(termIndices2.length, topicIndices2.length);
   }
 
   @Test
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 99e28499fd316..8a714f9b79e02 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -135,17 +135,34 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
 
     // Top 3 documents per topic
-    model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach {case (t1, t2) =>
+    model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach { case (t1, t2) =>
       assert(t1._1 === t2._1)
       assert(t1._2 === t2._2)
     }
 
     // All documents per topic
     val q = tinyCorpus.length
-    model.topDocumentsPerTopic(q).zip(topDocsByTopicDistributions(q)).foreach {case (t1, t2) =>
+    model.topDocumentsPerTopic(q).zip(topDocsByTopicDistributions(q)).foreach { case (t1, t2) =>
       assert(t1._1 === t2._1)
       assert(t1._2 === t2._2)
     }
+
+    // Check: topTopicAssignments
+    // Make sure it assigns a topic to each term appearing in each doc.
+    val topTopicAssignments: Map[Long, (Array[Int], Array[Int])] =
+      model.topicAssignments.collect().map(x => x._1 -> (x._2, x._3)).toMap
+    assert(topTopicAssignments.keys.max < tinyCorpus.length)
+    tinyCorpus.foreach { case (docID: Long, doc: Vector) =>
+      if (topTopicAssignments.contains(docID)) {
+        val (inds, vals) = topTopicAssignments(docID)
+        assert(inds.length === doc.numNonzeros)
+        // For "term" in actual doc,
+        // check that it has a topic assigned.
+        doc.foreachActive((term, wcnt) => assert(wcnt === 0 || inds.contains(term)))
+      } else {
+        assert(doc.numNonzeros === 0)
+      }
+    }
   }
 
   test("vertex indexing") {

From afe9f03fd964d1e8604d02feee8d6970efbe6009 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Thu, 20 Aug 2015 15:10:13 -0700
Subject: [PATCH 028/802] [SPARK-9400] [SQL] codegen for StringLocate

This is based on #7779 , thanks to tarekauel . Fix the conflict and nullability.

Closes #7779 and #8274 .

Author: Tarek Auel <tarek.auel@googlemail.com>
Author: Davies Liu <davies@databricks.com>

Closes #8330 from davies/stringLocate.
---
 .../expressions/stringExpressions.scala       | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 3c23f2ecfb57c..b60d318534a41 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -409,13 +409,14 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:
  * in given string after position pos.
  */
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
-  extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   def this(substr: Expression, str: Expression) = {
     this(substr, str, Literal(0))
   }
 
   override def children: Seq[Expression] = substr :: str :: start :: Nil
+  override def nullable: Boolean = substr.nullable || str.nullable
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
 
@@ -441,6 +442,31 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
     }
   }
 
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val substrGen = substr.gen(ctx)
+    val strGen = str.gen(ctx)
+    val startGen = start.gen(ctx)
+    s"""
+      int ${ev.primitive} = 0;
+      boolean ${ev.isNull} = false;
+      ${startGen.code}
+      if (!${startGen.isNull}) {
+        ${substrGen.code}
+        if (!${substrGen.isNull}) {
+          ${strGen.code}
+          if (!${strGen.isNull}) {
+            ${ev.primitive} = ${strGen.primitive}.indexOf(${substrGen.primitive},
+              ${startGen.primitive}) + 1;
+          } else {
+            ${ev.isNull} = true;
+          }
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+     """
+  }
+
   override def prettyName: String = "locate"
 }
 

From cdd9a2bb10e20556003843a0f7aaa33acd55f6d2 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 20 Aug 2015 20:01:13 -0700
Subject: [PATCH 029/802] [SPARK-10140] [DOC] add target fields to @Since

so constructors parameters and public fields can be annotated. rxin MechCoder

Author: Xiangrui Meng <meng@databricks.com>

Closes #8344 from mengxr/SPARK-10140.2.
---
 core/src/main/scala/org/apache/spark/annotation/Since.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/annotation/Since.scala b/core/src/main/scala/org/apache/spark/annotation/Since.scala
index fa59393c22476..af483e361e339 100644
--- a/core/src/main/scala/org/apache/spark/annotation/Since.scala
+++ b/core/src/main/scala/org/apache/spark/annotation/Since.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.annotation
 
 import scala.annotation.StaticAnnotation
+import scala.annotation.meta._
 
 /**
  * A Scala annotation that specifies the Spark version when a definition was added.
@@ -25,4 +26,5 @@ import scala.annotation.StaticAnnotation
  * hence works for overridden methods that inherit API documentation directly from parents.
  * The limitation is that it does not show up in the generated Java API documentation.
  */
+@param @field @getter @setter @beanGetter @beanSetter
 private[spark] class Since(version: String) extends StaticAnnotation

From dcfe0c5cde953b31c5bfeb6e41d1fc9b333241eb Mon Sep 17 00:00:00 2001
From: Alexander Ulanov <nashb@yandex.ru>
Date: Thu, 20 Aug 2015 20:02:27 -0700
Subject: [PATCH 030/802] [SPARK-9846] [DOCS] User guide for Multilayer
 Perceptron Classifier

Added user guide for multilayer perceptron classifier:
  - Simplified description of the multilayer perceptron classifier
  - Example code for Scala and Java

Author: Alexander Ulanov <nashb@yandex.ru>

Closes #8262 from avulanov/SPARK-9846-mlpc-docs.
---
 docs/ml-ann.md   | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 docs/ml-guide.md |   1 +
 2 files changed, 124 insertions(+)
 create mode 100644 docs/ml-ann.md

diff --git a/docs/ml-ann.md b/docs/ml-ann.md
new file mode 100644
index 0000000000000..d5ddd92af1e96
--- /dev/null
+++ b/docs/ml-ann.md
@@ -0,0 +1,123 @@
+---
+layout: global
+title: Multilayer perceptron classifier - ML
+displayTitle: <a href="ml-guide.html">ML</a> - Multilayer perceptron classifier
+---
+
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+
+Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). 
+MLPC consists of multiple layers of nodes. 
+Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes maps inputs to the outputs 
+by performing linear combination of the inputs with the node's weights `$\wv$` and bias `$\bv$` and applying an activation function. 
+It can be written in matrix form for MLPC with `$K+1$` layers as follows:
+`\[
+\mathrm{y}(\x) = \mathrm{f_K}(...\mathrm{f_2}(\wv_2^T\mathrm{f_1}(\wv_1^T \x+b_1)+b_2)...+b_K)
+\]`
+Nodes in intermediate layers use sigmoid (logistic) function:
+`\[
+\mathrm{f}(z_i) = \frac{1}{1 + e^{-z_i}}
+\]`
+Nodes in the output layer use softmax function:
+`\[
+\mathrm{f}(z_i) = \frac{e^{z_i}}{\sum_{k=1}^N e^{z_k}}
+\]`
+The number of nodes `$N$` in the output layer corresponds to the number of classes. 
+
+MLPC employes backpropagation for learning the model. We use logistic loss function for optimization and L-BFGS as optimization routine.
+
+**Examples**
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.Row
+
+// Load training data
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt").toDF()
+// Split the data into train and test
+val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L)
+val train = splits(0)
+val test = splits(1)
+// specify layers for the neural network: 
+// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes)
+val layers = Array[Int](4, 5, 4, 3)
+// create the trainer and set its parameters
+val trainer = new MultilayerPerceptronClassifier()
+  .setLayers(layers)
+  .setBlockSize(128)
+  .setSeed(1234L)
+  .setMaxIter(100)
+// train the model
+val model = trainer.fit(train)
+// compute precision on the test set
+val result = model.transform(test)
+val predictionAndLabels = result.select("prediction", "label")
+val evaluator = new MulticlassClassificationEvaluator()
+  .setMetricName("precision")
+println("Precision:" + evaluator.evaluate(predictionAndLabels))
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
+import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+
+// Load training data
+String path = "data/mllib/sample_multiclass_classification_data.txt";
+JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+DataFrame dataFrame = sqlContext.createDataFrame(data, LabeledPoint.class);
+// Split the data into train and test
+DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
+DataFrame train = splits[0];
+DataFrame test = splits[1];
+// specify layers for the neural network:
+// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes)
+int[] layers = new int[] {4, 5, 4, 3};
+// create the trainer and set its parameters
+MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier()
+  .setLayers(layers)
+  .setBlockSize(128)
+  .setSeed(1234L)
+  .setMaxIter(100);
+// train the model
+MultilayerPerceptronClassificationModel model = trainer.fit(train);
+// compute precision on the test set
+DataFrame result = model.transform(test);
+DataFrame predictionAndLabels = result.select("prediction", "label");
+MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+  .setMetricName("precision");
+System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels));
+{% endhighlight %}
+</div>
+
+</div>
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index c64fff7c0315a..de8fead3529e4 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -179,6 +179,7 @@ There are now several algorithms in the Pipelines API which are not in the lower
 * [Decision Trees for Classification and Regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
+* [Multilayer perceptron classifier](ml-ann.html)
 
 # Code Examples
 

From bb220f6570aa0b95598b30524224a3e82c1effbc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Fri, 21 Aug 2015 01:43:49 -0700
Subject: [PATCH 031/802] [SPARK-10040] [SQL] Use batch insert for JDBC writing

JIRA: https://issues.apache.org/jira/browse/SPARK-10040

We should use batch insert instead of single row in JDBC.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8273 from viirya/jdbc-insert-batch.
---
 .../execution/datasources/jdbc/JdbcUtils.scala  | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 2d0e736ee4766..26788b2a4fd69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -88,13 +88,15 @@ object JdbcUtils extends Logging {
       table: String,
       iterator: Iterator[Row],
       rddSchema: StructType,
-      nullTypes: Array[Int]): Iterator[Byte] = {
+      nullTypes: Array[Int],
+      batchSize: Int): Iterator[Byte] = {
     val conn = getConnection()
     var committed = false
     try {
       conn.setAutoCommit(false) // Everything in the same db transaction.
       val stmt = insertStatement(conn, table, rddSchema)
       try {
+        var rowCount = 0
         while (iterator.hasNext) {
           val row = iterator.next()
           val numFields = rddSchema.fields.length
@@ -122,7 +124,15 @@ object JdbcUtils extends Logging {
             }
             i = i + 1
           }
-          stmt.executeUpdate()
+          stmt.addBatch()
+          rowCount += 1
+          if (rowCount % batchSize == 0) {
+            stmt.executeBatch()
+            rowCount = 0
+          }
+        }
+        if (rowCount > 0) {
+          stmt.executeBatch()
         }
       } finally {
         stmt.close()
@@ -211,8 +221,9 @@ object JdbcUtils extends Logging {
     val rddSchema = df.schema
     val driver: String = DriverRegistry.getDriverClassName(url)
     val getConnection: () => Connection = JDBCRDD.getConnector(driver, url, properties)
+    val batchSize = properties.getProperty("batchsize", "1000").toInt
     df.foreachPartition { iterator =>
-      savePartition(getConnection, table, iterator, rddSchema, nullTypes)
+      savePartition(getConnection, table, iterator, rddSchema, nullTypes, batchSize)
     }
   }
 

From 708036c1de52d674ceff30ac465e1dcedeb8dde8 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 21 Aug 2015 08:41:36 -0500
Subject: [PATCH 032/802] [SPARK-9439] [YARN] External shuffle service robust
 to NM restarts using leveldb

https://issues.apache.org/jira/browse/SPARK-9439

In general, Yarn apps should be robust to NodeManager restarts.  However, if you run spark with the external shuffle service on, after a NM restart all shuffles fail, b/c the shuffle service has lost some state with info on each executor.  (Note the shuffle data is perfectly fine on disk across a NM restart, the problem is we've lost the small bit of state that lets us *find* those files.)

The solution proposed here is that the external shuffle service can write out its state to leveldb (backed by a local file) every time an executor is added.  When running with yarn, that file is in the NM's local dir.  Whenever the service is started, it looks for that file, and if it exists, it reads the file and re-registers all executors there.

Nothing is changed in non-yarn modes with this patch.  The service is not given a place to save the state to, so it operates the same as before.  This should make it easy to update other cluster managers as well, by just supplying the right file & the equivalent of yarn's `initializeApplication` -- I'm not familiar enough with those modes to know how to do that.

Author: Imran Rashid <irashid@cloudera.com>

Closes #7943 from squito/leveldb_external_shuffle_service_NM_restart and squashes the following commits:

0d285d3 [Imran Rashid] review feedback
70951d6 [Imran Rashid] Merge branch 'master' into leveldb_external_shuffle_service_NM_restart
5c71c8c [Imran Rashid] save executor to db before registering; style
2499c8c [Imran Rashid] explicit dependency on jackson-annotations
795d28f [Imran Rashid] review feedback
81f80e2 [Imran Rashid] Merge branch 'master' into leveldb_external_shuffle_service_NM_restart
594d520 [Imran Rashid] use json to serialize application executor info
1a7980b [Imran Rashid] version
8267d2a [Imran Rashid] style
e9f99e8 [Imran Rashid] cleanup the handling of bad dbs a little
9378ba3 [Imran Rashid] fail gracefully on corrupt leveldb files
acedb62 [Imran Rashid] switch to writing out one record per executor
79922b7 [Imran Rashid] rely on yarn to call stopApplication; assorted cleanup
12b6a35 [Imran Rashid] save registered executors when apps are removed; add tests
c878fbe [Imran Rashid] better explanation of shuffle service port handling
694934c [Imran Rashid] only open leveldb connection once per service
d596410 [Imran Rashid] store executor data in leveldb
59800b7 [Imran Rashid] Files.move in case renaming is unsupported
32fe5ae [Imran Rashid] Merge branch 'master' into external_shuffle_service_NM_restart
d7450f0 [Imran Rashid] style
f729e2b [Imran Rashid] debugging
4492835 [Imran Rashid] lol, dont use a PrintWriter b/c of scalastyle checks
0a39b98 [Imran Rashid] Merge branch 'master' into external_shuffle_service_NM_restart
55f49fc [Imran Rashid] make sure the service doesnt die if the registered executor file is corrupt; add tests
245db19 [Imran Rashid] style
62586a6 [Imran Rashid] just serialize the whole executors map
bdbbf0d [Imran Rashid] comments, remove some unnecessary changes
857331a [Imran Rashid] better tests & comments
bb9d1e6 [Imran Rashid] formatting
bdc4b32 [Imran Rashid] rename
86e0cb9 [Imran Rashid] for tests, shuffle service finds an open port
23994ff [Imran Rashid] style
7504de8 [Imran Rashid] style
a36729c [Imran Rashid] cleanup
efb6195 [Imran Rashid] proper unit test, and no longer leak if apps stop during NM restart
dd93dc0 [Imran Rashid] test for shuffle service w/ NM restarts
d596969 [Imran Rashid] cleanup imports
0e9d69b [Imran Rashid] better names
9eae119 [Imran Rashid] cleanup lots of duplication
1136f44 [Imran Rashid] test needs to have an actual shuffle
0b588bd [Imran Rashid] more fixes ...
ad122ef [Imran Rashid] more fixes
5e5a7c3 [Imran Rashid] fix build
c69f46b [Imran Rashid] maybe working version, needs tests & cleanup ...
bb3ba49 [Imran Rashid] minor cleanup
36127d3 [Imran Rashid] wip
b9d2ced [Imran Rashid] incomplete setup for external shuffle service tests
---
 .../spark/deploy/ExternalShuffleService.scala |   2 +-
 .../mesos/MesosExternalShuffleService.scala   |   2 +-
 .../apache/spark/storage/BlockManager.scala   |  14 +-
 .../spark/ExternalShuffleServiceSuite.scala   |   2 +-
 network/shuffle/pom.xml                       |  16 ++
 .../shuffle/ExternalShuffleBlockHandler.java  |  37 ++-
 .../shuffle/ExternalShuffleBlockResolver.java | 225 +++++++++++++++--
 .../shuffle/protocol/ExecutorShuffleInfo.java |   8 +-
 .../ExternalShuffleBlockResolverSuite.java    |  35 ++-
 .../shuffle/ExternalShuffleCleanupSuite.java  |   9 +-
 .../ExternalShuffleIntegrationSuite.java      |   2 +-
 .../shuffle/ExternalShuffleSecuritySuite.java |   5 +-
 .../network/yarn/YarnShuffleService.java      |  62 ++++-
 pom.xml                                       |   5 +
 yarn/pom.xml                                  |   6 +
 .../deploy/yarn/BaseYarnClusterSuite.scala    | 193 +++++++++++++++
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 173 +------------
 .../yarn/YarnShuffleIntegrationSuite.scala    | 109 ++++++++
 .../network/shuffle/ShuffleTestAccessor.scala |  71 ++++++
 .../yarn/YarnShuffleServiceSuite.scala        | 233 ++++++++++++++++++
 .../spark/network/yarn/YarnTestAccessor.scala |  37 +++
 21 files changed, 1031 insertions(+), 215 deletions(-)
 create mode 100644 yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
 create mode 100644 yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
 create mode 100644 yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
 create mode 100644 yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
 create mode 100644 yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
index 20a9faa1784b7..22ef701d833b2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
@@ -53,7 +53,7 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana
 
   /** Create a new shuffle block handler. Factored out for subclasses to override. */
   protected def newShuffleBlockHandler(conf: TransportConf): ExternalShuffleBlockHandler = {
-    new ExternalShuffleBlockHandler(conf)
+    new ExternalShuffleBlockHandler(conf, null)
   }
 
   /** Starts the external shuffle service if the user has configured us to. */
diff --git a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
index 061857476a8a0..12337a940a414 100644
--- a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
@@ -34,7 +34,7 @@ import org.apache.spark.network.util.TransportConf
  * It detects driver termination and calls the cleanup callback to [[ExternalShuffleService]].
  */
 private[mesos] class MesosExternalShuffleBlockHandler(transportConf: TransportConf)
-  extends ExternalShuffleBlockHandler(transportConf) with Logging {
+  extends ExternalShuffleBlockHandler(transportConf, null) with Logging {
 
   // Stores a map of driver socket addresses to app ids
   private val connectedApps = new mutable.HashMap[SocketAddress, String]
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index eedb27942e841..fefaef0ab82c8 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -93,8 +93,17 @@ private[spark] class BlockManager(
 
   // Port used by the external shuffle service. In Yarn mode, this may be already be
   // set through the Hadoop configuration as the server is launched in the Yarn NM.
-  private val externalShuffleServicePort =
-    Utils.getSparkOrYarnConfig(conf, "spark.shuffle.service.port", "7337").toInt
+  private val externalShuffleServicePort = {
+    val tmpPort = Utils.getSparkOrYarnConfig(conf, "spark.shuffle.service.port", "7337").toInt
+    if (tmpPort == 0) {
+      // for testing, we set "spark.shuffle.service.port" to 0 in the yarn config, so yarn finds
+      // an open port.  But we still need to tell our spark apps the right port to use.  So
+      // only if the yarn config has the port set to 0, we prefer the value in the spark config
+      conf.get("spark.shuffle.service.port").toInt
+    } else {
+      tmpPort
+    }
+  }
 
   // Check that we're not using external shuffle service with consolidated shuffle files.
   if (externalShuffleServiceEnabled
@@ -191,6 +200,7 @@ private[spark] class BlockManager(
       executorId, blockTransferService.hostName, blockTransferService.port)
 
     shuffleServerId = if (externalShuffleServiceEnabled) {
+      logInfo(s"external shuffle service port = $externalShuffleServicePort")
       BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)
     } else {
       blockManagerId
diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index c38d70252add1..e846a72c888c6 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -36,7 +36,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
 
   override def beforeAll() {
     val transportConf = SparkTransportConf.fromSparkConf(conf, numUsableCores = 2)
-    rpcHandler = new ExternalShuffleBlockHandler(transportConf)
+    rpcHandler = new ExternalShuffleBlockHandler(transportConf, null)
     val transportContext = new TransportContext(transportConf, rpcHandler)
     server = transportContext.createServer()
 
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 532463e96fbb7..3d2edf9d94515 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -43,6 +43,22 @@
       <version>${project.version}</version>
     </dependency>
 
+    <dependency>
+      <groupId>org.fusesource.leveldbjni</groupId>
+      <artifactId>leveldbjni-all</artifactId>
+      <version>1.8</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
+
     <!-- Provided dependencies -->
     <dependency>
       <groupId>org.slf4j</groupId>
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
index db9dc4f17cee9..0df1dd621f6e9 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
@@ -17,11 +17,12 @@
 
 package org.apache.spark.network.shuffle;
 
+import java.io.File;
+import java.io.IOException;
 import java.util.List;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
-import org.apache.spark.network.util.TransportConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -31,10 +32,10 @@
 import org.apache.spark.network.server.OneForOneStreamManager;
 import org.apache.spark.network.server.RpcHandler;
 import org.apache.spark.network.server.StreamManager;
-import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
-import org.apache.spark.network.shuffle.protocol.OpenBlocks;
-import org.apache.spark.network.shuffle.protocol.RegisterExecutor;
-import org.apache.spark.network.shuffle.protocol.StreamHandle;
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId;
+import org.apache.spark.network.shuffle.protocol.*;
+import org.apache.spark.network.util.TransportConf;
+
 
 /**
  * RPC Handler for a server which can serve shuffle blocks from outside of an Executor process.
@@ -46,11 +47,13 @@
 public class ExternalShuffleBlockHandler extends RpcHandler {
   private final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockHandler.class);
 
-  private final ExternalShuffleBlockResolver blockManager;
+  @VisibleForTesting
+  final ExternalShuffleBlockResolver blockManager;
   private final OneForOneStreamManager streamManager;
 
-  public ExternalShuffleBlockHandler(TransportConf conf) {
-    this(new OneForOneStreamManager(), new ExternalShuffleBlockResolver(conf));
+  public ExternalShuffleBlockHandler(TransportConf conf, File registeredExecutorFile) throws IOException {
+    this(new OneForOneStreamManager(),
+      new ExternalShuffleBlockResolver(conf, registeredExecutorFile));
   }
 
   /** Enables mocking out the StreamManager and BlockManager. */
@@ -105,4 +108,22 @@ public StreamManager getStreamManager() {
   public void applicationRemoved(String appId, boolean cleanupLocalDirs) {
     blockManager.applicationRemoved(appId, cleanupLocalDirs);
   }
+
+  /**
+   * Register an (application, executor) with the given shuffle info.
+   *
+   * The "re-" is meant to highlight the intended use of this method -- when this service is
+   * restarted, this is used to restore the state of executors from before the restart.  Normal
+   * registration will happen via a message handled in receive()
+   *
+   * @param appExecId
+   * @param executorInfo
+   */
+  public void reregisterExecutor(AppExecId appExecId, ExecutorShuffleInfo executorInfo) {
+    blockManager.registerExecutor(appExecId.appId, appExecId.execId, executorInfo);
+  }
+
+  public void close() {
+    blockManager.close();
+  }
 }
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
index 022ed88a16480..79beec4429a99 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -17,19 +17,24 @@
 
 package org.apache.spark.network.shuffle;
 
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.Map;
+import java.io.*;
+import java.util.*;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
 
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Charsets;
 import com.google.common.base.Objects;
 import com.google.common.collect.Maps;
+import org.fusesource.leveldbjni.JniDBFactory;
+import org.fusesource.leveldbjni.internal.NativeDB;
+import org.iq80.leveldb.DB;
+import org.iq80.leveldb.DBIterator;
+import org.iq80.leveldb.Options;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -52,25 +57,87 @@
 public class ExternalShuffleBlockResolver {
   private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);
 
+  private static final ObjectMapper mapper = new ObjectMapper();
+  /**
+   * This a common prefix to the key for each app registration we stick in leveldb, so they
+   * are easy to find, since leveldb lets you search based on prefix.
+   */
+  private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
+  private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);
+
   // Map containing all registered executors' metadata.
-  private final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
+  @VisibleForTesting
+  final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
 
   // Single-threaded Java executor used to perform expensive recursive directory deletion.
   private final Executor directoryCleaner;
 
   private final TransportConf conf;
 
-  public ExternalShuffleBlockResolver(TransportConf conf) {
-    this(conf, Executors.newSingleThreadExecutor(
+  @VisibleForTesting
+  final File registeredExecutorFile;
+  @VisibleForTesting
+  final DB db;
+
+  public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorFile)
+      throws IOException {
+    this(conf, registeredExecutorFile, Executors.newSingleThreadExecutor(
         // Add `spark` prefix because it will run in NM in Yarn mode.
         NettyUtils.createThreadFactory("spark-shuffle-directory-cleaner")));
   }
 
   // Allows tests to have more control over when directories are cleaned up.
   @VisibleForTesting
-  ExternalShuffleBlockResolver(TransportConf conf, Executor directoryCleaner) {
+  ExternalShuffleBlockResolver(
+      TransportConf conf,
+      File registeredExecutorFile,
+      Executor directoryCleaner) throws IOException {
     this.conf = conf;
-    this.executors = Maps.newConcurrentMap();
+    this.registeredExecutorFile = registeredExecutorFile;
+    if (registeredExecutorFile != null) {
+      Options options = new Options();
+      options.createIfMissing(false);
+      options.logger(new LevelDBLogger());
+      DB tmpDb;
+      try {
+        tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
+      } catch (NativeDB.DBException e) {
+        if (e.isNotFound() || e.getMessage().contains(" does not exist ")) {
+          logger.info("Creating state database at " + registeredExecutorFile);
+          options.createIfMissing(true);
+          try {
+            tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
+          } catch (NativeDB.DBException dbExc) {
+            throw new IOException("Unable to create state store", dbExc);
+          }
+        } else {
+          // the leveldb file seems to be corrupt somehow.  Lets just blow it away and create a new
+          // one, so we can keep processing new apps
+          logger.error("error opening leveldb file {}.  Creating new file, will not be able to " +
+            "recover state for existing applications", registeredExecutorFile, e);
+          if (registeredExecutorFile.isDirectory()) {
+            for (File f : registeredExecutorFile.listFiles()) {
+              f.delete();
+            }
+          }
+          registeredExecutorFile.delete();
+          options.createIfMissing(true);
+          try {
+            tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
+          } catch (NativeDB.DBException dbExc) {
+            throw new IOException("Unable to create state store", dbExc);
+          }
+
+        }
+      }
+      // if there is a version mismatch, we throw an exception, which means the service is unusable
+      checkVersion(tmpDb);
+      executors = reloadRegisteredExecutors(tmpDb);
+      db = tmpDb;
+    } else {
+      db = null;
+      executors = Maps.newConcurrentMap();
+    }
     this.directoryCleaner = directoryCleaner;
   }
 
@@ -81,6 +148,15 @@ public void registerExecutor(
       ExecutorShuffleInfo executorInfo) {
     AppExecId fullId = new AppExecId(appId, execId);
     logger.info("Registered executor {} with {}", fullId, executorInfo);
+    try {
+      if (db != null) {
+        byte[] key = dbAppExecKey(fullId);
+        byte[] value = mapper.writeValueAsString(executorInfo).getBytes(Charsets.UTF_8);
+        db.put(key, value);
+      }
+    } catch (Exception e) {
+      logger.error("Error saving registered executors", e);
+    }
     executors.put(fullId, executorInfo);
   }
 
@@ -136,6 +212,13 @@ public void applicationRemoved(String appId, boolean cleanupLocalDirs) {
       // Only touch executors associated with the appId that was removed.
       if (appId.equals(fullId.appId)) {
         it.remove();
+        if (db != null) {
+          try {
+            db.delete(dbAppExecKey(fullId));
+          } catch (IOException e) {
+            logger.error("Error deleting {} from executor state db", appId, e);
+          }
+        }
 
         if (cleanupLocalDirs) {
           logger.info("Cleaning up executor {}'s {} local dirs", fullId, executor.localDirs.length);
@@ -220,12 +303,23 @@ static File getFile(String[] localDirs, int subDirsPerLocalDir, String filename)
     return new File(new File(localDir, String.format("%02x", subDirId)), filename);
   }
 
+  void close() {
+    if (db != null) {
+      try {
+        db.close();
+      } catch (IOException e) {
+        logger.error("Exception closing leveldb with registered executors", e);
+      }
+    }
+  }
+
   /** Simply encodes an executor's full ID, which is appId + execId. */
-  private static class AppExecId {
-    final String appId;
-    final String execId;
+  public static class AppExecId {
+    public final String appId;
+    public final String execId;
 
-    private AppExecId(String appId, String execId) {
+    @JsonCreator
+    public AppExecId(@JsonProperty("appId") String appId, @JsonProperty("execId") String execId) {
       this.appId = appId;
       this.execId = execId;
     }
@@ -252,4 +346,105 @@ public String toString() {
         .toString();
     }
   }
+
+  private static byte[] dbAppExecKey(AppExecId appExecId) throws IOException {
+    // we stick a common prefix on all the keys so we can find them in the DB
+    String appExecJson = mapper.writeValueAsString(appExecId);
+    String key = (APP_KEY_PREFIX + ";" + appExecJson);
+    return key.getBytes(Charsets.UTF_8);
+  }
+
+  private static AppExecId parseDbAppExecKey(String s) throws IOException {
+    if (!s.startsWith(APP_KEY_PREFIX)) {
+      throw new IllegalArgumentException("expected a string starting with " + APP_KEY_PREFIX);
+    }
+    String json = s.substring(APP_KEY_PREFIX.length() + 1);
+    AppExecId parsed = mapper.readValue(json, AppExecId.class);
+    return parsed;
+  }
+
+  @VisibleForTesting
+  static ConcurrentMap<AppExecId, ExecutorShuffleInfo> reloadRegisteredExecutors(DB db)
+      throws IOException {
+    ConcurrentMap<AppExecId, ExecutorShuffleInfo> registeredExecutors = Maps.newConcurrentMap();
+    if (db != null) {
+      DBIterator itr = db.iterator();
+      itr.seek(APP_KEY_PREFIX.getBytes(Charsets.UTF_8));
+      while (itr.hasNext()) {
+        Map.Entry<byte[], byte[]> e = itr.next();
+        String key = new String(e.getKey(), Charsets.UTF_8);
+        if (!key.startsWith(APP_KEY_PREFIX)) {
+          break;
+        }
+        AppExecId id = parseDbAppExecKey(key);
+        ExecutorShuffleInfo shuffleInfo = mapper.readValue(e.getValue(), ExecutorShuffleInfo.class);
+        registeredExecutors.put(id, shuffleInfo);
+      }
+    }
+    return registeredExecutors;
+  }
+
+  private static class LevelDBLogger implements org.iq80.leveldb.Logger {
+    private static final Logger LOG = LoggerFactory.getLogger(LevelDBLogger.class);
+
+    @Override
+    public void log(String message) {
+      LOG.info(message);
+    }
+  }
+
+  /**
+   * Simple major.minor versioning scheme.  Any incompatible changes should be across major
+   * versions.  Minor version differences are allowed -- meaning we should be able to read
+   * dbs that are either earlier *or* later on the minor version.
+   */
+  private static void checkVersion(DB db) throws IOException {
+    byte[] bytes = db.get(StoreVersion.KEY);
+    if (bytes == null) {
+      storeVersion(db);
+    } else {
+      StoreVersion version = mapper.readValue(bytes, StoreVersion.class);
+      if (version.major != CURRENT_VERSION.major) {
+        throw new IOException("cannot read state DB with version " + version + ", incompatible " +
+          "with current version " + CURRENT_VERSION);
+      }
+      storeVersion(db);
+    }
+  }
+
+  private static void storeVersion(DB db) throws IOException {
+    db.put(StoreVersion.KEY, mapper.writeValueAsBytes(CURRENT_VERSION));
+  }
+
+
+  public static class StoreVersion {
+
+    final static byte[] KEY = "StoreVersion".getBytes(Charsets.UTF_8);
+
+    public final int major;
+    public final int minor;
+
+    @JsonCreator public StoreVersion(@JsonProperty("major") int major, @JsonProperty("minor") int minor) {
+      this.major = major;
+      this.minor = minor;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) return true;
+      if (o == null || getClass() != o.getClass()) return false;
+
+      StoreVersion that = (StoreVersion) o;
+
+      return major == that.major && minor == that.minor;
+    }
+
+    @Override
+    public int hashCode() {
+      int result = major;
+      result = 31 * result + minor;
+      return result;
+    }
+  }
+
 }
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java
index cadc8e8369c6a..102d4efb8bf3b 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java
@@ -19,6 +19,8 @@
 
 import java.util.Arrays;
 
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
 import com.google.common.base.Objects;
 import io.netty.buffer.ByteBuf;
 
@@ -34,7 +36,11 @@ public class ExecutorShuffleInfo implements Encodable {
   /** Shuffle manager (SortShuffleManager or HashShuffleManager) that the executor is using. */
   public final String shuffleManager;
 
-  public ExecutorShuffleInfo(String[] localDirs, int subDirsPerLocalDir, String shuffleManager) {
+  @JsonCreator
+  public ExecutorShuffleInfo(
+      @JsonProperty("localDirs") String[] localDirs,
+      @JsonProperty("subDirsPerLocalDir") int subDirsPerLocalDir,
+      @JsonProperty("shuffleManager") String shuffleManager) {
     this.localDirs = localDirs;
     this.subDirsPerLocalDir = subDirsPerLocalDir;
     this.shuffleManager = shuffleManager;
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
index d02f4f0fdb682..3c6cb367dea46 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
@@ -21,9 +21,12 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.io.CharStreams;
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
 import org.apache.spark.network.util.SystemPropertyConfigProvider;
 import org.apache.spark.network.util.TransportConf;
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -59,8 +62,8 @@ public static void afterAll() {
   }
 
   @Test
-  public void testBadRequests() {
-    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf);
+  public void testBadRequests() throws IOException {
+    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf, null);
     // Unregistered executor
     try {
       resolver.getBlockData("app0", "exec1", "shuffle_1_1_0");
@@ -91,7 +94,7 @@ public void testBadRequests() {
 
   @Test
   public void testSortShuffleBlocks() throws IOException {
-    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf);
+    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf, null);
     resolver.registerExecutor("app0", "exec0",
       dataContext.createExecutorInfo("org.apache.spark.shuffle.sort.SortShuffleManager"));
 
@@ -110,7 +113,7 @@ public void testSortShuffleBlocks() throws IOException {
 
   @Test
   public void testHashShuffleBlocks() throws IOException {
-    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf);
+    ExternalShuffleBlockResolver resolver = new ExternalShuffleBlockResolver(conf, null);
     resolver.registerExecutor("app0", "exec0",
       dataContext.createExecutorInfo("org.apache.spark.shuffle.hash.HashShuffleManager"));
 
@@ -126,4 +129,28 @@ public void testHashShuffleBlocks() throws IOException {
     block1Stream.close();
     assertEquals(hashBlock1, block1);
   }
+
+  @Test
+  public void jsonSerializationOfExecutorRegistration() throws IOException {
+    ObjectMapper mapper = new ObjectMapper();
+    AppExecId appId = new AppExecId("foo", "bar");
+    String appIdJson = mapper.writeValueAsString(appId);
+    AppExecId parsedAppId = mapper.readValue(appIdJson, AppExecId.class);
+    assertEquals(parsedAppId, appId);
+
+    ExecutorShuffleInfo shuffleInfo =
+      new ExecutorShuffleInfo(new String[]{"/bippy", "/flippy"}, 7, "hash");
+    String shuffleJson = mapper.writeValueAsString(shuffleInfo);
+    ExecutorShuffleInfo parsedShuffleInfo =
+      mapper.readValue(shuffleJson, ExecutorShuffleInfo.class);
+    assertEquals(parsedShuffleInfo, shuffleInfo);
+
+    // Intentionally keep these hard-coded strings in here, to check backwards-compatability.
+    // its not legacy yet, but keeping this here in case anybody changes it
+    String legacyAppIdJson = "{\"appId\":\"foo\", \"execId\":\"bar\"}";
+    assertEquals(appId, mapper.readValue(legacyAppIdJson, AppExecId.class));
+    String legacyShuffleJson = "{\"localDirs\": [\"/bippy\", \"/flippy\"], " +
+      "\"subDirsPerLocalDir\": 7, \"shuffleManager\": \"hash\"}";
+    assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class));
+  }
 }
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java
index d9d9c1bf2f17a..2f4f1d0df478b 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java
@@ -42,7 +42,7 @@ public void noCleanupAndCleanup() throws IOException {
     TestShuffleDataContext dataContext = createSomeData();
 
     ExternalShuffleBlockResolver resolver =
-      new ExternalShuffleBlockResolver(conf, sameThreadExecutor);
+      new ExternalShuffleBlockResolver(conf, null, sameThreadExecutor);
     resolver.registerExecutor("app", "exec0", dataContext.createExecutorInfo("shuffleMgr"));
     resolver.applicationRemoved("app", false /* cleanup */);
 
@@ -65,7 +65,8 @@ public void cleanupUsesExecutor() throws IOException {
       @Override public void execute(Runnable runnable) { cleanupCalled.set(true); }
     };
 
-    ExternalShuffleBlockResolver manager = new ExternalShuffleBlockResolver(conf, noThreadExecutor);
+    ExternalShuffleBlockResolver manager =
+      new ExternalShuffleBlockResolver(conf, null, noThreadExecutor);
 
     manager.registerExecutor("app", "exec0", dataContext.createExecutorInfo("shuffleMgr"));
     manager.applicationRemoved("app", true);
@@ -83,7 +84,7 @@ public void cleanupMultipleExecutors() throws IOException {
     TestShuffleDataContext dataContext1 = createSomeData();
 
     ExternalShuffleBlockResolver resolver =
-      new ExternalShuffleBlockResolver(conf, sameThreadExecutor);
+      new ExternalShuffleBlockResolver(conf, null, sameThreadExecutor);
 
     resolver.registerExecutor("app", "exec0", dataContext0.createExecutorInfo("shuffleMgr"));
     resolver.registerExecutor("app", "exec1", dataContext1.createExecutorInfo("shuffleMgr"));
@@ -99,7 +100,7 @@ public void cleanupOnlyRemovedApp() throws IOException {
     TestShuffleDataContext dataContext1 = createSomeData();
 
     ExternalShuffleBlockResolver resolver =
-      new ExternalShuffleBlockResolver(conf, sameThreadExecutor);
+      new ExternalShuffleBlockResolver(conf, null, sameThreadExecutor);
 
     resolver.registerExecutor("app-0", "exec0", dataContext0.createExecutorInfo("shuffleMgr"));
     resolver.registerExecutor("app-1", "exec0", dataContext1.createExecutorInfo("shuffleMgr"));
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
index 39aa49911d9cb..a3f9a38b1aeb9 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
@@ -92,7 +92,7 @@ public static void beforeAll() throws IOException {
     dataContext1.insertHashShuffleData(1, 0, exec1Blocks);
 
     conf = new TransportConf(new SystemPropertyConfigProvider());
-    handler = new ExternalShuffleBlockHandler(conf);
+    handler = new ExternalShuffleBlockHandler(conf, null);
     TransportContext transportContext = new TransportContext(conf, handler);
     server = transportContext.createServer();
   }
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
index d4ec1956c1e29..aa99efda94948 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
@@ -43,8 +43,9 @@ public class ExternalShuffleSecuritySuite {
   TransportServer server;
 
   @Before
-  public void beforeEach() {
-    TransportContext context = new TransportContext(conf, new ExternalShuffleBlockHandler(conf));
+  public void beforeEach() throws IOException {
+    TransportContext context =
+      new TransportContext(conf, new ExternalShuffleBlockHandler(conf, null));
     TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf,
         new TestSecretKeyHolder("my-app-id", "secret"));
     this.server = context.createServer(Arrays.asList(bootstrap));
diff --git a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index 463f99ef3352d..11ea7f3fd3cfe 100644
--- a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -17,25 +17,21 @@
 
 package org.apache.spark.network.yarn;
 
+import java.io.File;
 import java.nio.ByteBuffer;
 import java.util.List;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
-import org.apache.hadoop.yarn.server.api.AuxiliaryService;
-import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext;
-import org.apache.hadoop.yarn.server.api.ApplicationTerminationContext;
-import org.apache.hadoop.yarn.server.api.ContainerInitializationContext;
-import org.apache.hadoop.yarn.server.api.ContainerTerminationContext;
+import org.apache.hadoop.yarn.server.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.TransportContext;
 import org.apache.spark.network.sasl.SaslServerBootstrap;
 import org.apache.spark.network.sasl.ShuffleSecretManager;
-import org.apache.spark.network.server.RpcHandler;
 import org.apache.spark.network.server.TransportServer;
 import org.apache.spark.network.server.TransportServerBootstrap;
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler;
@@ -79,11 +75,26 @@ public class YarnShuffleService extends AuxiliaryService {
   private TransportServer shuffleServer = null;
 
   // Handles registering executors and opening shuffle blocks
-  private ExternalShuffleBlockHandler blockHandler;
+  @VisibleForTesting
+  ExternalShuffleBlockHandler blockHandler;
+
+  // Where to store & reload executor info for recovering state after an NM restart
+  @VisibleForTesting
+  File registeredExecutorFile;
+
+  // just for testing when you want to find an open port
+  @VisibleForTesting
+  static int boundPort = -1;
+
+  // just for integration tests that want to look at this file -- in general not sensible as
+  // a static
+  @VisibleForTesting
+  static YarnShuffleService instance;
 
   public YarnShuffleService() {
     super("spark_shuffle");
     logger.info("Initializing YARN shuffle service for Spark");
+    instance = this;
   }
 
   /**
@@ -100,11 +111,24 @@ private boolean isAuthenticationEnabled() {
    */
   @Override
   protected void serviceInit(Configuration conf) {
+
+    // In case this NM was killed while there were running spark applications, we need to restore
+    // lost state for the existing executors.  We look for an existing file in the NM's local dirs.
+    // If we don't find one, then we choose a file to use to save the state next time.  Even if
+    // an application was stopped while the NM was down, we expect yarn to call stopApplication()
+    // when it comes back
+    registeredExecutorFile =
+      findRegisteredExecutorFile(conf.getStrings("yarn.nodemanager.local-dirs"));
+
     TransportConf transportConf = new TransportConf(new HadoopConfigProvider(conf));
     // If authentication is enabled, set up the shuffle server to use a
     // special RPC handler that filters out unauthenticated fetch requests
     boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
-    blockHandler = new ExternalShuffleBlockHandler(transportConf);
+    try {
+      blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile);
+    } catch (Exception e) {
+      logger.error("Failed to initialize external shuffle service", e);
+    }
 
     List<TransportServerBootstrap> bootstraps = Lists.newArrayList();
     if (authEnabled) {
@@ -116,9 +140,13 @@ protected void serviceInit(Configuration conf) {
       SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT);
     TransportContext transportContext = new TransportContext(transportConf, blockHandler);
     shuffleServer = transportContext.createServer(port, bootstraps);
+    // the port should normally be fixed, but for tests its useful to find an open port
+    port = shuffleServer.getPort();
+    boundPort = port;
     String authEnabledString = authEnabled ? "enabled" : "not enabled";
     logger.info("Started YARN shuffle service for Spark on port {}. " +
-      "Authentication is {}.", port, authEnabledString);
+      "Authentication is {}.  Registered executor file is {}", port, authEnabledString,
+      registeredExecutorFile);
   }
 
   @Override
@@ -161,6 +189,16 @@ public void stopContainer(ContainerTerminationContext context) {
     logger.info("Stopping container {}", containerId);
   }
 
+  private File findRegisteredExecutorFile(String[] localDirs) {
+    for (String dir: localDirs) {
+      File f = new File(dir, "registeredExecutors.ldb");
+      if (f.exists()) {
+        return f;
+      }
+    }
+    return new File(localDirs[0], "registeredExecutors.ldb");
+  }
+
   /**
    * Close the shuffle server to clean up any associated state.
    */
@@ -170,6 +208,9 @@ protected void serviceStop() {
       if (shuffleServer != null) {
         shuffleServer.close();
       }
+      if (blockHandler != null) {
+        blockHandler.close();
+      }
     } catch (Exception e) {
       logger.error("Exception when stopping service", e);
     }
@@ -180,5 +221,4 @@ protected void serviceStop() {
   public ByteBuffer getMetaData() {
     return ByteBuffer.allocate(0);
   }
-
 }
diff --git a/pom.xml b/pom.xml
index ccfa1ea19b21e..d5945f2546d38 100644
--- a/pom.xml
+++ b/pom.xml
@@ -655,6 +655,11 @@
         <artifactId>jackson-databind</artifactId>
         <version>${fasterxml.jackson.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-annotations</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <!-- Guava is excluded because of SPARK-6149.  The Guava version referenced in this module is
            15.0, which causes runtime incompatibility issues. -->
       <dependency>
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 15db54e4e7909..f6737695307a2 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -38,6 +38,12 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-network-yarn_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
new file mode 100644
index 0000000000000..128e996b71fe5
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, FileOutputStream, OutputStreamWriter}
+import java.util.Properties
+import java.util.concurrent.TimeUnit
+
+import scala.collection.JavaConversions._
+
+import com.google.common.base.Charsets.UTF_8
+import com.google.common.io.Files
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.server.MiniYARNCluster
+import org.scalatest.{BeforeAndAfterAll, Matchers}
+
+import org.apache.spark._
+import org.apache.spark.util.Utils
+
+abstract class BaseYarnClusterSuite
+  extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
+
+  // log4j configuration for the YARN containers, so that their output is collected
+  // by YARN instead of trying to overwrite unit-tests.log.
+  protected val LOG4J_CONF = """
+    |log4j.rootCategory=DEBUG, console
+    |log4j.appender.console=org.apache.log4j.ConsoleAppender
+    |log4j.appender.console.target=System.err
+    |log4j.appender.console.layout=org.apache.log4j.PatternLayout
+    |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+    """.stripMargin
+
+  private var yarnCluster: MiniYARNCluster = _
+  protected var tempDir: File = _
+  private var fakeSparkJar: File = _
+  private var hadoopConfDir: File = _
+  private var logConfDir: File = _
+
+
+  def yarnConfig: YarnConfiguration
+
+  override def beforeAll() {
+    super.beforeAll()
+
+    tempDir = Utils.createTempDir()
+    logConfDir = new File(tempDir, "log4j")
+    logConfDir.mkdir()
+    System.setProperty("SPARK_YARN_MODE", "true")
+
+    val logConfFile = new File(logConfDir, "log4j.properties")
+    Files.write(LOG4J_CONF, logConfFile, UTF_8)
+
+    yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
+    yarnCluster.init(yarnConfig)
+    yarnCluster.start()
+
+    // There's a race in MiniYARNCluster in which start() may return before the RM has updated
+    // its address in the configuration. You can see this in the logs by noticing that when
+    // MiniYARNCluster prints the address, it still has port "0" assigned, although later the
+    // test works sometimes:
+    //
+    //    INFO MiniYARNCluster: MiniYARN ResourceManager address: blah:0
+    //
+    // That log message prints the contents of the RM_ADDRESS config variable. If you check it
+    // later on, it looks something like this:
+    //
+    //    INFO YarnClusterSuite: RM address in configuration is blah:42631
+    //
+    // This hack loops for a bit waiting for the port to change, and fails the test if it hasn't
+    // done so in a timely manner (defined to be 10 seconds).
+    val config = yarnCluster.getConfig()
+    val deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(10)
+    while (config.get(YarnConfiguration.RM_ADDRESS).split(":")(1) == "0") {
+      if (System.currentTimeMillis() > deadline) {
+        throw new IllegalStateException("Timed out waiting for RM to come up.")
+      }
+      logDebug("RM address still not set in configuration, waiting...")
+      TimeUnit.MILLISECONDS.sleep(100)
+    }
+
+    logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
+
+    fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
+    hadoopConfDir = new File(tempDir, Client.LOCALIZED_CONF_DIR)
+    assert(hadoopConfDir.mkdir())
+    File.createTempFile("token", ".txt", hadoopConfDir)
+  }
+
+  override def afterAll() {
+    yarnCluster.stop()
+    System.clearProperty("SPARK_YARN_MODE")
+    super.afterAll()
+  }
+
+  protected def runSpark(
+      clientMode: Boolean,
+      klass: String,
+      appArgs: Seq[String] = Nil,
+      sparkArgs: Seq[String] = Nil,
+      extraClassPath: Seq[String] = Nil,
+      extraJars: Seq[String] = Nil,
+      extraConf: Map[String, String] = Map()): Unit = {
+    val master = if (clientMode) "yarn-client" else "yarn-cluster"
+    val props = new Properties()
+
+    props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
+
+    val childClasspath = logConfDir.getAbsolutePath() +
+      File.pathSeparator +
+      sys.props("java.class.path") +
+      File.pathSeparator +
+      extraClassPath.mkString(File.pathSeparator)
+    props.setProperty("spark.driver.extraClassPath", childClasspath)
+    props.setProperty("spark.executor.extraClassPath", childClasspath)
+
+    // SPARK-4267: make sure java options are propagated correctly.
+    props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
+    props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
+
+    yarnCluster.getConfig().foreach { e =>
+      props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
+    }
+
+    sys.props.foreach { case (k, v) =>
+      if (k.startsWith("spark.")) {
+        props.setProperty(k, v)
+      }
+    }
+
+    extraConf.foreach { case (k, v) => props.setProperty(k, v) }
+
+    val propsFile = File.createTempFile("spark", ".properties", tempDir)
+    val writer = new OutputStreamWriter(new FileOutputStream(propsFile), UTF_8)
+    props.store(writer, "Spark properties.")
+    writer.close()
+
+    val extraJarArgs = if (!extraJars.isEmpty()) Seq("--jars", extraJars.mkString(",")) else Nil
+    val mainArgs =
+      if (klass.endsWith(".py")) {
+        Seq(klass)
+      } else {
+        Seq("--class", klass, fakeSparkJar.getAbsolutePath())
+      }
+    val argv =
+      Seq(
+        new File(sys.props("spark.test.home"), "bin/spark-submit").getAbsolutePath(),
+        "--master", master,
+        "--num-executors", "1",
+        "--properties-file", propsFile.getAbsolutePath()) ++
+      extraJarArgs ++
+      sparkArgs ++
+      mainArgs ++
+      appArgs
+
+    Utils.executeAndGetOutput(argv,
+      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()))
+  }
+
+  /**
+   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
+   * any sort of error when the job process finishes successfully, but the job itself fails. So
+   * the tests enforce that something is written to a file after everything is ok to indicate
+   * that the job succeeded.
+   */
+  protected def checkResult(result: File): Unit = {
+    checkResult(result, "success")
+  }
+
+  protected def checkResult(result: File, expected: String): Unit = {
+    val resultString = Files.toString(result, UTF_8)
+    resultString should be (expected)
+  }
+
+  protected def mainClassName(klass: Class[_]): String = {
+    klass.getName().stripSuffix("$")
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index eb6e1fd370620..128350b648992 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -17,25 +17,20 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.io.{File, FileOutputStream, OutputStreamWriter}
+import java.io.File
 import java.net.URL
-import java.util.Properties
-import java.util.concurrent.TimeUnit
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable
+import scala.collection.JavaConversions._
 
 import com.google.common.base.Charsets.UTF_8
-import com.google.common.io.ByteStreams
-import com.google.common.io.Files
+import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.server.MiniYARNCluster
-import org.scalatest.{BeforeAndAfterAll, Matchers}
+import org.scalatest.Matchers
 
 import org.apache.spark._
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart, SparkListenerExecutorAdded}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
-  SparkListenerExecutorAdded}
 import org.apache.spark.util.Utils
 
 /**
@@ -43,17 +38,9 @@ import org.apache.spark.util.Utils
  * applications, and require the Spark assembly to be built before they can be successfully
  * run.
  */
-class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
-
-  // log4j configuration for the YARN containers, so that their output is collected
-  // by YARN instead of trying to overwrite unit-tests.log.
-  private val LOG4J_CONF = """
-    |log4j.rootCategory=DEBUG, console
-    |log4j.appender.console=org.apache.log4j.ConsoleAppender
-    |log4j.appender.console.target=System.err
-    |log4j.appender.console.layout=org.apache.log4j.PatternLayout
-    |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-    """.stripMargin
+class YarnClusterSuite extends BaseYarnClusterSuite {
+
+  override def yarnConfig: YarnConfiguration = new YarnConfiguration()
 
   private val TEST_PYFILE = """
     |import mod1, mod2
@@ -82,65 +69,6 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     |    return 42
     """.stripMargin
 
-  private var yarnCluster: MiniYARNCluster = _
-  private var tempDir: File = _
-  private var fakeSparkJar: File = _
-  private var hadoopConfDir: File = _
-  private var logConfDir: File = _
-
-  override def beforeAll() {
-    super.beforeAll()
-
-    tempDir = Utils.createTempDir()
-    logConfDir = new File(tempDir, "log4j")
-    logConfDir.mkdir()
-    System.setProperty("SPARK_YARN_MODE", "true")
-
-    val logConfFile = new File(logConfDir, "log4j.properties")
-    Files.write(LOG4J_CONF, logConfFile, UTF_8)
-
-    yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
-    yarnCluster.init(new YarnConfiguration())
-    yarnCluster.start()
-
-    // There's a race in MiniYARNCluster in which start() may return before the RM has updated
-    // its address in the configuration. You can see this in the logs by noticing that when
-    // MiniYARNCluster prints the address, it still has port "0" assigned, although later the
-    // test works sometimes:
-    //
-    //    INFO MiniYARNCluster: MiniYARN ResourceManager address: blah:0
-    //
-    // That log message prints the contents of the RM_ADDRESS config variable. If you check it
-    // later on, it looks something like this:
-    //
-    //    INFO YarnClusterSuite: RM address in configuration is blah:42631
-    //
-    // This hack loops for a bit waiting for the port to change, and fails the test if it hasn't
-    // done so in a timely manner (defined to be 10 seconds).
-    val config = yarnCluster.getConfig()
-    val deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(10)
-    while (config.get(YarnConfiguration.RM_ADDRESS).split(":")(1) == "0") {
-      if (System.currentTimeMillis() > deadline) {
-        throw new IllegalStateException("Timed out waiting for RM to come up.")
-      }
-      logDebug("RM address still not set in configuration, waiting...")
-      TimeUnit.MILLISECONDS.sleep(100)
-    }
-
-    logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
-
-    fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
-    hadoopConfDir = new File(tempDir, Client.LOCALIZED_CONF_DIR)
-    assert(hadoopConfDir.mkdir())
-    File.createTempFile("token", ".txt", hadoopConfDir)
-  }
-
-  override def afterAll() {
-    yarnCluster.stop()
-    System.clearProperty("SPARK_YARN_MODE")
-    super.afterAll()
-  }
-
   test("run Spark in yarn-client mode") {
     testBasicYarnApp(true)
   }
@@ -174,7 +102,7 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
   }
 
   private def testBasicYarnApp(clientMode: Boolean): Unit = {
-    var result = File.createTempFile("result", null, tempDir)
+    val result = File.createTempFile("result", null, tempDir)
     runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
       appArgs = Seq(result.getAbsolutePath()))
     checkResult(result)
@@ -224,89 +152,6 @@ class YarnClusterSuite extends SparkFunSuite with BeforeAndAfterAll with Matcher
     checkResult(executorResult, "OVERRIDDEN")
   }
 
-  private def runSpark(
-      clientMode: Boolean,
-      klass: String,
-      appArgs: Seq[String] = Nil,
-      sparkArgs: Seq[String] = Nil,
-      extraClassPath: Seq[String] = Nil,
-      extraJars: Seq[String] = Nil,
-      extraConf: Map[String, String] = Map()): Unit = {
-    val master = if (clientMode) "yarn-client" else "yarn-cluster"
-    val props = new Properties()
-
-    props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
-
-    val childClasspath = logConfDir.getAbsolutePath() +
-      File.pathSeparator +
-      sys.props("java.class.path") +
-      File.pathSeparator +
-      extraClassPath.mkString(File.pathSeparator)
-    props.setProperty("spark.driver.extraClassPath", childClasspath)
-    props.setProperty("spark.executor.extraClassPath", childClasspath)
-
-    // SPARK-4267: make sure java options are propagated correctly.
-    props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
-    props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
-
-    yarnCluster.getConfig().foreach { e =>
-      props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
-    }
-
-    sys.props.foreach { case (k, v) =>
-      if (k.startsWith("spark.")) {
-        props.setProperty(k, v)
-      }
-    }
-
-    extraConf.foreach { case (k, v) => props.setProperty(k, v) }
-
-    val propsFile = File.createTempFile("spark", ".properties", tempDir)
-    val writer = new OutputStreamWriter(new FileOutputStream(propsFile), UTF_8)
-    props.store(writer, "Spark properties.")
-    writer.close()
-
-    val extraJarArgs = if (!extraJars.isEmpty()) Seq("--jars", extraJars.mkString(",")) else Nil
-    val mainArgs =
-      if (klass.endsWith(".py")) {
-        Seq(klass)
-      } else {
-        Seq("--class", klass, fakeSparkJar.getAbsolutePath())
-      }
-    val argv =
-      Seq(
-        new File(sys.props("spark.test.home"), "bin/spark-submit").getAbsolutePath(),
-        "--master", master,
-        "--num-executors", "1",
-        "--properties-file", propsFile.getAbsolutePath()) ++
-      extraJarArgs ++
-      sparkArgs ++
-      mainArgs ++
-      appArgs
-
-    Utils.executeAndGetOutput(argv,
-      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()))
-  }
-
-  /**
-   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
-   * any sort of error when the job process finishes successfully, but the job itself fails. So
-   * the tests enforce that something is written to a file after everything is ok to indicate
-   * that the job succeeded.
-   */
-  private def checkResult(result: File): Unit = {
-    checkResult(result, "success")
-  }
-
-  private def checkResult(result: File, expected: String): Unit = {
-    var resultString = Files.toString(result, UTF_8)
-    resultString should be (expected)
-  }
-
-  private def mainClassName(klass: Class[_]): String = {
-    klass.getName().stripSuffix("$")
-  }
-
 }
 
 private[spark] class SaveExecutorInfo extends SparkListener {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
new file mode 100644
index 0000000000000..5e8238822b90a
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -0,0 +1,109 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.deploy.yarn
+
+import java.io.File
+
+import com.google.common.base.Charsets.UTF_8
+import com.google.common.io.Files
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.scalatest.Matchers
+
+import org.apache.spark._
+import org.apache.spark.network.shuffle.ShuffleTestAccessor
+import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
+
+/**
+ * Integration test for the external shuffle service with a yarn mini-cluster
+ */
+class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
+
+  override def yarnConfig: YarnConfiguration = {
+    val yarnConfig = new YarnConfiguration()
+    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
+    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
+      classOf[YarnShuffleService].getCanonicalName)
+    yarnConfig.set("spark.shuffle.service.port", "0")
+    yarnConfig
+  }
+
+  test("external shuffle service") {
+    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
+    val shuffleService = YarnTestAccessor.getShuffleServiceInstance
+
+    val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)
+
+    logInfo("Shuffle service port = " + shuffleServicePort)
+    val result = File.createTempFile("result", null, tempDir)
+    runSpark(
+      false,
+      mainClassName(YarnExternalShuffleDriver.getClass),
+      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
+      extraConf = Map(
+        "spark.shuffle.service.enabled" -> "true",
+        "spark.shuffle.service.port" -> shuffleServicePort.toString
+      )
+    )
+    checkResult(result)
+    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
+  }
+}
+
+private object YarnExternalShuffleDriver extends Logging with Matchers {
+
+  val WAIT_TIMEOUT_MILLIS = 10000
+
+  def main(args: Array[String]): Unit = {
+    if (args.length != 2) {
+      // scalastyle:off println
+      System.err.println(
+        s"""
+        |Invalid command line: ${args.mkString(" ")}
+        |
+        |Usage: ExternalShuffleDriver [result file] [registed exec file]
+        """.stripMargin)
+      // scalastyle:on println
+      System.exit(1)
+    }
+
+    val sc = new SparkContext(new SparkConf()
+      .setAppName("External Shuffle Test"))
+    val conf = sc.getConf
+    val status = new File(args(0))
+    val registeredExecFile = new File(args(1))
+    logInfo("shuffle service executor file = " + registeredExecFile)
+    var result = "failure"
+    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
+    try {
+      val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
+        collect().toSet
+      sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+      data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
+      result = "success"
+      // only one process can open a leveldb file at a time, so we copy the files
+      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
+      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
+    } finally {
+      sc.stop()
+      FileUtils.deleteDirectory(execStateCopy)
+      Files.write(result, status, UTF_8)
+    }
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala b/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
new file mode 100644
index 0000000000000..aa46ec5100f0e
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.shuffle
+
+import java.io.{IOException, File}
+import java.util.concurrent.ConcurrentMap
+
+import com.google.common.annotations.VisibleForTesting
+import org.apache.hadoop.yarn.api.records.ApplicationId
+import org.fusesource.leveldbjni.JniDBFactory
+import org.iq80.leveldb.{DB, Options}
+
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
+
+/**
+ * just a cheat to get package-visible members in tests
+ */
+object ShuffleTestAccessor {
+
+  def getBlockResolver(handler: ExternalShuffleBlockHandler): ExternalShuffleBlockResolver = {
+    handler.blockManager
+  }
+
+  def getExecutorInfo(
+      appId: ApplicationId,
+      execId: String,
+      resolver: ExternalShuffleBlockResolver
+  ): Option[ExecutorShuffleInfo] = {
+    val id = new AppExecId(appId.toString, execId)
+    Option(resolver.executors.get(id))
+  }
+
+  def registeredExecutorFile(resolver: ExternalShuffleBlockResolver): File = {
+    resolver.registeredExecutorFile
+  }
+
+  def shuffleServiceLevelDB(resolver: ExternalShuffleBlockResolver): DB = {
+    resolver.db
+  }
+
+  def reloadRegisteredExecutors(
+    file: File): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
+    val options: Options = new Options
+    options.createIfMissing(true)
+    val factory = new JniDBFactory
+    val db = factory.open(file, options)
+    val result = ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
+    db.close()
+    result
+  }
+
+  def reloadRegisteredExecutors(
+      db: DB): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
+    ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
+  }
+}
diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
new file mode 100644
index 0000000000000..2f22cbdbeac37
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.yarn
+
+import java.io.{DataOutputStream, File, FileOutputStream}
+
+import scala.annotation.tailrec
+
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.yarn.api.records.ApplicationId
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.server.api.{ApplicationInitializationContext, ApplicationTerminationContext}
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.network.shuffle.ShuffleTestAccessor
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
+
+class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
+  private[yarn] var yarnConfig: YarnConfiguration = new YarnConfiguration
+
+  override def beforeEach(): Unit = {
+    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
+    yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
+      classOf[YarnShuffleService].getCanonicalName)
+
+    yarnConfig.get("yarn.nodemanager.local-dirs").split(",").foreach { dir =>
+      val d = new File(dir)
+      if (d.exists()) {
+        FileUtils.deleteDirectory(d)
+      }
+      FileUtils.forceMkdir(d)
+      logInfo(s"creating yarn.nodemanager.local-dirs: $d")
+    }
+  }
+
+  var s1: YarnShuffleService = null
+  var s2: YarnShuffleService = null
+  var s3: YarnShuffleService = null
+
+  override def afterEach(): Unit = {
+    if (s1 != null) {
+      s1.stop()
+      s1 = null
+    }
+    if (s2 != null) {
+      s2.stop()
+      s2 = null
+    }
+    if (s3 != null) {
+      s3.stop()
+      s3 = null
+    }
+  }
+
+  test("executor state kept across NM restart") {
+    s1 = new YarnShuffleService
+    s1.init(yarnConfig)
+    val app1Id = ApplicationId.newInstance(0, 1)
+    val app1Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app1Id, null)
+    s1.initializeApplication(app1Data)
+    val app2Id = ApplicationId.newInstance(0, 2)
+    val app2Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app2Id, null)
+    s1.initializeApplication(app2Data)
+
+    val execStateFile = s1.registeredExecutorFile
+    execStateFile should not be (null)
+    val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, "sort")
+    val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, "hash")
+
+    val blockHandler = s1.blockHandler
+    val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+    ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+    blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+    blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+    ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", blockResolver) should
+      be (Some(shuffleInfo1))
+    ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", blockResolver) should
+      be (Some(shuffleInfo2))
+
+    if (!execStateFile.exists()) {
+      @tailrec def findExistingParent(file: File): File = {
+        if (file == null) file
+        else if (file.exists()) file
+        else findExistingParent(file.getParentFile())
+      }
+      val existingParent = findExistingParent(execStateFile)
+      assert(false, s"$execStateFile does not exist -- closest existing parent is $existingParent")
+    }
+    assert(execStateFile.exists(), s"$execStateFile did not exist")
+
+    // now we pretend the shuffle service goes down, and comes back up
+    s1.stop()
+    s2 = new YarnShuffleService
+    s2.init(yarnConfig)
+    s2.registeredExecutorFile should be (execStateFile)
+
+    val handler2 = s2.blockHandler
+    val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
+
+    // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
+    // during the restart
+    s2.initializeApplication(app1Data)
+    s2.stopApplication(new ApplicationTerminationContext(app2Id))
+    ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver2) should be (Some(shuffleInfo1))
+    ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (None)
+
+    // Act like the NM restarts one more time
+    s2.stop()
+    s3 = new YarnShuffleService
+    s3.init(yarnConfig)
+    s3.registeredExecutorFile should be (execStateFile)
+
+    val handler3 = s3.blockHandler
+    val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
+
+    // app1 is still running
+    s3.initializeApplication(app1Data)
+    ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver3) should be (Some(shuffleInfo1))
+    ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (None)
+    s3.stop()
+  }
+
+  test("removed applications should not be in registered executor file") {
+    s1 = new YarnShuffleService
+    s1.init(yarnConfig)
+    val app1Id = ApplicationId.newInstance(0, 1)
+    val app1Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app1Id, null)
+    s1.initializeApplication(app1Data)
+    val app2Id = ApplicationId.newInstance(0, 2)
+    val app2Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app2Id, null)
+    s1.initializeApplication(app2Data)
+
+    val execStateFile = s1.registeredExecutorFile
+    execStateFile should not be (null)
+    val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, "sort")
+    val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, "hash")
+
+    val blockHandler = s1.blockHandler
+    val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+    ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+    blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+    blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+
+    val db = ShuffleTestAccessor.shuffleServiceLevelDB(blockResolver)
+    ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
+
+    s1.stopApplication(new ApplicationTerminationContext(app1Id))
+    ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
+    s1.stopApplication(new ApplicationTerminationContext(app2Id))
+    ShuffleTestAccessor.reloadRegisteredExecutors(db) shouldBe empty
+  }
+
+  test("shuffle service should be robust to corrupt registered executor file") {
+    s1 = new YarnShuffleService
+    s1.init(yarnConfig)
+    val app1Id = ApplicationId.newInstance(0, 1)
+    val app1Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app1Id, null)
+    s1.initializeApplication(app1Data)
+
+    val execStateFile = s1.registeredExecutorFile
+    val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, "sort")
+
+    val blockHandler = s1.blockHandler
+    val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+    ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+    blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+
+    // now we pretend the shuffle service goes down, and comes back up.  But we'll also
+    // make a corrupt registeredExecutor File
+    s1.stop()
+
+    execStateFile.listFiles().foreach{_.delete()}
+
+    val out = new DataOutputStream(new FileOutputStream(execStateFile + "/CURRENT"))
+    out.writeInt(42)
+    out.close()
+
+    s2 = new YarnShuffleService
+    s2.init(yarnConfig)
+    s2.registeredExecutorFile should be (execStateFile)
+
+    val handler2 = s2.blockHandler
+    val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
+
+    // we re-initialize app1, but since the file was corrupt there is nothing we can do about it ...
+    s2.initializeApplication(app1Data)
+    // however, when we initialize a totally new app2, everything is still happy
+    val app2Id = ApplicationId.newInstance(0, 2)
+    val app2Data: ApplicationInitializationContext =
+      new ApplicationInitializationContext("user", app2Id, null)
+    s2.initializeApplication(app2Data)
+    val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, "hash")
+    resolver2.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+    ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2))
+    s2.stop()
+
+    // another stop & restart should be fine though (eg., we recover from previous corruption)
+    s3 = new YarnShuffleService
+    s3.init(yarnConfig)
+    s3.registeredExecutorFile should be (execStateFile)
+    val handler3 = s3.blockHandler
+    val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
+
+    s3.initializeApplication(app2Data)
+    ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (Some(shuffleInfo2))
+    s3.stop()
+
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
new file mode 100644
index 0000000000000..db322cd18e150
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.yarn
+
+import java.io.File
+
+/**
+ * just a cheat to get package-visible members in tests
+ */
+object YarnTestAccessor {
+  def getShuffleServicePort: Int = {
+    YarnShuffleService.boundPort
+  }
+
+  def getShuffleServiceInstance: YarnShuffleService = {
+    YarnShuffleService.instance
+  }
+
+  def getRegisteredExecutorFile(service: YarnShuffleService): File = {
+    service.registeredExecutorFile
+  }
+
+}

From 3c462f5d87a9654c5a68fd658a40f5062029fd9a Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Fri, 21 Aug 2015 12:21:51 -0700
Subject: [PATCH 033/802] [SPARK-10130] [SQL] type coercion for IF should have
 children resolved first

Type coercion for IF should have children resolved first, or we could meet unresolved exception.

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #8331 from adrian-wang/spark10130.
---
 .../spark/sql/catalyst/analysis/HiveTypeCoercion.scala     | 1 +
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala    | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index f2f2ba2f96552..2cb067f4aac91 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -639,6 +639,7 @@ object HiveTypeCoercion {
    */
   object IfCoercion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+      case e if !e.childrenResolved => e
       // Find tightest common type for If, if the true value and false value have different types.
       case i @ If(pred, left, right) if left.dataType != right.dataType =>
         findTightestCommonTypeToString(left.dataType, right.dataType).map { widestType =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index da50aec17c89e..dcb4e83710982 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1679,4 +1679,11 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(sqlContext.table("`db.t`"), df)
     }
   }
+
+  test("SPARK-10130 type coercion for IF should have children resolved first") {
+    val df = Seq((1, 1), (-1, 1)).toDF("key", "value")
+    df.registerTempTable("src")
+    checkAnswer(
+      sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0)))
+  }
 }

From d89cc38b33815e7b99fb3389b5038a543527065d Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Fri, 21 Aug 2015 13:10:11 -0700
Subject: [PATCH 034/802] [SPARK-10122] [PYSPARK] [STREAMING] Fix
 getOffsetRanges bug in PySpark-Streaming transform function

Details of the bug and explanations can be seen in [SPARK-10122](https://issues.apache.org/jira/browse/SPARK-10122).

tdas , please help to review.

Author: jerryshao <sshao@hortonworks.com>

Closes #8347 from jerryshao/SPARK-10122 and squashes the following commits:

4039b16 [jerryshao] Fix getOffsetRanges in transform() bug
---
 python/pyspark/streaming/dstream.py | 5 ++++-
 python/pyspark/streaming/tests.py   | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 8dcb9645cdc6b..698336cfce18d 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -610,7 +610,10 @@ def __init__(self, prev, func):
         self.is_checkpointed = False
         self._jdstream_val = None
 
-        if (isinstance(prev, TransformedDStream) and
+        # Using type() to avoid folding the functions and compacting the DStreams which is not
+        # not strictly a object of TransformedDStream.
+        # Changed here is to avoid bug in KafkaTransformedDStream when calling offsetRanges().
+        if (type(prev) is TransformedDStream and
                 not prev.is_cached and not prev.is_checkpointed):
             prev_func = prev.func
             self.func = lambda t, rdd: func(t, prev_func(t, rdd))
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 6108c845c1efe..214d5be439003 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -850,7 +850,9 @@ def transformWithOffsetRanges(rdd):
                 offsetRanges.append(o)
             return rdd
 
-        stream.transform(transformWithOffsetRanges).foreachRDD(lambda rdd: rdd.count())
+        # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
+        # only the TransformedDstreams can be folded together.
+        stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint()
         self.ssc.start()
         self.wait_for(offsetRanges, 1)
 

From f5b028ed2f1ad6de43c8b50ebf480e1b6c047035 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 21 Aug 2015 14:19:24 -0700
Subject: [PATCH 035/802] [SPARK-9864] [DOC] [MLlib] [SQL] Replace since in
 scaladoc to Since annotation

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8352 from MechCoder/since.
---
 .../classification/ClassificationModel.scala  |   8 +-
 .../classification/LogisticRegression.scala   |  30 ++---
 .../mllib/classification/NaiveBayes.scala     |   7 +-
 .../spark/mllib/classification/SVM.scala      |  28 ++---
 .../mllib/clustering/GaussianMixture.scala    |  28 ++---
 .../clustering/GaussianMixtureModel.scala     |  28 ++---
 .../spark/mllib/clustering/KMeans.scala       |  50 ++++-----
 .../spark/mllib/clustering/KMeansModel.scala  |  27 ++---
 .../apache/spark/mllib/clustering/LDA.scala   |  56 ++++-----
 .../spark/mllib/clustering/LDAModel.scala     |  69 +++++-------
 .../spark/mllib/clustering/LDAOptimizer.scala |  24 ++--
 .../clustering/PowerIterationClustering.scala |  38 +++----
 .../mllib/clustering/StreamingKMeans.scala    |  35 +++---
 .../BinaryClassificationMetrics.scala         |  26 ++---
 .../mllib/evaluation/MulticlassMetrics.scala  |  20 ++--
 .../mllib/evaluation/MultilabelMetrics.scala  |   9 +-
 .../mllib/evaluation/RankingMetrics.scala     |  10 +-
 .../mllib/evaluation/RegressionMetrics.scala  |  14 +--
 .../spark/mllib/fpm/AssociationRules.scala    |  20 ++--
 .../org/apache/spark/mllib/fpm/FPGrowth.scala |  22 ++--
 .../apache/spark/mllib/linalg/Matrices.scala  | 106 ++++++++----------
 .../linalg/SingularValueDecomposition.scala   |   4 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |  90 +++++----------
 .../linalg/distributed/BlockMatrix.scala      |  88 ++++++---------
 .../linalg/distributed/CoordinateMatrix.scala |  40 +++----
 .../distributed/DistributedMatrix.scala       |   4 +-
 .../linalg/distributed/IndexedRowMatrix.scala |  38 +++----
 .../mllib/linalg/distributed/RowMatrix.scala  |  39 +++----
 .../spark/mllib/recommendation/ALS.scala      |  22 ++--
 .../MatrixFactorizationModel.scala            |  28 +++--
 .../GeneralizedLinearAlgorithm.scala          |  24 ++--
 .../mllib/regression/IsotonicRegression.scala |  22 ++--
 .../spark/mllib/regression/LabeledPoint.scala |   7 +-
 .../apache/spark/mllib/regression/Lasso.scala |  25 ++---
 .../mllib/regression/LinearRegression.scala   |  25 ++---
 .../mllib/regression/RegressionModel.scala    |  12 +-
 .../mllib/regression/RidgeRegression.scala    |  25 ++---
 .../regression/StreamingLinearAlgorithm.scala |  18 +--
 .../spark/mllib/stat/KernelDensity.scala      |  12 +-
 .../stat/MultivariateOnlineSummarizer.scala   |  24 ++--
 .../stat/MultivariateStatisticalSummary.scala |  19 ++--
 .../apache/spark/mllib/stat/Statistics.scala  |  30 ++---
 .../distribution/MultivariateGaussian.scala   |   8 +-
 .../spark/mllib/tree/DecisionTree.scala       |  28 +++--
 .../mllib/tree/GradientBoostedTrees.scala     |  20 ++--
 .../spark/mllib/tree/RandomForest.scala       |  20 ++--
 .../spark/mllib/tree/configuration/Algo.scala |   4 +-
 .../tree/configuration/BoostingStrategy.scala |  12 +-
 .../tree/configuration/FeatureType.scala      |   4 +-
 .../tree/configuration/QuantileStrategy.scala |   4 +-
 .../mllib/tree/configuration/Strategy.scala   |  24 ++--
 .../spark/mllib/tree/impurity/Entropy.scala   |  10 +-
 .../spark/mllib/tree/impurity/Gini.scala      |  10 +-
 .../spark/mllib/tree/impurity/Impurity.scala  |   8 +-
 .../spark/mllib/tree/impurity/Variance.scala  |  10 +-
 .../spark/mllib/tree/loss/AbsoluteError.scala |   6 +-
 .../spark/mllib/tree/loss/LogLoss.scala       |   6 +-
 .../apache/spark/mllib/tree/loss/Loss.scala   |   8 +-
 .../apache/spark/mllib/tree/loss/Losses.scala |  10 +-
 .../spark/mllib/tree/loss/SquaredError.scala  |   6 +-
 .../mllib/tree/model/DecisionTreeModel.scala  |  22 ++--
 .../tree/model/InformationGainStats.scala     |   4 +-
 .../apache/spark/mllib/tree/model/Node.scala  |   8 +-
 .../spark/mllib/tree/model/Predict.scala      |   4 +-
 .../apache/spark/mllib/tree/model/Split.scala |   4 +-
 .../mllib/tree/model/treeEnsembleModels.scala |  26 ++---
 .../org/apache/spark/mllib/tree/package.scala |   1 -
 .../org/apache/spark/mllib/util/MLUtils.scala |  36 +++---
 68 files changed, 692 insertions(+), 862 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index ba73024e3c04d..a29b425a71fd6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.classification
 
 import org.json4s.{DefaultFormats, JValue}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
@@ -36,8 +36,8 @@ trait ClassificationModel extends Serializable {
    *
    * @param testData RDD representing data points to be predicted
    * @return an RDD[Double] where each entry contains the corresponding prediction
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def predict(testData: RDD[Vector]): RDD[Double]
 
   /**
@@ -45,16 +45,16 @@ trait ClassificationModel extends Serializable {
    *
    * @param testData array representing a single data point
    * @return predicted category from the trained model
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def predict(testData: Vector): Double
 
   /**
    * Predict values for examples stored in a JavaRDD.
    * @param testData JavaRDD representing data points to be predicted
    * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
     predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 268642ac6a2f6..e03e662227d14 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.classification
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.linalg.{DenseVector, Vector}
@@ -85,8 +85,8 @@ class LogisticRegressionModel (
    * in Binary Logistic Regression. An example with prediction score greater than or equal to
    * this threshold is identified as an positive, and negative otherwise. The default value is 0.5.
    * It is only used for binary classification.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @Experimental
   def setThreshold(threshold: Double): this.type = {
     this.threshold = Some(threshold)
@@ -97,8 +97,8 @@ class LogisticRegressionModel (
    * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
    * It is only used for binary classification.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   @Experimental
   def getThreshold: Option[Double] = threshold
 
@@ -106,8 +106,8 @@ class LogisticRegressionModel (
    * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
    * It is only used for binary classification.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @Experimental
   def clearThreshold(): this.type = {
     threshold = None
@@ -158,9 +158,7 @@ class LogisticRegressionModel (
     }
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
       numFeatures, numClasses, weights, intercept, threshold)
@@ -168,9 +166,7 @@ class LogisticRegressionModel (
 
   override protected def formatVersion: String = "1.0"
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def toString: String = {
     s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
   }
@@ -178,9 +174,7 @@ class LogisticRegressionModel (
 
 object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -261,8 +255,8 @@ object LogisticRegressionWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -284,8 +278,8 @@ object LogisticRegressionWithSGD {
    * @param stepSize Step size to be used for each iteration of gradient descent.
 
    * @param miniBatchFraction Fraction of data to be used per iteration.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -306,8 +300,8 @@ object LogisticRegressionWithSGD {
 
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -324,8 +318,8 @@ object LogisticRegressionWithSGD {
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int): LogisticRegressionModel = {
@@ -361,8 +355,8 @@ class LogisticRegressionWithLBFGS
    * Set the number of possible outcomes for k classes classification problem in
    * Multinomial Logistic Regression.
    * By default, it is binary logistic regression so k will be set to 2.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   @Experimental
   def setNumClasses(numClasses: Int): this.type = {
     require(numClasses > 1)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 2df91c09421e9..dab369207cc9a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -25,6 +25,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext, SparkException}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -444,8 +445,8 @@ object NaiveBayes {
    *
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
-   * @since 0.9.0
    */
+  @Since("0.9.0")
   def train(input: RDD[LabeledPoint]): NaiveBayesModel = {
     new NaiveBayes().run(input)
   }
@@ -460,8 +461,8 @@ object NaiveBayes {
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
    * @param lambda The smoothing parameter
-   * @since 0.9.0
    */
+  @Since("0.9.0")
   def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
     new NaiveBayes(lambda, Multinomial).run(input)
   }
@@ -483,8 +484,8 @@ object NaiveBayes {
    *
    * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be
    *              multinomial or bernoulli
-   * @since 0.9.0
    */
+  @Since("0.9.0")
   def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
     require(supportedModelTypes.contains(modelType),
       s"NaiveBayes was created with an unknown modelType: $modelType.")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 5b54feeb10467..5f87269863572 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.classification
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
@@ -46,8 +46,8 @@ class SVMModel (
    * Sets the threshold that separates positive predictions from negative predictions. An example
    * with prediction score greater than or equal to this threshold is identified as an positive,
    * and negative otherwise. The default value is 0.0.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   @Experimental
   def setThreshold(threshold: Double): this.type = {
     this.threshold = Some(threshold)
@@ -57,16 +57,16 @@ class SVMModel (
   /**
    * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   @Experimental
   def getThreshold: Option[Double] = threshold
 
   /**
    * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @Experimental
   def clearThreshold(): this.type = {
     threshold = None
@@ -84,9 +84,7 @@ class SVMModel (
     }
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
       numFeatures = weights.size, numClasses = 2, weights, intercept, threshold)
@@ -94,9 +92,7 @@ class SVMModel (
 
   override protected def formatVersion: String = "1.0"
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def toString: String = {
     s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}"
   }
@@ -104,9 +100,7 @@ class SVMModel (
 
 object SVMModel extends Loader[SVMModel] {
 
-   /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): SVMModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -185,8 +179,8 @@ object SVMWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -209,8 +203,8 @@ object SVMWithSGD {
    * @param stepSize Step size to be used for each iteration of gradient descent.
    * @param regParam Regularization parameter.
    * @param miniBatchFraction Fraction of data to be used per iteration.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -231,8 +225,8 @@ object SVMWithSGD {
    * @param regParam Regularization parameter.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -250,8 +244,8 @@ object SVMWithSGD {
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = {
     train(input, numIterations, 1.0, 0.01, 1.0)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index bc27b1fe7390b..fcc9dfecac54f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.IndexedSeq
 
 import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
@@ -62,8 +62,8 @@ class GaussianMixture private (
   /**
    * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
    * maxIterations: 100, seed: random}.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this() = this(2, 0.01, 100, Utils.random.nextLong())
 
   // number of samples per cluster to use when initializing Gaussians
@@ -77,8 +77,8 @@ class GaussianMixture private (
    * Set the initial GMM starting point, bypassing the random initialization.
    * You must call setK() prior to calling this method, and the condition
    * (model.k == this.k) must be met; failure will result in an IllegalArgumentException
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setInitialModel(model: GaussianMixtureModel): this.type = {
     if (model.k == k) {
       initialModel = Some(model)
@@ -90,14 +90,14 @@ class GaussianMixture private (
 
   /**
    * Return the user supplied initial GMM, if supplied
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getInitialModel: Option[GaussianMixtureModel] = initialModel
 
   /**
    * Set the number of Gaussians in the mixture model.  Default: 2
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setK(k: Int): this.type = {
     this.k = k
     this
@@ -105,14 +105,14 @@ class GaussianMixture private (
 
   /**
    * Return the number of Gaussians in the mixture model
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getK: Int = k
 
   /**
    * Set the maximum number of iterations to run. Default: 100
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
@@ -120,15 +120,15 @@ class GaussianMixture private (
 
   /**
    * Return the maximum number of iterations to run
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getMaxIterations: Int = maxIterations
 
   /**
    * Set the largest change in log-likelihood at which convergence is
    * considered to have occurred.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setConvergenceTol(convergenceTol: Double): this.type = {
     this.convergenceTol = convergenceTol
     this
@@ -137,14 +137,14 @@ class GaussianMixture private (
   /**
    * Return the largest change in log-likelihood at which convergence is
    * considered to have occurred.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getConvergenceTol: Double = convergenceTol
 
   /**
    * Set the random seed
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -152,14 +152,14 @@ class GaussianMixture private (
 
   /**
    * Return the random seed
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getSeed: Long = seed
 
   /**
    * Perform expectation maximization
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(data: RDD[Vector]): GaussianMixtureModel = {
     val sc = data.sparkContext
 
@@ -235,8 +235,8 @@ class GaussianMixture private (
 
   /**
    * Java-friendly version of [[run()]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
 
   private def updateWeightsAndGaussians(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 2fa0473737aae..1a10a8b624218 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -24,7 +24,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Matrices, Matrix}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
@@ -43,8 +43,8 @@ import org.apache.spark.sql.{SQLContext, Row}
  *                the weight for Gaussian i, and weights.sum == 1
  * @param gaussians Array of MultivariateGaussian where gaussians(i) represents
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class GaussianMixtureModel(
   val weights: Array[Double],
@@ -54,23 +54,21 @@ class GaussianMixtureModel(
 
   override protected def formatVersion = "1.0"
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GaussianMixtureModel.SaveLoadV1_0.save(sc, path, weights, gaussians)
   }
 
   /**
    * Number of gaussians in mixture
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def k: Int = weights.length
 
   /**
    * Maps given points to their cluster indices.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predict(points: RDD[Vector]): RDD[Int] = {
     val responsibilityMatrix = predictSoft(points)
     responsibilityMatrix.map(r => r.indexOf(r.max))
@@ -78,8 +76,8 @@ class GaussianMixtureModel(
 
   /**
    * Maps given point to its cluster index.
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def predict(point: Vector): Int = {
     val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
     r.indexOf(r.max)
@@ -87,16 +85,16 @@ class GaussianMixtureModel(
 
   /**
    * Java-friendly version of [[predict()]]
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
    * Given the input vectors, return the membership value of each vector
    * to all mixture components.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
     val sc = points.sparkContext
     val bcDists = sc.broadcast(gaussians)
@@ -108,8 +106,8 @@ class GaussianMixtureModel(
 
   /**
    * Given the input vector, return the membership values to all mixture components.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def predictSoft(point: Vector): Array[Double] = {
     computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
   }
@@ -133,9 +131,7 @@ class GaussianMixtureModel(
   }
 }
 
-/**
- * @since 1.4.0
- */
+@Since("1.4.0")
 @Experimental
 object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
 
@@ -186,9 +182,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
     }
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def load(sc: SparkContext, path: String) : GaussianMixtureModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 9ef6834e5ea8d..3e9545a74bef3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.{axpy, scal}
 import org.apache.spark.mllib.util.MLUtils
@@ -49,20 +49,20 @@ class KMeans private (
   /**
    * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
    * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong())
 
   /**
    * Number of clusters to create (k).
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getK: Int = k
 
   /**
    * Set the number of clusters to create (k). Default: 2.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setK(k: Int): this.type = {
     this.k = k
     this
@@ -70,14 +70,14 @@ class KMeans private (
 
   /**
    * Maximum number of iterations to run.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getMaxIterations: Int = maxIterations
 
   /**
    * Set maximum number of iterations to run. Default: 20.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
@@ -85,16 +85,16 @@ class KMeans private (
 
   /**
    * The initialization algorithm. This can be either "random" or "k-means||".
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getInitializationMode: String = initializationMode
 
   /**
    * Set the initialization algorithm. This can be either "random" to choose random points as
    * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
    * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setInitializationMode(initializationMode: String): this.type = {
     KMeans.validateInitMode(initializationMode)
     this.initializationMode = initializationMode
@@ -104,8 +104,8 @@ class KMeans private (
   /**
    * :: Experimental ::
    * Number of runs of the algorithm to execute in parallel.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   @Experimental
   def getRuns: Int = runs
 
@@ -114,8 +114,8 @@ class KMeans private (
    * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
    * this many times with random starting conditions (configured by the initialization mode), then
    * return the best clustering found over any run. Default: 1.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   @Experimental
   def setRuns(runs: Int): this.type = {
     if (runs <= 0) {
@@ -127,15 +127,15 @@ class KMeans private (
 
   /**
    * Number of steps for the k-means|| initialization mode
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getInitializationSteps: Int = initializationSteps
 
   /**
    * Set the number of steps for the k-means|| initialization mode. This is an advanced
    * setting -- the default of 5 is almost always enough. Default: 5.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setInitializationSteps(initializationSteps: Int): this.type = {
     if (initializationSteps <= 0) {
       throw new IllegalArgumentException("Number of initialization steps must be positive")
@@ -146,15 +146,15 @@ class KMeans private (
 
   /**
    * The distance threshold within which we've consider centers to have converged.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getEpsilon: Double = epsilon
 
   /**
    * Set the distance threshold within which we've consider centers to have converged.
    * If all centers move less than this Euclidean distance, we stop iterating one run.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setEpsilon(epsilon: Double): this.type = {
     this.epsilon = epsilon
     this
@@ -162,14 +162,14 @@ class KMeans private (
 
   /**
    * The random seed for cluster initialization.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getSeed: Long = seed
 
   /**
    * Set the random seed for cluster initialization.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -183,8 +183,8 @@ class KMeans private (
    * Set the initial starting point, bypassing the random initialization or k-means||
    * The condition model.k == this.k must be met, failure results
    * in an IllegalArgumentException.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setInitialModel(model: KMeansModel): this.type = {
     require(model.k == k, "mismatched cluster count")
     initialModel = Some(model)
@@ -194,8 +194,8 @@ class KMeans private (
   /**
    * Train a K-means model on the given set of points; `data` should be cached for high
    * performance, because this is an iterative algorithm.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def run(data: RDD[Vector]): KMeansModel = {
 
     if (data.getStorageLevel == StorageLevel.NONE) {
@@ -453,14 +453,14 @@ class KMeans private (
 
 /**
  * Top-level methods for calling K-means clustering.
- * @since 0.8.0
  */
+@Since("0.8.0")
 object KMeans {
 
   // Initialization mode names
-  /** @since 0.8.0 */
+  @Since("0.8.0")
   val RANDOM = "random"
-  /** @since 0.8.0 */
+  @Since("0.8.0")
   val K_MEANS_PARALLEL = "k-means||"
 
   /**
@@ -472,8 +472,8 @@ object KMeans {
    * @param runs number of parallel runs, defaults to 1. The best model is returned.
    * @param initializationMode initialization model, either "random" or "k-means||" (default).
    * @param seed random seed value for cluster initialization
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def train(
       data: RDD[Vector],
       k: Int,
@@ -497,8 +497,8 @@ object KMeans {
    * @param maxIterations max number of iterations
    * @param runs number of parallel runs, defaults to 1. The best model is returned.
    * @param initializationMode initialization model, either "random" or "k-means||" (default).
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       data: RDD[Vector],
       k: Int,
@@ -514,8 +514,8 @@ object KMeans {
 
   /**
    * Trains a k-means model using specified parameters and the default values for unspecified.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       data: RDD[Vector],
       k: Int,
@@ -525,8 +525,8 @@ object KMeans {
 
   /**
    * Trains a k-means model using specified parameters and the default values for unspecified.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       data: RDD[Vector],
       k: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index 8de2087ceb4df..e425ecdd481c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -23,6 +23,7 @@ import org.json4s._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.pmml.PMMLExportable
@@ -34,35 +35,35 @@ import org.apache.spark.sql.Row
 
 /**
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
- * @since 0.8.0
  */
+@Since("0.8.0")
 class KMeansModel (
     val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
 
   /**
    * A Java-friendly constructor that takes an Iterable of Vectors.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray)
 
   /**
    * Total number of clusters.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def k: Int = clusterCenters.length
 
   /**
    * Returns the cluster index that a given point belongs to.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def predict(point: Vector): Int = {
     KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
   }
 
   /**
    * Maps given points to their cluster indices.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(points: RDD[Vector]): RDD[Int] = {
     val centersWithNorm = clusterCentersWithNorm
     val bcCentersWithNorm = points.context.broadcast(centersWithNorm)
@@ -71,16 +72,16 @@ class KMeansModel (
 
   /**
    * Maps given points to their cluster indices.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
     predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
 
   /**
    * Return the K-means cost (sum of squared distances of points to their nearest center) for this
    * model on the given data.
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def computeCost(data: RDD[Vector]): Double = {
     val centersWithNorm = clusterCentersWithNorm
     val bcCentersWithNorm = data.context.broadcast(centersWithNorm)
@@ -90,9 +91,7 @@ class KMeansModel (
   private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
     clusterCenters.map(new VectorWithNorm(_))
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def save(sc: SparkContext, path: String): Unit = {
     KMeansModel.SaveLoadV1_0.save(sc, this, path)
   }
@@ -100,14 +99,10 @@ class KMeansModel (
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.4.0
- */
+@Since("1.4.0")
 object KMeansModel extends Loader[KMeansModel] {
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def load(sc: SparkContext, path: String): KMeansModel = {
     KMeansModel.SaveLoadV1_0.load(sc, path)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 2a8c6acbaec61..92a321afb0ca3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseVector => BDV}
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.graphx._
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -43,8 +43,8 @@ import org.apache.spark.util.Utils
  *
  * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
  *       (Wikipedia)]]
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class LDA private (
     private var k: Int,
@@ -57,8 +57,8 @@ class LDA private (
 
   /**
    * Constructs a LDA instance with default parameters.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1),
     topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10,
     ldaOptimizer = new EMLDAOptimizer)
@@ -66,15 +66,15 @@ class LDA private (
   /**
    * Number of topics to infer.  I.e., the number of soft cluster centers.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getK: Int = k
 
   /**
    * Number of topics to infer.  I.e., the number of soft cluster centers.
    * (default = 10)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setK(k: Int): this.type = {
     require(k > 0, s"LDA k (number of clusters) must be > 0, but was set to $k")
     this.k = k
@@ -86,8 +86,8 @@ class LDA private (
    * distributions over topics ("theta").
    *
    * This is the parameter to a Dirichlet distribution.
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def getAsymmetricDocConcentration: Vector = this.docConcentration
 
   /**
@@ -96,8 +96,8 @@ class LDA private (
    *
    * This method assumes the Dirichlet distribution is symmetric and can be described by a single
    * [[Double]] parameter. It should fail if docConcentration is asymmetric.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getDocConcentration: Double = {
     val parameter = docConcentration(0)
     if (docConcentration.size == 1) {
@@ -131,8 +131,8 @@ class LDA private (
    *     - Values should be >= 0
    *     - default = uniformly (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def setDocConcentration(docConcentration: Vector): this.type = {
     require(docConcentration.size > 0, "docConcentration must have > 0 elements")
     this.docConcentration = docConcentration
@@ -141,8 +141,8 @@ class LDA private (
 
   /**
    * Replicates a [[Double]] docConcentration to create a symmetric prior.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setDocConcentration(docConcentration: Double): this.type = {
     this.docConcentration = Vectors.dense(docConcentration)
     this
@@ -150,26 +150,26 @@ class LDA private (
 
   /**
    * Alias for [[getAsymmetricDocConcentration]]
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
 
   /**
    * Alias for [[getDocConcentration]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getAlpha: Double = getDocConcentration
 
   /**
    * Alias for [[setDocConcentration()]]
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
 
   /**
    * Alias for [[setDocConcentration()]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
 
   /**
@@ -180,8 +180,8 @@ class LDA private (
    *
    * Note: The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getTopicConcentration: Double = this.topicConcentration
 
   /**
@@ -205,8 +205,8 @@ class LDA private (
    *     - Value should be >= 0
    *     - default = (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setTopicConcentration(topicConcentration: Double): this.type = {
     this.topicConcentration = topicConcentration
     this
@@ -214,27 +214,27 @@ class LDA private (
 
   /**
    * Alias for [[getTopicConcentration]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getBeta: Double = getTopicConcentration
 
   /**
    * Alias for [[setTopicConcentration()]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setBeta(beta: Double): this.type = setTopicConcentration(beta)
 
   /**
    * Maximum number of iterations for learning.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getMaxIterations: Int = maxIterations
 
   /**
    * Maximum number of iterations for learning.
    * (default = 20)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
@@ -242,14 +242,14 @@ class LDA private (
 
   /**
    * Random seed
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getSeed: Long = seed
 
   /**
    * Random seed
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -257,8 +257,8 @@ class LDA private (
 
   /**
    * Period (in iterations) between checkpoints.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def getCheckpointInterval: Int = checkpointInterval
 
   /**
@@ -268,8 +268,8 @@ class LDA private (
    * [[org.apache.spark.SparkContext]], this setting is ignored.
    *
    * @see [[org.apache.spark.SparkContext#setCheckpointDir]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setCheckpointInterval(checkpointInterval: Int): this.type = {
     this.checkpointInterval = checkpointInterval
     this
@@ -280,8 +280,8 @@ class LDA private (
    * :: DeveloperApi ::
    *
    * LDAOptimizer used to perform the actual calculation
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   @DeveloperApi
   def getOptimizer: LDAOptimizer = ldaOptimizer
 
@@ -289,8 +289,8 @@ class LDA private (
    * :: DeveloperApi ::
    *
    * LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   @DeveloperApi
   def setOptimizer(optimizer: LDAOptimizer): this.type = {
     this.ldaOptimizer = optimizer
@@ -300,8 +300,8 @@ class LDA private (
   /**
    * Set the LDAOptimizer used to perform the actual calculation by algorithm name.
    * Currently "em", "online" are supported.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setOptimizer(optimizerName: String): this.type = {
     this.ldaOptimizer =
       optimizerName.toLowerCase match {
@@ -321,8 +321,8 @@ class LDA private (
    *                   (where the vocabulary size is the length of the vector).
    *                   Document IDs must be unique and >= 0.
    * @return  Inferred LDA model
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(documents: RDD[(Long, Vector)]): LDAModel = {
     val state = ldaOptimizer.initialize(documents, this)
     var iter = 0
@@ -339,8 +339,8 @@ class LDA private (
 
   /**
    * Java-friendly version of [[run()]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = {
     run(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6bc68a4c18b99..667374a2bc418 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -25,7 +25,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
 import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
@@ -192,24 +192,16 @@ class LocalLDAModel private[clustering] (
     override protected[clustering] val gammaShape: Double = 100)
   extends LDAModel with Serializable {
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def k: Int = topics.numCols
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def vocabSize: Int = topics.numRows
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def topicsMatrix: Matrix = topics
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = {
     val brzTopics = topics.toBreeze.toDenseMatrix
     Range(0, k).map { topicIndex =>
@@ -222,9 +214,7 @@ class LocalLDAModel private[clustering] (
 
   override protected def formatVersion = "1.0"
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def save(sc: SparkContext, path: String): Unit = {
     LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
       gammaShape)
@@ -238,16 +228,16 @@ class LocalLDAModel private[clustering] (
    *
    * @param documents test corpus to use for calculating log likelihood
    * @return variational lower bound on the log likelihood of the entire corpus
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents,
     docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
     vocabSize)
 
   /**
    * Java-friendly version of [[logLikelihood]]
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
     logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
@@ -258,8 +248,8 @@ class LocalLDAModel private[clustering] (
    *
    * @param documents test corpus to use for calculating perplexity
    * @return Variational upper bound on log perplexity per token.
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
     val corpusTokenCount = documents
       .map { case (_, termCounts) => termCounts.toArray.sum }
@@ -267,9 +257,8 @@ class LocalLDAModel private[clustering] (
     -logLikelihood(documents) / corpusTokenCount
   }
 
-  /** Java-friendly version of [[logPerplexity]]
-   *  @since 1.5.0
-   */
+  /** Java-friendly version of [[logPerplexity]] */
+  @Since("1.5.0")
   def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
     logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
   }
@@ -347,8 +336,8 @@ class LocalLDAModel private[clustering] (
    * for each document.
    * @param documents documents to predict topic mixture distributions for
    * @return An RDD of (document ID, topic mixture distribution for document)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
   def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = {
     // Double transpose because dirichletExpectation normalizes by row and we need to normalize
@@ -376,8 +365,8 @@ class LocalLDAModel private[clustering] (
 
   /**
    * Java-friendly version of [[topicDistributions]]
-   * @since 1.4.1
    */
+  @Since("1.4.1")
   def topicDistributions(
       documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = {
     val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
@@ -451,9 +440,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
     }
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def load(sc: SparkContext, path: String): LocalLDAModel = {
     val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
@@ -510,8 +497,8 @@ class DistributedLDAModel private[clustering] (
    * Convert model to a local model.
    * The local model stores the inferred topics but not the topic distributions for training
    * documents.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration,
     gammaShape)
 
@@ -521,8 +508,8 @@ class DistributedLDAModel private[clustering] (
    * No guarantees are given about the ordering of the topics.
    *
    * WARNING: This matrix is collected from an RDD. Beware memory usage when vocabSize, k are large.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override lazy val topicsMatrix: Matrix = {
     // Collect row-major topics
     val termTopicCounts: Array[(Int, TopicCounts)] =
@@ -541,9 +528,7 @@ class DistributedLDAModel private[clustering] (
     Matrices.fromBreeze(brzTopics)
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = {
     val numTopics = k
     // Note: N_k is not needed to find the top terms, but it is needed to normalize weights
@@ -582,8 +567,8 @@ class DistributedLDAModel private[clustering] (
    * @return  Array over topics.  Each element represent as a pair of matching arrays:
    *          (IDs for the documents, weights of the topic in these documents).
    *          For each topic, documents are sorted in order of decreasing topic weights.
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = {
     val numTopics = k
     val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] =
@@ -666,8 +651,8 @@ class DistributedLDAModel private[clustering] (
    *  - This excludes the prior; for that, use [[logPrior]].
    *  - Even with [[logPrior]], this is NOT the same as the data log likelihood given the
    *    hyperparameters.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   lazy val logLikelihood: Double = {
     // TODO: generalize this for asymmetric (non-scalar) alpha
     val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object
@@ -693,8 +678,8 @@ class DistributedLDAModel private[clustering] (
   /**
    * Log probability of the current parameter estimate:
    * log P(topics, topic distributions for docs | alpha, eta)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   lazy val logPrior: Double = {
     // TODO: generalize this for asymmetric (non-scalar) alpha
     val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object
@@ -725,8 +710,8 @@ class DistributedLDAModel private[clustering] (
    * ("theta_doc").
    *
    * @return  RDD of (document ID, topic distribution) pairs
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def topicDistributions: RDD[(Long, Vector)] = {
     graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
       (docID.toLong, Vectors.fromBreeze(normalize(topicCounts, 1.0)))
@@ -735,8 +720,8 @@ class DistributedLDAModel private[clustering] (
 
   /**
    * Java-friendly version of [[topicDistributions]]
-   * @since 1.4.1
    */
+  @Since("1.4.1")
   def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = {
     JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]])
   }
@@ -744,8 +729,8 @@ class DistributedLDAModel private[clustering] (
   /**
    * For each document, return the top k weighted topics for that document and their weights.
    * @return RDD of (doc ID, topic indices, topic weights)
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = {
     graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
       val topIndices = argtopk(topicCounts, k)
@@ -761,8 +746,8 @@ class DistributedLDAModel private[clustering] (
 
   /**
    * Java-friendly version of [[topTopicsPerDocument]]
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = {
     val topics = topTopicsPerDocument(k)
     topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD()
@@ -775,8 +760,8 @@ class DistributedLDAModel private[clustering] (
 
   /**
    * Java-friendly version of [[topicDistributions]]
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   override def save(sc: SparkContext, path: String): Unit = {
     DistributedLDAModel.SaveLoadV1_0.save(
       sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration,
@@ -877,9 +862,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
 
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def load(sc: SparkContext, path: String): DistributedLDAModel = {
     val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
     implicit val formats = DefaultFormats
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index cb517f9689ade..5c2aae6403bea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -23,7 +23,7 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, su
 import breeze.numerics.{trigamma, abs, exp}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
@@ -35,8 +35,8 @@ import org.apache.spark.rdd.RDD
  *
  * An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
  * hold optimizer-specific parameters for users to set.
- * @since 1.4.0
  */
+@Since("1.4.0")
 @DeveloperApi
 sealed trait LDAOptimizer {
 
@@ -74,8 +74,8 @@ sealed trait LDAOptimizer {
  *  - Paper which clearly explains several algorithms, including EM:
  *    Asuncion, Welling, Smyth, and Teh.
  *    "On Smoothing and Inference for Topic Models."  UAI, 2009.
- * @since 1.4.0
  */
+@Since("1.4.0")
 @DeveloperApi
 final class EMLDAOptimizer extends LDAOptimizer {
 
@@ -226,8 +226,8 @@ final class EMLDAOptimizer extends LDAOptimizer {
  *
  * Original Online LDA paper:
  *   Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010.
- * @since 1.4.0
  */
+@Since("1.4.0")
 @DeveloperApi
 final class OnlineLDAOptimizer extends LDAOptimizer {
 
@@ -276,16 +276,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /**
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getTau0: Double = this.tau0
 
   /**
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
    * Default: 1024, following the original Online LDA paper.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setTau0(tau0: Double): this.type = {
     require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
     this.tau0 = tau0
@@ -294,16 +294,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   /**
    * Learning rate: exponential decay rate
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getKappa: Double = this.kappa
 
   /**
    * Learning rate: exponential decay rate---should be between
    * (0.5, 1.0] to guarantee asymptotic convergence.
    * Default: 0.51, based on the original Online LDA paper.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setKappa(kappa: Double): this.type = {
     require(kappa >= 0, s"Online LDA kappa must be nonnegative, but was set to $kappa")
     this.kappa = kappa
@@ -312,8 +312,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   /**
    * Mini-batch fraction, which sets the fraction of document sampled and used in each iteration
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getMiniBatchFraction: Double = this.miniBatchFraction
 
   /**
@@ -325,8 +325,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * maxIterations * miniBatchFraction >= 1.
    *
    * Default: 0.05, i.e., 5% of total documents.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
     require(miniBatchFraction > 0.0 && miniBatchFraction <= 1.0,
       s"Online LDA miniBatchFraction must be in range (0,1], but was set to $miniBatchFraction")
@@ -337,16 +337,16 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /**
    * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution)
    * will be optimized during training.
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def getOptimzeAlpha: Boolean = this.optimizeAlpha
 
   /**
    * Sets whether to optimize alpha parameter during training.
    *
    * Default: false
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
     this.optimizeAlpha = optimizeAlpha
     this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index b4733ca975152..396b36f2f6454 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -21,7 +21,7 @@ import org.json4s.JsonDSL._
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
@@ -39,16 +39,14 @@ import org.apache.spark.{Logging, SparkContext, SparkException}
  *
  * @param k number of clusters
  * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class PowerIterationClusteringModel(
     val k: Int,
     val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable {
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def save(sc: SparkContext, path: String): Unit = {
     PowerIterationClusteringModel.SaveLoadV1_0.save(sc, this, path)
   }
@@ -56,9 +54,7 @@ class PowerIterationClusteringModel(
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.4.0
- */
+@Since("1.4.0")
 object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] {
   override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
     PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
@@ -73,8 +69,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
     val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel"
 
     /**
-     * @since 1.4.0
      */
+    @Since("1.4.0")
     def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = {
       val sqlContext = new SQLContext(sc)
       import sqlContext.implicits._
@@ -87,9 +83,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
       dataRDD.write.parquet(Loader.dataPath(path))
     }
 
-    /**
-     * @since 1.4.0
-     */
+    @Since("1.4.0")
     def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
       implicit val formats = DefaultFormats
       val sqlContext = new SQLContext(sc)
@@ -136,14 +130,14 @@ class PowerIterationClustering private[clustering] (
   /**
    * Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
    * initMode: "random"}.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this() = this(k = 2, maxIterations = 100, initMode = "random")
 
   /**
    * Set the number of clusters.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setK(k: Int): this.type = {
     this.k = k
     this
@@ -151,8 +145,8 @@ class PowerIterationClustering private[clustering] (
 
   /**
    * Set maximum number of iterations of the power iteration loop
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
     this.maxIterations = maxIterations
     this
@@ -161,8 +155,8 @@ class PowerIterationClustering private[clustering] (
   /**
    * Set the initialization mode. This can be either "random" to use a random vector
    * as vertex properties, or "degree" to use normalized sum similarities. Default: random.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setInitializationMode(mode: String): this.type = {
     this.initMode = mode match {
       case "random" | "degree" => mode
@@ -182,8 +176,8 @@ class PowerIterationClustering private[clustering] (
    *              assume s,,ij,, = 0.0.
    *
    * @return a [[PowerIterationClusteringModel]] that contains the clustering result
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def run(graph: Graph[Double, Double]): PowerIterationClusteringModel = {
     val w = normalize(graph)
     val w0 = initMode match {
@@ -204,8 +198,8 @@ class PowerIterationClustering private[clustering] (
    *                     assume s,,ij,, = 0.0.
    *
    * @return a [[PowerIterationClusteringModel]] that contains the clustering result
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = {
     val w = normalize(similarities)
     val w0 = initMode match {
@@ -217,8 +211,8 @@ class PowerIterationClustering private[clustering] (
 
   /**
    * A Java-friendly version of [[PowerIterationClustering.run]].
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
     : PowerIterationClusteringModel = {
     run(similarities.rdd.asInstanceOf[RDD[(Long, Long, Double)]])
@@ -242,9 +236,7 @@ class PowerIterationClustering private[clustering] (
   }
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 @Experimental
 object PowerIterationClustering extends Logging {
 
@@ -253,8 +245,8 @@ object PowerIterationClustering extends Logging {
    * Cluster assignment.
    * @param id node id
    * @param cluster assigned cluster id
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   @Experimental
   case class Assignment(id: Long, cluster: Int)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index a915804b02c90..41f2668ec6a7d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaSparkContext._
 import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.rdd.RDD
@@ -63,9 +63,8 @@ import org.apache.spark.util.random.XORShiftRandom
  * such that at time t + h the discount applied to the data from t is 0.5.
  * The definition remains the same whether the time unit is given
  * as batches or points.
- * @since 1.2.0
- *
  */
+@Since("1.2.0")
 @Experimental
 class StreamingKMeansModel(
     override val clusterCenters: Array[Vector],
@@ -73,8 +72,8 @@ class StreamingKMeansModel(
 
   /**
    * Perform a k-means update on a batch of data.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def update(data: RDD[Vector], decayFactor: Double, timeUnit: String): StreamingKMeansModel = {
 
     // find nearest cluster to each point
@@ -166,23 +165,23 @@ class StreamingKMeansModel(
  *    .setRandomCenters(5, 100.0)
  *    .trainOn(DStream)
  * }}}
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class StreamingKMeans(
     var k: Int,
     var decayFactor: Double,
     var timeUnit: String) extends Logging with Serializable {
 
-  /** @since 1.2.0 */
+  @Since("1.2.0")
   def this() = this(2, 1.0, StreamingKMeans.BATCHES)
 
   protected var model: StreamingKMeansModel = new StreamingKMeansModel(null, null)
 
   /**
    * Set the number of clusters.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setK(k: Int): this.type = {
     this.k = k
     this
@@ -190,8 +189,8 @@ class StreamingKMeans(
 
   /**
    * Set the decay factor directly (for forgetful algorithms).
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setDecayFactor(a: Double): this.type = {
     this.decayFactor = a
     this
@@ -199,8 +198,8 @@ class StreamingKMeans(
 
   /**
    * Set the half life and time unit ("batches" or "points") for forgetful algorithms.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setHalfLife(halfLife: Double, timeUnit: String): this.type = {
     if (timeUnit != StreamingKMeans.BATCHES && timeUnit != StreamingKMeans.POINTS) {
       throw new IllegalArgumentException("Invalid time unit for decay: " + timeUnit)
@@ -213,8 +212,8 @@ class StreamingKMeans(
 
   /**
    * Specify initial centers directly.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setInitialCenters(centers: Array[Vector], weights: Array[Double]): this.type = {
     model = new StreamingKMeansModel(centers, weights)
     this
@@ -226,8 +225,8 @@ class StreamingKMeans(
    * @param dim Number of dimensions
    * @param weight Weight for each center
    * @param seed Random seed
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setRandomCenters(dim: Int, weight: Double, seed: Long = Utils.random.nextLong): this.type = {
     val random = new XORShiftRandom(seed)
     val centers = Array.fill(k)(Vectors.dense(Array.fill(dim)(random.nextGaussian())))
@@ -238,8 +237,8 @@ class StreamingKMeans(
 
   /**
    * Return the latest model.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def latestModel(): StreamingKMeansModel = {
     model
   }
@@ -251,8 +250,8 @@ class StreamingKMeans(
    * and updates the model using each batch of data from the stream.
    *
    * @param data DStream containing vector data
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainOn(data: DStream[Vector]) {
     assertInitialized()
     data.foreachRDD { (rdd, time) =>
@@ -262,8 +261,8 @@ class StreamingKMeans(
 
   /**
    * Java-friendly version of `trainOn`.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def trainOn(data: JavaDStream[Vector]): Unit = trainOn(data.dstream)
 
   /**
@@ -271,8 +270,8 @@ class StreamingKMeans(
    *
    * @param data DStream containing vector data
    * @return DStream containing predictions
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def predictOn(data: DStream[Vector]): DStream[Int] = {
     assertInitialized()
     data.map(model.predict)
@@ -280,8 +279,8 @@ class StreamingKMeans(
 
   /**
    * Java-friendly version of `predictOn`.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Integer] = {
     JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Integer]])
   }
@@ -292,8 +291,8 @@ class StreamingKMeans(
    * @param data DStream containing (key, feature vector) pairs
    * @tparam K key type
    * @return DStream containing the input keys and the predictions as values
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Int)] = {
     assertInitialized()
     data.mapValues(model.predict)
@@ -301,8 +300,8 @@ class StreamingKMeans(
 
   /**
    * Java-friendly version of `predictOnValues`.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def predictOnValues[K](
       data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Integer] = {
     implicit val tag = fakeClassTag[K]
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 486741edd6f5a..76ae847921f44 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.evaluation.binary._
@@ -41,8 +41,8 @@ import org.apache.spark.sql.DataFrame
  *                of bins may not exactly equal numBins. The last bin in each partition may
  *                be smaller as a result, meaning there may be an extra sample at
  *                partition boundaries.
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class BinaryClassificationMetrics(
     val scoreAndLabels: RDD[(Double, Double)],
@@ -52,8 +52,8 @@ class BinaryClassificationMetrics(
 
   /**
    * Defaults `numBins` to 0.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0)
 
   /**
@@ -65,16 +65,16 @@ class BinaryClassificationMetrics(
 
   /**
    * Unpersist intermediate RDDs used in the computation.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def unpersist() {
     cumulativeCounts.unpersist()
   }
 
   /**
    * Returns thresholds in descending order.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def thresholds(): RDD[Double] = cumulativeCounts.map(_._1)
 
   /**
@@ -82,8 +82,8 @@ class BinaryClassificationMetrics(
    * which is an RDD of (false positive rate, true positive rate)
    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
    * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def roc(): RDD[(Double, Double)] = {
     val rocCurve = createCurve(FalsePositiveRate, Recall)
     val sc = confusions.context
@@ -94,16 +94,16 @@ class BinaryClassificationMetrics(
 
   /**
    * Computes the area under the receiver operating characteristic (ROC) curve.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def areaUnderROC(): Double = AreaUnderCurve.of(roc())
 
   /**
    * Returns the precision-recall curve, which is an RDD of (recall, precision),
    * NOT (precision, recall), with (0.0, 1.0) prepended to it.
    * @see http://en.wikipedia.org/wiki/Precision_and_recall
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def pr(): RDD[(Double, Double)] = {
     val prCurve = createCurve(Recall, Precision)
     val sc = confusions.context
@@ -113,8 +113,8 @@ class BinaryClassificationMetrics(
 
   /**
    * Computes the area under the precision-recall curve.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def areaUnderPR(): Double = AreaUnderCurve.of(pr())
 
   /**
@@ -122,26 +122,26 @@ class BinaryClassificationMetrics(
    * @param beta the beta factor in F-Measure computation.
    * @return an RDD of (threshold, F-Measure) pairs.
    * @see http://en.wikipedia.org/wiki/F1_score
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
 
   /**
    * Returns the (threshold, F-Measure) curve with beta = 1.0.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0)
 
   /**
    * Returns the (threshold, precision) curve.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision)
 
   /**
    * Returns the (threshold, recall) curve.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall)
 
   private lazy val (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index dddfa3ea5b800..02e89d921033c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.evaluation
 import scala.collection.Map
 
 import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
@@ -30,8 +30,8 @@ import org.apache.spark.sql.DataFrame
  * Evaluator for multiclass classification.
  *
  * @param predictionAndLabels an RDD of (prediction, label) pairs.
- * @since 1.1.0
  */
+@Since("1.1.0")
 @Experimental
 class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
 
@@ -65,8 +65,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
    * predicted classes are in columns,
    * they are ordered by class label ascending,
    * as in "labels"
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def confusionMatrix: Matrix = {
     val n = labels.size
     val values = Array.ofDim[Double](n * n)
@@ -85,15 +85,15 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns true positive rate for a given label (category)
    * @param label the label.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def truePositiveRate(label: Double): Double = recall(label)
 
   /**
    * Returns false positive rate for a given label (category)
    * @param label the label.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def falsePositiveRate(label: Double): Double = {
     val fp = fpByClass.getOrElse(label, 0)
     fp.toDouble / (labelCount - labelCountByClass(label))
@@ -102,8 +102,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns precision for a given label (category)
    * @param label the label.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def precision(label: Double): Double = {
     val tp = tpByClass(label)
     val fp = fpByClass.getOrElse(label, 0)
@@ -113,16 +113,16 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns recall for a given label (category)
    * @param label the label.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
 
   /**
    * Returns f-measure for a given label (category)
    * @param label the label.
    * @param beta the beta parameter.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
     val p = precision(label)
     val r = recall(label)
@@ -133,8 +133,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns f1-measure for a given label (category)
    * @param label the label.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def fMeasure(label: Double): Double = fMeasure(label, 1.0)
 
   /**
@@ -187,8 +187,8 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns weighted averaged f-measure
    * @param beta the beta parameter.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, beta) * count.toDouble / labelCount
   }.sum
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
index 77cb1e09bdbb5..a0a8d9c56847b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.evaluation
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
 import org.apache.spark.sql.DataFrame
@@ -25,8 +26,8 @@ import org.apache.spark.sql.DataFrame
  * Evaluator for multilabel classification.
  * @param predictionAndLabels an RDD of (predictions, labels) pairs,
  * both are non-null Arrays, each with unique elements.
- * @since 1.2.0
  */
+@Since("1.2.0")
 class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) {
 
   /**
@@ -104,8 +105,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns precision for a given label (category)
    * @param label the label.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def precision(label: Double): Double = {
     val tp = tpPerClass(label)
     val fp = fpPerClass.getOrElse(label, 0L)
@@ -115,8 +116,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns recall for a given label (category)
    * @param label the label.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def recall(label: Double): Double = {
     val tp = tpPerClass(label)
     val fn = fnPerClass.getOrElse(label, 0L)
@@ -126,8 +127,8 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns f1-measure for a given label (category)
    * @param label the label.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def f1Measure(label: Double): Double = {
     val p = precision(label)
     val r = recall(label)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index 063fbed8cdeea..a7f43f0b110f5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -23,7 +23,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
 import org.apache.spark.rdd.RDD
 
@@ -34,8 +34,8 @@ import org.apache.spark.rdd.RDD
  * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
  *
  * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])])
   extends Logging with Serializable {
@@ -56,8 +56,8 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
    *
    * @param k the position to compute the truncated precision, must be positive
    * @return the average precision at the first k ranking positions
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def precisionAt(k: Int): Double = {
     require(k > 0, "ranking position k should be positive")
     predictionAndLabels.map { case (pred, lab) =>
@@ -126,8 +126,8 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
    *
    * @param k the position to compute the truncated ndcg, must be positive
    * @return the average ndcg at the first k ranking positions
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def ndcgAt(k: Int): Double = {
     require(k > 0, "ranking position k should be positive")
     predictionAndLabels.map { case (pred, lab) =>
@@ -165,8 +165,8 @@ object RankingMetrics {
   /**
    * Creates a [[RankingMetrics]] instance (for Java users).
    * @param predictionAndLabels a JavaRDD of (predicted ranking, ground truth set) pairs
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def of[E, T <: jl.Iterable[E]](predictionAndLabels: JavaRDD[(T, T)]): RankingMetrics[E] = {
     implicit val tag = JavaSparkContext.fakeClassTag[E]
     val rdd = predictionAndLabels.rdd.map { case (predictions, labels) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 54dfd8c099494..36a6c357c3897 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg.Vectors
@@ -29,8 +29,8 @@ import org.apache.spark.sql.DataFrame
  * Evaluator for regression.
  *
  * @param predictionAndObservations an RDD of (prediction, observation) pairs.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
 
@@ -67,8 +67,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
    * Returns the variance explained by regression.
    * explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n
    * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def explainedVariance: Double = {
     SSreg / summary.count
   }
@@ -76,8 +76,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
   /**
    * Returns the mean absolute error, which is a risk function corresponding to the
    * expected value of the absolute error loss or l1-norm loss.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def meanAbsoluteError: Double = {
     summary.normL1(1) / summary.count
   }
@@ -85,8 +85,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
   /**
    * Returns the mean squared error, which is a risk function corresponding to the
    * expected value of the squared error loss or quadratic loss.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def meanSquaredError: Double = {
     SSerr / summary.count
   }
@@ -94,8 +94,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
   /**
    * Returns the root mean squared error, which is defined as the square root of
    * the mean squared error.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def rootMeanSquaredError: Double = {
     math.sqrt(this.meanSquaredError)
   }
@@ -103,8 +103,8 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
   /**
    * Returns R^2^, the unadjusted coefficient of determination.
    * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def r2: Double = {
     1 - SSerr / SStot
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 7f4de77044994..ba3b447a83398 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -20,7 +20,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.fpm.AssociationRules.Rule
@@ -33,24 +33,22 @@ import org.apache.spark.rdd.RDD
  * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
  * association rules which have a single item as the consequent.
  *
- * @since 1.5.0
  */
+@Since("1.5.0")
 @Experimental
 class AssociationRules private[fpm] (
     private var minConfidence: Double) extends Logging with Serializable {
 
   /**
    * Constructs a default instance with default parameters {minConfidence = 0.8}.
-   *
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def this() = this(0.8)
 
   /**
    * Sets the minimal confidence (default: `0.8`).
-   *
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def setMinConfidence(minConfidence: Double): this.type = {
     require(minConfidence >= 0.0 && minConfidence <= 1.0)
     this.minConfidence = minConfidence
@@ -62,8 +60,8 @@ class AssociationRules private[fpm] (
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
    * @return a [[Set[Rule[Item]]] containing the assocation rules.
    *
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]]): RDD[Rule[Item]] = {
     // For candidate rule X => Y, generate (X, (Y, freq(X union Y)))
     val candidates = freqItemsets.flatMap { itemset =>
@@ -102,8 +100,8 @@ object AssociationRules {
    *                   instead.
    * @tparam Item item type
    *
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   @Experimental
   class Rule[Item] private[fpm] (
       val antecedent: Array[Item],
@@ -114,8 +112,8 @@ object AssociationRules {
     /**
      * Returns the confidence of the rule.
      *
-     * @since 1.5.0
      */
+    @Since("1.5.0")
     def confidence: Double = freqUnion.toDouble / freqAntecedent
 
     require(antecedent.toSet.intersect(consequent.toSet).isEmpty, {
@@ -127,8 +125,8 @@ object AssociationRules {
     /**
      * Returns antecedent in a Java List.
      *
-     * @since 1.5.0
      */
+    @Since("1.5.0")
     def javaAntecedent: java.util.List[Item] = {
       antecedent.toList.asJava
     }
@@ -136,8 +134,8 @@ object AssociationRules {
     /**
      * Returns consequent in a Java List.
      *
-     * @since 1.5.0
      */
+    @Since("1.5.0")
     def javaConsequent: java.util.List[Item] = {
       consequent.toList.asJava
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index e2370a52f4930..e37f806271680 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.fpm.FPGrowth._
@@ -39,15 +39,15 @@ import org.apache.spark.storage.StorageLevel
  * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
  * @tparam Item item type
  *
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
   /**
    * Generates association rules for the [[Item]]s in [[freqItemsets]].
    * @param confidence minimal confidence of the rules produced
-   * @since 1.5.0
    */
+  @Since("1.5.0")
   def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = {
     val associationRules = new AssociationRules(confidence)
     associationRules.run(freqItemsets)
@@ -71,8 +71,8 @@ class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) ex
  * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
  *       (Wikipedia)]]
  *
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class FPGrowth private (
     private var minSupport: Double,
@@ -82,15 +82,15 @@ class FPGrowth private (
    * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
    * as the input data}.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this() = this(0.3, -1)
 
   /**
    * Sets the minimal support level (default: `0.3`).
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setMinSupport(minSupport: Double): this.type = {
     this.minSupport = minSupport
     this
@@ -99,8 +99,8 @@ class FPGrowth private (
   /**
    * Sets the number of partitions used by parallel FP-growth (default: same as input data).
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def setNumPartitions(numPartitions: Int): this.type = {
     this.numPartitions = numPartitions
     this
@@ -111,8 +111,8 @@ class FPGrowth private (
    * @param data input data set, each element contains a transaction
    * @return an [[FPGrowthModel]]
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def run[Item: ClassTag](data: RDD[Array[Item]]): FPGrowthModel[Item] = {
     if (data.getStorageLevel == StorageLevel.NONE) {
       logWarning("Input data is not cached.")
@@ -213,8 +213,8 @@ class FPGrowth private (
 /**
  * :: Experimental ::
  *
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 object FPGrowth {
 
@@ -224,15 +224,15 @@ object FPGrowth {
    * @param freq frequency
    * @tparam Item item type
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
 
     /**
      * Returns items in a Java List.
      *
-     * @since 1.3.0
      */
+    @Since("1.3.0")
     def javaItems: java.util.List[Item] = {
       items.toList.asJava
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index dfa8910fcbc8c..28b5b4637bf17 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.{ArrayBuilder => MArrayBuilder, HashSet => MHash
 
 import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
@@ -227,8 +227,8 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
  * @param values matrix entries in column major if not transposed or in row major otherwise
  * @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in
  *                     row major.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
 class DenseMatrix(
     val numRows: Int,
@@ -253,8 +253,8 @@ class DenseMatrix(
    * @param numRows number of rows
    * @param numCols number of columns
    * @param values matrix entries in column major
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this(numRows: Int, numCols: Int, values: Array[Double]) =
     this(numRows, numCols, values, false)
 
@@ -278,9 +278,7 @@ class DenseMatrix(
 
   private[mllib] def apply(i: Int): Double = values(i)
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
@@ -291,9 +289,7 @@ class DenseMatrix(
     values(index(i, j)) = v
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone())
 
   private[spark] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f),
@@ -309,9 +305,7 @@ class DenseMatrix(
     this
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
 
   private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
@@ -342,21 +336,17 @@ class DenseMatrix(
     }
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def numNonzeros: Int = values.count(_ != 0)
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def numActives: Int = values.length
 
   /**
    * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
    * set to false.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toSparse: SparseMatrix = {
     val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
     val colPtrs: Array[Int] = new Array[Int](numCols + 1)
@@ -383,8 +373,8 @@ class DenseMatrix(
 
 /**
  * Factory methods for [[org.apache.spark.mllib.linalg.DenseMatrix]].
- * @since 1.3.0
  */
+@Since("1.3.0")
 object DenseMatrix {
 
   /**
@@ -392,8 +382,8 @@ object DenseMatrix {
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def zeros(numRows: Int, numCols: Int): DenseMatrix = {
     require(numRows.toLong * numCols <= Int.MaxValue,
             s"$numRows x $numCols dense matrix is too large to allocate")
@@ -405,8 +395,8 @@ object DenseMatrix {
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def ones(numRows: Int, numCols: Int): DenseMatrix = {
     require(numRows.toLong * numCols <= Int.MaxValue,
             s"$numRows x $numCols dense matrix is too large to allocate")
@@ -417,8 +407,8 @@ object DenseMatrix {
    * Generate an Identity Matrix in `DenseMatrix` format.
    * @param n number of rows and columns of the matrix
    * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def eye(n: Int): DenseMatrix = {
     val identity = DenseMatrix.zeros(n, n)
     var i = 0
@@ -435,8 +425,8 @@ object DenseMatrix {
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
    * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
     require(numRows.toLong * numCols <= Int.MaxValue,
             s"$numRows x $numCols dense matrix is too large to allocate")
@@ -449,8 +439,8 @@ object DenseMatrix {
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
    * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
     require(numRows.toLong * numCols <= Int.MaxValue,
             s"$numRows x $numCols dense matrix is too large to allocate")
@@ -462,8 +452,8 @@ object DenseMatrix {
    * @param vector a `Vector` that will form the values on the diagonal of the matrix
    * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values`
    *         on the diagonal
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def diag(vector: Vector): DenseMatrix = {
     val n = vector.size
     val matrix = DenseMatrix.zeros(n, n)
@@ -498,8 +488,8 @@ object DenseMatrix {
  * @param isTransposed whether the matrix is transposed. If true, the matrix can be considered
  *                     Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs,
  *                     and `rowIndices` behave as colIndices, and `values` are stored in row major.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
 class SparseMatrix(
     val numRows: Int,
@@ -536,8 +526,8 @@ class SparseMatrix(
    * @param rowIndices the row index of the entry. They must be in strictly increasing
    *                   order for each column
    * @param values non-zero matrix entries in column major
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def this(
       numRows: Int,
       numCols: Int,
@@ -560,8 +550,8 @@ class SparseMatrix(
   }
 
   /**
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def apply(i: Int, j: Int): Double = {
     val ind = index(i, j)
     if (ind < 0) 0.0 else values(ind)
@@ -585,9 +575,7 @@ class SparseMatrix(
     }
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def copy: SparseMatrix = {
     new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone())
   }
@@ -605,9 +593,7 @@ class SparseMatrix(
     this
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def transpose: SparseMatrix =
     new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed)
 
@@ -641,28 +627,24 @@ class SparseMatrix(
   /**
    * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
    * set to false.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toDense: DenseMatrix = {
     new DenseMatrix(numRows, numCols, toArray)
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def numNonzeros: Int = values.count(_ != 0)
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def numActives: Int = values.length
 
 }
 
 /**
  * Factory methods for [[org.apache.spark.mllib.linalg.SparseMatrix]].
- * @since 1.3.0
  */
+@Since("1.3.0")
 object SparseMatrix {
 
   /**
@@ -673,8 +655,8 @@ object SparseMatrix {
    * @param numCols number of columns of the matrix
    * @param entries Array of (i, j, value) tuples
    * @return The corresponding `SparseMatrix`
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = {
     val sortedEntries = entries.toSeq.sortBy(v => (v._2, v._1))
     val numEntries = sortedEntries.size
@@ -722,8 +704,8 @@ object SparseMatrix {
    * Generate an Identity Matrix in `SparseMatrix` format.
    * @param n number of rows and columns of the matrix
    * @return `SparseMatrix` with size `n` x `n` and values of ones on the diagonal
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def speye(n: Int): SparseMatrix = {
     new SparseMatrix(n, n, (0 to n).toArray, (0 until n).toArray, Array.fill(n)(1.0))
   }
@@ -792,8 +774,8 @@ object SparseMatrix {
    * @param density the desired density for the matrix
    * @param rng a random number generator
    * @return `SparseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
     val mat = genRandMatrix(numRows, numCols, density, rng)
     mat.update(i => rng.nextDouble())
@@ -806,8 +788,8 @@ object SparseMatrix {
    * @param density the desired density for the matrix
    * @param rng a random number generator
    * @return `SparseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = {
     val mat = genRandMatrix(numRows, numCols, density, rng)
     mat.update(i => rng.nextGaussian())
@@ -818,8 +800,8 @@ object SparseMatrix {
    * @param vector a `Vector` that will form the values on the diagonal of the matrix
    * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero
    *         `values` on the diagonal
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def spdiag(vector: Vector): SparseMatrix = {
     val n = vector.size
     vector match {
@@ -835,8 +817,8 @@ object SparseMatrix {
 
 /**
  * Factory methods for [[org.apache.spark.mllib.linalg.Matrix]].
- * @since 1.0.0
  */
+@Since("1.0.0")
 object Matrices {
 
   /**
@@ -845,8 +827,8 @@ object Matrices {
    * @param numRows number of rows
    * @param numCols number of columns
    * @param values matrix entries in column major
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def dense(numRows: Int, numCols: Int, values: Array[Double]): Matrix = {
     new DenseMatrix(numRows, numCols, values)
   }
@@ -859,8 +841,8 @@ object Matrices {
    * @param colPtrs the index corresponding to the start of a new column
    * @param rowIndices the row index of the entry
    * @param values non-zero matrix entries in column major
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def sparse(
      numRows: Int,
      numCols: Int,
@@ -893,8 +875,8 @@ object Matrices {
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @return `Matrix` with size `numRows` x `numCols` and values of zeros
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols)
 
   /**
@@ -902,24 +884,24 @@ object Matrices {
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @return `Matrix` with size `numRows` x `numCols` and values of ones
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols)
 
   /**
    * Generate a dense Identity Matrix in `Matrix` format.
    * @param n number of rows and columns of the matrix
    * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def eye(n: Int): Matrix = DenseMatrix.eye(n)
 
   /**
    * Generate a sparse Identity Matrix in `Matrix` format.
    * @param n number of rows and columns of the matrix
    * @return `Matrix` with size `n` x `n` and values of ones on the diagonal
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def speye(n: Int): Matrix = SparseMatrix.speye(n)
 
   /**
@@ -928,8 +910,8 @@ object Matrices {
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
    * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def rand(numRows: Int, numCols: Int, rng: Random): Matrix =
     DenseMatrix.rand(numRows, numCols, rng)
 
@@ -940,8 +922,8 @@ object Matrices {
    * @param density the desired density for the matrix
    * @param rng a random number generator
    * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
     SparseMatrix.sprand(numRows, numCols, density, rng)
 
@@ -951,8 +933,8 @@ object Matrices {
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
    * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def randn(numRows: Int, numCols: Int, rng: Random): Matrix =
     DenseMatrix.randn(numRows, numCols, rng)
 
@@ -963,8 +945,8 @@ object Matrices {
    * @param density the desired density for the matrix
    * @param rng a random number generator
    * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1)
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix =
     SparseMatrix.sprandn(numRows, numCols, density, rng)
 
@@ -973,8 +955,8 @@ object Matrices {
    * @param vector a `Vector` that will form the values on the diagonal of the matrix
    * @return Square `Matrix` with size `values.length` x `values.length` and `values`
    *         on the diagonal
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def diag(vector: Vector): Matrix = DenseMatrix.diag(vector)
 
   /**
@@ -983,8 +965,8 @@ object Matrices {
    * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
    * @param matrices array of matrices
    * @return a single `Matrix` composed of the matrices that were horizontally concatenated
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def horzcat(matrices: Array[Matrix]): Matrix = {
     if (matrices.isEmpty) {
       return new DenseMatrix(0, 0, Array[Double]())
@@ -1042,8 +1024,8 @@ object Matrices {
    * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned.
    * @param matrices array of matrices
    * @return a single `Matrix` composed of the matrices that were vertically concatenated
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def vertcat(matrices: Array[Matrix]): Matrix = {
     if (matrices.isEmpty) {
       return new DenseMatrix(0, 0, Array[Double]())
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index 8f504f6984cb0..a37aca99d5e72 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.mllib.linalg
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 
 /**
  * :: Experimental ::
  * Represents singular value decomposition (SVD) factors.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 52ef7be3b38be..3d577edbe23e1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -26,7 +26,7 @@ import scala.collection.JavaConverters._
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{AlphaComponent, Since}
 import org.apache.spark.mllib.util.NumericParser
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
@@ -240,14 +240,14 @@ class VectorUDT extends UserDefinedType[Vector] {
  * Factory methods for [[org.apache.spark.mllib.linalg.Vector]].
  * We don't use the name `Vector` because Scala imports
  * [[scala.collection.immutable.Vector]] by default.
- * @since 1.0.0
  */
+@Since("1.0.0")
 object Vectors {
 
   /**
    * Creates a dense vector from its values.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @varargs
   def dense(firstValue: Double, otherValues: Double*): Vector =
     new DenseVector((firstValue +: otherValues).toArray)
@@ -255,8 +255,8 @@ object Vectors {
   // A dummy implicit is used to avoid signature collision with the one generated by @varargs.
   /**
    * Creates a dense vector from a double array.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def dense(values: Array[Double]): Vector = new DenseVector(values)
 
   /**
@@ -265,8 +265,8 @@ object Vectors {
    * @param size vector size.
    * @param indices index array, must be strictly increasing.
    * @param values value array, must have the same length as indices.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector =
     new SparseVector(size, indices, values)
 
@@ -275,8 +275,8 @@ object Vectors {
    *
    * @param size vector size.
    * @param elements vector elements in (index, value) pairs.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def sparse(size: Int, elements: Seq[(Int, Double)]): Vector = {
     require(size > 0, "The size of the requested sparse vector must be greater than 0.")
 
@@ -297,8 +297,8 @@ object Vectors {
    *
    * @param size vector size.
    * @param elements vector elements in (index, value) pairs.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def sparse(size: Int, elements: JavaIterable[(JavaInteger, JavaDouble)]): Vector = {
     sparse(size, elements.asScala.map { case (i, x) =>
       (i.intValue(), x.doubleValue())
@@ -310,16 +310,16 @@ object Vectors {
    *
    * @param size vector size
    * @return a zero vector
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def zeros(size: Int): Vector = {
     new DenseVector(new Array[Double](size))
   }
 
   /**
    * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def parse(s: String): Vector = {
     parseNumeric(NumericParser.parse(s))
   }
@@ -362,8 +362,8 @@ object Vectors {
    * @param vector input vector.
    * @param p norm.
    * @return norm in L^p^ space.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def norm(vector: Vector, p: Double): Double = {
     require(p >= 1.0, "To compute the p-norm of the vector, we require that you specify a p>=1. " +
       s"You specified p=$p.")
@@ -415,8 +415,8 @@ object Vectors {
    * @param v1 first Vector.
    * @param v2 second Vector.
    * @return squared distance between two Vectors.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def sqdist(v1: Vector, v2: Vector): Double = {
     require(v1.size == v2.size, s"Vector dimensions do not match: Dim(v1)=${v1.size} and Dim(v2)" +
       s"=${v2.size}.")
@@ -529,33 +529,25 @@ object Vectors {
 
 /**
  * A dense vector represented by a value array.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 class DenseVector(val values: Array[Double]) extends Vector {
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def size: Int = values.length
 
   override def toString: String = values.mkString("[", ",", "]")
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def toArray: Array[Double] = values
 
   private[spark] override def toBreeze: BV[Double] = new BDV[Double](values)
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def apply(i: Int): Double = values(i)
 
-  /**
-   * @since 1.1.0
-   */
+  @Since("1.1.0")
   override def copy: DenseVector = {
     new DenseVector(values.clone())
   }
@@ -587,14 +579,10 @@ class DenseVector(val values: Array[Double]) extends Vector {
     result
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def numActives: Int = size
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def numNonzeros: Int = {
     // same as values.count(_ != 0.0) but faster
     var nnz = 0
@@ -606,9 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
     nnz
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def toSparse: SparseVector = {
     val nnz = numNonzeros
     val ii = new Array[Int](nnz)
@@ -624,9 +610,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
     new SparseVector(size, ii, vv)
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def argmax: Int = {
     if (size == 0) {
       -1
@@ -646,9 +630,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
   }
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object DenseVector {
   /** Extracts the value array from a dense vector. */
   def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
@@ -660,8 +642,8 @@ object DenseVector {
  * @param size size of the vector.
  * @param indices index array, assume to be strictly increasing.
  * @param values value array, must have the same length as the index array.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 class SparseVector(
     override val size: Int,
@@ -677,9 +659,7 @@ class SparseVector(
   override def toString: String =
     s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})"
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def toArray: Array[Double] = {
     val data = new Array[Double](size)
     var i = 0
@@ -691,9 +671,7 @@ class SparseVector(
     data
   }
 
-  /**
-   * @since 1.1.0
-   */
+  @Since("1.1.0")
   override def copy: SparseVector = {
     new SparseVector(size, indices.clone(), values.clone())
   }
@@ -734,14 +712,10 @@ class SparseVector(
     result
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def numActives: Int = values.length
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def numNonzeros: Int = {
     var nnz = 0
     values.foreach { v =>
@@ -752,9 +726,7 @@ class SparseVector(
     nnz
   }
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def toSparse: SparseVector = {
     val nnz = numNonzeros
     if (nnz == numActives) {
@@ -774,9 +746,7 @@ class SparseVector(
     }
   }
 
-  /**
-   * @since 1.5.0
-   */
+  @Since("1.5.0")
   override def argmax: Int = {
     if (size == 0) {
       -1
@@ -847,9 +817,7 @@ class SparseVector(
   }
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object SparseVector {
   def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] =
     Some((sv.size, sv.indices, sv.values))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index cfb6680a18b34..94376c24a7ac6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import breeze.linalg.{DenseMatrix => BDM}
 
 import org.apache.spark.{Logging, Partitioner, SparkException}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -128,9 +128,8 @@ private[mllib] object GridPartitioner {
  *              the number of rows will be calculated when `numRows` is invoked.
  * @param nCols Number of columns of this matrix. If the supplied value is less than or equal to
  *              zero, the number of columns will be calculated when `numCols` is invoked.
- * @since 1.3.0
- *
  */
+@Since("1.3.0")
 @Experimental
 class BlockMatrix(
     val blocks: RDD[((Int, Int), Matrix)],
@@ -151,10 +150,8 @@ class BlockMatrix(
    *                     rows are not required to have the given number of rows
    * @param colsPerBlock Number of columns that make up each block. The blocks forming the final
    *                     columns are not required to have the given number of columns
-   *
-   * @since 1.3.0
-   *
    */
+  @Since("1.3.0")
   def this(
       blocks: RDD[((Int, Int), Matrix)],
       rowsPerBlock: Int,
@@ -162,20 +159,13 @@ class BlockMatrix(
     this(blocks, rowsPerBlock, colsPerBlock, 0L, 0L)
   }
 
-  /**
-   * @since 1.3.0
-   * */
-
+  @Since("1.3.0")
   override def numRows(): Long = {
     if (nRows <= 0L) estimateDim()
     nRows
   }
 
-  /**
-   *
-   * @since 1.3.0
-   */
-
+  @Since("1.3.0")
   override def numCols(): Long = {
     if (nCols <= 0L) estimateDim()
     nCols
@@ -206,8 +196,8 @@ class BlockMatrix(
   /**
    * Validates the block matrix info against the matrix data (`blocks`) and throws an exception if
    * any error is found.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def validate(): Unit = {
     logDebug("Validating BlockMatrix...")
     // check if the matrix is larger than the claimed dimensions
@@ -243,25 +233,22 @@ class BlockMatrix(
     logDebug("BlockMatrix is valid!")
   }
 
-  /** Caches the underlying RDD.
-    * @since 1.3.0
-    * */
+  /** Caches the underlying RDD. */
+  @Since("1.3.0")
   def cache(): this.type = {
     blocks.cache()
     this
   }
 
-  /** Persists the underlying RDD with the specified storage level.
-    * @since 1.3.0
-    * */
+  /** Persists the underlying RDD with the specified storage level. */
+  @Since("1.3.0")
   def persist(storageLevel: StorageLevel): this.type = {
     blocks.persist(storageLevel)
     this
   }
 
-  /** Converts to CoordinateMatrix.
-    * @since 1.3.0
-    * */
+  /** Converts to CoordinateMatrix. */
+  @Since("1.3.0")
   def toCoordinateMatrix(): CoordinateMatrix = {
     val entryRDD = blocks.flatMap { case ((blockRowIndex, blockColIndex), mat) =>
       val rowStart = blockRowIndex.toLong * rowsPerBlock
@@ -275,9 +262,8 @@ class BlockMatrix(
     new CoordinateMatrix(entryRDD, numRows(), numCols())
   }
 
-  /** Converts to IndexedRowMatrix. The number of columns must be within the integer range.
-    * @since 1.3.0
-    * */
+  /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */
+  @Since("1.3.0")
   def toIndexedRowMatrix(): IndexedRowMatrix = {
     require(numCols() < Int.MaxValue, "The number of columns must be within the integer range. " +
       s"numCols: ${numCols()}")
@@ -285,9 +271,8 @@ class BlockMatrix(
     toCoordinateMatrix().toIndexedRowMatrix()
   }
 
-  /** Collect the distributed matrix on the driver as a `DenseMatrix`.
-    * @since 1.3.0
-    * */
+  /** Collect the distributed matrix on the driver as a `DenseMatrix`. */
+  @Since("1.3.0")
   def toLocalMatrix(): Matrix = {
     require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " +
       s"Int.MaxValue. Currently numRows: ${numRows()}")
@@ -312,11 +297,11 @@ class BlockMatrix(
     new DenseMatrix(m, n, values)
   }
 
-  /** Transpose this `BlockMatrix`. Returns a new `BlockMatrix` instance sharing the
-    * same underlying data. Is a lazy operation.
-    * @since 1.3.0
-    *
-    * */
+  /**
+   * Transpose this `BlockMatrix`. Returns a new `BlockMatrix` instance sharing the
+   * same underlying data. Is a lazy operation.
+   */
+  @Since("1.3.0")
   def transpose: BlockMatrix = {
     val transposedBlocks = blocks.map { case ((blockRowIndex, blockColIndex), mat) =>
       ((blockColIndex, blockRowIndex), mat.transpose)
@@ -330,13 +315,14 @@ class BlockMatrix(
     new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray)
   }
 
-  /** Adds two block matrices together. The matrices must have the same size and matching
-    * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are
-    * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even
-    * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will
-    * also be a [[DenseMatrix]].
-    * @since 1.3.0
-    */
+  /**
+   * Adds two block matrices together. The matrices must have the same size and matching
+   * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are
+   * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even
+   * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will
+   * also be a [[DenseMatrix]].
+   */
+  @Since("1.3.0")
   def add(other: BlockMatrix): BlockMatrix = {
     require(numRows() == other.numRows(), "Both matrices must have the same number of rows. " +
       s"A.numRows: ${numRows()}, B.numRows: ${other.numRows()}")
@@ -364,14 +350,14 @@ class BlockMatrix(
     }
   }
 
-  /** Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
-    * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
-    * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
-    * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
-    * some performance issues until support for multiplying two sparse matrices is added.
-    *
-    * @since 1.3.0
-    */
+  /**
+   * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
+   * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
+   * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
+   * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
+   * some performance issues until support for multiplying two sparse matrices is added.
+   */
+  @Since("1.3.0")
   def multiply(other: BlockMatrix): BlockMatrix = {
     require(numCols() == other.numRows(), "The number of columns of A and the number of rows " +
       s"of B must be equal. A.numCols: ${numCols()}, B.numRows: ${other.numRows()}. If you " +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 2b751e45dd76c..4bb27ec840902 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors}
 
@@ -29,8 +29,8 @@ import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors}
  * @param i row index
  * @param j column index
  * @param value value of the entry
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 case class MatrixEntry(i: Long, j: Long, value: Double)
 
@@ -43,22 +43,20 @@ case class MatrixEntry(i: Long, j: Long, value: Double)
  *              be determined by the max row index plus one.
  * @param nCols number of columns. A non-positive value means unknown, and then the number of
  *              columns will be determined by the max column index plus one.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class CoordinateMatrix(
     val entries: RDD[MatrixEntry],
     private var nRows: Long,
     private var nCols: Long) extends DistributedMatrix {
 
-  /** Alternative constructor leaving matrix dimensions to be determined automatically.
-    * @since 1.0.0
-    * */
+  /** Alternative constructor leaving matrix dimensions to be determined automatically. */
+  @Since("1.0.0")
   def this(entries: RDD[MatrixEntry]) = this(entries, 0L, 0L)
 
-  /** Gets or computes the number of columns.
-    * @since 1.0.0
-    * */
+  /** Gets or computes the number of columns. */
+  @Since("1.0.0")
   override def numCols(): Long = {
     if (nCols <= 0L) {
       computeSize()
@@ -66,9 +64,8 @@ class CoordinateMatrix(
     nCols
   }
 
-  /** Gets or computes the number of rows.
-    * @since 1.0.0
-    * */
+  /** Gets or computes the number of rows. */
+  @Since("1.0.0")
   override def numRows(): Long = {
     if (nRows <= 0L) {
       computeSize()
@@ -76,16 +73,14 @@ class CoordinateMatrix(
     nRows
   }
 
-  /** Transposes this CoordinateMatrix.
-    * @since 1.3.0
-    * */
+  /** Transposes this CoordinateMatrix. */
+  @Since("1.3.0")
   def transpose(): CoordinateMatrix = {
     new CoordinateMatrix(entries.map(x => MatrixEntry(x.j, x.i, x.value)), numCols(), numRows())
   }
 
-  /** Converts to IndexedRowMatrix. The number of columns must be within the integer range.
-    * @since 1.0.0
-    * */
+  /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */
+  @Since("1.0.0")
   def toIndexedRowMatrix(): IndexedRowMatrix = {
     val nl = numCols()
     if (nl > Int.MaxValue) {
@@ -104,15 +99,14 @@ class CoordinateMatrix(
   /**
    * Converts to RowMatrix, dropping row indices after grouping by row index.
    * The number of columns must be within the integer range.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def toRowMatrix(): RowMatrix = {
     toIndexedRowMatrix().toRowMatrix()
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024.
-    * @since 1.3.0
-    * */
+  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
@@ -124,8 +118,8 @@ class CoordinateMatrix(
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @return a [[BlockMatrix]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = {
     require(rowsPerBlock > 0,
       s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
index 98e90af84abac..e51327ebb7b58 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
@@ -19,10 +19,12 @@ package org.apache.spark.mllib.linalg.distributed
 
 import breeze.linalg.{DenseMatrix => BDM}
 
+import org.apache.spark.annotation.Since
+
 /**
  * Represents a distributively stored matrix backed by one or more RDDs.
- * @since 1.0.0
  */
+@Since("1.0.0")
 trait DistributedMatrix extends Serializable {
 
   /** Gets or computes the number of rows. */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index a09f88ce28e58..6d2c05a47d049 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.SingularValueDecomposition
@@ -27,8 +27,8 @@ import org.apache.spark.mllib.linalg.SingularValueDecomposition
 /**
  * :: Experimental ::
  * Represents a row of [[org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix]].
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 case class IndexedRow(index: Long, vector: Vector)
 
@@ -42,23 +42,19 @@ case class IndexedRow(index: Long, vector: Vector)
  *              be determined by the max row index plus one.
  * @param nCols number of columns. A non-positive value means unknown, and then the number of
  *              columns will be determined by the size of the first row.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class IndexedRowMatrix(
     val rows: RDD[IndexedRow],
     private var nRows: Long,
     private var nCols: Int) extends DistributedMatrix {
 
-  /** Alternative constructor leaving matrix dimensions to be determined automatically.
-    * @since 1.0.0
-    * */
+  /** Alternative constructor leaving matrix dimensions to be determined automatically. */
+  @Since("1.0.0")
   def this(rows: RDD[IndexedRow]) = this(rows, 0L, 0)
 
-  /**
-   *
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def numCols(): Long = {
     if (nCols <= 0) {
       // Calling `first` will throw an exception if `rows` is empty.
@@ -67,10 +63,7 @@ class IndexedRowMatrix(
     nCols
   }
 
-  /**
-   *
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   override def numRows(): Long = {
     if (nRows <= 0L) {
       // Reduce will throw an exception if `rows` is empty.
@@ -82,15 +75,14 @@ class IndexedRowMatrix(
   /**
    * Drops row indices and converts this matrix to a
    * [[org.apache.spark.mllib.linalg.distributed.RowMatrix]].
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def toRowMatrix(): RowMatrix = {
     new RowMatrix(rows.map(_.vector), 0L, nCols)
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024.
-    * @since 1.3.0
-    * */
+  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
@@ -102,8 +94,8 @@ class IndexedRowMatrix(
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @return a [[BlockMatrix]]
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = {
     // TODO: This implementation may be optimized
     toCoordinateMatrix().toBlockMatrix(rowsPerBlock, colsPerBlock)
@@ -112,8 +104,8 @@ class IndexedRowMatrix(
   /**
    * Converts this matrix to a
    * [[org.apache.spark.mllib.linalg.distributed.CoordinateMatrix]].
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def toCoordinateMatrix(): CoordinateMatrix = {
     val entries = rows.flatMap { row =>
       val rowIndex = row.index
@@ -149,8 +141,8 @@ class IndexedRowMatrix(
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
    * @return SingularValueDecomposition(U, s, V)
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeSVD(
       k: Int,
       computeU: Boolean = false,
@@ -176,8 +168,8 @@ class IndexedRowMatrix(
    *
    * @param B a local matrix whose number of rows must match the number of columns of this matrix
    * @return an IndexedRowMatrix representing the product, which preserves partitioning
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def multiply(B: Matrix): IndexedRowMatrix = {
     val mat = toRowMatrix().multiply(B)
     val indexedRows = rows.map(_.index).zip(mat.rows).map { case (i, v) =>
@@ -188,8 +180,8 @@ class IndexedRowMatrix(
 
   /**
    * Computes the Gramian matrix `A^T A`.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
     toRowMatrix().computeGramianMatrix()
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index b2e94f2dd6201..78036eba5c3e6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -28,7 +28,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary}
 import org.apache.spark.rdd.RDD
@@ -44,22 +44,20 @@ import org.apache.spark.storage.StorageLevel
  *              be determined by the number of records in the RDD `rows`.
  * @param nCols number of columns. A non-positive value means unknown, and then the number of
  *              columns will be determined by the size of the first row.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class RowMatrix(
     val rows: RDD[Vector],
     private var nRows: Long,
     private var nCols: Int) extends DistributedMatrix with Logging {
 
-  /** Alternative constructor leaving matrix dimensions to be determined automatically.
-    * @since 1.0.0
-    * */
+  /** Alternative constructor leaving matrix dimensions to be determined automatically. */
+  @Since("1.0.0")
   def this(rows: RDD[Vector]) = this(rows, 0L, 0)
 
-  /** Gets or computes the number of columns.
-    * @since 1.0.0
-    * */
+  /** Gets or computes the number of columns. */
+  @Since("1.0.0")
   override def numCols(): Long = {
     if (nCols <= 0) {
       try {
@@ -74,9 +72,8 @@ class RowMatrix(
     nCols
   }
 
-  /** Gets or computes the number of rows.
-    * @since 1.0.0
-    * */
+  /** Gets or computes the number of rows. */
+  @Since("1.0.0")
   override def numRows(): Long = {
     if (nRows <= 0L) {
       nRows = rows.count()
@@ -114,8 +111,8 @@ class RowMatrix(
 
   /**
    * Computes the Gramian matrix `A^T A`.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
     val n = numCols().toInt
     checkNumColumns(n)
@@ -185,8 +182,8 @@ class RowMatrix(
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
    * @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeSVD(
       k: Int,
       computeU: Boolean = false,
@@ -326,8 +323,8 @@ class RowMatrix(
   /**
    * Computes the covariance matrix, treating each row as an observation.
    * @return a local dense matrix of size n x n
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeCovariance(): Matrix = {
     val n = numCols().toInt
     checkNumColumns(n)
@@ -380,8 +377,8 @@ class RowMatrix(
    *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computePrincipalComponents(k: Int): Matrix = {
     val n = numCols().toInt
     require(k > 0 && k <= n, s"k = $k out of range (0, n = $n]")
@@ -399,8 +396,8 @@ class RowMatrix(
 
   /**
    * Computes column-wise summary statistics.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def computeColumnSummaryStatistics(): MultivariateStatisticalSummary = {
     val summary = rows.treeAggregate(new MultivariateOnlineSummarizer)(
       (aggregator, data) => aggregator.add(data),
@@ -415,8 +412,8 @@ class RowMatrix(
    * @param B a local matrix whose number of rows must match the number of columns of this matrix
    * @return a [[org.apache.spark.mllib.linalg.distributed.RowMatrix]] representing the product,
    *         which preserves partitioning
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def multiply(B: Matrix): RowMatrix = {
     val n = numCols().toInt
     val k = B.numCols
@@ -448,8 +445,8 @@ class RowMatrix(
    *
    * @return An n x n sparse upper-triangular matrix of cosine similarities between
    *         columns of this matrix.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def columnSimilarities(): CoordinateMatrix = {
     columnSimilarities(0.0)
   }
@@ -492,8 +489,8 @@ class RowMatrix(
    *                  with the cost vs estimate quality trade-off described above.
    * @return An n x n sparse upper-triangular matrix of cosine similarities
    *         between columns of this matrix.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def columnSimilarities(threshold: Double): CoordinateMatrix = {
     require(threshold >= 0, s"Threshold cannot be negative: $threshold")
 
@@ -671,9 +668,7 @@ class RowMatrix(
   }
 }
 
-/**
- * @since 1.0.0
- */
+@Since("1.0.0")
 @Experimental
 object RowMatrix {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 56c549ef99cb7..b27ef1b949e2e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.recommendation
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.ml.recommendation.{ALS => NewALS}
 import org.apache.spark.rdd.RDD
@@ -26,8 +26,8 @@ import org.apache.spark.storage.StorageLevel
 
 /**
  * A more compact class to represent a rating than Tuple3[Int, Int, Double].
- * @since 0.8.0
  */
+@Since("0.8.0")
 case class Rating(user: Int, product: Int, rating: Double)
 
 /**
@@ -255,8 +255,8 @@ class ALS private (
 
 /**
  * Top-level methods for calling Alternating Least Squares (ALS) matrix factorization.
- * @since 0.8.0
  */
+@Since("0.8.0")
 object ALS {
   /**
    * Train a matrix factorization model given an RDD of ratings given by users to some products,
@@ -271,8 +271,8 @@ object ALS {
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
    * @param seed       random seed
-   * @since 0.9.1
    */
+  @Since("0.9.1")
   def train(
       ratings: RDD[Rating],
       rank: Int,
@@ -296,8 +296,8 @@ object ALS {
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       ratings: RDD[Rating],
       rank: Int,
@@ -319,8 +319,8 @@ object ALS {
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double)
     : MatrixFactorizationModel = {
     train(ratings, rank, iterations, lambda, -1)
@@ -336,8 +336,8 @@ object ALS {
    * @param ratings    RDD of (userID, productID, rating) pairs
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(ratings: RDD[Rating], rank: Int, iterations: Int)
     : MatrixFactorizationModel = {
     train(ratings, rank, iterations, 0.01, -1)
@@ -357,8 +357,8 @@ object ALS {
    * @param blocks     level of parallelism to split computation into
    * @param alpha      confidence parameter
    * @param seed       random seed
-   * @since 0.8.1
    */
+  @Since("0.8.1")
   def trainImplicit(
       ratings: RDD[Rating],
       rank: Int,
@@ -384,8 +384,8 @@ object ALS {
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
    * @param alpha      confidence parameter
-   * @since 0.8.1
    */
+  @Since("0.8.1")
   def trainImplicit(
       ratings: RDD[Rating],
       rank: Int,
@@ -409,8 +409,8 @@ object ALS {
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
    * @param alpha      confidence parameter
-   * @since 0.8.1
    */
+  @Since("0.8.1")
   def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double)
     : MatrixFactorizationModel = {
     trainImplicit(ratings, rank, iterations, lambda, -1, alpha)
@@ -427,8 +427,8 @@ object ALS {
    * @param ratings    RDD of (userID, productID, rating) pairs
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
-   * @since 0.8.1
    */
+  @Since("0.8.1")
   def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int)
     : MatrixFactorizationModel = {
     trainImplicit(ratings, rank, iterations, 0.01, -1, 1.0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 261ca9cef0c5b..ba4cfdcd9f1dd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -30,6 +30,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
@@ -49,8 +50,8 @@ import org.apache.spark.storage.StorageLevel
  *                     the features computed for this user.
  * @param productFeatures RDD of tuples where each tuple represents the productId
  *                        and the features computed for this product.
- * @since 0.8.0
  */
+@Since("0.8.0")
 class MatrixFactorizationModel(
     val rank: Int,
     val userFeatures: RDD[(Int, Array[Double])],
@@ -74,9 +75,8 @@ class MatrixFactorizationModel(
     }
   }
 
-  /** Predict the rating of one user for one product.
-   * @since 0.8.0
-   */
+  /** Predict the rating of one user for one product. */
+  @Since("0.8.0")
   def predict(user: Int, product: Int): Double = {
     val userVector = userFeatures.lookup(user).head
     val productVector = productFeatures.lookup(product).head
@@ -114,8 +114,8 @@ class MatrixFactorizationModel(
    *
    * @param usersProducts  RDD of (user, product) pairs.
    * @return RDD of Ratings.
-   * @since 0.9.0
    */
+  @Since("0.9.0")
   def predict(usersProducts: RDD[(Int, Int)]): RDD[Rating] = {
     // Previously the partitions of ratings are only based on the given products.
     // So if the usersProducts given for prediction contains only few products or
@@ -146,8 +146,8 @@ class MatrixFactorizationModel(
 
   /**
    * Java-friendly version of [[MatrixFactorizationModel.predict]].
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = {
     predict(usersProducts.rdd.asInstanceOf[RDD[(Int, Int)]]).toJavaRDD()
   }
@@ -162,8 +162,8 @@ class MatrixFactorizationModel(
    *  by score, decreasing. The first returned is the one predicted to be most strongly
    *  recommended to the user. The score is an opaque value that indicates how strongly
    *  recommended the product is.
-   *  @since 1.1.0
    */
+  @Since("1.1.0")
   def recommendProducts(user: Int, num: Int): Array[Rating] =
     MatrixFactorizationModel.recommend(userFeatures.lookup(user).head, productFeatures, num)
       .map(t => Rating(user, t._1, t._2))
@@ -179,8 +179,8 @@ class MatrixFactorizationModel(
    *  by score, decreasing. The first returned is the one predicted to be most strongly
    *  recommended to the product. The score is an opaque value that indicates how strongly
    *  recommended the user is.
-   *  @since 1.1.0
    */
+  @Since("1.1.0")
   def recommendUsers(product: Int, num: Int): Array[Rating] =
     MatrixFactorizationModel.recommend(productFeatures.lookup(product).head, userFeatures, num)
       .map(t => Rating(t._1, product, t._2))
@@ -199,8 +199,8 @@ class MatrixFactorizationModel(
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
    *              If the directory already exists, this method throws an exception.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     MatrixFactorizationModel.SaveLoadV1_0.save(this, path)
   }
@@ -212,8 +212,8 @@ class MatrixFactorizationModel(
    * @return [(Int, Array[Rating])] objects, where every tuple contains a userID and an array of
    * rating objects which contains the same userId, recommended productID and a "score" in the
    * rating field. Semantics of score is same as recommendProducts API
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def recommendProductsForUsers(num: Int): RDD[(Int, Array[Rating])] = {
     MatrixFactorizationModel.recommendForAll(rank, userFeatures, productFeatures, num).map {
       case (user, top) =>
@@ -230,8 +230,8 @@ class MatrixFactorizationModel(
    * @return [(Int, Array[Rating])] objects, where every tuple contains a productID and an array
    * of rating objects which contains the recommended userId, same productID and a "score" in the
    * rating field. Semantics of score is same as recommendUsers API
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def recommendUsersForProducts(num: Int): RDD[(Int, Array[Rating])] = {
     MatrixFactorizationModel.recommendForAll(rank, productFeatures, userFeatures, num).map {
       case (product, top) =>
@@ -241,9 +241,7 @@ class MatrixFactorizationModel(
   }
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
 
   import org.apache.spark.mllib.util.Loader._
@@ -326,8 +324,8 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
    * @return  Model instance
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
     val (loadedClassName, formatVersion, _) = loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 2980b94de35b0..509f6a2d169c4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.feature.StandardScaler
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.RDD
@@ -35,8 +35,8 @@ import org.apache.spark.storage.StorageLevel
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 @DeveloperApi
 abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double)
   extends Serializable {
@@ -56,8 +56,8 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double
    * @param testData RDD representing data points to be predicted
    * @return RDD[Double] where each entry contains the corresponding prediction
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(testData: RDD[Vector]): RDD[Double] = {
     // A small optimization to avoid serializing the entire model. Only the weightsMatrix
     // and intercept is needed.
@@ -76,8 +76,8 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double
    * @param testData array representing a single data point
    * @return Double prediction from the trained model
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(testData: Vector): Double = {
     predictPoint(testData, weights, intercept)
   }
@@ -95,8 +95,8 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double
  * GeneralizedLinearAlgorithm implements methods to train a Generalized Linear Model (GLM).
  * This class should be extended with an Optimizer to create a new GLM.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 @DeveloperApi
 abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   extends Logging with Serializable {
@@ -106,8 +106,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * The optimizer to solve the problem.
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def optimizer: Optimizer
 
   /** Whether to add intercept (default: false). */
@@ -143,8 +143,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * The dimension of training features.
    *
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def getNumFeatures: Int = this.numFeatures
 
   /**
@@ -168,16 +168,16 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * Get if the algorithm uses addIntercept
    *
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def isAddIntercept: Boolean = this.addIntercept
 
   /**
    * Set if the algorithm should add an intercept. Default false.
    * We set the default to false because adding the intercept will cause memory allocation.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setIntercept(addIntercept: Boolean): this.type = {
     this.addIntercept = addIntercept
     this
@@ -186,8 +186,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * Set if the algorithm should validate data before training. Default true.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def setValidateData(validateData: Boolean): this.type = {
     this.validateData = validateData
     this
@@ -197,8 +197,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * Run the algorithm with the configured parameters on an input
    * RDD of LabeledPoint entries.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def run(input: RDD[LabeledPoint]): M = {
     if (numFeatures < 0) {
       numFeatures = input.map(_.features.size).first()
@@ -231,8 +231,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * Run the algorithm with the configured parameters on an input RDD
    * of LabeledPoint entries starting from the initial weights provided.
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def run(input: RDD[LabeledPoint], initialWeights: Vector): M = {
 
     if (numFeatures < 0) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 8995591d9e8ce..31ca7c2f207d9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -29,7 +29,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -47,8 +47,8 @@ import org.apache.spark.sql.SQLContext
  *                    Results of isotonic regression and therefore monotone.
  * @param isotonic indicates whether this is isotonic or antitonic.
  *
- * @since 1.3.0
  */
+@Since("1.3.0")
 @Experimental
 class IsotonicRegressionModel (
     val boundaries: Array[Double],
@@ -64,8 +64,8 @@ class IsotonicRegressionModel (
   /**
    * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter.
    *
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def this(boundaries: java.lang.Iterable[Double],
       predictions: java.lang.Iterable[Double],
       isotonic: java.lang.Boolean) = {
@@ -90,8 +90,8 @@ class IsotonicRegressionModel (
    * @param testData Features to be labeled.
    * @return Predicted labels.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predict(testData: RDD[Double]): RDD[Double] = {
     testData.map(predict)
   }
@@ -103,8 +103,8 @@ class IsotonicRegressionModel (
    * @param testData Features to be labeled.
    * @return Predicted labels.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predict(testData: JavaDoubleRDD): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(predict(testData.rdd.retag.asInstanceOf[RDD[Double]]))
   }
@@ -125,8 +125,8 @@ class IsotonicRegressionModel (
    *           as piecewise linear function and interpolated value is returned. In case there are
    *           multiple values with the same boundary then the same rules as in 2) are used.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predict(testData: Double): Double = {
 
     def linearInterpolation(x1: Double, y1: Double, x2: Double, y2: Double, x: Double): Double = {
@@ -160,9 +160,7 @@ class IsotonicRegressionModel (
   /** A convenient method for boundaries called by the Python API. */
   private[mllib] def predictionVector: Vector = Vectors.dense(predictions)
 
-  /**
-   * @since 1.4.0
-   */
+  @Since("1.4.0")
   override def save(sc: SparkContext, path: String): Unit = {
     IsotonicRegressionModel.SaveLoadV1_0.save(sc, path, boundaries, predictions, isotonic)
   }
@@ -170,9 +168,7 @@ class IsotonicRegressionModel (
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.4.0
- */
+@Since("1.4.0")
 object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
 
   import org.apache.spark.mllib.util.Loader._
@@ -219,8 +215,8 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
   }
 
   /**
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = {
     implicit val formats = DefaultFormats
     val (loadedClassName, version, metadata) = loadMetadata(sc, path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
index 8b51011eeb297..f7fe1b7b21fca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.regression
 
 import scala.beans.BeanInfo
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.util.NumericParser
 import org.apache.spark.SparkException
@@ -29,8 +30,8 @@ import org.apache.spark.SparkException
  * @param label Label for this data point.
  * @param features List of features for this data point.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 @BeanInfo
 case class LabeledPoint(label: Double, features: Vector) {
   override def toString: String = {
@@ -41,15 +42,15 @@ case class LabeledPoint(label: Double, features: Vector) {
 /**
  * Parser for [[org.apache.spark.mllib.regression.LabeledPoint]].
  *
- * @since 1.1.0
  */
+@Since("1.1.0")
 object LabeledPoint {
   /**
    * Parses a string resulted from `LabeledPoint#toString` into
    * an [[org.apache.spark.mllib.regression.LabeledPoint]].
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def parse(s: String): LabeledPoint = {
     if (s.startsWith("(")) {
       NumericParser.parse(s) match {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
index 03eb589b05a0e..556411a366bd2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.regression
 
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.pmml.PMMLExportable
@@ -31,8 +32,8 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 class LassoModel (
     override val weights: Vector,
     override val intercept: Double)
@@ -46,9 +47,7 @@ class LassoModel (
     weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept)
   }
@@ -56,14 +55,10 @@ class LassoModel (
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object LassoModel extends Loader[LassoModel] {
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): LassoModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -118,8 +113,8 @@ class LassoWithSGD private (
 /**
  * Top-level methods for calling Lasso.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 object LassoWithSGD {
 
   /**
@@ -137,8 +132,8 @@ object LassoWithSGD {
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -162,8 +157,8 @@ object LassoWithSGD {
    * @param regParam Regularization parameter.
    * @param miniBatchFraction Fraction of data to be used per iteration.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -185,8 +180,8 @@ object LassoWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LassoModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -205,8 +200,8 @@ object LassoWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LassoModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int): LassoModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index fb5c220daaedb..00ab06e3ba738 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.regression
 
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.pmml.PMMLExportable
@@ -31,8 +32,8 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 class LinearRegressionModel (
     override val weights: Vector,
     override val intercept: Double)
@@ -46,9 +47,7 @@ class LinearRegressionModel (
     weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept)
   }
@@ -56,14 +55,10 @@ class LinearRegressionModel (
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object LinearRegressionModel extends Loader[LinearRegressionModel] {
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): LinearRegressionModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -117,8 +112,8 @@ class LinearRegressionWithSGD private[mllib] (
 /**
  * Top-level methods for calling LinearRegression.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 object LinearRegressionWithSGD {
 
   /**
@@ -135,8 +130,8 @@ object LinearRegressionWithSGD {
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -158,8 +153,8 @@ object LinearRegressionWithSGD {
    * @param stepSize Step size to be used for each iteration of gradient descent.
    * @param miniBatchFraction Fraction of data to be used per iteration.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -179,8 +174,8 @@ object LinearRegressionWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LinearRegressionModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -198,8 +193,8 @@ object LinearRegressionWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LinearRegressionModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int): LinearRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
index b097fd38fdd82..0e72d6591ce83 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
@@ -19,14 +19,12 @@ package org.apache.spark.mllib.regression
 
 import org.json4s.{DefaultFormats, JValue}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
-/**
- * @since 0.8.0
- */
+@Since("0.8.0")
 @Experimental
 trait RegressionModel extends Serializable {
   /**
@@ -35,8 +33,8 @@ trait RegressionModel extends Serializable {
    * @param testData RDD representing data points to be predicted
    * @return RDD[Double] where each entry contains the corresponding prediction
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(testData: RDD[Vector]): RDD[Double]
 
   /**
@@ -45,8 +43,8 @@ trait RegressionModel extends Serializable {
    * @param testData array representing a single data point
    * @return Double prediction from the trained model
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(testData: Vector): Double
 
   /**
@@ -54,8 +52,8 @@ trait RegressionModel extends Serializable {
    * @param testData JavaRDD representing data points to be predicted
    * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction
    *
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
     predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index 5bced6b4b7b53..21a791d98b2cb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.regression
 
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.pmml.PMMLExportable
@@ -32,8 +33,8 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 class RidgeRegressionModel (
     override val weights: Vector,
     override val intercept: Double)
@@ -47,9 +48,7 @@ class RidgeRegressionModel (
     weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
   }
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     GLMRegressionModel.SaveLoadV1_0.save(sc, path, this.getClass.getName, weights, intercept)
   }
@@ -57,14 +56,10 @@ class RidgeRegressionModel (
   override protected def formatVersion: String = "1.0"
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object RidgeRegressionModel extends Loader[RidgeRegressionModel] {
 
-  /**
-   * @since 1.3.0
-   */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): RidgeRegressionModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -120,8 +115,8 @@ class RidgeRegressionWithSGD private (
 /**
  * Top-level methods for calling RidgeRegression.
  *
- * @since 0.8.0
  */
+@Since("0.8.0")
 object RidgeRegressionWithSGD {
 
   /**
@@ -138,8 +133,8 @@ object RidgeRegressionWithSGD {
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -162,8 +157,8 @@ object RidgeRegressionWithSGD {
    * @param regParam Regularization parameter.
    * @param miniBatchFraction Fraction of data to be used per iteration.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -184,8 +179,8 @@ object RidgeRegressionWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a RidgeRegressionModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
@@ -203,8 +198,8 @@ object RidgeRegressionWithSGD {
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a RidgeRegressionModel which has the weights and offset from training.
    *
-   * @since 0.8.0
    */
+  @Since("0.8.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int): RidgeRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index a2ab95c474765..cd3ed8a1549db 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.regression
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream}
@@ -54,8 +54,8 @@ import org.apache.spark.streaming.dstream.DStream
  * the model using each of the different sources, in sequence.
  *
  *
- * @since 1.1.0
  */
+@Since("1.1.0")
 @DeveloperApi
 abstract class StreamingLinearAlgorithm[
     M <: GeneralizedLinearModel,
@@ -70,8 +70,8 @@ abstract class StreamingLinearAlgorithm[
   /**
    * Return the latest model.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def latestModel(): M = {
     model.get
   }
@@ -84,8 +84,8 @@ abstract class StreamingLinearAlgorithm[
    *
    * @param data DStream containing labeled data
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def trainOn(data: DStream[LabeledPoint]): Unit = {
     if (model.isEmpty) {
       throw new IllegalArgumentException("Model must be initialized before starting training.")
@@ -106,8 +106,8 @@ abstract class StreamingLinearAlgorithm[
   /**
    * Java-friendly version of `trainOn`.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def trainOn(data: JavaDStream[LabeledPoint]): Unit = trainOn(data.dstream)
 
   /**
@@ -116,8 +116,8 @@ abstract class StreamingLinearAlgorithm[
    * @param data DStream containing feature vectors
    * @return DStream containing predictions
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def predictOn(data: DStream[Vector]): DStream[Double] = {
     if (model.isEmpty) {
       throw new IllegalArgumentException("Model must be initialized before starting prediction.")
@@ -128,8 +128,8 @@ abstract class StreamingLinearAlgorithm[
   /**
    * Java-friendly version of `predictOn`.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Double] = {
     JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Double]])
   }
@@ -140,8 +140,8 @@ abstract class StreamingLinearAlgorithm[
    * @tparam K key type
    * @return DStream containing the input keys and the predictions as values
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def predictOnValues[K: ClassTag](data: DStream[(K, Vector)]): DStream[(K, Double)] = {
     if (model.isEmpty) {
       throw new IllegalArgumentException("Model must be initialized before starting prediction")
@@ -153,8 +153,8 @@ abstract class StreamingLinearAlgorithm[
   /**
    * Java-friendly version of `predictOnValues`.
    *
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def predictOnValues[K](data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Double] = {
     implicit val tag = fakeClassTag[K]
     JavaPairDStream.fromPairDStream(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index 93a6753efd4d9..4a856f7f3434a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 
@@ -37,8 +37,8 @@ import org.apache.spark.rdd.RDD
  *   .setBandwidth(3.0)
  * val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
  * }}}
- * @since 1.4.0
  */
+@Since("1.4.0")
 @Experimental
 class KernelDensity extends Serializable {
 
@@ -52,8 +52,8 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setBandwidth(bandwidth: Double): this.type = {
     require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
     this.bandwidth = bandwidth
@@ -62,8 +62,8 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the sample to use for density estimation.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setSample(sample: RDD[Double]): this.type = {
     this.sample = sample
     this
@@ -71,8 +71,8 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the sample to use for density estimation (for Java users).
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
     this.sample = sample.rdd.asInstanceOf[RDD[Double]]
     this
@@ -80,8 +80,8 @@ class KernelDensity extends Serializable {
 
   /**
    * Estimates probability density function at the given array of points.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def estimate(points: Array[Double]): Array[Double] = {
     val sample = this.sample
     val bandwidth = this.bandwidth
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 64e4be0ebb97e..51b713e263e0c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.stat
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 
 /**
@@ -33,8 +33,8 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
  * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
- * @since 1.1.0
  */
+@Since("1.1.0")
 @DeveloperApi
 class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
 
@@ -53,8 +53,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    *
    * @param sample The sample in dense/sparse vector format to be added into this summarizer.
    * @return This MultivariateOnlineSummarizer object.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def add(sample: Vector): this.type = {
     if (n == 0) {
       require(sample.size > 0, s"Vector should have dimension larger than zero.")
@@ -109,8 +109,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    *
    * @param other The other MultivariateOnlineSummarizer to be merged.
    * @return This MultivariateOnlineSummarizer object.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def merge(other: MultivariateOnlineSummarizer): this.type = {
     if (this.totalCnt != 0 && other.totalCnt != 0) {
       require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
@@ -155,8 +155,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * Sample mean of each dimension.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def mean: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -172,8 +172,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * Sample variance of each dimension.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def variance: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -199,15 +199,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * Sample size.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def count: Long = totalCnt
 
   /**
    * Number of nonzero elements in each dimension.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def numNonzeros: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -217,8 +217,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * Maximum value of each dimension.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def max: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -233,8 +233,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * Minimum value of each dimension.
    *
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   override def min: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -249,8 +249,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * L2 (Euclidian) norm of each dimension.
    *
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   override def normL2: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -268,8 +268,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   /**
    * L1 norm of each dimension.
    *
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   override def normL1: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
index 3bb49f12289e1..39a16fb743d64 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
@@ -17,59 +17,60 @@
 
 package org.apache.spark.mllib.stat
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 
 /**
  * Trait for multivariate statistical summary of a data matrix.
- * @since 1.0.0
  */
+@Since("1.0.0")
 trait MultivariateStatisticalSummary {
 
   /**
    * Sample mean vector.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def mean: Vector
 
   /**
    * Sample variance vector. Should return a zero vector if the sample size is 1.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def variance: Vector
 
   /**
    * Sample size.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def count: Long
 
   /**
    * Number of nonzero elements (including explicitly presented zero values) in each column.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def numNonzeros: Vector
 
   /**
    * Maximum value of each column.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def max: Vector
 
   /**
    * Minimum value of each column.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def min: Vector
 
   /**
    * Euclidean magnitude of each column
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def normL2: Vector
 
   /**
    * L1 norm of each column
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def normL1: Vector
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index ef8d78607048f..84d64a5bfb38e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat
 
 import scala.annotation.varargs
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.{JavaRDD, JavaDoubleRDD}
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
@@ -32,8 +32,8 @@ import org.apache.spark.rdd.RDD
 /**
  * :: Experimental ::
  * API for statistical functions in MLlib.
- * @since 1.1.0
  */
+@Since("1.1.0")
 @Experimental
 object Statistics {
 
@@ -42,8 +42,8 @@ object Statistics {
    *
    * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
    * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
     new RowMatrix(X).computeColumnSummaryStatistics()
   }
@@ -54,8 +54,8 @@ object Statistics {
    *
    * @param X an RDD[Vector] for which the correlation matrix is to be computed.
    * @return Pearson correlation matrix comparing columns in X.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
 
   /**
@@ -71,8 +71,8 @@ object Statistics {
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return Correlation matrix comparing columns in X.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
 
   /**
@@ -85,14 +85,14 @@ object Statistics {
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @return A Double containing the Pearson correlation between the two input RDD[Double]s
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
   /**
    * Java-friendly version of [[corr()]]
-   * @since 1.4.1
    */
+  @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
     corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
 
@@ -109,14 +109,14 @@ object Statistics {
    *               Supported: `pearson` (default), `spearman`
    * @return A Double containing the correlation between the two input RDD[Double]s using the
    *         specified method.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
   /**
    * Java-friendly version of [[corr()]]
-   * @since 1.4.1
    */
+  @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
     corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
 
@@ -133,8 +133,8 @@ object Statistics {
    *                 `expected` is rescaled if the `expected` sum differs from the `observed` sum.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
     ChiSqTest.chiSquared(observed, expected)
   }
@@ -148,8 +148,8 @@ object Statistics {
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
 
   /**
@@ -159,8 +159,8 @@ object Statistics {
    * @param observed The contingency matrix (containing either counts or relative frequencies).
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
 
   /**
@@ -172,13 +172,14 @@ object Statistics {
    *             Real-valued features will be treated as categorical for each distinct value.
    * @return an array containing the ChiSquaredTestResult for every feature against the label.
    *         The order of the elements in the returned array reflects the order of input features.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
   }
 
   /** Java-friendly version of [[chiSqTest()]] */
+  @Since("1.5.0")
   def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd)
 
   /**
@@ -194,6 +195,7 @@ object Statistics {
    * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
    *        statistic, p-value, and null hypothesis.
    */
+  @Since("1.5.0")
   def kolmogorovSmirnovTest(data: RDD[Double], cdf: Double => Double)
     : KolmogorovSmirnovTestResult = {
     KolmogorovSmirnovTest.testOneSample(data, cdf)
@@ -210,6 +212,7 @@ object Statistics {
    * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
    *        statistic, p-value, and null hypothesis.
    */
+  @Since("1.5.0")
   @varargs
   def kolmogorovSmirnovTest(data: RDD[Double], distName: String, params: Double*)
     : KolmogorovSmirnovTestResult = {
@@ -217,6 +220,7 @@ object Statistics {
   }
 
   /** Java-friendly version of [[kolmogorovSmirnovTest()]] */
+  @Since("1.5.0")
   @varargs
   def kolmogorovSmirnovTest(
       data: JavaDoubleRDD,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 9aa7763d7890d..bd4d81390bfae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat.distribution
 
 import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym, Vector => BV}
 
-import org.apache.spark.annotation.DeveloperApi;
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
 import org.apache.spark.mllib.util.MLUtils
 
@@ -32,8 +32,8 @@ import org.apache.spark.mllib.util.MLUtils
  *
  * @param mu The mean vector of the distribution
  * @param sigma The covariance matrix of the distribution
- * @since 1.3.0
  */
+@Since("1.3.0")
 @DeveloperApi
 class MultivariateGaussian (
     val mu: Vector,
@@ -62,15 +62,15 @@ class MultivariateGaussian (
   private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
 
   /** Returns density of this multivariate Gaussian at given point, x
-    * @since 1.3.0
     */
+   @Since("1.3.0")
   def pdf(x: Vector): Double = {
     pdf(x.toBreeze)
   }
 
   /** Returns the log-density of this multivariate Gaussian at given point, x
-    * @since 1.3.0
     */
+   @Since("1.3.0")
   def logpdf(x: Vector): Double = {
     logpdf(x.toBreeze)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index e5200b86fddd4..972841015d4f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.RandomForest.NodeIndexInfo
@@ -43,8 +43,8 @@ import org.apache.spark.util.random.XORShiftRandom
  * @param strategy The configuration parameters for the tree algorithm which specify the type
  *                 of algorithm (classification, regression, etc.), feature type (continuous,
  *                 categorical), depth of the tree, quantile calculation strategy, etc.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class DecisionTree (private val strategy: Strategy) extends Serializable with Logging {
 
@@ -54,8 +54,8 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
    * Method to train a decision tree model over an RDD
    * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def run(input: RDD[LabeledPoint]): DecisionTreeModel = {
     // Note: random seed will not be used since numTrees = 1.
     val rf = new RandomForest(strategy, numTrees = 1, featureSubsetStrategy = "all", seed = 0)
@@ -64,9 +64,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
   }
 }
 
-/**
- * @since 1.0.0
- */
+@Since("1.0.0")
 object DecisionTree extends Serializable with Logging {
 
   /**
@@ -84,8 +82,8 @@ object DecisionTree extends Serializable with Logging {
    *                 of algorithm (classification, regression, etc.), feature type (continuous,
    *                 categorical), depth of the tree, quantile calculation strategy, etc.
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.0.0
-  */
+   */
+ @Since("1.0.0")
   def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
     new DecisionTree(strategy).run(input)
   }
@@ -106,8 +104,8 @@ object DecisionTree extends Serializable with Logging {
    * @param maxDepth Maximum depth of the tree.
    *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       algo: Algo,
@@ -134,8 +132,8 @@ object DecisionTree extends Serializable with Logging {
    *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
    * @param numClasses number of classes for classification. Default value of 2.
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def train(
       input: RDD[LabeledPoint],
       algo: Algo,
@@ -168,8 +166,8 @@ object DecisionTree extends Serializable with Logging {
    *                                E.g., an entry (n -> k) indicates that feature n is categorical
    *                                with k categories indexed from 0: {0, 1, ..., k-1}.
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       algo: Algo,
@@ -201,8 +199,8 @@ object DecisionTree extends Serializable with Logging {
    * @param maxBins maximum number of bins used for splitting features
    *                 (suggested value: 32)
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def trainClassifier(
       input: RDD[LabeledPoint],
       numClasses: Int,
@@ -217,8 +215,8 @@ object DecisionTree extends Serializable with Logging {
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def trainClassifier(
       input: JavaRDD[LabeledPoint],
       numClasses: Int,
@@ -247,8 +245,8 @@ object DecisionTree extends Serializable with Logging {
    * @param maxBins maximum number of bins used for splitting features
    *                 (suggested value: 32)
    * @return DecisionTreeModel that can be used for prediction
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def trainRegressor(
       input: RDD[LabeledPoint],
       categoricalFeaturesInfo: Map[Int, Int],
@@ -261,8 +259,8 @@ object DecisionTree extends Serializable with Logging {
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def trainRegressor(
       input: JavaRDD[LabeledPoint],
       categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 143617098637a..e750408600c33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.tree
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -48,8 +48,8 @@ import org.apache.spark.storage.StorageLevel
  *       for other loss functions.
  *
  * @param boostingStrategy Parameters for the gradient boosting algorithm.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   extends Serializable with Logging {
@@ -58,8 +58,8 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
    * Method to train a gradient boosting model
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    * @return a gradient boosted trees model that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
@@ -76,8 +76,8 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]].
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
     run(input.rdd)
   }
@@ -91,8 +91,8 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
    *                        E.g., these two datasets could be created from an original dataset
    *                        by using [[org.apache.spark.rdd.RDD.randomSplit()]]
    * @return a gradient boosted trees model that can be used for prediction
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def runWithValidation(
       input: RDD[LabeledPoint],
       validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -115,8 +115,8 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def runWithValidation(
       input: JavaRDD[LabeledPoint],
       validationInput: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -124,9 +124,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   }
 }
 
-/**
- * @since 1.2.0
- */
+@Since("1.2.0")
 object GradientBoostedTrees extends Logging {
 
   /**
@@ -137,8 +135,8 @@ object GradientBoostedTrees extends Logging {
    *              For regression, labels are real numbers.
    * @param boostingStrategy Configuration options for the boosting algorithm.
    * @return a gradient boosted trees model that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def train(
       input: RDD[LabeledPoint],
       boostingStrategy: BoostingStrategy): GradientBoostedTreesModel = {
@@ -147,8 +145,8 @@ object GradientBoostedTrees extends Logging {
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees$#train]]
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def train(
       input: JavaRDD[LabeledPoint],
       boostingStrategy: BoostingStrategy): GradientBoostedTreesModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 9f3230656acc5..63a902f3eb51e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import scala.collection.JavaConverters._
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Strategy
@@ -260,9 +260,7 @@ private class RandomForest (
 
 }
 
-/**
- * @since 1.2.0
- */
+@Since("1.2.0")
 object RandomForest extends Serializable with Logging {
 
   /**
@@ -279,8 +277,8 @@ object RandomForest extends Serializable with Logging {
    *                                if numTrees > 1 (forest) set to "sqrt".
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainClassifier(
       input: RDD[LabeledPoint],
       strategy: Strategy,
@@ -317,8 +315,8 @@ object RandomForest extends Serializable with Logging {
    *                 (suggested value: 100)
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model  that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainClassifier(
       input: RDD[LabeledPoint],
       numClasses: Int,
@@ -337,8 +335,8 @@ object RandomForest extends Serializable with Logging {
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainClassifier]]
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainClassifier(
       input: JavaRDD[LabeledPoint],
       numClasses: Int,
@@ -368,8 +366,8 @@ object RandomForest extends Serializable with Logging {
    *                                if numTrees > 1 (forest) set to "onethird".
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainRegressor(
       input: RDD[LabeledPoint],
       strategy: Strategy,
@@ -405,8 +403,8 @@ object RandomForest extends Serializable with Logging {
    *                 (suggested value: 100)
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model that can be used for prediction
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainRegressor(
       input: RDD[LabeledPoint],
       categoricalFeaturesInfo: Map[Int, Int],
@@ -424,8 +422,8 @@ object RandomForest extends Serializable with Logging {
 
   /**
    * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainRegressor]]
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def trainRegressor(
       input: JavaRDD[LabeledPoint],
       categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer],
@@ -442,8 +440,8 @@ object RandomForest extends Serializable with Logging {
 
   /**
    * List of supported feature subset sampling strategies.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   val supportedFeatureSubsetStrategies: Array[String] =
     Array("auto", "all", "sqrt", "log2", "onethird")
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
index d9a49aa71fcfb..8301ad160836b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.mllib.tree.configuration
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 
 /**
  * :: Experimental ::
  * Enum to select the algorithm for the decision tree
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object Algo extends Enumeration {
   type Algo = Value
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 88e5f57e9ab32..7c569981977b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.tree.configuration
 
 import scala.beans.BeanProperty
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
 
@@ -38,8 +38,8 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                      validation input between two iterations is less than the validationTol
  *                      then stop.  Ignored when
  *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 case class BoostingStrategy(
     // Required boosting parameters
@@ -71,9 +71,7 @@ case class BoostingStrategy(
   }
 }
 
-/**
- * @since 1.2.0
- */
+@Since("1.2.0")
 @Experimental
 object BoostingStrategy {
 
@@ -81,8 +79,8 @@ object BoostingStrategy {
    * Returns default configuration for the boosting algorithm
    * @param algo Learning goal.  Supported: "Classification" or "Regression"
    * @return Configuration for boosting algorithm
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def defaultParams(algo: String): BoostingStrategy = {
     defaultParams(Algo.fromString(algo))
   }
@@ -93,8 +91,8 @@ object BoostingStrategy {
    *             [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
    *             [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
    * @return Configuration for boosting algorithm
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def defaultParams(algo: Algo): BoostingStrategy = {
     val treeStrategy = Strategy.defaultStrategy(algo)
     treeStrategy.maxDepth = 3
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
index 0684cafa486bd..bb7c7ee4f964f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.mllib.tree.configuration
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 
 /**
  * :: Experimental ::
  * Enum to describe whether a feature is "continuous" or "categorical"
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object FeatureType extends Enumeration {
   type FeatureType = Value
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
index 2daa63c4d2771..904e42deebb5f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.mllib.tree.configuration
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 
 /**
  * :: Experimental ::
  * Enum for selecting the quantile calculation strategy
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object QuantileStrategy extends Enumeration {
   type QuantileStrategy = Value
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index 7ae25a88bf500..a58f01ba8544e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.tree.configuration
 import scala.beans.BeanProperty
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.tree.impurity.{Variance, Entropy, Gini, Impurity}
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
@@ -66,8 +66,8 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  *                           E.g. 10 means that the cache will get checkpointed every 10 updates. If
  *                           the checkpoint directory is not set in
  *                           [[org.apache.spark.SparkContext]], this setting is ignored.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class Strategy (
     @BeanProperty var algo: Algo,
@@ -85,23 +85,23 @@ class Strategy (
     @BeanProperty var checkpointInterval: Int = 10) extends Serializable {
 
   /**
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def isMulticlassClassification: Boolean = {
     algo == Classification && numClasses > 2
   }
 
   /**
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def isMulticlassWithCategoricalFeatures: Boolean = {
     isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
   }
 
   /**
    * Java-friendly constructor for [[org.apache.spark.mllib.tree.configuration.Strategy]]
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def this(
       algo: Algo,
       impurity: Impurity,
@@ -115,8 +115,8 @@ class Strategy (
 
   /**
    * Sets Algorithm using a String.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setAlgo(algo: String): Unit = algo match {
     case "Classification" => setAlgo(Classification)
     case "Regression" => setAlgo(Regression)
@@ -124,8 +124,8 @@ class Strategy (
 
   /**
    * Sets categoricalFeaturesInfo using a Java Map.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def setCategoricalFeaturesInfo(
       categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer]): Unit = {
     this.categoricalFeaturesInfo =
@@ -174,8 +174,8 @@ class Strategy (
 
   /**
    * Returns a shallow copy of this instance.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def copy: Strategy = {
     new Strategy(algo, impurity, maxDepth, numClasses, maxBins,
       quantileCalculationStrategy, categoricalFeaturesInfo, minInstancesPerNode, minInfoGain,
@@ -183,17 +183,15 @@ class Strategy (
   }
 }
 
-/**
- * @since 1.2.0
- */
+@Since("1.2.0")
 @Experimental
 object Strategy {
 
   /**
    * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
    * @param algo  "Classification" or "Regression"
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def defaultStrategy(algo: String): Strategy = {
     defaultStrategy(Algo.fromString(algo))
   }
@@ -201,8 +199,8 @@ object Strategy {
   /**
    * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
    * @param algo Algo.Classification or Algo.Regression
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   def defaultStrategy(algo: Algo): Strategy = algo match {
     case Algo.Classification =>
       new Strategy(algo = Classification, impurity = Gini, maxDepth = 10,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
index 0b6c7266dee05..73df6b054a8ce 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.mllib.tree.impurity
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 
 /**
  * :: Experimental ::
  * Class for calculating [[http://en.wikipedia.org/wiki/Binary_entropy_function entropy]] during
  * binary classification.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object Entropy extends Impurity {
 
@@ -36,8 +36,8 @@ object Entropy extends Impurity {
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
    * @return information value, or 0 if totalCount = 0
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double = {
     if (totalCount == 0) {
@@ -64,8 +64,8 @@ object Entropy extends Impurity {
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
    * @return information value, or 0 if count = 0
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Entropy.calculate")
@@ -73,8 +73,8 @@ object Entropy extends Impurity {
   /**
    * Get this impurity instance.
    * This is useful for passing impurity parameters to a Strategy in Java.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def instance: this.type = this
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
index 3b0be428833ae..f21845b21a802 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.mllib.tree.impurity
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 
 /**
  * :: Experimental ::
  * Class for calculating the
  * [[http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity Gini impurity]]
  * during binary classification.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object Gini extends Impurity {
 
@@ -35,8 +35,8 @@ object Gini extends Impurity {
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
    * @return information value, or 0 if totalCount = 0
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double = {
     if (totalCount == 0) {
@@ -60,8 +60,8 @@ object Gini extends Impurity {
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
    * @return information value, or 0 if count = 0
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Gini.calculate")
@@ -69,8 +69,8 @@ object Gini extends Impurity {
   /**
    * Get this impurity instance.
    * This is useful for passing impurity parameters to a Strategy in Java.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def instance: this.type = this
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index dd297400058d2..4637dcceea7f8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.impurity
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 
 /**
  * :: Experimental ::
@@ -25,8 +25,8 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
  * This trait is used for
  *  (a) setting the impurity parameter in [[org.apache.spark.mllib.tree.configuration.Strategy]]
  *  (b) calculating impurity values from sufficient statistics.
- *  @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 trait Impurity extends Serializable {
 
@@ -36,8 +36,8 @@ trait Impurity extends Serializable {
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
    * @return information value, or 0 if totalCount = 0
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   @DeveloperApi
   def calculate(counts: Array[Double], totalCount: Double): Double
 
@@ -48,8 +48,8 @@ trait Impurity extends Serializable {
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
    * @return information value, or 0 if count = 0
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @DeveloperApi
   def calculate(count: Double, sum: Double, sumSquares: Double): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
index adbe05811f286..a74197278d6f7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.mllib.tree.impurity
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 
 /**
  * :: Experimental ::
  * Class for calculating variance during regression
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 object Variance extends Impurity {
 
@@ -33,8 +33,8 @@ object Variance extends Impurity {
    * @param counts Array[Double] with counts for each label
    * @param totalCount sum of counts for all labels
    * @return information value, or 0 if totalCount = 0
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   @DeveloperApi
   override def calculate(counts: Array[Double], totalCount: Double): Double =
      throw new UnsupportedOperationException("Variance.calculate")
@@ -46,8 +46,8 @@ object Variance extends Impurity {
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
    * @return information value, or 0 if count = 0
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double = {
     if (count == 0) {
@@ -60,8 +60,8 @@ object Variance extends Impurity {
   /**
    * Get this impurity instance.
    * This is useful for passing impurity parameters to a Strategy in Java.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def instance: this.type = this
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
index c6e3d0d824dd7..bab7b8c6cadf2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.loss
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.TreeEnsembleModel
 
@@ -29,8 +29,8 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel
  * The absolute (L1) error is defined as:
  *  |y - F(x)|
  * where y is the label and F(x) is the model prediction for features x.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @DeveloperApi
 object AbsoluteError extends Loss {
 
@@ -41,8 +41,8 @@ object AbsoluteError extends Loss {
    * @param prediction Predicted label.
    * @param label True label.
    * @return Loss gradient
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   override def gradient(prediction: Double, label: Double): Double = {
     if (label - prediction < 0) 1.0 else -1.0
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
index eee58445a1ec1..b2b4594712f0d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.loss
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.TreeEnsembleModel
 import org.apache.spark.mllib.util.MLUtils
@@ -31,8 +31,8 @@ import org.apache.spark.mllib.util.MLUtils
  * The log loss is defined as:
  *   2 log(1 + exp(-2 y F(x)))
  * where y is a label in {-1, 1} and F(x) is the model prediction for features x.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @DeveloperApi
 object LogLoss extends Loss {
 
@@ -43,8 +43,8 @@ object LogLoss extends Loss {
    * @param prediction Predicted label.
    * @param label True label.
    * @return Loss gradient
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   override def gradient(prediction: Double, label: Double): Double = {
     - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
index 7c9fb924645c8..687cde325ffed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.loss
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.TreeEnsembleModel
 import org.apache.spark.rdd.RDD
@@ -26,8 +26,8 @@ import org.apache.spark.rdd.RDD
 /**
  * :: DeveloperApi ::
  * Trait for adding "pluggable" loss functions for the gradient boosting algorithm.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @DeveloperApi
 trait Loss extends Serializable {
 
@@ -36,8 +36,8 @@ trait Loss extends Serializable {
    * @param prediction Predicted feature
    * @param label true label.
    * @return Loss gradient.
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def gradient(prediction: Double, label: Double): Double
 
   /**
@@ -47,8 +47,8 @@ trait Loss extends Serializable {
    * @param model Model of the weak learner.
    * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    * @return Measure of model error on data
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = {
     data.map(point => computeError(model.predict(point.features), point.label)).mean()
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
index 47dc94cde7e03..2b112fbe12202 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.mllib.tree.loss
 
-/**
- * @since 1.2.0
- */
+import org.apache.spark.annotation.Since
+
+@Since("1.2.0")
 object Losses {
 
-  /**
-   * @since 1.2.0
-   */
+  @Since("1.2.0")
   def fromString(name: String): Loss = name match {
     case "leastSquaresError" => SquaredError
     case "leastAbsoluteError" => AbsoluteError
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
index ff8903d6956bd..3f7d3d38be16c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.loss
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.TreeEnsembleModel
 
@@ -29,8 +29,8 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel
  * The squared (L2) error is defined as:
  *   (y - F(x))**2
  * where y is the label and F(x) is the model prediction for features x.
- * @since 1.2.0
  */
+@Since("1.2.0")
 @DeveloperApi
 object SquaredError extends Loss {
 
@@ -41,8 +41,8 @@ object SquaredError extends Loss {
    * @param prediction Predicted label.
    * @param label True label.
    * @return Loss gradient
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   override def gradient(prediction: Double, label: Double): Double = {
     - 2.0 * (label - prediction)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 0f386a26601ce..3eefd135f7836 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -24,7 +24,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType}
@@ -40,8 +40,8 @@ import org.apache.spark.util.Utils
  * This model stores the decision tree structure and parameters.
  * @param topNode root node
  * @param algo algorithm type -- classification or regression
- * @since 1.0.0
  */
+@Since("1.0.0")
 @Experimental
 class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable {
 
@@ -50,8 +50,8 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
    *
    * @param features array representing a single data point
    * @return Double prediction from the trained model
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(features: Vector): Double = {
     topNode.predict(features)
   }
@@ -61,8 +61,8 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
    *
    * @param features RDD representing data points to be predicted
    * @return RDD of predictions for each of the given data points
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def predict(features: RDD[Vector]): RDD[Double] = {
     features.map(x => predict(x))
   }
@@ -72,16 +72,16 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
    *
    * @param features JavaRDD representing data points to be predicted
    * @return JavaRDD of predictions for each of the given data points
-   * @since 1.2.0
    */
+  @Since("1.2.0")
   def predict(features: JavaRDD[Vector]): JavaRDD[Double] = {
     predict(features.rdd)
   }
 
   /**
    * Get number of nodes in tree, including leaf nodes.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def numNodes: Int = {
     1 + topNode.numDescendants
   }
@@ -89,8 +89,8 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
   /**
    * Get depth of tree.
    * E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def depth: Int = {
     topNode.subtreeDepth
   }
@@ -119,8 +119,8 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
    *              If the directory already exists, this method throws an exception.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     DecisionTreeModel.SaveLoadV1_0.save(sc, path, this)
   }
@@ -128,9 +128,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
   override protected def formatVersion: String = DecisionTreeModel.formatVersion
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
 
   private[spark] def formatVersion: String = "1.0"
@@ -317,8 +315,8 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
    * @return  Model instance
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): DecisionTreeModel = {
     implicit val formats = DefaultFormats
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index 23f0363639120..091a0462c204f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.model
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
 
 /**
@@ -29,8 +29,8 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
  * @param rightImpurity right node impurity
  * @param leftPredict left node predict
  * @param rightPredict right node predict
- * @since 1.0.0
  */
+@Since("1.0.0")
 @DeveloperApi
 class InformationGainStats(
     val gain: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index aca3350c2e535..8c54c55107233 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.model
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.Logging
 import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.linalg.Vector
@@ -38,8 +38,8 @@ import org.apache.spark.mllib.linalg.Vector
  * @param leftNode  left child
  * @param rightNode right child
  * @param stats information gain stats
- * @since 1.0.0
  */
+@Since("1.0.0")
 @DeveloperApi
 class Node (
     val id: Int,
@@ -59,8 +59,8 @@ class Node (
   /**
    * build the left node and right nodes if not leaf
    * @param nodes array of nodes
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @deprecated("build should no longer be used since trees are constructed on-the-fly in training",
     "1.2.0")
   def build(nodes: Array[Node]): Unit = {
@@ -81,8 +81,8 @@ class Node (
    * predict value if node is not leaf
    * @param features feature value
    * @return predicted value
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def predict(features: Vector) : Double = {
     if (isLeaf) {
       predict.predict
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
index be819b59e7048..965784051ede5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.mllib.tree.model
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 
 /**
  * Predicted value for a node
  * @param predict predicted value
  * @param prob probability of the label (classification only)
- * @since 1.2.0
  */
+@Since("1.2.0")
 @DeveloperApi
 class Predict(
     val predict: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index 18d40530aee1e..45db83ae3a1f3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree.model
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
 import org.apache.spark.mllib.tree.configuration.FeatureType
 import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
@@ -30,8 +30,8 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
  *                  Split left if feature <= threshold, else right.
  * @param featureType type of feature -- categorical or continuous
  * @param categories Split left if categorical feature value is in this set, else right.
- * @since 1.0.0
  */
+@Since("1.0.0")
 @DeveloperApi
 case class Split(
     feature: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 0c629b12a84df..19571447a2c56 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -25,7 +25,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -45,8 +45,8 @@ import org.apache.spark.util.Utils
  *
  * @param algo algorithm for the ensemble model, either Classification or Regression
  * @param trees tree ensembles
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel])
   extends TreeEnsembleModel(algo, trees, Array.fill(trees.length)(1.0),
@@ -60,8 +60,8 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
    *              If the directory already exists, this method throws an exception.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
       RandomForestModel.SaveLoadV1_0.thisClassName)
@@ -70,9 +70,7 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis
   override protected def formatVersion: String = RandomForestModel.formatVersion
 }
 
-/**
- * @since 1.3.0
- */
+@Since("1.3.0")
 object RandomForestModel extends Loader[RandomForestModel] {
 
   private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
@@ -82,8 +80,8 @@ object RandomForestModel extends Loader[RandomForestModel] {
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
    * @return  Model instance
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): RandomForestModel = {
     val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
@@ -114,8 +112,8 @@ object RandomForestModel extends Loader[RandomForestModel] {
  * @param algo algorithm for the ensemble model, either Classification or Regression
  * @param trees tree ensembles
  * @param treeWeights tree ensemble weights
- * @since 1.2.0
  */
+@Since("1.2.0")
 @Experimental
 class GradientBoostedTreesModel(
     override val algo: Algo,
@@ -130,8 +128,8 @@ class GradientBoostedTreesModel(
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
    *              If the directory already exists, this method throws an exception.
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
       GradientBoostedTreesModel.SaveLoadV1_0.thisClassName)
@@ -143,8 +141,8 @@ class GradientBoostedTreesModel(
    * @param loss evaluation metric.
    * @return an array with index i having the losses or errors for the ensemble
    *         containing the first i+1 trees
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def evaluateEachIteration(
       data: RDD[LabeledPoint],
       loss: Loss): Array[Double] = {
@@ -186,8 +184,8 @@ class GradientBoostedTreesModel(
 }
 
 /**
- * @since 1.3.0
  */
+@Since("1.3.0")
 object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
 
   /**
@@ -199,8 +197,8 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param loss: evaluation metric.
    * @return a RDD with each element being a zip of the prediction and error
    *         corresponding to every sample.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def computeInitialPredictionAndError(
       data: RDD[LabeledPoint],
       initTreeWeight: Double,
@@ -223,8 +221,8 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param loss: evaluation metric.
    * @return a RDD with each element being a zip of the prediction and error
    *         corresponding to each sample.
-   * @since 1.4.0
    */
+  @Since("1.4.0")
   def updatePredictionError(
     data: RDD[LabeledPoint],
     predictionAndError: RDD[(Double, Double)],
@@ -248,8 +246,8 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
    * @return  Model instance
-   * @since 1.3.0
    */
+  @Since("1.3.0")
   override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
     val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
index f520b3a1b7c72..bcaacc1b1f191 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
@@ -24,7 +24,6 @@ package org.apache.spark.mllib
  *  - information loss calculation with entropy and Gini for classification and
  *    variance for regression,
  *  - both continuous and categorical features.
- *  @since 1.0.0
  */
 package object tree {
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 11ed23176fc12..4940974bf4f41 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -21,7 +21,7 @@ import scala.reflect.ClassTag
 
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PartitionwiseSampledRDD
@@ -64,8 +64,8 @@ object MLUtils {
    *                    feature dimensions.
    * @param minPartitions min number of partitions
    * @return labeled data stored as an RDD[LabeledPoint]
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
@@ -115,9 +115,7 @@ object MLUtils {
 
   // Convenient methods for `loadLibSVMFile`.
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
@@ -130,17 +128,15 @@ object MLUtils {
   /**
    * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
    * partitions.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       numFeatures: Int): RDD[LabeledPoint] =
     loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
@@ -149,9 +145,7 @@ object MLUtils {
       numFeatures: Int): RDD[LabeledPoint] =
     loadLibSVMFile(sc, path, numFeatures)
 
-  /**
-   * @since 1.0.0
-   */
+  @Since("1.0.0")
   @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
@@ -162,8 +156,8 @@ object MLUtils {
   /**
    * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
    * features determined automatically and the default number of partitions.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
     loadLibSVMFile(sc, path, -1)
 
@@ -193,15 +187,15 @@ object MLUtils {
    * @param path file or directory path in any Hadoop-supported file system URI
    * @param minPartitions min number of partitions
    * @return vectors stored as an RDD[Vector]
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def loadVectors(sc: SparkContext, path: String, minPartitions: Int): RDD[Vector] =
     sc.textFile(path, minPartitions).map(Vectors.parse)
 
   /**
    * Loads vectors saved using `RDD[Vector].saveAsTextFile` with the default number of partitions.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def loadVectors(sc: SparkContext, path: String): RDD[Vector] =
     sc.textFile(path, sc.defaultMinPartitions).map(Vectors.parse)
 
@@ -211,16 +205,16 @@ object MLUtils {
    * @param path file or directory path in any Hadoop-supported file system URI
    * @param minPartitions min number of partitions
    * @return labeled points stored as an RDD[LabeledPoint]
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def loadLabeledPoints(sc: SparkContext, path: String, minPartitions: Int): RDD[LabeledPoint] =
     sc.textFile(path, minPartitions).map(LabeledPoint.parse)
 
   /**
    * Loads labeled points saved using `RDD[LabeledPoint].saveAsTextFile` with the default number of
    * partitions.
-   * @since 1.1.0
    */
+  @Since("1.1.0")
   def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
     loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
 
@@ -236,8 +230,8 @@ object MLUtils {
    *
    * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
    *            [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @deprecated("Should use MLUtils.loadLabeledPoints instead.", "1.0.1")
   def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
     sc.textFile(dir).map { line =>
@@ -258,8 +252,8 @@ object MLUtils {
    *
    * @deprecated Should use [[org.apache.spark.rdd.RDD#saveAsTextFile]] for saving and
    *            [[org.apache.spark.mllib.util.MLUtils#loadLabeledPoints]] for loading.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @deprecated("Should use RDD[LabeledPoint].saveAsTextFile instead.", "1.0.1")
   def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
     val dataStr = data.map(x => x.label + "," + x.features.toArray.mkString(" "))
@@ -271,8 +265,8 @@ object MLUtils {
    * Return a k element array of pairs of RDDs with the first element of each pair
    * containing the training data, a complement of the validation data and the second
    * element, the validation data, containing a unique 1/kth of the data. Where k=numFolds.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   @Experimental
   def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Int): Array[(RDD[T], RDD[T])] = {
     val numFoldsF = numFolds.toFloat
@@ -287,8 +281,8 @@ object MLUtils {
 
   /**
    * Returns a new vector with `1.0` (bias) appended to the input vector.
-   * @since 1.0.0
    */
+  @Since("1.0.0")
   def appendBias(vector: Vector): Vector = {
     vector match {
       case dv: DenseVector =>

From e3355090d4030daffed5efb0959bf1d724c13c13 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 21 Aug 2015 14:30:00 -0700
Subject: [PATCH 036/802] [SPARK-10143] [SQL] Use parquet's block size (row
 group size) setting as the min split size if necessary.

https://issues.apache.org/jira/browse/SPARK-10143

With this PR, we will set min split size to parquet's block size (row group size) set in the conf if the min split size is smaller. So, we can avoid have too many tasks and even useless tasks for reading parquet data.

I tested it locally. The table I have has 343MB and it is in my local FS. Because I did not set any min/max split size, the default split size was 32MB and the map stage had 11 tasks. But there were only three tasks that actually read data. With my PR, there were only three tasks in the map stage. Here is the difference.

Without this PR:
![image](https://cloud.githubusercontent.com/assets/2072857/9399179/8587dba6-4765-11e5-9189-7ebba52a2b6d.png)

With this PR:
![image](https://cloud.githubusercontent.com/assets/2072857/9399185/a4735d74-4765-11e5-8848-1f1e361a6b4b.png)

Even if the block size setting does match the actual block size of parquet file, I think it is still generally good to use parquet's block size setting if min split size is smaller than this block size.

Tested it on a cluster using
```
val count = sqlContext.table("""store_sales""").groupBy().count().queryExecution.executedPlan(3).execute().count
```
Basically, it reads 0 column of table `store_sales`. My table has 1824 parquet files with size from 80MB to 280MB (1 to 3 row group sizes). Without this patch, in a 16 worker cluster, the job had 5023 tasks and spent 102s. With this patch, the job had 2893 tasks and spent 64s. It is still not as good as using one mapper per file (1824 tasks and 42s), but it is much better than our master.

Author: Yin Huai <yhuai@databricks.com>

Closes #8346 from yhuai/parquetMinSplit.
---
 .../datasources/parquet/ParquetRelation.scala | 41 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 68169d48ac57c..bbf682aec0f9d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -26,6 +26,7 @@ import scala.collection.mutable
 import scala.util.{Failure, Try}
 
 import com.google.common.base.Objects
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
@@ -281,12 +282,18 @@ private[sql] class ParquetRelation(
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
     val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
 
+    // Parquet row group size. We will use this value as the value for
+    // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value
+    // of these flags are smaller than the parquet row group size.
+    val parquetBlockSize = ParquetOutputFormat.getLongBlockSize(broadcastedConf.value.value)
+
     // Create the function to set variable Parquet confs at both driver and executor side.
     val initLocalJobFuncOpt =
       ParquetRelation.initializeLocalJobFunc(
         requiredColumns,
         filters,
         dataSchema,
+        parquetBlockSize,
         useMetadataCache,
         parquetFilterPushDown,
         assumeBinaryIsString,
@@ -294,7 +301,8 @@ private[sql] class ParquetRelation(
         followParquetFormatSpec) _
 
     // Create the function to set input paths at the driver side.
-    val setInputPaths = ParquetRelation.initializeDriverSideJobFunc(inputFiles) _
+    val setInputPaths =
+      ParquetRelation.initializeDriverSideJobFunc(inputFiles, parquetBlockSize) _
 
     Utils.withDummyCallSite(sqlContext.sparkContext) {
       new SqlNewHadoopRDD(
@@ -482,11 +490,35 @@ private[sql] object ParquetRelation extends Logging {
   // internally.
   private[sql] val METASTORE_SCHEMA = "metastoreSchema"
 
+  /**
+   * If parquet's block size (row group size) setting is larger than the min split size,
+   * we use parquet's block size setting as the min split size. Otherwise, we will create
+   * tasks processing nothing (because a split does not cover the starting point of a
+   * parquet block). See https://issues.apache.org/jira/browse/SPARK-10143 for more information.
+   */
+  private def overrideMinSplitSize(parquetBlockSize: Long, conf: Configuration): Unit = {
+    val minSplitSize =
+      math.max(
+        conf.getLong("mapred.min.split.size", 0L),
+        conf.getLong("mapreduce.input.fileinputformat.split.minsize", 0L))
+    if (parquetBlockSize > minSplitSize) {
+      val message =
+        s"Parquet's block size (row group size) is larger than " +
+          s"mapred.min.split.size/mapreduce.input.fileinputformat.split.minsize. Setting " +
+          s"mapred.min.split.size and mapreduce.input.fileinputformat.split.minsize to " +
+          s"$parquetBlockSize."
+      logDebug(message)
+      conf.set("mapred.min.split.size", parquetBlockSize.toString)
+      conf.set("mapreduce.input.fileinputformat.split.minsize", parquetBlockSize.toString)
+    }
+  }
+
   /** This closure sets various Parquet configurations at both driver side and executor side. */
   private[parquet] def initializeLocalJobFunc(
       requiredColumns: Array[String],
       filters: Array[Filter],
       dataSchema: StructType,
+      parquetBlockSize: Long,
       useMetadataCache: Boolean,
       parquetFilterPushDown: Boolean,
       assumeBinaryIsString: Boolean,
@@ -522,16 +554,21 @@ private[sql] object ParquetRelation extends Logging {
     conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
     conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
     conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
+
+    overrideMinSplitSize(parquetBlockSize, conf)
   }
 
   /** This closure sets input paths at the driver side. */
   private[parquet] def initializeDriverSideJobFunc(
-      inputFiles: Array[FileStatus])(job: Job): Unit = {
+      inputFiles: Array[FileStatus],
+      parquetBlockSize: Long)(job: Job): Unit = {
     // We side the input paths at the driver side.
     logInfo(s"Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(", ")}")
     if (inputFiles.nonEmpty) {
       FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
     }
+
+    overrideMinSplitSize(parquetBlockSize, job.getConfiguration)
   }
 
   private[parquet] def readSchema(

From f01c4220d2b791f470fa6596ffe11baa51517fbe Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 21 Aug 2015 16:28:00 -0700
Subject: [PATCH 037/802] [SPARK-10163] [ML] Allow single-category features for
 GBT models

Removed categorical feature info validation since no longer needed

This is needed to make the ML user guide examples work (in another current PR).

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8367 from jkbradley/gbt-single-cat.
---
 .../org/apache/spark/mllib/tree/configuration/Strategy.scala | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index a58f01ba8544e..b74e3f1f46523 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -158,11 +158,6 @@ class Strategy (
       s"  Valid values are integers >= 0.")
     require(maxBins >= 2, s"DecisionTree Strategy given invalid maxBins parameter: $maxBins." +
       s"  Valid values are integers >= 2.")
-    categoricalFeaturesInfo.foreach { case (feature, arity) =>
-      require(arity >= 2,
-        s"DecisionTree Strategy given invalid categoricalFeaturesInfo setting:" +
-        s" feature $feature has $arity categories.  The number of categories should be >= 2.")
-    }
     require(minInstancesPerNode >= 1,
       s"DecisionTree Strategy requires minInstancesPerNode >= 1 but was given $minInstancesPerNode")
     require(maxMemoryInMB <= 10240,

From 630a994e6a9785d1704f8e7fb604f32f5dea24f8 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 21 Aug 2015 16:30:12 -0700
Subject: [PATCH 038/802] [SPARK-9893] User guide with Java test suite for
 VectorSlicer

Add user guide for `VectorSlicer`, with Java test suite and Python version VectorSlicer.

Note that Python version does not support selecting by names now.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #8267 from yinxusen/SPARK-9893.
---
 docs/ml-features.md                           | 133 ++++++++++++++++++
 .../ml/feature/JavaVectorSlicerSuite.java     |  85 +++++++++++
 2 files changed, 218 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 6309db97be4d0..642a4b4c53183 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1477,6 +1477,139 @@ print(output.select("features", "clicked").first())
 </div>
 </div>
 
+# Feature Selectors
+
+## VectorSlicer
+
+`VectorSlicer` is a transformer that takes a feature vector and outputs a new feature vector with a
+sub-array of the original features. It is useful for extracting features from a vector column.
+
+`VectorSlicer` accepts a vector column with a specified indices, then outputs a new vector column
+whose values are selected via those indices. There are two types of indices, 
+
+ 1. Integer indices that represents the indices into the vector, `setIndices()`;
+
+ 2. String indices that represents the names of features into the vector, `setNames()`. 
+ *This requires the vector column to have an `AttributeGroup` since the implementation matches on
+ the name field of an `Attribute`.*
+
+Specification by integer and string are both acceptable. Moreover, you can use integer index and 
+string name simultaneously. At least one feature must be selected. Duplicate features are not
+allowed, so there can be no overlap between selected indices and names. Note that if names of
+features are selected, an exception will be threw out when encountering with empty input attributes.
+
+The output vector will order features with the selected indices first (in the order given),
+followed by the selected names (in the order given).
+
+**Examples**
+
+Suppose that we have a DataFrame with the column `userFeatures`:
+
+~~~
+ userFeatures     
+------------------
+ [0.0, 10.0, 0.5] 
+~~~
+
+`userFeatures` is a vector column that contains three user features. Assuming that the first column
+of `userFeatures` are all zeros, so we want to remove it and only the last two columns are selected.
+The `VectorSlicer` selects the last two elements with `setIndices(1, 2)` then produces a new vector
+column named `features`:
+
+~~~
+ userFeatures     | features
+------------------|-----------------------------
+ [0.0, 10.0, 0.5] | [10.0, 0.5]
+~~~
+
+Suppose also that we have a potential input attributes for the `userFeatures`, i.e. 
+`["f1", "f2", "f3"]`, then we can use `setNames("f2", "f3")` to select them.
+
+~~~
+ userFeatures     | features
+------------------|-----------------------------
+ [0.0, 10.0, 0.5] | [10.0, 0.5]
+ ["f1", "f2", "f3"] | ["f2", "f3"]
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input
+column name with specified indices or names and an output column name.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
+import org.apache.spark.ml.feature.VectorSlicer
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+val data = Array(
+  Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
+  Vectors.dense(-2.0, 2.3, 0.0)
+)
+
+val defaultAttr = NumericAttribute.defaultAttr
+val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
+val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
+
+val dataRDD = sc.parallelize(data).map(Row.apply)
+val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField()))
+
+val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
+
+slicer.setIndices(1).setNames("f3")
+// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
+
+val output = slicer.transform(dataset)
+println(output.select("userFeatures", "features").first())
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name
+with specified indices or names and an output column name.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+Attribute[] attrs = new Attribute[]{
+  NumericAttribute.defaultAttr().withName("f1"),
+  NumericAttribute.defaultAttr().withName("f2"),
+  NumericAttribute.defaultAttr().withName("f3")
+};
+AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+  RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+));
+
+DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+VectorSlicer vectorSlicer = new VectorSlicer()
+  .setInputCol("userFeatures").setOutputCol("features");
+
+vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+DataFrame output = vectorSlicer.transform(dataset);
+
+System.out.println(output.select("userFeatures", "features").first());
+{% endhighlight %}
+</div>
+</div>
+
 ## RFormula
 
 `RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
new file mode 100644
index 0000000000000..56988b9fb29cb
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import com.google.common.collect.Lists;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.attribute.Attribute;
+import org.apache.spark.ml.attribute.AttributeGroup;
+import org.apache.spark.ml.attribute.NumericAttribute;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.StructType;
+
+
+public class JavaVectorSlicerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaVectorSlicerSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void vectorSlice() {
+    Attribute[] attrs = new Attribute[]{
+      NumericAttribute.defaultAttr().withName("f1"),
+      NumericAttribute.defaultAttr().withName("f2"),
+      NumericAttribute.defaultAttr().withName("f3")
+    };
+    AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+      RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+    ));
+
+    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+    VectorSlicer vectorSlicer = new VectorSlicer()
+      .setInputCol("userFeatures").setOutputCol("features");
+
+    vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+
+    DataFrame output = vectorSlicer.transform(dataset);
+
+    for (Row r : output.select("userFeatures", "features").take(2)) {
+      Vector features = r.getAs(1);
+      Assert.assertEquals(features.size(), 2);
+    }
+  }
+}

From 46fcb9e0dbb2b28110f68a3d9f6c0c47bfd197b1 Mon Sep 17 00:00:00 2001
From: Keiji Yoshida <yoshida.keiji.84@gmail.com>
Date: Sat, 22 Aug 2015 02:38:10 -0700
Subject: [PATCH 039/802] Update programming-guide.md

Update `lineLengths.persist();` to `lineLengths.persist(StorageLevel.MEMORY_ONLY());` because `JavaRDD#persist` needs a parameter of `StorageLevel`.

Author: Keiji Yoshida <yoshida.keiji.84@gmail.com>

Closes #8372 from yosssi/patch-1.
---
 docs/programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 982c5eabe652b..4cf83bb392636 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -549,7 +549,7 @@ returning only its answer to the driver program.
 If we also wanted to use `lineLengths` again later, we could add:
 
 {% highlight java %}
-lineLengths.persist();
+lineLengths.persist(StorageLevel.MEMORY_ONLY());
 {% endhighlight %}
 
 before the `reduce`, which would cause `lineLengths` to be saved in memory after the first time it is computed.

From 90cb9f05655a25b95b8f9fe81da14e5b9c8bcf44 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Sat, 22 Aug 2015 10:16:35 -0700
Subject: [PATCH 040/802] [SPARK-9401] [SQL] Fully implement code generation
 for ConcatWs

This PR adds full codegen support for ConcatWs, is a substitute of #7782

JIRA: https://issues.apache.org/jira/browse/SPARK-9401

cc davies

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #8353 from yjshen/concatws.
---
 .../expressions/stringExpressions.scala       | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index b60d318534a41..48d02bb534501 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -72,7 +72,7 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
  * Returns null if the separator is null. Otherwise, concat_ws skips all null values.
  */
 case class ConcatWs(children: Seq[Expression])
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends Expression with ImplicitCastInputTypes {
 
   require(children.nonEmpty, s"$prettyName requires at least one argument.")
 
@@ -114,8 +114,44 @@ case class ConcatWs(children: Seq[Expression])
         boolean ${ev.isNull} = ${ev.primitive} == null;
       """
     } else {
-      // Contains a mix of strings and array<string>s. Fall back to interpreted mode for now.
-      super.genCode(ctx, ev)
+      val array = ctx.freshName("array")
+      val varargNum = ctx.freshName("varargNum")
+      val idxInVararg = ctx.freshName("idxInVararg")
+
+      val evals = children.map(_.gen(ctx))
+      val (varargCount, varargBuild) = children.tail.zip(evals.tail).map { case (child, eval) =>
+        child.dataType match {
+          case StringType =>
+            ("", // we count all the StringType arguments num at once below.
+              s"$array[$idxInVararg ++] = ${eval.isNull} ? (UTF8String) null : ${eval.primitive};")
+          case _: ArrayType =>
+            val size = ctx.freshName("n")
+            (s"""
+              if (!${eval.isNull}) {
+                $varargNum += ${eval.primitive}.numElements();
+              }
+            """,
+            s"""
+            if (!${eval.isNull}) {
+              final int $size = ${eval.primitive}.numElements();
+              for (int j = 0; j < $size; j ++) {
+                $array[$idxInVararg ++] = ${ctx.getValue(eval.primitive, StringType, "j")};
+              }
+            }
+            """)
+        }
+      }.unzip
+
+      evals.map(_.code).mkString("\n") +
+      s"""
+        int $varargNum = ${children.count(_.dataType == StringType) - 1};
+        int $idxInVararg = 0;
+        ${varargCount.mkString("\n")}
+        UTF8String[] $array = new UTF8String[$varargNum];
+        ${varargBuild.mkString("\n")}
+        UTF8String ${ev.primitive} = UTF8String.concatWs(${evals.head.primitive}, $array);
+        boolean ${ev.isNull} = ${ev.primitive} == null;
+      """
     }
   }
 }

From 623c675fde7a3a39957a62c7af26a54f4b01f8ce Mon Sep 17 00:00:00 2001
From: Keiji Yoshida <yoshida.keiji.84@gmail.com>
Date: Sun, 23 Aug 2015 11:04:29 +0100
Subject: [PATCH 041/802] Update streaming-programming-guide.md

Update `See the Scala example` to `See the Java example`.

Author: Keiji Yoshida <yoshida.keiji.84@gmail.com>

Closes #8376 from yosssi/patch-1.
---
 docs/streaming-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index c59d936b43c88..118ced298f4b0 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -1702,7 +1702,7 @@ context.awaitTermination();
 If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
 If the directory does not exist (i.e., running for the first time),
 then the function `contextFactory` will be called to create a new
-context and set up the DStreams. See the Scala example
+context and set up the DStreams. See the Java example
 [JavaRecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java).
 This example appends the word counts of network data into a file.
 

From c6df5f66d9a8b9760f2cd46fcd930f977650c9c5 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sun, 23 Aug 2015 17:41:49 -0700
Subject: [PATCH 042/802] [SPARK-10148] [STREAMING] Display active and inactive
 receiver numbers in Streaming page

Added the active and inactive receiver numbers in the summary section of Streaming page.

<img width="1074" alt="screen shot 2015-08-21 at 2 08 54 pm" src="https://cloud.githubusercontent.com/assets/1000778/9402437/ff2806a2-480f-11e5-8f8e-efdf8e5d514d.png">

Author: zsxwing <zsxwing@gmail.com>

Closes #8351 from zsxwing/receiver-number.
---
 .../spark/streaming/ui/StreamingJobProgressListener.scala | 8 ++++++++
 .../org/apache/spark/streaming/ui/StreamingPage.scala     | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index b77c555c68b8b..78aeb004e18b1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -148,6 +148,14 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     receiverInfos.size
   }
 
+  def numActiveReceivers: Int = synchronized {
+    receiverInfos.count(_._2.active)
+  }
+
+  def numInactiveReceivers: Int = {
+    ssc.graph.getReceiverInputStreams().size - numActiveReceivers
+  }
+
   def numTotalCompletedBatches: Long = synchronized {
     totalCompletedBatches
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
index 87af902428ec8..96d943e75d272 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
@@ -303,6 +303,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
 
     val numCompletedBatches = listener.retainedCompletedBatches.size
     val numActiveBatches = batchTimes.length - numCompletedBatches
+    val numReceivers = listener.numInactiveReceivers + listener.numActiveReceivers
     val table =
       // scalastyle:off
       <table id="stat-table" class="table table-bordered" style="width: auto">
@@ -330,6 +331,11 @@ private[ui] class StreamingPage(parent: StreamingTab)
                 }
               }
               </div>
+              {
+                if (numReceivers > 0) {
+                  <div>Receivers: {listener.numActiveReceivers} / {numReceivers} active</div>
+                }
+              }
               <div>Avg: {eventRateForAllStreams.formattedAvg} events/sec</div>
             </div>
           </td>

From b963c19a803c5a26c9b65655d40ca6621acf8bd4 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Sun, 23 Aug 2015 18:34:07 -0700
Subject: [PATCH 043/802] [SPARK-10164] [MLLIB] Fixed GMM distributed
 decomposition bug

GaussianMixture now distributes matrix decompositions for certain problem sizes. Distributed computation actually fails, but this was not tested in unit tests.

This PR adds a unit test which checks this.  It failed previously but works with this fix.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8370 from jkbradley/gmm-fix.
---
 .../mllib/clustering/GaussianMixture.scala    | 22 +++++++++++++------
 .../clustering/GaussianMixtureSuite.scala     | 22 +++++++++++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index fcc9dfecac54f..daa947e81d44d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -169,9 +169,7 @@ class GaussianMixture private (
     // Get length of the input vectors
     val d = breezeData.first().length
 
-    // Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
-    // d > 25 except for when k is very small
-    val distributeGaussians = ((k - 1.0) / k) * d > 25
+    val shouldDistributeGaussians = GaussianMixture.shouldDistributeGaussians(k, d)
 
     // Determine initial weights and corresponding Gaussians.
     // If the user supplied an initial GMM, we use those values, otherwise
@@ -205,15 +203,15 @@ class GaussianMixture private (
       // (often referred to as the "M" step in literature)
       val sumWeights = sums.weights.sum
 
-      if (distributeGaussians) {
+      if (shouldDistributeGaussians) {
         val numPartitions = math.min(k, 1024)
         val tuples =
           Seq.tabulate(k)(i => (sums.means(i), sums.sigmas(i), sums.weights(i)))
         val (ws, gs) = sc.parallelize(tuples, numPartitions).map { case (mean, sigma, weight) =>
           updateWeightsAndGaussians(mean, sigma, weight, sumWeights)
-        }.collect.unzip
-        Array.copy(ws, 0, weights, 0, ws.length)
-        Array.copy(gs, 0, gaussians, 0, gs.length)
+        }.collect().unzip
+        Array.copy(ws.toArray, 0, weights, 0, ws.length)
+        Array.copy(gs.toArray, 0, gaussians, 0, gs.length)
       } else {
         var i = 0
         while (i < k) {
@@ -271,6 +269,16 @@ class GaussianMixture private (
   }
 }
 
+private[clustering] object GaussianMixture {
+  /**
+   * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
+   * d > 25 except for when k is very small.
+   * @param k  Number of topics
+   * @param d  Number of features
+   */
+  def shouldDistributeGaussians(k: Int, d: Int): Boolean = ((k - 1.0) / k) * d > 25
+}
+
 // companion class to provide zero constructor for ExpectationSum
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index b636d02f786e6..a72723eb00daf 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{Vectors, Matrices}
+import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrices}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -76,6 +76,20 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(gmm.gaussians(1).sigma ~== Esigma(1) absTol 1E-3)
   }
 
+  test("two clusters with distributed decompositions") {
+    val data = sc.parallelize(GaussianTestData.data2, 2)
+
+    val k = 5
+    val d = data.first().size
+    assert(GaussianMixture.shouldDistributeGaussians(k, d))
+
+    val gmm = new GaussianMixture()
+      .setK(k)
+      .run(data)
+
+    assert(gmm.k === k)
+  }
+
   test("single cluster with sparse data") {
     val data = sc.parallelize(Array(
       Vectors.sparse(3, Array(0, 2), Array(4.0, 2.0)),
@@ -116,7 +130,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
     val sparseGMM = new GaussianMixture()
       .setK(2)
       .setInitialModel(initialGmm)
-      .run(data)
+      .run(sparseData)
 
     assert(sparseGMM.weights(0) ~== Ew(0) absTol 1E-3)
     assert(sparseGMM.weights(1) ~== Ew(1) absTol 1E-3)
@@ -168,5 +182,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
       Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734)
     )
 
+    val data2: Array[Vector] = Array.tabulate(25){ i: Int =>
+      Vectors.dense(Array.tabulate(50)(i + _.toDouble))
+    }
+
   }
 }

From 053d94fcf32268369b5a40837271f15d6af41aa4 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sun, 23 Aug 2015 19:24:32 -0700
Subject: [PATCH 044/802] [SPARK-10142] [STREAMING] Made python checkpoint
 recovery handle non-local checkpoint paths and existing SparkContexts

The current code only checks checkpoint files in local filesystem, and always tries to create a new Python SparkContext (even if one already exists). The solution is to do the following:
1. Use the same code path as Java to check whether a valid checkpoint exists
2. Create a new Python SparkContext only if there no active one.

There is not test for the path as its hard to test with distributed filesystem paths in a local unit test. I am going to test it with a distributed file system manually to verify that this patch works.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8366 from tdas/SPARK-10142 and squashes the following commits:

3afa666 [Tathagata Das] Added tests
2dd4ae5 [Tathagata Das] Added the check to not create a context if one already exists
9bf151b [Tathagata Das] Made python checkpoint recovery use java to find the checkpoint files
---
 python/pyspark/streaming/context.py           | 22 ++++++----
 python/pyspark/streaming/tests.py             | 43 ++++++++++++++++---
 .../apache/spark/streaming/Checkpoint.scala   |  9 ++++
 3 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index e3ba70e4e5e88..4069d7a149986 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -150,26 +150,30 @@ def getOrCreate(cls, checkpointPath, setupFunc):
         @param checkpointPath: Checkpoint directory used in an earlier streaming program
         @param setupFunc:      Function to create a new context and setup DStreams
         """
-        # TODO: support checkpoint in HDFS
-        if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath):
+        cls._ensure_initialized()
+        gw = SparkContext._gateway
+
+        # Check whether valid checkpoint information exists in the given path
+        if gw.jvm.CheckpointReader.read(checkpointPath).isEmpty():
             ssc = setupFunc()
             ssc.checkpoint(checkpointPath)
             return ssc
 
-        cls._ensure_initialized()
-        gw = SparkContext._gateway
-
         try:
             jssc = gw.jvm.JavaStreamingContext(checkpointPath)
         except Exception:
             print("failed to load StreamingContext from checkpoint", file=sys.stderr)
             raise
 
-        jsc = jssc.sparkContext()
-        conf = SparkConf(_jconf=jsc.getConf())
-        sc = SparkContext(conf=conf, gateway=gw, jsc=jsc)
+        # If there is already an active instance of Python SparkContext use it, or create a new one
+        if not SparkContext._active_spark_context:
+            jsc = jssc.sparkContext()
+            conf = SparkConf(_jconf=jsc.getConf())
+            SparkContext(conf=conf, gateway=gw, jsc=jsc)
+
+        sc = SparkContext._active_spark_context
+
         # update ctx in serializer
-        SparkContext._active_spark_context = sc
         cls._transformerSerializer.ctx = sc
         return StreamingContext(sc, None, jssc)
 
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 214d5be439003..510a4f2b3e472 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -603,6 +603,10 @@ def tearDownClass():
     def tearDown(self):
         if self.ssc is not None:
             self.ssc.stop(True)
+        if self.sc is not None:
+            self.sc.stop()
+        if self.cpd is not None:
+            shutil.rmtree(self.cpd)
 
     def test_get_or_create_and_get_active_or_create(self):
         inputd = tempfile.mkdtemp()
@@ -622,8 +626,12 @@ def setup():
             self.setupCalled = True
             return ssc
 
-        cpd = tempfile.mkdtemp("test_streaming_cps")
-        self.ssc = StreamingContext.getOrCreate(cpd, setup)
+        # Verify that getOrCreate() calls setup() in absence of checkpoint files
+        self.cpd = tempfile.mkdtemp("test_streaming_cps")
+        self.setupCalled = False
+        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
+        self.assertFalse(self.setupCalled)
+
         self.ssc.start()
 
         def check_output(n):
@@ -660,31 +668,52 @@ def check_output(n):
         self.ssc.stop(True, True)
         time.sleep(1)
         self.setupCalled = False
-        self.ssc = StreamingContext.getOrCreate(cpd, setup)
+        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
         self.assertFalse(self.setupCalled)
         self.ssc.start()
         check_output(3)
 
+        # Verify that getOrCreate() uses existing SparkContext
+        self.ssc.stop(True, True)
+        time.sleep(1)
+        sc = SparkContext(SparkConf())
+        self.setupCalled = False
+        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
+        self.assertFalse(self.setupCalled)
+        self.assertTrue(self.ssc.sparkContext == sc)
+
         # Verify the getActiveOrCreate() recovers from checkpoint files
         self.ssc.stop(True, True)
         time.sleep(1)
         self.setupCalled = False
-        self.ssc = StreamingContext.getActiveOrCreate(cpd, setup)
+        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
         self.assertFalse(self.setupCalled)
         self.ssc.start()
         check_output(4)
 
         # Verify that getActiveOrCreate() returns active context
         self.setupCalled = False
-        self.assertEquals(StreamingContext.getActiveOrCreate(cpd, setup), self.ssc)
+        self.assertEquals(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
         self.assertFalse(self.setupCalled)
 
+        # Verify that getActiveOrCreate() uses existing SparkContext
+        self.ssc.stop(True, True)
+        time.sleep(1)
+        self.sc = SparkContext(SparkConf())
+        self.setupCalled = False
+        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
+        self.assertFalse(self.setupCalled)
+        self.assertTrue(self.ssc.sparkContext == sc)
+
         # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files
         self.ssc.stop(True, True)
-        shutil.rmtree(cpd)  # delete checkpoint directory
+        shutil.rmtree(self.cpd)  # delete checkpoint directory
+        time.sleep(1)
         self.setupCalled = False
-        self.ssc = StreamingContext.getActiveOrCreate(cpd, setup)
+        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
         self.assertTrue(self.setupCalled)
+
+        # Stop everything
         self.ssc.stop(True, True)
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 6f6b449accc3c..cd5d960369c05 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -286,6 +286,15 @@ class CheckpointWriter(
 private[streaming]
 object CheckpointReader extends Logging {
 
+  /**
+   * Read checkpoint files present in the given checkpoint directory. If there are no checkpoint
+   * files, then return None, else try to return the latest valid checkpoint object. If no
+   * checkpoint files could be read correctly, then return None.
+   */
+  def read(checkpointDir: String): Option[Checkpoint] = {
+    read(checkpointDir, new SparkConf(), SparkHadoopUtil.get.conf, ignoreReadError = true)
+  }
+
   /**
    * Read checkpoint files present in the given checkpoint directory. If there are no checkpoint
    * files, then return None, else try to return the latest valid checkpoint object. If no

From 4e0395ddb764d092b5b38447af49e196e590e0f0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 24 Aug 2015 12:38:01 -0700
Subject: [PATCH 045/802] [SPARK-10168] [STREAMING] Fix the issue that maven
 publishes wrong artifact jars

This PR removed the `outputFile` configuration from pom.xml and updated `tests.py` to search jars for both sbt build and maven build.

I ran ` mvn -Pkinesis-asl -DskipTests clean install` locally, and verified the jars in my local repository were correct. I also checked Python tests for maven build, and it passed all tests.

Author: zsxwing <zsxwing@gmail.com>

Closes #8373 from zsxwing/SPARK-10168 and squashes the following commits:

e0b5818 [zsxwing] Fix the sbt build
c697627 [zsxwing] Add the jar pathes to the exception message
be1d8a5 [zsxwing] Fix the issue that maven publishes wrong artifact jars
---
 external/flume-assembly/pom.xml     |  1 -
 external/kafka-assembly/pom.xml     |  1 -
 external/mqtt-assembly/pom.xml      |  1 -
 extras/kinesis-asl-assembly/pom.xml |  1 -
 python/pyspark/streaming/tests.py   | 47 ++++++++++++++++-------------
 5 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index e05e4318969ce..561ed4babe5d0 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -115,7 +115,6 @@
         <artifactId>maven-shade-plugin</artifactId>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar</outputFile>
           <artifactSet>
             <includes>
               <include>*:*</include>
diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 36342f37bb2ea..6f4e2a89e9af7 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -142,7 +142,6 @@
       <artifactId>maven-shade-plugin</artifactId>
       <configuration>
         <shadedArtifactAttached>false</shadedArtifactAttached>
-        <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar</outputFile>
         <artifactSet>
           <includes>
             <include>*:*</include>
diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml
index f3e3f93e7ed50..8412600633734 100644
--- a/external/mqtt-assembly/pom.xml
+++ b/external/mqtt-assembly/pom.xml
@@ -132,7 +132,6 @@
         <artifactId>maven-shade-plugin</artifactId>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-mqtt-assembly-${project.version}.jar</outputFile>
           <artifactSet>
             <includes>
               <include>*:*</include>
diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml
index 3ca538608f694..51af3e6f2225f 100644
--- a/extras/kinesis-asl-assembly/pom.xml
+++ b/extras/kinesis-asl-assembly/pom.xml
@@ -137,7 +137,6 @@
       <artifactId>maven-shade-plugin</artifactId>
       <configuration>
         <shadedArtifactAttached>false</shadedArtifactAttached>
-        <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kinesis-asl-assembly-${project.version}.jar</outputFile>
         <artifactSet>
           <includes>
             <include>*:*</include>
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 510a4f2b3e472..cfea95b0dec71 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1162,11 +1162,20 @@ def get_output(_, rdd):
             kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
 
 
+# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because
+# the artifact jars are in different directories.
+def search_jar(dir, name_prefix):
+    # We should ignore the following jars
+    ignored_jar_suffixes = ("javadoc.jar", "sources.jar", "test-sources.jar", "tests.jar")
+    jars = (glob.glob(os.path.join(dir, "target/scala-*/" + name_prefix + "-*.jar")) +  # sbt build
+            glob.glob(os.path.join(dir, "target/" + name_prefix + "_*.jar")))  # maven build
+    return [jar for jar in jars if not jar.endswith(ignored_jar_suffixes)]
+
+
 def search_kafka_assembly_jar():
     SPARK_HOME = os.environ["SPARK_HOME"]
     kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
-    jars = glob.glob(
-        os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar"))
+    jars = search_jar(kafka_assembly_dir, "spark-streaming-kafka-assembly")
     if not jars:
         raise Exception(
             ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
@@ -1174,8 +1183,8 @@ def search_kafka_assembly_jar():
             "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
             "'build/mvn package' before running this test.")
     elif len(jars) > 1:
-        raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please "
-                         "remove all but one") % kafka_assembly_dir)
+        raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
+                         "remove all but one") % (", ".join(jars)))
     else:
         return jars[0]
 
@@ -1183,8 +1192,7 @@ def search_kafka_assembly_jar():
 def search_flume_assembly_jar():
     SPARK_HOME = os.environ["SPARK_HOME"]
     flume_assembly_dir = os.path.join(SPARK_HOME, "external/flume-assembly")
-    jars = glob.glob(
-        os.path.join(flume_assembly_dir, "target/scala-*/spark-streaming-flume-assembly-*.jar"))
+    jars = search_jar(flume_assembly_dir, "spark-streaming-flume-assembly")
     if not jars:
         raise Exception(
             ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) +
@@ -1192,8 +1200,8 @@ def search_flume_assembly_jar():
             "'build/sbt assembly/assembly streaming-flume-assembly/assembly' or "
             "'build/mvn package' before running this test.")
     elif len(jars) > 1:
-        raise Exception(("Found multiple Spark Streaming Flume assembly JARs in %s; please "
-                        "remove all but one") % flume_assembly_dir)
+        raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please "
+                        "remove all but one") % (", ".join(jars)))
     else:
         return jars[0]
 
@@ -1201,8 +1209,7 @@ def search_flume_assembly_jar():
 def search_mqtt_assembly_jar():
     SPARK_HOME = os.environ["SPARK_HOME"]
     mqtt_assembly_dir = os.path.join(SPARK_HOME, "external/mqtt-assembly")
-    jars = glob.glob(
-        os.path.join(mqtt_assembly_dir, "target/scala-*/spark-streaming-mqtt-assembly-*.jar"))
+    jars = search_jar(mqtt_assembly_dir, "spark-streaming-mqtt-assembly")
     if not jars:
         raise Exception(
             ("Failed to find Spark Streaming MQTT assembly jar in %s. " % mqtt_assembly_dir) +
@@ -1210,8 +1217,8 @@ def search_mqtt_assembly_jar():
             "'build/sbt assembly/assembly streaming-mqtt-assembly/assembly' or "
             "'build/mvn package' before running this test")
     elif len(jars) > 1:
-        raise Exception(("Found multiple Spark Streaming MQTT assembly JARs in %s; please "
-                         "remove all but one") % mqtt_assembly_dir)
+        raise Exception(("Found multiple Spark Streaming MQTT assembly JARs: %s; please "
+                         "remove all but one") % (", ".join(jars)))
     else:
         return jars[0]
 
@@ -1227,8 +1234,8 @@ def search_mqtt_test_jar():
             "You need to build Spark with "
             "'build/sbt assembly/assembly streaming-mqtt/test:assembly'")
     elif len(jars) > 1:
-        raise Exception(("Found multiple Spark Streaming MQTT test JARs in %s; please "
-                         "remove all but one") % mqtt_test_dir)
+        raise Exception(("Found multiple Spark Streaming MQTT test JARs: %s; please "
+                         "remove all but one") % (", ".join(jars)))
     else:
         return jars[0]
 
@@ -1236,14 +1243,12 @@ def search_mqtt_test_jar():
 def search_kinesis_asl_assembly_jar():
     SPARK_HOME = os.environ["SPARK_HOME"]
     kinesis_asl_assembly_dir = os.path.join(SPARK_HOME, "extras/kinesis-asl-assembly")
-    jars = glob.glob(
-        os.path.join(kinesis_asl_assembly_dir,
-                     "target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar"))
+    jars = search_jar(kinesis_asl_assembly_dir, "spark-streaming-kinesis-asl-assembly")
     if not jars:
         return None
     elif len(jars) > 1:
-        raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please "
-                         "remove all but one") % kinesis_asl_assembly_dir)
+        raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs: %s; please "
+                         "remove all but one") % (", ".join(jars)))
     else:
         return jars[0]
 
@@ -1269,8 +1274,8 @@ def search_kinesis_asl_assembly_jar():
                                    mqtt_test_jar, kinesis_asl_assembly_jar)
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
-    testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests,
-                 CheckpointTests, KafkaStreamTests, FlumeStreamTests, FlumePollingStreamTests]
+    testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
+                 KafkaStreamTests, FlumeStreamTests, FlumePollingStreamTests, MQTTStreamTests]
 
     if kinesis_jar_present is True:
         testcases.append(KinesisStreamTests)

From 7478c8b66d6a2b1179f20c38b49e27e37b0caec3 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 24 Aug 2015 12:40:09 -0700
Subject: [PATCH 046/802] [SPARK-9791] [PACKAGE] Change private class to
 private class to prevent unnecessary classes from showing up in the docs

In addition, some random cleanup of import ordering

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8387 from tdas/SPARK-9791 and squashes the following commits:

67f3ee9 [Tathagata Das] Change private class to private[package] class to prevent them from showing up in the docs
---
 .../spark/streaming/flume/FlumeUtils.scala    |  2 +-
 .../apache/spark/streaming/kafka/Broker.scala |  6 ++--
 .../streaming/kafka/KafkaTestUtils.scala      | 10 +++---
 .../spark/streaming/kafka/KafkaUtils.scala    | 36 +++++--------------
 .../spark/streaming/kafka/OffsetRange.scala   |  8 -----
 .../spark/streaming/mqtt/MQTTUtils.scala      |  6 ++--
 .../spark/streaming/mqtt/MQTTTestUtils.scala  |  2 +-
 .../streaming/kinesis/KinesisTestUtils.scala  |  2 +-
 .../spark/streaming/util/WriteAheadLog.java   |  2 ++
 .../util/WriteAheadLogRecordHandle.java       |  2 ++
 .../receiver/ReceivedBlockHandler.scala       |  2 +-
 .../streaming/scheduler/ReceiverTracker.scala |  2 +-
 .../apache/spark/streaming/ui/BatchPage.scala |  2 +-
 13 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
index 095bfb0c73a9a..a65a9b921aafa 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
@@ -247,7 +247,7 @@ object FlumeUtils {
  * This is a helper class that wraps the methods in FlumeUtils into more Python-friendly class and
  * function so that it can be easily instantiated and called from Python's FlumeUtils.
  */
-private class FlumeUtilsPythonHelper {
+private[flume] class FlumeUtilsPythonHelper {
 
   def createStream(
       jssc: JavaStreamingContext,
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
index 5a74febb4bd46..9159051ba06e4 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
@@ -20,11 +20,9 @@ package org.apache.spark.streaming.kafka
 import org.apache.spark.annotation.Experimental
 
 /**
- * :: Experimental ::
- * Represent the host and port info for a Kafka broker.
- * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID
+ * Represents the host and port info for a Kafka broker.
+ * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID.
  */
-@Experimental
 final class Broker private(
     /** Broker's hostname */
     val host: String,
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
index b608b75952721..79a9db4291bef 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
@@ -20,9 +20,8 @@ package org.apache.spark.streaming.kafka
 import java.io.File
 import java.lang.{Integer => JInt}
 import java.net.InetSocketAddress
-import java.util.{Map => JMap}
-import java.util.Properties
 import java.util.concurrent.TimeoutException
+import java.util.{Map => JMap, Properties}
 
 import scala.annotation.tailrec
 import scala.language.postfixOps
@@ -30,17 +29,16 @@ import scala.util.control.NonFatal
 
 import kafka.admin.AdminUtils
 import kafka.api.Request
-import kafka.common.TopicAndPartition
 import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
 import kafka.serializer.StringEncoder
 import kafka.server.{KafkaConfig, KafkaServer}
 import kafka.utils.{ZKStringSerializer, ZkUtils}
-import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
 import org.I0Itec.zkclient.ZkClient
+import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
 
-import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.streaming.Time
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkConf}
 
 /**
  * This is a helper class for Kafka test suites. This has the functionality to set up
@@ -48,7 +46,7 @@ import org.apache.spark.util.Utils
  *
  * The reason to put Kafka test utility class in src is to test Python related Kafka APIs.
  */
-private class KafkaTestUtils extends Logging {
+private[kafka] class KafkaTestUtils extends Logging {
 
   // Zookeeper related configurations
   private val zkHost = "localhost"
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index f3b01bd60b178..388dbb8184106 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -17,29 +17,25 @@
 
 package org.apache.spark.streaming.kafka
 
-import java.lang.{Integer => JInt}
-import java.lang.{Long => JLong}
-import java.util.{Map => JMap}
-import java.util.{Set => JSet}
-import java.util.{List => JList}
+import java.lang.{Integer => JInt, Long => JLong}
+import java.util.{List => JList, Map => JMap, Set => JSet}
 
-import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
+import scala.reflect.ClassTag
 
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
-import kafka.serializer.{DefaultDecoder, Decoder, StringDecoder}
+import kafka.serializer.{Decoder, DefaultDecoder, StringDecoder}
 
 import org.apache.spark.api.java.function.{Function => JFunction}
-import org.apache.spark.streaming.util.WriteAheadLogUtils
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.{JavaPairInputDStream, JavaInputDStream, JavaPairReceiverInputDStream, JavaStreamingContext}
+import org.apache.spark.streaming.api.java.{JavaInputDStream, JavaPairInputDStream, JavaPairReceiverInputDStream, JavaStreamingContext}
 import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream}
-import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.streaming.util.WriteAheadLogUtils
+import org.apache.spark.{SparkContext, SparkException}
 
 object KafkaUtils {
   /**
@@ -196,7 +192,6 @@ object KafkaUtils {
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
    */
-  @Experimental
   def createRDD[
     K: ClassTag,
     V: ClassTag,
@@ -214,7 +209,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
@@ -230,7 +224,6 @@ object KafkaUtils {
    *   in which case leaders will be looked up on the driver.
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
-  @Experimental
   def createRDD[
     K: ClassTag,
     V: ClassTag,
@@ -268,7 +261,6 @@ object KafkaUtils {
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
    */
-  @Experimental
   def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V]](
       jsc: JavaSparkContext,
       keyClass: Class[K],
@@ -287,7 +279,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
@@ -303,7 +294,6 @@ object KafkaUtils {
    *   in which case leaders will be looked up on the driver.
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
-  @Experimental
   def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
       jsc: JavaSparkContext,
       keyClass: Class[K],
@@ -327,7 +317,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create an input stream that directly pulls messages from Kafka Brokers
    * without using any receiver. This stream can guarantee that each message
    * from Kafka is included in transformations exactly once (see points below).
@@ -357,7 +346,6 @@ object KafkaUtils {
    *    starting point of the stream
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
-  @Experimental
   def createDirectStream[
     K: ClassTag,
     V: ClassTag,
@@ -375,7 +363,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create an input stream that directly pulls messages from Kafka Brokers
    * without using any receiver. This stream can guarantee that each message
    * from Kafka is included in transformations exactly once (see points below).
@@ -405,7 +392,6 @@ object KafkaUtils {
    *   to determine where the stream starts (defaults to "largest")
    * @param topics Names of the topics to consume
    */
-  @Experimental
   def createDirectStream[
     K: ClassTag,
     V: ClassTag,
@@ -437,7 +423,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create an input stream that directly pulls messages from Kafka Brokers
    * without using any receiver. This stream can guarantee that each message
    * from Kafka is included in transformations exactly once (see points below).
@@ -472,7 +457,6 @@ object KafkaUtils {
    *    starting point of the stream
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
-  @Experimental
   def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
       jssc: JavaStreamingContext,
       keyClass: Class[K],
@@ -499,7 +483,6 @@ object KafkaUtils {
   }
 
   /**
-   * :: Experimental ::
    * Create an input stream that directly pulls messages from Kafka Brokers
    * without using any receiver. This stream can guarantee that each message
    * from Kafka is included in transformations exactly once (see points below).
@@ -533,7 +516,6 @@ object KafkaUtils {
    *   to determine where the stream starts (defaults to "largest")
    * @param topics Names of the topics to consume
    */
-  @Experimental
   def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]](
       jssc: JavaStreamingContext,
       keyClass: Class[K],
@@ -564,7 +546,7 @@ object KafkaUtils {
  * classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream()
  * takes care of known parameters instead of passing them from Python
  */
-private class KafkaUtilsPythonHelper {
+private[kafka] class KafkaUtilsPythonHelper {
   def createStream(
       jssc: JavaStreamingContext,
       kafkaParams: JMap[String, String],
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index 2f8981d4898bd..8a5f371494511 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -19,10 +19,7 @@ package org.apache.spark.streaming.kafka
 
 import kafka.common.TopicAndPartition
 
-import org.apache.spark.annotation.Experimental
-
 /**
- * :: Experimental ::
  * Represents any object that has a collection of [[OffsetRange]]s. This can be used access the
  * offset ranges in RDDs generated by the direct Kafka DStream (see
  * [[KafkaUtils.createDirectStream()]]).
@@ -33,13 +30,11 @@ import org.apache.spark.annotation.Experimental
  *   }
  * }}}
  */
-@Experimental
 trait HasOffsetRanges {
   def offsetRanges: Array[OffsetRange]
 }
 
 /**
- * :: Experimental ::
  * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class
  * can be created with `OffsetRange.create()`.
  * @param topic Kafka topic name
@@ -47,7 +42,6 @@ trait HasOffsetRanges {
  * @param fromOffset Inclusive starting offset
  * @param untilOffset Exclusive ending offset
  */
-@Experimental
 final class OffsetRange private(
     val topic: String,
     val partition: Int,
@@ -84,10 +78,8 @@ final class OffsetRange private(
 }
 
 /**
- * :: Experimental ::
  * Companion object the provides methods to create instances of [[OffsetRange]].
  */
-@Experimental
 object OffsetRange {
   def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
     new OffsetRange(topic, partition, fromOffset, untilOffset)
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
index 38a1114863d15..7b8d56d6faf2d 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
@@ -21,8 +21,8 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream}
-import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}
+import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext}
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
 
 object MQTTUtils {
   /**
@@ -79,7 +79,7 @@ object MQTTUtils {
  * This is a helper class that wraps the methods in MQTTUtils into more Python-friendly class and
  * function so that it can be easily instantiated and called from Python's MQTTUtils.
  */
-private class MQTTUtilsPythonHelper {
+private[mqtt] class MQTTUtilsPythonHelper {
 
   def createStream(
       jssc: JavaStreamingContext,
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
index 1a371b7008824..1618e2c088b70 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
@@ -33,7 +33,7 @@ import org.apache.spark.{Logging, SparkConf}
 /**
  * Share codes for Scala and Python unit tests
  */
-private class MQTTTestUtils extends Logging {
+private[mqtt] class MQTTTestUtils extends Logging {
 
   private val persistenceDir = Utils.createTempDir()
   private val brokerHost = "localhost"
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
index 711aade182945..c8eec13ec7dc7 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -36,7 +36,7 @@ import org.apache.spark.Logging
 /**
  * Shared utility methods for performing Kinesis tests that actually transfer data
  */
-private class KinesisTestUtils extends Logging {
+private[kinesis] class KinesisTestUtils extends Logging {
 
   val endpointUrl = KinesisTestUtils.endpointUrl
   val regionName = RegionUtils.getRegionByEndpoint(endpointUrl).getName()
diff --git a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java
index 8c0fdfa9c7478..3738fc1a235c2 100644
--- a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java
+++ b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLog.java
@@ -21,6 +21,8 @@
 import java.util.Iterator;
 
 /**
+ * :: DeveloperApi ::
+ *
  * This abstract class represents a write ahead log (aka journal) that is used by Spark Streaming
  * to save the received data (by receivers) and associated metadata to a reliable storage, so that
  * they can be recovered after driver failures. See the Spark documentation for more information
diff --git a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java
index 02324189b7822..662889e779fb2 100644
--- a/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java
+++ b/streaming/src/main/java/org/apache/spark/streaming/util/WriteAheadLogRecordHandle.java
@@ -18,6 +18,8 @@
 package org.apache.spark.streaming.util;
 
 /**
+ * :: DeveloperApi ::
+ *
  * This abstract class represents a handle that refers to a record written in a
  * {@link org.apache.spark.streaming.util.WriteAheadLog WriteAheadLog}.
  * It must contain all the information necessary for the record to be read and returned by
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
index c8dd6e06812dc..5f6c5b024085c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
@@ -222,7 +222,7 @@ private[streaming] object WriteAheadLogBasedBlockHandler {
 /**
  * A utility that will wrap the Iterator to get the count
  */
-private class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] {
+private[streaming] class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] {
    private var _count = 0
 
    private def isFullyConsumed: Boolean = !iterator.hasNext
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index aae3acf7aba3e..30d25a64e307a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -546,7 +546,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
  * Function to start the receiver on the worker node. Use a class instead of closure to avoid
  * the serialization issue.
  */
-private class StartReceiverFunc(
+private[streaming] class StartReceiverFunc(
     checkpointDirOption: Option[String],
     serializableHadoopConf: SerializableConfiguration)
   extends (Iterator[Receiver[_]] => Unit) with Serializable {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 0c891662c264f..90d1b0fadecfc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -28,7 +28,7 @@ import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
 import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId}
 import org.apache.spark.ui.jobs.UIData.JobUIData
 
-private case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData])
+private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData])
 
 private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   private val streamingListener = parent.listener

From 9ce0c7ad333f4a3c01207e5e9ed42bcafb99d894 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 24 Aug 2015 13:48:01 -0700
Subject: [PATCH 047/802] [SPARK-7710] [SPARK-7998] [DOCS] Docs for
 DataFrameStatFunctions

This PR contains examples on how to use some of the Stat Functions available for DataFrames under `df.stat`.

rxin

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #8378 from brkyvz/update-sql-docs.
---
 .../org/apache/spark/sql/DataFrame.scala      |   2 +-
 .../spark/sql/DataFrameStatFunctions.scala    | 101 ++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d6688b24ae7d6..791c10c3d7ce7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -684,7 +684,7 @@ class DataFrame private[sql](
       // make it a NamedExpression.
       case Column(u: UnresolvedAttribute) => UnresolvedAlias(u)
       case Column(expr: NamedExpression) => expr
-      // Leave an unaliased explode with an empty list of names since the analzyer will generate the
+      // Leave an unaliased explode with an empty list of names since the analyzer will generate the
       // correct defaults after the nested expression's type has been resolved.
       case Column(explode: Explode) => MultiAlias(explode, Nil)
       case Column(expr: Expression) => Alias(expr, expr.prettyString)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 2e68e358f2f1f..69c984717526d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param col2 the name of the second column
    * @return the covariance of the two columns.
    *
+   * {{{
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.cov("rand1", "rand2")
+   *    res1: Double = 0.065...
+   * }}}
+   *
    * @since 1.4.0
    */
   def cov(col1: String, col2: String): Double = {
@@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param col2 the name of the column to calculate the correlation against
    * @return The Pearson Correlation Coefficient as a Double.
    *
+   * {{{
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.corr("rand1", "rand2")
+   *    res1: Double = 0.613...
+   * }}}
+   *
    * @since 1.4.0
    */
   def corr(col1: String, col2: String, method: String): Double = {
@@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param col2 the name of the column to calculate the correlation against
    * @return The Pearson Correlation Coefficient as a Double.
    *
+   * {{{
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.corr("rand1", "rand2", "pearson")
+   *    res1: Double = 0.613...
+   * }}}
+   *
    * @since 1.4.0
    */
   def corr(col1: String, col2: String): Double = {
@@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *             of the DataFrame.
    * @return A DataFrame containing for the contingency table.
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF("key", "value")
+   *    val ct = df.stat.crosstab("key", "value")
+   *    ct.show()
+   *    +---------+---+---+---+
+   *    |key_value|  1|  2|  3|
+   *    +---------+---+---+---+
+   *    |        2|  2|  0|  1|
+   *    |        1|  1|  1|  0|
+   *    |        3|  0|  1|  1|
+   *    +---------+---+---+---+
+   * }}}
+   *
    * @since 1.4.0
    */
   def crosstab(col1: String, col2: String): DataFrame = {
@@ -112,6 +147,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                than 1e-4.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
+   * {{{
+   *    val rows = Seq.tabulate(100) { i =>
+   *      if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
+   *    }
+   *    val df = sqlContext.createDataFrame(rows).toDF("a", "b")
+   *    // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+   *    // "a" and "b"
+   *    val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
+   *    freqSingles.show()
+   *    +-----------+-------------+
+   *    |a_freqItems|  b_freqItems|
+   *    +-----------+-------------+
+   *    |    [1, 99]|[-1.0, -99.0]|
+   *    +-----------+-------------+
+   *    // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+   *    val pairDf = df.select(struct("a", "b").as("a-b"))
+   *    val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
+   *    freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
+   *    +----------+
+   *    |   freq_ab|
+   *    +----------+
+   *    |  [1,-1.0]|
+   *    |   ...    |
+   *    +----------+
+   * }}}
+   *
    * @since 1.4.0
    */
   def freqItems(cols: Array[String], support: Double): DataFrame = {
@@ -147,6 +208,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
+   * {{{
+   *    val rows = Seq.tabulate(100) { i =>
+   *      if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
+   *    }
+   *    val df = sqlContext.createDataFrame(rows).toDF("a", "b")
+   *    // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+   *    // "a" and "b"
+   *    val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
+   *    freqSingles.show()
+   *    +-----------+-------------+
+   *    |a_freqItems|  b_freqItems|
+   *    +-----------+-------------+
+   *    |    [1, 99]|[-1.0, -99.0]|
+   *    +-----------+-------------+
+   *    // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+   *    val pairDf = df.select(struct("a", "b").as("a-b"))
+   *    val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
+   *    freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
+   *    +----------+
+   *    |   freq_ab|
+   *    +----------+
+   *    |  [1,-1.0]|
+   *    |   ...    |
+   *    +----------+
+   * }}}
+   *
    * @since 1.4.0
    */
   def freqItems(cols: Seq[String], support: Double): DataFrame = {
@@ -180,6 +267,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @tparam T stratum type
    * @return a new [[DataFrame]] that represents the stratified sample
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF("key", "value")
+   *    val fractions = Map(1 -> 1.0, 3 -> 0.5)
+   *    df.stat.sampleBy("key", fractions, 36L).show()
+   *    +---+-----+
+   *    |key|value|
+   *    +---+-----+
+   *    |  1|    1|
+   *    |  1|    2|
+   *    |  3|    2|
+   *    +---+-----+
+   * }}}
+   *
    * @since 1.5.0
    */
   def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = {

From 662bb9667669cb07cf6d2ccee0d8e76bb561cd89 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 24 Aug 2015 14:10:50 -0700
Subject: [PATCH 048/802] [SPARK-10144] [UI] Actually show peak execution
 memory by default

The peak execution memory metric was introduced in SPARK-8735. That was before Tungsten was enabled by default, so it assumed that `spark.sql.unsafe.enabled` must be explicitly set to true. The result is that the memory is not displayed by default.

Author: Andrew Or <andrew@databricks.com>

Closes #8345 from andrewor14/show-memory-default.
---
 .../main/scala/org/apache/spark/ui/jobs/StagePage.scala   | 6 ++----
 .../test/scala/org/apache/spark/ui/StagePageSuite.scala   | 8 ++++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index fb4556b836859..4adc6596ba21c 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -68,8 +68,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private val displayPeakExecutionMemory =
-    parent.conf.getOption("spark.sql.unsafe.enabled").exists(_.toBoolean)
+  private val displayPeakExecutionMemory = parent.conf.getBoolean("spark.sql.unsafe.enabled", true)
 
   def render(request: HttpServletRequest): Seq[Node] = {
     progressListener.synchronized {
@@ -1193,8 +1192,7 @@ private[ui] class TaskPagedTable(
     desc: Boolean) extends PagedTable[TaskTableRowData] {
 
   // We only track peak memory used for unsafe operators
-  private val displayPeakExecutionMemory =
-    conf.getOption("spark.sql.unsafe.enabled").exists(_.toBoolean)
+  private val displayPeakExecutionMemory = conf.getBoolean("spark.sql.unsafe.enabled", true)
 
   override def tableId: String = "task-table"
 
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 98f9314f31dff..3388c6dca81f1 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -33,14 +33,18 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
   test("peak execution memory only displayed if unsafe is enabled") {
     val unsafeConf = "spark.sql.unsafe.enabled"
-    val conf = new SparkConf().set(unsafeConf, "true")
+    val conf = new SparkConf(false).set(unsafeConf, "true")
     val html = renderStagePage(conf).toString().toLowerCase
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
     // Disable unsafe and make sure it's not there
-    val conf2 = new SparkConf().set(unsafeConf, "false")
+    val conf2 = new SparkConf(false).set(unsafeConf, "false")
     val html2 = renderStagePage(conf2).toString().toLowerCase
     assert(!html2.contains(targetString))
+    // Avoid setting anything; it should be displayed by default
+    val conf3 = new SparkConf(false)
+    val html3 = renderStagePage(conf3).toString().toLowerCase
+    assert(html3.contains(targetString))
   }
 
   /**

From a2f4cdceba32aaa0df59df335ca0ce1ac73fc6c2 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 24 Aug 2015 14:11:19 -0700
Subject: [PATCH 049/802] [SPARK-8580] [SQL] Refactors
 ParquetHiveCompatibilitySuite and adds more test cases

This PR refactors `ParquetHiveCompatibilitySuite` so that it's easier to add new test cases.

Hit two bugs, SPARK-10177 and HIVE-11625, while working on this, added test cases for them and marked as ignored for now. SPARK-10177 will be addressed in a separate PR.

Author: Cheng Lian <lian@databricks.com>

Closes #8392 from liancheng/spark-8580/parquet-hive-compat-tests.
---
 .../hive/ParquetHiveCompatibilitySuite.scala  | 132 ++++++++++++------
 1 file changed, 93 insertions(+), 39 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
index 13452e71a1b3b..bc30180cf0917 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -17,15 +17,17 @@
 
 package org.apache.spark.sql.hive
 
+import java.sql.Timestamp
+import java.util.{Locale, TimeZone}
+
 import org.apache.hadoop.hive.conf.HiveConf
+import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest
+import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.{Row, SQLConf, SQLContext}
 
-class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest {
-  import ParquetCompatibilityTest.makeNullable
-
+class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with BeforeAndAfterAll {
   override def _sqlContext: SQLContext = TestHive
   private val sqlContext = _sqlContext
 
@@ -35,69 +37,121 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest {
    */
   private val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR)
 
-  test("Read Parquet file generated by parquet-hive") {
+  private val originalTimeZone = TimeZone.getDefault
+  private val originalLocale = Locale.getDefault
+
+  protected override def beforeAll(): Unit = {
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
+    Locale.setDefault(Locale.US)
+  }
+
+  override protected def afterAll(): Unit = {
+    TimeZone.setDefault(originalTimeZone)
+    Locale.setDefault(originalLocale)
+  }
+
+  override protected def logParquetSchema(path: String): Unit = {
+    val schema = readParquetSchema(path, { path =>
+      !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir)
+    })
+
+    logInfo(
+      s"""Schema of the Parquet file written by parquet-avro:
+         |$schema
+       """.stripMargin)
+  }
+
+  private def testParquetHiveCompatibility(row: Row, hiveTypes: String*): Unit = {
     withTable("parquet_compat") {
       withTempPath { dir =>
         val path = dir.getCanonicalPath
 
+        // Hive columns are always nullable, so here we append a all-null row.
+        val rows = row :: Row(Seq.fill(row.length)(null): _*) :: Nil
+
+        // Don't convert Hive metastore Parquet tables to let Hive write those Parquet files.
         withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {
           withTempTable("data") {
-            sqlContext.sql(
+            val fields = hiveTypes.zipWithIndex.map { case (typ, index) => s"  col_$index $typ" }
+
+            val ddl =
               s"""CREATE TABLE parquet_compat(
-                 |  bool_column BOOLEAN,
-                 |  byte_column TINYINT,
-                 |  short_column SMALLINT,
-                 |  int_column INT,
-                 |  long_column BIGINT,
-                 |  float_column FLOAT,
-                 |  double_column DOUBLE,
-                 |
-                 |  strings_column ARRAY<STRING>,
-                 |  int_to_string_column MAP<INT, STRING>
+                 |${fields.mkString(",\n")}
                  |)
                  |STORED AS PARQUET
                  |LOCATION '$path'
+               """.stripMargin
+
+            logInfo(
+              s"""Creating testing Parquet table with the following DDL:
+                 |$ddl
                """.stripMargin)
 
+            sqlContext.sql(ddl)
+
             val schema = sqlContext.table("parquet_compat").schema
-            val rowRDD = sqlContext.sparkContext.parallelize(makeRows).coalesce(1)
+            val rowRDD = sqlContext.sparkContext.parallelize(rows).coalesce(1)
             sqlContext.createDataFrame(rowRDD, schema).registerTempTable("data")
             sqlContext.sql("INSERT INTO TABLE parquet_compat SELECT * FROM data")
           }
         }
 
-        val schema = readParquetSchema(path, { path =>
-          !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir)
-        })
-
-        logInfo(
-          s"""Schema of the Parquet file written by parquet-hive:
-             |$schema
-           """.stripMargin)
+        logParquetSchema(path)
 
         // Unfortunately parquet-hive doesn't add `UTF8` annotation to BINARY when writing strings.
         // Have to assume all BINARY values are strings here.
         withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") {
-          checkAnswer(sqlContext.read.parquet(path), makeRows)
+          checkAnswer(sqlContext.read.parquet(path), rows)
         }
       }
     }
   }
 
-  def makeRows: Seq[Row] = {
-    (0 until 10).map { i =>
-      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
+  test("simple primitives") {
+    testParquetHiveCompatibility(
+      Row(true, 1.toByte, 2.toShort, 3, 4.toLong, 5.1f, 6.1d, "foo"),
+      "BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING")
+  }
 
+  ignore("SPARK-10177 timestamp") {
+    testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP")
+  }
+
+  test("array") {
+    testParquetHiveCompatibility(
       Row(
-        nullable(i % 2 == 0: java.lang.Boolean),
-        nullable(i.toByte: java.lang.Byte),
-        nullable((i + 1).toShort: java.lang.Short),
-        nullable(i + 2: Integer),
-        nullable(i.toLong * 10: java.lang.Long),
-        nullable(i.toFloat + 0.1f: java.lang.Float),
-        nullable(i.toDouble + 0.2d: java.lang.Double),
-        nullable(Seq.tabulate(3)(n => s"arr_${i + n}")),
-        nullable(Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap))
-    }
+        Seq[Integer](1: Integer, null, 2: Integer, null),
+        Seq[String]("foo", null, "bar", null),
+        Seq[Seq[Integer]](
+          Seq[Integer](1: Integer, null),
+          Seq[Integer](2: Integer, null))),
+      "ARRAY<INT>",
+      "ARRAY<STRING>",
+      "ARRAY<ARRAY<INT>>")
+  }
+
+  test("map") {
+    testParquetHiveCompatibility(
+      Row(
+        Map[Integer, String](
+          (1: Integer) -> "foo",
+          (2: Integer) -> null)),
+      "MAP<INT, STRING>")
+  }
+
+  // HIVE-11625: Parquet map entries with null keys are dropped by Hive
+  ignore("map entries with null keys") {
+    testParquetHiveCompatibility(
+      Row(
+        Map[Integer, String](
+          null.asInstanceOf[Integer] -> "bar",
+          null.asInstanceOf[Integer] -> null)),
+      "MAP<INT, STRING>")
+  }
+
+  test("struct") {
+    testParquetHiveCompatibility(
+      Row(Row(1, Seq("foo", "bar", null))),
+      "STRUCT<f0: INT, f1: ARRAY<STRING>>")
   }
 }

From cb2d2e15844d7ae34b5dd7028b55e11586ed93fa Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 24 Aug 2015 22:35:21 +0100
Subject: [PATCH 050/802] [SPARK-9758] [TEST] [SQL] Compilation issue for hive
 test / wrong package?

Move `test.org.apache.spark.sql.hive` package tests to apparent intended `org.apache.spark.sql.hive` as they don't intend to test behavior from outside org.apache.spark.*

Alternate take, per discussion at https://github.com/apache/spark/pull/8051
I think this is what vanzin and I had in mind but also CC rxin to cross-check, as this does indeed depend on whether these tests were accidentally in this package or not. Testing from a `test.org.apache.spark` package is legitimate but didn't seem to be the intent here.

Author: Sean Owen <sowen@cloudera.com>

Closes #8307 from srowen/SPARK-9758.
---
 .../org/apache/spark/sql/hive/JavaDataFrameSuite.java       | 6 ++----
 .../spark/sql/hive/JavaMetastoreDataSourcesSuite.java       | 3 +--
 .../org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java    | 2 +-
 .../org/apache/spark/sql/hive/aggregate/MyDoubleSum.java    | 2 +-
 .../apache/spark/sql/hive/execution/UDFIntegerToString.java | 0
 .../org/apache/spark/sql/hive/execution/UDFListListInt.java | 0
 .../org/apache/spark/sql/hive/execution/UDFListString.java  | 0
 .../apache/spark/sql/hive/execution/UDFStringString.java    | 0
 .../org/apache/spark/sql/hive/execution/UDFTwoListList.java | 0
 .../spark/sql/hive/execution/AggregationQuerySuite.scala    | 2 +-
 10 files changed, 6 insertions(+), 9 deletions(-)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/JavaDataFrameSuite.java (94%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java (98%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java (98%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java (98%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/execution/UDFIntegerToString.java (100%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/execution/UDFListListInt.java (100%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/execution/UDFListString.java (100%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/execution/UDFStringString.java (100%)
 rename sql/hive/src/test/java/{test => }/org/apache/spark/sql/hive/execution/UDFTwoListList.java (100%)

diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
similarity index 94%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index a30dfa554eabc..019d8a30266e2 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package test.org.apache.spark.sql.hive;
+package org.apache.spark.sql.hive;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -31,10 +31,8 @@
 import org.apache.spark.sql.expressions.Window;
 import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
 import static org.apache.spark.sql.functions.*;
-import org.apache.spark.sql.hive.HiveContext;
 import org.apache.spark.sql.hive.test.TestHive$;
-import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
-import test.org.apache.spark.sql.hive.aggregate.MyDoubleSum;
+import org.apache.spark.sql.hive.aggregate.MyDoubleSum;
 
 public class JavaDataFrameSuite {
   private transient JavaSparkContext sc;
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
similarity index 98%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 15c2c3deb0d83..4192155975c47 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package test.org.apache.spark.sql.hive;
+package org.apache.spark.sql.hive;
 
 import java.io.File;
 import java.io.IOException;
@@ -37,7 +37,6 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.QueryTest$;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.hive.HiveContext;
 import org.apache.spark.sql.hive.test.TestHive$;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
similarity index 98%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
index 2961b803f14aa..5a167edd89592 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package test.org.apache.spark.sql.hive.aggregate;
+package org.apache.spark.sql.hive.aggregate;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
similarity index 98%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
index c71882a6e7bed..c3b7768e71bf8 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package test.org.apache.spark.sql.hive.aggregate;
+package org.apache.spark.sql.hive.aggregate;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
similarity index 100%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFIntegerToString.java
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
similarity index 100%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListListInt.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListListInt.java
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java
similarity index 100%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFListString.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFListString.java
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java
similarity index 100%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFStringString.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFStringString.java
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java
similarity index 100%
rename from sql/hive/src/test/java/test/org/apache/spark/sql/hive/execution/UDFTwoListList.java
rename to sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFTwoListList.java
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 119663af1887a..4886a85948367 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
-import _root_.test.org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
+import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
 
 abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
   override def _sqlContext: SQLContext = TestHive

From 13db11cb08eb90eb0ea3402c9fe0270aa282f247 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Mon, 24 Aug 2015 15:38:54 -0700
Subject: [PATCH 051/802] [SPARK-10061] [DOC] ML ensemble docs

User guide for spark.ml GBTs and Random Forests.
The examples are copied from the decision tree guide and modified to run.

I caught some issues I had somehow missed in the tree guide as well.

I have run all examples, including Java ones.  (Of course, I thought I had previously as well...)

CC: mengxr manishamde yanboliang

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8369 from jkbradley/ml-ensemble-docs.
---
 docs/ml-decision-tree.md |  75 ++-
 docs/ml-ensembles.md     | 952 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 976 insertions(+), 51 deletions(-)

diff --git a/docs/ml-decision-tree.md b/docs/ml-decision-tree.md
index 958c6f5e4716c..542819e93e6dc 100644
--- a/docs/ml-decision-tree.md
+++ b/docs/ml-decision-tree.md
@@ -30,7 +30,7 @@ The Pipelines API for Decision Trees offers a bit more functionality than the or
 
 Ensembles of trees (Random Forests and Gradient-Boosted Trees) are described in the [Ensembles guide](ml-ensembles.html).
 
-# Inputs and Outputs (Predictions)
+# Inputs and Outputs
 
 We list the input and output (prediction) column types here.
 All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
@@ -234,7 +234,7 @@ IndexToString labelConverter = new IndexToString()
 
 // Chain indexers and tree in a Pipeline
 Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
+  .setStages(new PipelineStage[] {labelIndexer, featureIndexer, dt, labelConverter});
 
 // Train model.  This also runs the indexers.
 PipelineModel model = pipeline.fit(trainingData);
@@ -315,10 +315,13 @@ print treeModel # summary only
 
 ## Regression
 
+The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
+We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize.
+
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier).
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.DecisionTreeRegressor).
 
 {% highlight scala %}
 import org.apache.spark.ml.Pipeline
@@ -347,7 +350,7 @@ val dt = new DecisionTreeRegressor()
   .setLabelCol("label")
   .setFeaturesCol("indexedFeatures")
 
-// Chain indexers and tree in a Pipeline
+// Chain indexer and tree in a Pipeline
 val pipeline = new Pipeline()
   .setStages(Array(featureIndexer, dt))
 
@@ -365,9 +368,7 @@ val evaluator = new RegressionEvaluator()
   .setLabelCol("label")
   .setPredictionCol("prediction")
   .setMetricName("rmse")
-// We negate the RMSE value since RegressionEvalutor returns negated RMSE
-// (since evaluation metrics are meant to be maximized by CrossValidator).
-val rmse = - evaluator.evaluate(predictions)
+val rmse = evaluator.evaluate(predictions)
 println("Root Mean Squared Error (RMSE) on test data = " + rmse)
 
 val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
@@ -377,14 +378,15 @@ println("Learned regression tree model:\n" + treeModel.toDebugString)
 
 <div data-lang="java" markdown="1">
 
-More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html).
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/DecisionTreeRegressor.html).
 
 {% highlight java %}
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
 import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.feature.*;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
 import org.apache.spark.ml.regression.DecisionTreeRegressor;
 import org.apache.spark.mllib.regression.LabeledPoint;
@@ -396,17 +398,12 @@ import org.apache.spark.sql.DataFrame;
 RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
 DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
 
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-StringIndexerModel labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data);
 // Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
 VectorIndexerModel featureIndexer = new VectorIndexer()
   .setInputCol("features")
   .setOutputCol("indexedFeatures")
-  .setMaxCategories(4) // features with > 4 distinct values are treated as continuous
+  .setMaxCategories(4)
   .fit(data);
 
 // Split the data into training and test sets (30% held out for testing)
@@ -416,61 +413,49 @@ DataFrame testData = splits[1];
 
 // Train a DecisionTree model.
 DecisionTreeRegressor dt = new DecisionTreeRegressor()
-  .setLabelCol("indexedLabel")
   .setFeaturesCol("indexedFeatures");
 
-// Convert indexed labels back to original labels.
-IndexToString labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels());
-
-// Chain indexers and tree in a Pipeline
+// Chain indexer and tree in a Pipeline
 Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
+  .setStages(new PipelineStage[] {featureIndexer, dt});
 
-// Train model.  This also runs the indexers.
+// Train model.  This also runs the indexer.
 PipelineModel model = pipeline.fit(trainingData);
 
 // Make predictions.
 DataFrame predictions = model.transform(testData);
 
 // Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5);
+predictions.select("label", "features").show(5);
 
 // Select (prediction, true label) and compute test error
 RegressionEvaluator evaluator = new RegressionEvaluator()
-  .setLabelCol("indexedLabel")
+  .setLabelCol("label")
   .setPredictionCol("prediction")
   .setMetricName("rmse");
-// We negate the RMSE value since RegressionEvalutor returns negated RMSE
-// (since evaluation metrics are meant to be maximized by CrossValidator).
-double rmse = - evaluator.evaluate(predictions);
+double rmse = evaluator.evaluate(predictions);
 System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
 
 DecisionTreeRegressionModel treeModel =
-  (DecisionTreeRegressionModel)(model.stages()[2]);
+  (DecisionTreeRegressionModel)(model.stages()[1]);
 System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
 {% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
 
-More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier).
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.DecisionTreeRegressor).
 
 {% highlight python %}
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import DecisionTreeRegressor
-from pyspark.ml.feature import StringIndexer, VectorIndexer
+from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
 data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
 
-# Index labels, adding metadata to the label column.
-# Fit on whole dataset to include all labels in index.
-labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
 # Automatically identify categorical features, and index them.
 # We specify maxCategories so features with > 4 distinct values are treated as continuous.
 featureIndexer =\
@@ -480,26 +465,24 @@ featureIndexer =\
 (trainingData, testData) = data.randomSplit([0.7, 0.3])
 
 # Train a DecisionTree model.
-dt = DecisionTreeRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures")
+dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
 
-# Chain indexers and tree in a Pipeline
-pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
+# Chain indexer and tree in a Pipeline
+pipeline = Pipeline(stages=[featureIndexer, dt])
 
-# Train model.  This also runs the indexers.
+# Train model.  This also runs the indexer.
 model = pipeline.fit(trainingData)
 
 # Make predictions.
 predictions = model.transform(testData)
 
 # Select example rows to display.
-predictions.select("prediction", "indexedLabel", "features").show(5)
+predictions.select("prediction", "label", "features").show(5)
 
 # Select (prediction, true label) and compute test error
 evaluator = RegressionEvaluator(
-    labelCol="indexedLabel", predictionCol="prediction", metricName="rmse")
-# We negate the RMSE value since RegressionEvalutor returns negated RMSE
-# (since evaluation metrics are meant to be maximized by CrossValidator).
-rmse = -evaluator.evaluate(predictions)
+    labelCol="label", predictionCol="prediction", metricName="rmse")
+rmse = evaluator.evaluate(predictions)
 print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
 
 treeModel = model.stages[1]
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index 9ff50e95fc479..62749909e01dc 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -11,11 +11,947 @@ displayTitle: <a href="ml-guide.html">ML</a> - Ensembles
 
 An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
 is a learning algorithm which creates a model composed of a set of other base models.
-The Pipelines API supports the following ensemble algorithms: [`OneVsRest`](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest)
 
-## OneVsRest
+## Tree Ensembles
 
-[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently.
+The Pipelines API supports two major tree ensemble algorithms: [Random Forests](http://en.wikipedia.org/wiki/Random_forest) and [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting).
+Both use [MLlib decision trees](ml-decision-tree.html) as their base models.
+
+Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html).  In this section, we demonstrate the Pipelines API for ensembles.
+
+The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are:
+* support for ML Pipelines
+* separation of classification vs. regression
+* use of DataFrame metadata to distinguish continuous and categorical features
+* a bit more functionality for random forests: estimates of feature importance, as well as the predicted probability of each class (a.k.a. class conditional probabilities) for classification.
+
+### Random Forests
+
+[Random forests](http://en.wikipedia.org/wiki/Random_forest)
+are ensembles of [decision trees](ml-decision-tree.html).
+Random forests combine many decision trees in order to reduce the risk of overfitting.
+MLlib supports random forests for binary and multiclass classification and for regression,
+using both continuous and categorical features.
+
+This section gives examples of using random forests with the Pipelines API.
+For more information on the algorithm, please see the [main MLlib docs on random forests](mllib-ensembles.html).
+
+#### Inputs and Outputs
+
+We list the input and output (prediction) column types here.
+All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
+
+##### Input Columns
+
+<table class="table">
+  <thead>
+    <tr>
+      <th align="left">Param name</th>
+      <th align="left">Type(s)</th>
+      <th align="left">Default</th>
+      <th align="left">Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>labelCol</td>
+      <td>Double</td>
+      <td>"label"</td>
+      <td>Label to predict</td>
+    </tr>
+    <tr>
+      <td>featuresCol</td>
+      <td>Vector</td>
+      <td>"features"</td>
+      <td>Feature vector</td>
+    </tr>
+  </tbody>
+</table>
+
+##### Output Columns (Predictions)
+
+<table class="table">
+  <thead>
+    <tr>
+      <th align="left">Param name</th>
+      <th align="left">Type(s)</th>
+      <th align="left">Default</th>
+      <th align="left">Description</th>
+      <th align="left">Notes</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>predictionCol</td>
+      <td>Double</td>
+      <td>"prediction"</td>
+      <td>Predicted label</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>rawPredictionCol</td>
+      <td>Vector</td>
+      <td>"rawPrediction"</td>
+      <td>Vector of length # classes, with the counts of training instance labels at the tree node which makes the prediction</td>
+      <td>Classification only</td>
+    </tr>
+    <tr>
+      <td>probabilityCol</td>
+      <td>Vector</td>
+      <td>"probability"</td>
+      <td>Vector of length # classes equal to rawPrediction normalized to a multinomial distribution</td>
+      <td>Classification only</td>
+    </tr>
+  </tbody>
+</table>
+
+#### Example: Classification
+
+The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
+We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details.
+
+{% highlight scala %}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.classification.RandomForestClassifier
+import org.apache.spark.ml.classification.RandomForestClassificationModel
+import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.mllib.util.MLUtils
+
+// Load and parse the data file, converting it to a DataFrame.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+// Index labels, adding metadata to the label column.
+// Fit on whole dataset to include all labels in index.
+val labelIndexer = new StringIndexer()
+  .setInputCol("label")
+  .setOutputCol("indexedLabel")
+  .fit(data)
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+val featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data)
+
+// Split the data into training and test sets (30% held out for testing)
+val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
+
+// Train a RandomForest model.
+val rf = new RandomForestClassifier()
+  .setLabelCol("indexedLabel")
+  .setFeaturesCol("indexedFeatures")
+  .setNumTrees(10)
+
+// Convert indexed labels back to original labels.
+val labelConverter = new IndexToString()
+  .setInputCol("prediction")
+  .setOutputCol("predictedLabel")
+  .setLabels(labelIndexer.labels)
+
+// Chain indexers and forest in a Pipeline
+val pipeline = new Pipeline()
+  .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
+
+// Train model.  This also runs the indexers.
+val model = pipeline.fit(trainingData)
+
+// Make predictions.
+val predictions = model.transform(testData)
+
+// Select example rows to display.
+predictions.select("predictedLabel", "label", "features").show(5)
+
+// Select (prediction, true label) and compute test error
+val evaluator = new MulticlassClassificationEvaluator()
+  .setLabelCol("indexedLabel")
+  .setPredictionCol("prediction")
+  .setMetricName("precision")
+val accuracy = evaluator.evaluate(predictions)
+println("Test Error = " + (1.0 - accuracy))
+
+val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
+println("Learned classification forest model:\n" + rfModel.toDebugString)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/RandomForestClassifier.html) for more details.
+
+{% highlight java %}
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.classification.RandomForestClassifier;
+import org.apache.spark.ml.classification.RandomForestClassificationModel;
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
+import org.apache.spark.ml.feature.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+// Load and parse the data file, converting it to a DataFrame.
+RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
+DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+
+// Index labels, adding metadata to the label column.
+// Fit on whole dataset to include all labels in index.
+StringIndexerModel labelIndexer = new StringIndexer()
+  .setInputCol("label")
+  .setOutputCol("indexedLabel")
+  .fit(data);
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+VectorIndexerModel featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data);
+
+// Split the data into training and test sets (30% held out for testing)
+DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
+DataFrame trainingData = splits[0];
+DataFrame testData = splits[1];
+
+// Train a RandomForest model.
+RandomForestClassifier rf = new RandomForestClassifier()
+  .setLabelCol("indexedLabel")
+  .setFeaturesCol("indexedFeatures");
+
+// Convert indexed labels back to original labels.
+IndexToString labelConverter = new IndexToString()
+  .setInputCol("prediction")
+  .setOutputCol("predictedLabel")
+  .setLabels(labelIndexer.labels());
+
+// Chain indexers and forest in a Pipeline
+Pipeline pipeline = new Pipeline()
+  .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});
+
+// Train model.  This also runs the indexers.
+PipelineModel model = pipeline.fit(trainingData);
+
+// Make predictions.
+DataFrame predictions = model.transform(testData);
+
+// Select example rows to display.
+predictions.select("predictedLabel", "label", "features").show(5);
+
+// Select (prediction, true label) and compute test error
+MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+  .setLabelCol("indexedLabel")
+  .setPredictionCol("prediction")
+  .setMetricName("precision");
+double accuracy = evaluator.evaluate(predictions);
+System.out.println("Test Error = " + (1.0 - accuracy));
+
+RandomForestClassificationModel rfModel =
+  (RandomForestClassificationModel)(model.stages()[2]);
+System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier) for more details.
+
+{% highlight python %}
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import RandomForestClassifier
+from pyspark.ml.feature import StringIndexer, VectorIndexer
+from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file, converting it to a DataFrame.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+# Index labels, adding metadata to the label column.
+# Fit on whole dataset to include all labels in index.
+labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
+# Automatically identify categorical features, and index them.
+# Set maxCategories so features with > 4 distinct values are treated as continuous.
+featureIndexer =\
+    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
+
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a RandomForest model.
+rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
+
+# Chain indexers and forest in a Pipeline
+pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
+
+# Train model.  This also runs the indexers.
+model = pipeline.fit(trainingData)
+
+# Make predictions.
+predictions = model.transform(testData)
+
+# Select example rows to display.
+predictions.select("prediction", "indexedLabel", "features").show(5)
+
+# Select (prediction, true label) and compute test error
+evaluator = MulticlassClassificationEvaluator(
+    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
+accuracy = evaluator.evaluate(predictions)
+print "Test Error = %g" % (1.0 - accuracy)
+
+rfModel = model.stages[2]
+print rfModel # summary only
+{% endhighlight %}
+</div>
+</div>
+
+#### Example: Regression
+
+The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
+We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.RandomForestRegressor) for more details.
+
+{% highlight scala %}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.regression.RandomForestRegressor
+import org.apache.spark.ml.regression.RandomForestRegressionModel
+import org.apache.spark.ml.feature.VectorIndexer
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.mllib.util.MLUtils
+
+// Load and parse the data file, converting it to a DataFrame.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+val featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data)
+
+// Split the data into training and test sets (30% held out for testing)
+val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
+
+// Train a RandomForest model.
+val rf = new RandomForestRegressor()
+  .setLabelCol("label")
+  .setFeaturesCol("indexedFeatures")
+
+// Chain indexer and forest in a Pipeline
+val pipeline = new Pipeline()
+  .setStages(Array(featureIndexer, rf))
+
+// Train model.  This also runs the indexer.
+val model = pipeline.fit(trainingData)
+
+// Make predictions.
+val predictions = model.transform(testData)
+
+// Select example rows to display.
+predictions.select("prediction", "label", "features").show(5)
+
+// Select (prediction, true label) and compute test error
+val evaluator = new RegressionEvaluator()
+  .setLabelCol("label")
+  .setPredictionCol("prediction")
+  .setMetricName("rmse")
+val rmse = evaluator.evaluate(predictions)
+println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+
+val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
+println("Learned regression forest model:\n" + rfModel.toDebugString)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/RandomForestRegressor.html) for more details.
+
+{% highlight java %}
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.ml.regression.RandomForestRegressionModel;
+import org.apache.spark.ml.regression.RandomForestRegressor;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+// Load and parse the data file, converting it to a DataFrame.
+RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
+DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+VectorIndexerModel featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data);
+
+// Split the data into training and test sets (30% held out for testing)
+DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
+DataFrame trainingData = splits[0];
+DataFrame testData = splits[1];
+
+// Train a RandomForest model.
+RandomForestRegressor rf = new RandomForestRegressor()
+  .setLabelCol("label")
+  .setFeaturesCol("indexedFeatures");
+
+// Chain indexer and forest in a Pipeline
+Pipeline pipeline = new Pipeline()
+  .setStages(new PipelineStage[] {featureIndexer, rf});
+
+// Train model.  This also runs the indexer.
+PipelineModel model = pipeline.fit(trainingData);
+
+// Make predictions.
+DataFrame predictions = model.transform(testData);
+
+// Select example rows to display.
+predictions.select("prediction", "label", "features").show(5);
+
+// Select (prediction, true label) and compute test error
+RegressionEvaluator evaluator = new RegressionEvaluator()
+  .setLabelCol("label")
+  .setPredictionCol("prediction")
+  .setMetricName("rmse");
+double rmse = evaluator.evaluate(predictions);
+System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
+
+RandomForestRegressionModel rfModel =
+  (RandomForestRegressionModel)(model.stages()[1]);
+System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.RandomForestRegressor) for more details.
+
+{% highlight python %}
+from pyspark.ml import Pipeline
+from pyspark.ml.regression import RandomForestRegressor
+from pyspark.ml.feature import VectorIndexer
+from pyspark.ml.evaluation import RegressionEvaluator
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file, converting it to a DataFrame.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+# Automatically identify categorical features, and index them.
+# Set maxCategories so features with > 4 distinct values are treated as continuous.
+featureIndexer =\
+    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
+
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a RandomForest model.
+rf = RandomForestRegressor(featuresCol="indexedFeatures")
+
+# Chain indexer and forest in a Pipeline
+pipeline = Pipeline(stages=[featureIndexer, rf])
+
+# Train model.  This also runs the indexer.
+model = pipeline.fit(trainingData)
+
+# Make predictions.
+predictions = model.transform(testData)
+
+# Select example rows to display.
+predictions.select("prediction", "label", "features").show(5)
+
+# Select (prediction, true label) and compute test error
+evaluator = RegressionEvaluator(
+    labelCol="label", predictionCol="prediction", metricName="rmse")
+rmse = evaluator.evaluate(predictions)
+print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
+
+rfModel = model.stages[1]
+print rfModel # summary only
+{% endhighlight %}
+</div>
+</div>
+
+### Gradient-Boosted Trees (GBTs)
+
+[Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting)
+are ensembles of [decision trees](ml-decision-tree.html).
+GBTs iteratively train decision trees in order to minimize a loss function.
+MLlib supports GBTs for binary classification and for regression,
+using both continuous and categorical features.
+
+This section gives examples of using GBTs with the Pipelines API.
+For more information on the algorithm, please see the [main MLlib docs on GBTs](mllib-ensembles.html).
+
+#### Inputs and Outputs
+
+We list the input and output (prediction) column types here.
+All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
+
+##### Input Columns
+
+<table class="table">
+  <thead>
+    <tr>
+      <th align="left">Param name</th>
+      <th align="left">Type(s)</th>
+      <th align="left">Default</th>
+      <th align="left">Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>labelCol</td>
+      <td>Double</td>
+      <td>"label"</td>
+      <td>Label to predict</td>
+    </tr>
+    <tr>
+      <td>featuresCol</td>
+      <td>Vector</td>
+      <td>"features"</td>
+      <td>Feature vector</td>
+    </tr>
+  </tbody>
+</table>
+
+Note that `GBTClassifier` currently only supports binary labels.
+
+##### Output Columns (Predictions)
+
+<table class="table">
+  <thead>
+    <tr>
+      <th align="left">Param name</th>
+      <th align="left">Type(s)</th>
+      <th align="left">Default</th>
+      <th align="left">Description</th>
+      <th align="left">Notes</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>predictionCol</td>
+      <td>Double</td>
+      <td>"prediction"</td>
+      <td>Predicted label</td>
+      <td></td>
+    </tr>
+  </tbody>
+</table>
+
+In the future, `GBTClassifier` will also output columns for `rawPrediction` and `probability`, just as `RandomForestClassifier` does.
+
+#### Example: Classification
+
+The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
+We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.GBTClassifier) for more details.
+
+{% highlight scala %}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.classification.GBTClassifier
+import org.apache.spark.ml.classification.GBTClassificationModel
+import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.mllib.util.MLUtils
+
+// Load and parse the data file, converting it to a DataFrame.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+// Index labels, adding metadata to the label column.
+// Fit on whole dataset to include all labels in index.
+val labelIndexer = new StringIndexer()
+  .setInputCol("label")
+  .setOutputCol("indexedLabel")
+  .fit(data)
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+val featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data)
+
+// Split the data into training and test sets (30% held out for testing)
+val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
+
+// Train a GBT model.
+val gbt = new GBTClassifier()
+  .setLabelCol("indexedLabel")
+  .setFeaturesCol("indexedFeatures")
+  .setMaxIter(10)
+
+// Convert indexed labels back to original labels.
+val labelConverter = new IndexToString()
+  .setInputCol("prediction")
+  .setOutputCol("predictedLabel")
+  .setLabels(labelIndexer.labels)
+
+// Chain indexers and GBT in a Pipeline
+val pipeline = new Pipeline()
+  .setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))
+
+// Train model.  This also runs the indexers.
+val model = pipeline.fit(trainingData)
+
+// Make predictions.
+val predictions = model.transform(testData)
+
+// Select example rows to display.
+predictions.select("predictedLabel", "label", "features").show(5)
+
+// Select (prediction, true label) and compute test error
+val evaluator = new MulticlassClassificationEvaluator()
+  .setLabelCol("indexedLabel")
+  .setPredictionCol("prediction")
+  .setMetricName("precision")
+val accuracy = evaluator.evaluate(predictions)
+println("Test Error = " + (1.0 - accuracy))
+
+val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel]
+println("Learned classification GBT model:\n" + gbtModel.toDebugString)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/GBTClassifier.html) for more details.
+
+{% highlight java %}
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.classification.GBTClassifier;
+import org.apache.spark.ml.classification.GBTClassificationModel;
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
+import org.apache.spark.ml.feature.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+// Load and parse the data file, converting it to a DataFrame.
+RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
+DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+
+// Index labels, adding metadata to the label column.
+// Fit on whole dataset to include all labels in index.
+StringIndexerModel labelIndexer = new StringIndexer()
+  .setInputCol("label")
+  .setOutputCol("indexedLabel")
+  .fit(data);
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+VectorIndexerModel featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data);
+
+// Split the data into training and test sets (30% held out for testing)
+DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
+DataFrame trainingData = splits[0];
+DataFrame testData = splits[1];
+
+// Train a GBT model.
+GBTClassifier gbt = new GBTClassifier()
+  .setLabelCol("indexedLabel")
+  .setFeaturesCol("indexedFeatures")
+  .setMaxIter(10);
+
+// Convert indexed labels back to original labels.
+IndexToString labelConverter = new IndexToString()
+  .setInputCol("prediction")
+  .setOutputCol("predictedLabel")
+  .setLabels(labelIndexer.labels());
+
+// Chain indexers and GBT in a Pipeline
+Pipeline pipeline = new Pipeline()
+  .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});
+
+// Train model.  This also runs the indexers.
+PipelineModel model = pipeline.fit(trainingData);
+
+// Make predictions.
+DataFrame predictions = model.transform(testData);
+
+// Select example rows to display.
+predictions.select("predictedLabel", "label", "features").show(5);
+
+// Select (prediction, true label) and compute test error
+MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+  .setLabelCol("indexedLabel")
+  .setPredictionCol("prediction")
+  .setMetricName("precision");
+double accuracy = evaluator.evaluate(predictions);
+System.out.println("Test Error = " + (1.0 - accuracy));
+
+GBTClassificationModel gbtModel =
+  (GBTClassificationModel)(model.stages()[2]);
+System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier) for more details.
+
+{% highlight python %}
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import GBTClassifier
+from pyspark.ml.feature import StringIndexer, VectorIndexer
+from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file, converting it to a DataFrame.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+# Index labels, adding metadata to the label column.
+# Fit on whole dataset to include all labels in index.
+labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
+# Automatically identify categorical features, and index them.
+# Set maxCategories so features with > 4 distinct values are treated as continuous.
+featureIndexer =\
+    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
+
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GBT model.
+gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
+
+# Chain indexers and GBT in a Pipeline
+pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
+
+# Train model.  This also runs the indexers.
+model = pipeline.fit(trainingData)
+
+# Make predictions.
+predictions = model.transform(testData)
+
+# Select example rows to display.
+predictions.select("prediction", "indexedLabel", "features").show(5)
+
+# Select (prediction, true label) and compute test error
+evaluator = MulticlassClassificationEvaluator(
+    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
+accuracy = evaluator.evaluate(predictions)
+print "Test Error = %g" % (1.0 - accuracy)
+
+gbtModel = model.stages[2]
+print gbtModel # summary only
+{% endhighlight %}
+</div>
+</div>
+
+#### Example: Regression
+
+Note: For this example dataset, `GBTRegressor` actually only needs 1 iteration, but that will not
+be true in general.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GBTRegressor) for more details.
+
+{% highlight scala %}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.regression.GBTRegressor
+import org.apache.spark.ml.regression.GBTRegressionModel
+import org.apache.spark.ml.feature.VectorIndexer
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.mllib.util.MLUtils
+
+// Load and parse the data file, converting it to a DataFrame.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+val featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data)
+
+// Split the data into training and test sets (30% held out for testing)
+val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
+
+// Train a GBT model.
+val gbt = new GBTRegressor()
+  .setLabelCol("label")
+  .setFeaturesCol("indexedFeatures")
+  .setMaxIter(10)
+
+// Chain indexer and GBT in a Pipeline
+val pipeline = new Pipeline()
+  .setStages(Array(featureIndexer, gbt))
+
+// Train model.  This also runs the indexer.
+val model = pipeline.fit(trainingData)
+
+// Make predictions.
+val predictions = model.transform(testData)
+
+// Select example rows to display.
+predictions.select("prediction", "label", "features").show(5)
+
+// Select (prediction, true label) and compute test error
+val evaluator = new RegressionEvaluator()
+  .setLabelCol("label")
+  .setPredictionCol("prediction")
+  .setMetricName("rmse")
+val rmse = evaluator.evaluate(predictions)
+println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+
+val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
+println("Learned regression GBT model:\n" + gbtModel.toDebugString)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GBTRegressor.html) for more details.
+
+{% highlight java %}
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.ml.regression.GBTRegressionModel;
+import org.apache.spark.ml.regression.GBTRegressor;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+// Load and parse the data file, converting it to a DataFrame.
+RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
+DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+
+// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
+VectorIndexerModel featureIndexer = new VectorIndexer()
+  .setInputCol("features")
+  .setOutputCol("indexedFeatures")
+  .setMaxCategories(4)
+  .fit(data);
+
+// Split the data into training and test sets (30% held out for testing)
+DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
+DataFrame trainingData = splits[0];
+DataFrame testData = splits[1];
+
+// Train a GBT model.
+GBTRegressor gbt = new GBTRegressor()
+  .setLabelCol("label")
+  .setFeaturesCol("indexedFeatures")
+  .setMaxIter(10);
+
+// Chain indexer and GBT in a Pipeline
+Pipeline pipeline = new Pipeline()
+  .setStages(new PipelineStage[] {featureIndexer, gbt});
+
+// Train model.  This also runs the indexer.
+PipelineModel model = pipeline.fit(trainingData);
+
+// Make predictions.
+DataFrame predictions = model.transform(testData);
+
+// Select example rows to display.
+predictions.select("prediction", "label", "features").show(5);
+
+// Select (prediction, true label) and compute test error
+RegressionEvaluator evaluator = new RegressionEvaluator()
+  .setLabelCol("label")
+  .setPredictionCol("prediction")
+  .setMetricName("rmse");
+double rmse = evaluator.evaluate(predictions);
+System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
+
+GBTRegressionModel gbtModel =
+  (GBTRegressionModel)(model.stages()[1]);
+System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString());
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GBTRegressor) for more details.
+
+{% highlight python %}
+from pyspark.ml import Pipeline
+from pyspark.ml.regression import GBTRegressor
+from pyspark.ml.feature import VectorIndexer
+from pyspark.ml.evaluation import RegressionEvaluator
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file, converting it to a DataFrame.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+# Automatically identify categorical features, and index them.
+# Set maxCategories so features with > 4 distinct values are treated as continuous.
+featureIndexer =\
+    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
+
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GBT model.
+gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)
+
+# Chain indexer and GBT in a Pipeline
+pipeline = Pipeline(stages=[featureIndexer, gbt])
+
+# Train model.  This also runs the indexer.
+model = pipeline.fit(trainingData)
+
+# Make predictions.
+predictions = model.transform(testData)
+
+# Select example rows to display.
+predictions.select("prediction", "label", "features").show(5)
+
+# Select (prediction, true label) and compute test error
+evaluator = RegressionEvaluator(
+    labelCol="label", predictionCol="prediction", metricName="rmse")
+rmse = evaluator.evaluate(predictions)
+print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
+
+gbtModel = model.stages[1]
+print gbtModel # summary only
+{% endhighlight %}
+</div>
+</div>
+
+
+## One-vs-Rest (a.k.a. One-vs-All)
+
+[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently.  It is also known as "One-vs-All."
 
 `OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes.
 
@@ -28,6 +964,9 @@ The example below demonstrates how to load the
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) for more details.
+
 {% highlight scala %}
 import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
@@ -64,9 +1003,12 @@ println("label\tfpr\n")
 }
 {% endhighlight %}
 </div>
+
 <div data-lang="java" markdown="1">
-{% highlight java %}
 
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/OneVsRest.html) for more details.
+
+{% highlight java %}
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -88,7 +1030,7 @@ RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
   "data/mllib/sample_multiclass_classification_data.txt");
 
 DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
-DataFrame[] splits = dataFrame.randomSplit(new double[]{0.7, 0.3}, 12345);
+DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
 DataFrame train = splits[0];
 DataFrame test = splits[1];
 

From d7b4c095271c36fcc7f9ded267ecf5ec66fac803 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 24 Aug 2015 16:17:45 -0700
Subject: [PATCH 052/802] [SPARK-10190] Fix NPE in CatalystTypeConverters
 Decimal toScala converter

This adds a missing null check to the Decimal `toScala` converter in `CatalystTypeConverters`, fixing an NPE.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8401 from JoshRosen/SPARK-10190.
---
 .../apache/spark/sql/catalyst/CatalystTypeConverters.scala   | 5 ++++-
 .../spark/sql/catalyst/CatalystTypeConvertersSuite.scala     | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 8d0c64eae4774..966623ed017ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -329,7 +329,10 @@ object CatalystTypeConverters {
         null
       }
     }
-    override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal
+    override def toScala(catalystValue: Decimal): JavaBigDecimal = {
+      if (catalystValue == null) null
+      else catalystValue.toJavaBigDecimal
+    }
     override def toScalaImpl(row: InternalRow, column: Int): JavaBigDecimal =
       row.getDecimal(column, dataType.precision, dataType.scale).toJavaBigDecimal
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
index df0f04563edcf..03bb102c67fe7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
@@ -32,7 +32,9 @@ class CatalystTypeConvertersSuite extends SparkFunSuite {
     IntegerType,
     LongType,
     FloatType,
-    DoubleType)
+    DoubleType,
+    DecimalType.SYSTEM_DEFAULT,
+    DecimalType.USER_DEFAULT)
 
   test("null handling in rows") {
     val schema = StructType(simpleTypes.map(t => StructField(t.getClass.getName, t)))

From 2bf338c626e9d97ccc033cfadae8b36a82c66fd1 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 24 Aug 2015 18:10:51 -0700
Subject: [PATCH 053/802] [SPARK-10165] [SQL] Await child resolution in
 ResolveFunctions

Currently, we eagerly attempt to resolve functions, even before their children are resolved.  However, this is not valid in cases where we need to know the types of the input arguments (i.e. when resolving Hive UDFs).

As a fix, this PR delays function resolution until the functions children are resolved.  This change also necessitates a change to the way we resolve aggregate expressions that are not in aggregate operators (e.g., in `HAVING` or `ORDER BY` clauses).  Specifically, we can't assume that these misplaced functions will be resolved, allowing us to differentiate aggregate functions from normal functions.  To compensate for this change we now attempt to resolve these unresolved expressions in the context of the aggregate operator, before checking to see if any aggregate expressions are present.

Author: Michael Armbrust <michael@databricks.com>

Closes #8371 from marmbrus/hiveUDFResolution.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 116 +++++++++++-------
 .../sql/hive/execution/HiveUDFSuite.scala     |   5 +
 2 files changed, 77 insertions(+), 44 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index d0eb9c2c90bdf..1a5de15c61f86 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -78,7 +78,7 @@ class Analyzer(
       ResolveAliases ::
       ExtractWindowExpressions ::
       GlobalAggregates ::
-      UnresolvedHavingClauseAttributes ::
+      ResolveAggregateFunctions ::
       HiveTypeCoercion.typeCoercionRules ++
       extendedResolutionRules : _*),
     Batch("Nondeterministic", Once,
@@ -452,37 +452,6 @@ class Analyzer(
           logDebug(s"Failed to find $missing in ${p.output.mkString(", ")}")
           s // Nothing we can do here. Return original plan.
         }
-      case s @ Sort(ordering, global, a @ Aggregate(grouping, aggs, child))
-          if !s.resolved && a.resolved =>
-        // A small hack to create an object that will allow us to resolve any references that
-        // refer to named expressions that are present in the grouping expressions.
-        val groupingRelation = LocalRelation(
-          grouping.collect { case ne: NamedExpression => ne.toAttribute }
-        )
-
-        // Find sort attributes that are projected away so we can temporarily add them back in.
-        val (newOrdering, missingAttr) = resolveAndFindMissing(ordering, a, groupingRelation)
-
-        // Find aggregate expressions and evaluate them early, since they can't be evaluated in a
-        // Sort.
-        val (withAggsRemoved, aliasedAggregateList) = newOrdering.map {
-          case aggOrdering if aggOrdering.collect { case a: AggregateExpression => a }.nonEmpty =>
-            val aliased = Alias(aggOrdering.child, "_aggOrdering")()
-            (aggOrdering.copy(child = aliased.toAttribute), Some(aliased))
-
-          case other => (other, None)
-        }.unzip
-
-        val missing = missingAttr ++ aliasedAggregateList.flatten
-
-        if (missing.nonEmpty) {
-          // Add missing grouping exprs and then project them away after the sort.
-          Project(a.output,
-            Sort(withAggsRemoved, global,
-              Aggregate(grouping, aggs ++ missing, child)))
-        } else {
-          s // Nothing we can do here. Return original plan.
-        }
     }
 
     /**
@@ -515,6 +484,7 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case q: LogicalPlan =>
         q transformExpressions {
+          case u if !u.childrenResolved => u // Skip until children are resolved.
           case u @ UnresolvedFunction(name, children, isDistinct) =>
             withPosition(u) {
               registry.lookupFunction(name, children) match {
@@ -559,21 +529,79 @@ class Analyzer(
   }
 
   /**
-   * This rule finds expressions in HAVING clause filters that depend on
-   * unresolved attributes.  It pushes these expressions down to the underlying
-   * aggregates and then projects them away above the filter.
+   * This rule finds aggregate expressions that are not in an aggregate operator.  For example,
+   * those in a HAVING clause or ORDER BY clause.  These expressions are pushed down to the
+   * underlying aggregate operator and then projected away after the original operator.
    */
-  object UnresolvedHavingClauseAttributes extends Rule[LogicalPlan] {
+  object ResolveAggregateFunctions extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-      case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _))
-          if aggregate.resolved && containsAggregate(havingCondition) =>
-
-        val evaluatedCondition = Alias(havingCondition, "havingCondition")()
-        val aggExprsWithHaving = evaluatedCondition +: originalAggExprs
+      case filter @ Filter(havingCondition,
+             aggregate @ Aggregate(grouping, originalAggExprs, child))
+          if aggregate.resolved && !filter.resolved =>
+
+        // Try resolving the condition of the filter as though it is in the aggregate clause
+        val aggregatedCondition =
+          Aggregate(grouping, Alias(havingCondition, "havingCondition")() :: Nil, child)
+        val resolvedOperator = execute(aggregatedCondition)
+        def resolvedAggregateFilter =
+          resolvedOperator
+            .asInstanceOf[Aggregate]
+            .aggregateExpressions.head
+
+        // If resolution was successful and we see the filter has an aggregate in it, add it to
+        // the original aggregate operator.
+        if (resolvedOperator.resolved && containsAggregate(resolvedAggregateFilter)) {
+          val aggExprsWithHaving = resolvedAggregateFilter +: originalAggExprs
+
+          Project(aggregate.output,
+            Filter(resolvedAggregateFilter.toAttribute,
+              aggregate.copy(aggregateExpressions = aggExprsWithHaving)))
+        } else {
+          filter
+        }
 
-        Project(aggregate.output,
-          Filter(evaluatedCondition.toAttribute,
-            aggregate.copy(aggregateExpressions = aggExprsWithHaving)))
+      case sort @ Sort(sortOrder, global,
+             aggregate @ Aggregate(grouping, originalAggExprs, child))
+        if aggregate.resolved && !sort.resolved =>
+
+        // Try resolving the ordering as though it is in the aggregate clause.
+        try {
+          val aliasedOrder = sortOrder.map(o => Alias(o.child, "aggOrder")())
+          val aggregatedOrdering = Aggregate(grouping, aliasedOrder, child)
+          val resolvedOperator: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          def resolvedAggregateOrdering = resolvedOperator.aggregateExpressions
+
+          // Expressions that have an aggregate can be pushed down.
+          val needsAggregate = resolvedAggregateOrdering.exists(containsAggregate)
+
+          // Attribute references, that are missing from the order but are present in the grouping
+          // expressions can also be pushed down.
+          val requiredAttributes = resolvedAggregateOrdering.map(_.references).reduce(_ ++ _)
+          val missingAttributes = requiredAttributes -- aggregate.outputSet
+          val validPushdownAttributes =
+            missingAttributes.filter(a => grouping.exists(a.semanticEquals))
+
+          // If resolution was successful and we see the ordering either has an aggregate in it or
+          // it is missing something that is projected away by the aggregate, add the ordering
+          // the original aggregate operator.
+          if (resolvedOperator.resolved && (needsAggregate || validPushdownAttributes.nonEmpty)) {
+            val evaluatedOrderings: Seq[SortOrder] = sortOrder.zip(resolvedAggregateOrdering).map {
+              case (order, evaluated) => order.copy(child = evaluated.toAttribute)
+            }
+            val aggExprsWithOrdering: Seq[NamedExpression] =
+              resolvedAggregateOrdering ++ originalAggExprs
+
+            Project(aggregate.output,
+              Sort(evaluatedOrderings, global,
+                aggregate.copy(aggregateExpressions = aggExprsWithOrdering)))
+          } else {
+            sort
+          }
+        } catch {
+          // Attempting to resolve in the aggregate can result in ambiguity.  When this happens,
+          // just return the original plan.
+          case ae: AnalysisException => sort
+        }
     }
 
     protected def containsAggregate(condition: Expression): Boolean = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 10f2902e5eef0..b03a35132325d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -276,6 +276,11 @@ class HiveUDFSuite extends QueryTest {
     checkAnswer(
       sql("SELECT testStringStringUDF(\"hello\", s) FROM stringTable"),
       Seq(Row("hello world"), Row("hello goodbye")))
+
+    checkAnswer(
+      sql("SELECT testStringStringUDF(\"\", testStringStringUDF(\"hello\", s)) FROM stringTable"),
+      Seq(Row(" hello world"), Row(" hello goodbye")))
+
     sql("DROP TEMPORARY FUNCTION IF EXISTS testStringStringUDF")
 
     TestHive.reset()

From 6511bf559b736d8e23ae398901c8d78938e66869 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 24 Aug 2015 18:17:51 -0700
Subject: [PATCH 054/802] [SPARK-10118] [SPARKR] [DOCS] Improve SparkR API docs
 for 1.5 release

cc: shivaram

## Summary

- Modify `tdname` of expression functions. i.e. `ascii`: `rdname functions` => `rdname ascii`
- Replace the dynamical function definitions to the static ones because of thir documentations.

## Generated PDF File
https://drive.google.com/file/d/0B9biIZIU47lLX2t6ZjRoRnBTSEU/view?usp=sharing

## JIRA
[[SPARK-10118] Improve SparkR API docs for 1.5 release - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-10118)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Author: Yuu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8386 from yu-iskw/SPARK-10118.
---
 R/create-docs.sh    |    2 +-
 R/pkg/R/column.R    |    5 +-
 R/pkg/R/functions.R | 1603 +++++++++++++++++++++++++++++++++++++++----
 R/pkg/R/generics.R  |  214 +++---
 4 files changed, 1596 insertions(+), 228 deletions(-)

diff --git a/R/create-docs.sh b/R/create-docs.sh
index 6a4687b06ecb9..d2ae160b50021 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -39,7 +39,7 @@ pushd $FWDIR
 mkdir -p pkg/html
 pushd pkg/html
 
-Rscript -e 'library(SparkR, lib.loc="../../lib"); library(knitr); knit_rd("SparkR")'
+Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
 
 popd
 
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 5a07ebd308296..a1f50c383367c 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -169,8 +169,7 @@ setMethod("between", signature(x = "Column"),
 #'
 #' @rdname column
 #'
-#' @examples
-#' \dontrun{
+#' @examples \dontrun{
 #'   cast(df$age, "string")
 #'   cast(df$name, list(type="array", elementType="byte", containsNull = TRUE))
 #' }
@@ -192,7 +191,7 @@ setMethod("cast",
 #'
 #' @rdname column
 #' @return a matched values as a result of comparing with given values.
-#' \dontrun{
+#' @examples \dontrun{
 #'   filter(df, "age in (10, 30)")
 #'   where(df, df$age %in% c(10, 30))
 #' }
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index b5879bd9ad553..d848730e70433 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -18,69 +18,1298 @@
 #' @include generics.R column.R
 NULL
 
-#' @title S4 expression functions for DataFrame column(s)
-#' @description These are expression functions on DataFrame columns
-
-functions1 <- c(
-  "abs", "acos", "approxCountDistinct", "ascii", "asin", "atan",
-  "avg", "base64", "bin", "bitwiseNOT", "cbrt", "ceil", "cos", "cosh", "count",
-  "crc32", "dayofmonth", "dayofyear", "exp", "explode", "expm1", "factorial",
-  "first", "floor", "hex", "hour", "initcap", "isNaN", "last", "last_day",
-  "length", "log", "log10", "log1p", "log2", "lower", "ltrim", "max", "md5",
-  "mean", "min", "minute", "month", "negate", "quarter", "reverse",
-  "rint", "round", "rtrim", "second", "sha1", "signum", "sin", "sinh", "size",
-  "soundex", "sqrt", "sum", "sumDistinct", "tan", "tanh", "toDegrees",
-  "toRadians", "to_date", "trim", "unbase64", "unhex", "upper", "weekofyear",
-  "year")
-functions2 <- c(
-  "atan2", "datediff", "hypot", "levenshtein", "months_between", "nanvl", "pmod")
-
-createFunction1 <- function(name) {
-  setMethod(name,
-            signature(x = "Column"),
-            function(x) {
-              jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
-              column(jc)
-            })
-}
-
-createFunction2 <- function(name) {
-  setMethod(name,
-            signature(y = "Column"),
-            function(y, x) {
-              if (class(x) == "Column") {
-                x <- x@jc
-              }
-              jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x)
-              column(jc)
-            })
-}
+#' Creates a \code{Column} of literal value.
+#'
+#' The passed in object is returned directly if it is already a \linkS4class{Column}.
+#' If the object is a Scala Symbol, it is converted into a \linkS4class{Column} also.
+#' Otherwise, a new \linkS4class{Column} is created to represent the literal value.
+#'
+#' @family normal_funcs
+#' @rdname lit
+#' @name lit
+#' @export
+setMethod("lit", signature("ANY"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions",
+                              "lit",
+                              ifelse(class(x) == "Column", x@jc, x))
+            column(jc)
+          })
+
+#' abs
+#'
+#' Computes the absolute value.
+#'
+#' @rdname abs
+#' @name abs
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{abs(df$c)}
+setMethod("abs",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "abs", x@jc)
+            column(jc)
+          })
+
+#' acos
+#'
+#' Computes the cosine inverse of the given value; the returned angle is in the range
+#' 0.0 through pi.
+#'
+#' @rdname acos
+#' @name acos
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{acos(df$c)}
+setMethod("acos",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "acos", x@jc)
+            column(jc)
+          })
+
+#' approxCountDistinct
+#'
+#' Aggregate function: returns the approximate number of distinct items in a group.
+#'
+#' @rdname approxCountDistinct
+#' @name approxCountDistinct
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{approxCountDistinct(df$c)}
+setMethod("approxCountDistinct",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc)
+            column(jc)
+          })
+
+#' ascii
+#'
+#' Computes the numeric value of the first character of the string column, and returns the
+#' result as a int column.
+#'
+#' @rdname ascii
+#' @name ascii
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{\dontrun{ascii(df$c)}}
+setMethod("ascii",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "ascii", x@jc)
+            column(jc)
+          })
+
+#' asin
+#'
+#' Computes the sine inverse of the given value; the returned angle is in the range
+#' -pi/2 through pi/2.
+#'
+#' @rdname asin
+#' @name asin
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{asin(df$c)}
+setMethod("asin",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "asin", x@jc)
+            column(jc)
+          })
+
+#' atan
+#'
+#' Computes the tangent inverse of the given value.
+#'
+#' @rdname atan
+#' @name atan
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{atan(df$c)}
+setMethod("atan",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "atan", x@jc)
+            column(jc)
+          })
+
+#' avg
+#'
+#' Aggregate function: returns the average of the values in a group.
+#'
+#' @rdname avg
+#' @name avg
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{avg(df$c)}
+setMethod("avg",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "avg", x@jc)
+            column(jc)
+          })
+
+#' base64
+#'
+#' Computes the BASE64 encoding of a binary column and returns it as a string column.
+#' This is the reverse of unbase64.
+#'
+#' @rdname base64
+#' @name base64
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{base64(df$c)}
+setMethod("base64",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "base64", x@jc)
+            column(jc)
+          })
+
+#' bin
+#'
+#' An expression that returns the string representation of the binary value of the given long
+#' column. For example, bin("12") returns "1100".
+#'
+#' @rdname bin
+#' @name bin
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{bin(df$c)}
+setMethod("bin",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bin", x@jc)
+            column(jc)
+          })
+
+#' bitwiseNOT
+#'
+#' Computes bitwise NOT.
+#'
+#' @rdname bitwiseNOT
+#' @name bitwiseNOT
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{bitwiseNOT(df$c)}
+setMethod("bitwiseNOT",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc)
+            column(jc)
+          })
+
+#' cbrt
+#'
+#' Computes the cube-root of the given value.
+#'
+#' @rdname cbrt
+#' @name cbrt
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{cbrt(df$c)}
+setMethod("cbrt",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "cbrt", x@jc)
+            column(jc)
+          })
+
+#' ceil
+#'
+#' Computes the ceiling of the given value.
+#'
+#' @rdname ceil
+#' @name ceil
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{ceil(df$c)}
+setMethod("ceil",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "ceil", x@jc)
+            column(jc)
+          })
+
+#' cos
+#'
+#' Computes the cosine of the given value.
+#'
+#' @rdname cos
+#' @name cos
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{cos(df$c)}
+setMethod("cos",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "cos", x@jc)
+            column(jc)
+          })
+
+#' cosh
+#'
+#' Computes the hyperbolic cosine of the given value.
+#'
+#' @rdname cosh
+#' @name cosh
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{cosh(df$c)}
+setMethod("cosh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "cosh", x@jc)
+            column(jc)
+          })
+
+#' count
+#'
+#' Aggregate function: returns the number of items in a group.
+#'
+#' @rdname count
+#' @name count
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{count(df$c)}
+setMethod("count",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "count", x@jc)
+            column(jc)
+          })
+
+#' crc32
+#'
+#' Calculates the cyclic redundancy check value  (CRC32) of a binary column and
+#' returns the value as a bigint.
+#'
+#' @rdname crc32
+#' @name crc32
+#' @family misc_funcs
+#' @export
+#' @examples \dontrun{crc32(df$c)}
+setMethod("crc32",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "crc32", x@jc)
+            column(jc)
+          })
+
+#' dayofmonth
+#'
+#' Extracts the day of the month as an integer from a given date/timestamp/string.
+#'
+#' @rdname dayofmonth
+#' @name dayofmonth
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{dayofmonth(df$c)}
+setMethod("dayofmonth",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "dayofmonth", x@jc)
+            column(jc)
+          })
+
+#' dayofyear
+#'
+#' Extracts the day of the year as an integer from a given date/timestamp/string.
+#'
+#' @rdname dayofyear
+#' @name dayofyear
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{dayofyear(df$c)}
+setMethod("dayofyear",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "dayofyear", x@jc)
+            column(jc)
+          })
+
+#' exp
+#'
+#' Computes the exponential of the given value.
+#'
+#' @rdname exp
+#' @name exp
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{exp(df$c)}
+setMethod("exp",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "exp", x@jc)
+            column(jc)
+          })
+
+#' explode
+#'
+#' Creates a new row for each element in the given array or map column.
+#'
+#' @rdname explode
+#' @name explode
+#' @family collection_funcs
+#' @export
+#' @examples \dontrun{explode(df$c)}
+setMethod("explode",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "explode", x@jc)
+            column(jc)
+          })
+
+#' expm1
+#'
+#' Computes the exponential of the given value minus one.
+#'
+#' @rdname expm1
+#' @name expm1
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{expm1(df$c)}
+setMethod("expm1",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "expm1", x@jc)
+            column(jc)
+          })
+
+#' factorial
+#'
+#' Computes the factorial of the given value.
+#'
+#' @rdname factorial
+#' @name factorial
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{factorial(df$c)}
+setMethod("factorial",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "factorial", x@jc)
+            column(jc)
+          })
+
+#' first
+#'
+#' Aggregate function: returns the first value in a group.
+#'
+#' @rdname first
+#' @name first
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{first(df$c)}
+setMethod("first",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "first", x@jc)
+            column(jc)
+          })
+
+#' floor
+#'
+#' Computes the floor of the given value.
+#'
+#' @rdname floor
+#' @name floor
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{floor(df$c)}
+setMethod("floor",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "floor", x@jc)
+            column(jc)
+          })
+
+#' hex
+#'
+#' Computes hex value of the given column.
+#'
+#' @rdname hex
+#' @name hex
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{hex(df$c)}
+setMethod("hex",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "hex", x@jc)
+            column(jc)
+          })
+
+#' hour
+#'
+#' Extracts the hours as an integer from a given date/timestamp/string.
+#'
+#' @rdname hour
+#' @name hour
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{hour(df$c)}
+setMethod("hour",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "hour", x@jc)
+            column(jc)
+          })
+
+#' initcap
+#'
+#' Returns a new string column by converting the first letter of each word to uppercase.
+#' Words are delimited by whitespace.
+#'
+#' For example, "hello world" will become "Hello World".
+#'
+#' @rdname initcap
+#' @name initcap
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{initcap(df$c)}
+setMethod("initcap",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "initcap", x@jc)
+            column(jc)
+          })
+
+#' isNaN
+#'
+#' Return true iff the column is NaN.
+#'
+#' @rdname isNaN
+#' @name isNaN
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{isNaN(df$c)}
+setMethod("isNaN",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "isNaN", x@jc)
+            column(jc)
+          })
+
+#' last
+#'
+#' Aggregate function: returns the last value in a group.
+#'
+#' @rdname last
+#' @name last
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{last(df$c)}
+setMethod("last",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "last", x@jc)
+            column(jc)
+          })
+
+#' last_day
+#'
+#' Given a date column, returns the last day of the month which the given date belongs to.
+#' For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the
+#' month in July 2015.
+#'
+#' @rdname last_day
+#' @name last_day
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{last_day(df$c)}
+setMethod("last_day",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "last_day", x@jc)
+            column(jc)
+          })
+
+#' length
+#'
+#' Computes the length of a given string or binary column.
+#'
+#' @rdname length
+#' @name length
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{length(df$c)}
+setMethod("length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "length", x@jc)
+            column(jc)
+          })
+
+#' log
+#'
+#' Computes the natural logarithm of the given value.
+#'
+#' @rdname log
+#' @name log
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{log(df$c)}
+setMethod("log",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "log", x@jc)
+            column(jc)
+          })
+
+#' log10
+#'
+#' Computes the logarithm of the given value in base 10.
+#'
+#' @rdname log10
+#' @name log10
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{log10(df$c)}
+setMethod("log10",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "log10", x@jc)
+            column(jc)
+          })
+
+#' log1p
+#'
+#' Computes the natural logarithm of the given value plus one.
+#'
+#' @rdname log1p
+#' @name log1p
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{log1p(df$c)}
+setMethod("log1p",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "log1p", x@jc)
+            column(jc)
+          })
+
+#' log2
+#'
+#' Computes the logarithm of the given column in base 2.
+#'
+#' @rdname log2
+#' @name log2
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{log2(df$c)}
+setMethod("log2",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "log2", x@jc)
+            column(jc)
+          })
+
+#' lower
+#'
+#' Converts a string column to lower case.
+#'
+#' @rdname lower
+#' @name lower
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{lower(df$c)}
+setMethod("lower",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "lower", x@jc)
+            column(jc)
+          })
+
+#' ltrim
+#'
+#' Trim the spaces from left end for the specified string value.
+#'
+#' @rdname ltrim
+#' @name ltrim
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{ltrim(df$c)}
+setMethod("ltrim",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc)
+            column(jc)
+          })
+
+#' max
+#'
+#' Aggregate function: returns the maximum value of the expression in a group.
+#'
+#' @rdname max
+#' @name max
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{max(df$c)}
+setMethod("max",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "max", x@jc)
+            column(jc)
+          })
+
+#' md5
+#'
+#' Calculates the MD5 digest of a binary column and returns the value
+#' as a 32 character hex string.
+#'
+#' @rdname md5
+#' @name md5
+#' @family misc_funcs
+#' @export
+#' @examples \dontrun{md5(df$c)}
+setMethod("md5",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "md5", x@jc)
+            column(jc)
+          })
+
+#' mean
+#'
+#' Aggregate function: returns the average of the values in a group.
+#' Alias for avg.
+#'
+#' @rdname mean
+#' @name mean
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{mean(df$c)}
+setMethod("mean",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "mean", x@jc)
+            column(jc)
+          })
+
+#' min
+#'
+#' Aggregate function: returns the minimum value of the expression in a group.
+#'
+#' @rdname min
+#' @name min
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{min(df$c)}
+setMethod("min",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "min", x@jc)
+            column(jc)
+          })
+
+#' minute
+#'
+#' Extracts the minutes as an integer from a given date/timestamp/string.
+#'
+#' @rdname minute
+#' @name minute
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{minute(df$c)}
+setMethod("minute",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "minute", x@jc)
+            column(jc)
+          })
+
+#' month
+#'
+#' Extracts the month as an integer from a given date/timestamp/string.
+#'
+#' @rdname month
+#' @name month
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{month(df$c)}
+setMethod("month",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "month", x@jc)
+            column(jc)
+          })
+
+#' negate
+#'
+#' Unary minus, i.e. negate the expression.
+#'
+#' @rdname negate
+#' @name negate
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{negate(df$c)}
+setMethod("negate",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "negate", x@jc)
+            column(jc)
+          })
 
-createFunctions <- function() {
-  for (name in functions1) {
-    createFunction1(name)
-  }
-  for (name in functions2) {
-    createFunction2(name)
-  }
-}
+#' quarter
+#'
+#' Extracts the quarter as an integer from a given date/timestamp/string.
+#'
+#' @rdname quarter
+#' @name quarter
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{quarter(df$c)}
+setMethod("quarter",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "quarter", x@jc)
+            column(jc)
+          })
 
-createFunctions()
+#' reverse
+#'
+#' Reverses the string column and returns it as a new string column.
+#'
+#' @rdname reverse
+#' @name reverse
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{reverse(df$c)}
+setMethod("reverse",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "reverse", x@jc)
+            column(jc)
+          })
 
-#' @rdname functions
-#' @return Creates a Column class of literal value.
-setMethod("lit", signature("ANY"),
+#' rint
+#'
+#' Returns the double value that is closest in value to the argument and
+#' is equal to a mathematical integer.
+#'
+#' @rdname rint
+#' @name rint
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{rint(df$c)}
+setMethod("rint",
+          signature(x = "Column"),
           function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions",
-                              "lit",
-                              ifelse(class(x) == "Column", x@jc, x))
+            jc <- callJStatic("org.apache.spark.sql.functions", "rint", x@jc)
+            column(jc)
+          })
+
+#' round
+#'
+#' Returns the value of the column `e` rounded to 0 decimal places.
+#'
+#' @rdname round
+#' @name round
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{round(df$c)}
+setMethod("round",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "round", x@jc)
+            column(jc)
+          })
+
+#' rtrim
+#'
+#' Trim the spaces from right end for the specified string value.
+#'
+#' @rdname rtrim
+#' @name rtrim
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{rtrim(df$c)}
+setMethod("rtrim",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc)
+            column(jc)
+          })
+
+#' second
+#'
+#' Extracts the seconds as an integer from a given date/timestamp/string.
+#'
+#' @rdname second
+#' @name second
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{second(df$c)}
+setMethod("second",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "second", x@jc)
+            column(jc)
+          })
+
+#' sha1
+#'
+#' Calculates the SHA-1 digest of a binary column and returns the value
+#' as a 40 character hex string.
+#'
+#' @rdname sha1
+#' @name sha1
+#' @family misc_funcs
+#' @export
+#' @examples \dontrun{sha1(df$c)}
+setMethod("sha1",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sha1", x@jc)
+            column(jc)
+          })
+
+#' signum
+#'
+#' Computes the signum of the given value.
+#'
+#' @rdname signum
+#' @name signum
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{signum(df$c)}
+setMethod("signum",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "signum", x@jc)
+            column(jc)
+          })
+
+#' sin
+#'
+#' Computes the sine of the given value.
+#'
+#' @rdname sin
+#' @name sin
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{sin(df$c)}
+setMethod("sin",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sin", x@jc)
+            column(jc)
+          })
+
+#' sinh
+#'
+#' Computes the hyperbolic sine of the given value.
+#'
+#' @rdname sinh
+#' @name sinh
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{sinh(df$c)}
+setMethod("sinh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sinh", x@jc)
+            column(jc)
+          })
+
+#' size
+#'
+#' Returns length of array or map.
+#'
+#' @rdname size
+#' @name size
+#' @family collection_funcs
+#' @export
+#' @examples \dontrun{size(df$c)}
+setMethod("size",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "size", x@jc)
+            column(jc)
+          })
+
+#' soundex
+#'
+#' Return the soundex code for the specified expression.
+#'
+#' @rdname soundex
+#' @name soundex
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{soundex(df$c)}
+setMethod("soundex",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "soundex", x@jc)
+            column(jc)
+          })
+
+#' sqrt
+#'
+#' Computes the square root of the specified float value.
+#'
+#' @rdname sqrt
+#' @name sqrt
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{sqrt(df$c)}
+setMethod("sqrt",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sqrt", x@jc)
+            column(jc)
+          })
+
+#' sum
+#'
+#' Aggregate function: returns the sum of all values in the expression.
+#'
+#' @rdname sum
+#' @name sum
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{sum(df$c)}
+setMethod("sum",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sum", x@jc)
+            column(jc)
+          })
+
+#' sumDistinct
+#'
+#' Aggregate function: returns the sum of distinct values in the expression.
+#'
+#' @rdname sumDistinct
+#' @name sumDistinct
+#' @family agg_funcs
+#' @export
+#' @examples \dontrun{sumDistinct(df$c)}
+setMethod("sumDistinct",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc)
+            column(jc)
+          })
+
+#' tan
+#'
+#' Computes the tangent of the given value.
+#'
+#' @rdname tan
+#' @name tan
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{tan(df$c)}
+setMethod("tan",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "tan", x@jc)
+            column(jc)
+          })
+
+#' tanh
+#'
+#' Computes the hyperbolic tangent of the given value.
+#'
+#' @rdname tanh
+#' @name tanh
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{tanh(df$c)}
+setMethod("tanh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "tanh", x@jc)
+            column(jc)
+          })
+
+#' toDegrees
+#'
+#' Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
+#'
+#' @rdname toDegrees
+#' @name toDegrees
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{toDegrees(df$c)}
+setMethod("toDegrees",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "toDegrees", x@jc)
+            column(jc)
+          })
+
+#' toRadians
+#'
+#' Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
+#'
+#' @rdname toRadians
+#' @name toRadians
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{toRadians(df$c)}
+setMethod("toRadians",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "toRadians", x@jc)
+            column(jc)
+          })
+
+#' to_date
+#'
+#' Converts the column into DateType.
+#'
+#' @rdname to_date
+#' @name to_date
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{to_date(df$c)}
+setMethod("to_date",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc)
+            column(jc)
+          })
+
+#' trim
+#'
+#' Trim the spaces from both ends for the specified string column.
+#'
+#' @rdname trim
+#' @name trim
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{trim(df$c)}
+setMethod("trim",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc)
+            column(jc)
+          })
+
+#' unbase64
+#'
+#' Decodes a BASE64 encoded string column and returns it as a binary column.
+#' This is the reverse of base64.
+#'
+#' @rdname unbase64
+#' @name unbase64
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{unbase64(df$c)}
+setMethod("unbase64",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "unbase64", x@jc)
+            column(jc)
+          })
+
+#' unhex
+#'
+#' Inverse of hex. Interprets each pair of characters as a hexadecimal number
+#' and converts to the byte representation of number.
+#'
+#' @rdname unhex
+#' @name unhex
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{unhex(df$c)}
+setMethod("unhex",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "unhex", x@jc)
+            column(jc)
+          })
+
+#' upper
+#'
+#' Converts a string column to upper case.
+#'
+#' @rdname upper
+#' @name upper
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{upper(df$c)}
+setMethod("upper",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "upper", x@jc)
+            column(jc)
+          })
+
+#' weekofyear
+#'
+#' Extracts the week number as an integer from a given date/timestamp/string.
+#'
+#' @rdname weekofyear
+#' @name weekofyear
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{weekofyear(df$c)}
+setMethod("weekofyear",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "weekofyear", x@jc)
+            column(jc)
+          })
+
+#' year
+#'
+#' Extracts the year as an integer from a given date/timestamp/string.
+#'
+#' @rdname year
+#' @name year
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{year(df$c)}
+setMethod("year",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "year", x@jc)
             column(jc)
           })
 
+#' atan2
+#'
+#' Returns the angle theta from the conversion of rectangular coordinates (x, y) to
+#' polar coordinates (r, theta).
+#'
+#' @rdname atan2
+#' @name atan2
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{atan2(df$c, x)}
+setMethod("atan2", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "atan2", y@jc, x)
+            column(jc)
+          })
+
+#' datediff
+#'
+#' Returns the number of days from `start` to `end`.
+#'
+#' @rdname datediff
+#' @name datediff
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{datediff(df$c, x)}
+setMethod("datediff", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "datediff", y@jc, x)
+            column(jc)
+          })
+
+#' hypot
+#'
+#' Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
+#'
+#' @rdname hypot
+#' @name hypot
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{hypot(df$c, x)}
+setMethod("hypot", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "hypot", y@jc, x)
+            column(jc)
+          })
+
+#' levenshtein
+#'
+#' Computes the Levenshtein distance of the two given string columns.
+#'
+#' @rdname levenshtein
+#' @name levenshtein
+#' @family string_funcs
+#' @export
+#' @examples \dontrun{levenshtein(df$c, x)}
+setMethod("levenshtein", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "levenshtein", y@jc, x)
+            column(jc)
+          })
+
+#' months_between
+#'
+#' Returns number of months between dates `date1` and `date2`.
+#'
+#' @rdname months_between
+#' @name months_between
+#' @family datetime_funcs
+#' @export
+#' @examples \dontrun{months_between(df$c, x)}
+setMethod("months_between", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "months_between", y@jc, x)
+            column(jc)
+          })
+
+#' nanvl
+#'
+#' Returns col1 if it is not NaN, or col2 if col1 is NaN.
+#' hhBoth inputs should be floating point columns (DoubleType or FloatType).
+#'
+#' @rdname nanvl
+#' @name nanvl
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{nanvl(df$c, x)}
+setMethod("nanvl", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "nanvl", y@jc, x)
+            column(jc)
+          })
+
+#' pmod
+#'
+#' Returns the positive value of dividend mod divisor.
+#'
+#' @rdname pmod
+#' @name pmod
+#' @docType methods
+#' @family math_funcs
+#' @export
+#' @examples \dontrun{pmod(df$c, x)}
+setMethod("pmod", signature(y = "Column"),
+          function(y, x) {
+            if (class(x) == "Column") {
+              x <- x@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "pmod", y@jc, x)
+            column(jc)
+          })
+
+
 #' Approx Count Distinct
 #'
-#' @rdname functions
+#' @family agg_funcs
+#' @rdname approxCountDistinct
+#' @name approxCountDistinct
 #' @return the approximate number of distinct items in a group.
+#' @export
 setMethod("approxCountDistinct",
           signature(x = "Column"),
           function(x, rsd = 0.95) {
@@ -90,8 +1319,11 @@ setMethod("approxCountDistinct",
 
 #' Count Distinct
 #'
-#' @rdname functions
+#' @family agg_funcs
+#' @rdname countDistinct
+#' @name countDistinct
 #' @return the number of distinct items in a group.
+#' @export
 setMethod("countDistinct",
           signature(x = "Column"),
           function(x, ...) {
@@ -103,8 +1335,15 @@ setMethod("countDistinct",
             column(jc)
           })
 
-#' @rdname functions
-#' @return Concatenates multiple input string columns together into a single string column.
+
+#' concat
+#'
+#' Concatenates multiple input string columns together into a single string column.
+#'
+#' @family string_funcs
+#' @rdname concat
+#' @name concat
+#' @export
 setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
@@ -113,9 +1352,15 @@ setMethod("concat",
             column(jc)
           })
 
-#' @rdname functions
-#' @return Returns the greatest value of the list of column names, skipping null values.
-#'         This function takes at least 2 parameters. It will return null if all parameters are null.
+#' greatest
+#'
+#' Returns the greatest value of the list of column names, skipping null values.
+#' This function takes at least 2 parameters. It will return null if all parameters are null.
+#'
+#' @family normal_funcs
+#' @rdname greatest
+#' @name greatest
+#' @export
 setMethod("greatest",
           signature(x = "Column"),
           function(x, ...) {
@@ -125,9 +1370,15 @@ setMethod("greatest",
             column(jc)
           })
 
-#' @rdname functions
-#' @return Returns the least value of the list of column names, skipping null values.
-#'         This function takes at least 2 parameters. It will return null iff all parameters are null.
+#' least
+#'
+#' Returns the least value of the list of column names, skipping null values.
+#' This function takes at least 2 parameters. It will return null iff all parameters are null.
+#'
+#' @family normal_funcs
+#' @rdname least
+#' @name least
+#' @export
 setMethod("least",
           signature(x = "Column"),
           function(x, ...) {
@@ -137,30 +1388,58 @@ setMethod("least",
             column(jc)
           })
 
-#' @rdname functions
+#' ceiling
+#'
+#' Computes the ceiling of the given value.
+#'
+#' @family math_funcs
+#' @rdname ceil
+#' @name ceil
 #' @aliases ceil
+#' @export
 setMethod("ceiling",
           signature(x = "Column"),
           function(x) {
             ceil(x)
           })
 
-#' @rdname functions
+#' sign
+#'
+#' Computes the signum of the given value.
+#'
+#' @family math_funcs
+#' @rdname signum
+#' @name signum
 #' @aliases signum
+#' @export
 setMethod("sign", signature(x = "Column"),
           function(x) {
             signum(x)
           })
 
-#' @rdname functions
+#' n_distinct
+#'
+#' Aggregate function: returns the number of distinct items in a group.
+#'
+#' @family agg_funcs
+#' @rdname countDistinct
+#' @name countDistinct
 #' @aliases countDistinct
+#' @export
 setMethod("n_distinct", signature(x = "Column"),
           function(x, ...) {
             countDistinct(x, ...)
           })
 
-#' @rdname functions
+#' n
+#'
+#' Aggregate function: returns the number of items in a group.
+#'
+#' @family agg_funcs
+#' @rdname count
+#' @name count
 #' @aliases count
+#' @export
 setMethod("n", signature(x = "Column"),
           function(x) {
             count(x)
@@ -171,13 +1450,16 @@ setMethod("n", signature(x = "Column"),
 #' Converts a date/timestamp/string to a value of string in the format specified by the date
 #' format given by the second argument.
 #'
-#' A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
-#' pattern letters of `java.text.SimpleDateFormat` can be used.
+#' A pattern could be for instance \preformatted{dd.MM.yyyy} and could return a string like '18.03.1993'. All
+#' pattern letters of \code{java.text.SimpleDateFormat} can be used.
 #'
-#' NOTE: Use when ever possible specialized functions like `year`. These benefit from a
+#' NOTE: Use when ever possible specialized functions like \code{year}. These benefit from a
 #' specialized implementation.
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname date_format
+#' @name date_format
+#' @export
 setMethod("date_format", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_format", y@jc, x)
@@ -188,7 +1470,10 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 #'
 #' Assumes given timestamp is UTC and converts to given timezone.
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname from_utc_timestamp
+#' @name from_utc_timestamp
+#' @export
 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x)
@@ -203,7 +1488,10 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
 #' could not be found in str.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname instr
+#' @name instr
+#' @export
 setMethod("instr", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "instr", y@jc, x)
@@ -215,13 +1503,16 @@ setMethod("instr", signature(y = "Column", x = "character"),
 #' Given a date column, returns the first date which is later than the value of the date column
 #' that is on the specified day of the week.
 #'
-#' For example, `next <- day('2015-07-27', "Sunday")` returns 2015-08-02 because that is the first
+#' For example, \code{next_day('2015-07-27', "Sunday")} returns 2015-08-02 because that is the first
 #' Sunday after 2015-07-27.
 #'
 #' Day of the week parameter is case insensitive, and accepts:
 #' "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname next_day
+#' @name next_day
+#' @export
 setMethod("next_day", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "next_day", y@jc, x)
@@ -232,7 +1523,10 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 #'
 #' Assumes given timestamp is in given timezone and converts to UTC.
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname to_utc_timestamp
+#' @name to_utc_timestamp
+#' @export
 setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x)
@@ -243,7 +1537,11 @@ setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
 #'
 #' Returns the date that is numMonths after startDate.
 #'
-#' @rdname functions
+#' @name add_months
+#' @family datetime_funcs
+#' @rdname add_months
+#' @name add_months
+#' @export
 setMethod("add_months", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "add_months", y@jc, as.integer(x))
@@ -254,7 +1552,10 @@ setMethod("add_months", signature(y = "Column", x = "numeric"),
 #'
 #' Returns the date that is `days` days after `start`
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname date_add
+#' @name date_add
+#' @export
 setMethod("date_add", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_add", y@jc, as.integer(x))
@@ -265,7 +1566,10 @@ setMethod("date_add", signature(y = "Column", x = "numeric"),
 #'
 #' Returns the date that is `days` days before `start`
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname date_sub
+#' @name date_sub
+#' @export
 setMethod("date_sub", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_sub", y@jc, as.integer(x))
@@ -280,7 +1584,10 @@ setMethod("date_sub", signature(y = "Column", x = "numeric"),
 #' If d is 0, the result has no decimal point or fractional part.
 #' If d < 0, the result will be null.'
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname format_number
+#' @name format_number
+#' @export
 setMethod("format_number", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -294,9 +1601,12 @@ setMethod("format_number", signature(y = "Column", x = "numeric"),
 #' Calculates the SHA-2 family of hash functions of a binary column and
 #' returns the value as a hex string.
 #'
-#' @rdname functions
 #' @param y column to compute SHA-2 on.
 #' @param x one of 224, 256, 384, or 512.
+#' @family misc_funcs
+#' @rdname sha2
+#' @name sha2
+#' @export
 setMethod("sha2", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "sha2", y@jc, as.integer(x))
@@ -308,7 +1618,10 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
 #' Shift the the given value numBits left. If the given value is a long value, this function
 #' will return a long value else it will return an integer value.
 #'
-#' @rdname functions
+#' @family math_funcs
+#' @rdname shiftLeft
+#' @name shiftLeft
+#' @export
 setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -322,7 +1635,10 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 #' Shift the the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
-#' @rdname functions
+#' @family math_funcs
+#' @rdname shiftRight
+#' @name shiftRight
+#' @export
 setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -336,7 +1652,10 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
 #' Unsigned shift the the given value numBits right. If the given value is a long value,
 #' it will return a long value else it will return an integer value.
 #'
-#' @rdname functions
+#' @family math_funcs
+#' @rdname shiftRightUnsigned
+#' @name shiftRightUnsigned
+#' @export
 setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -350,7 +1669,10 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
 #' Concatenates multiple input string columns together into a single string column,
 #' using the given separator.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname concat_ws
+#' @name concat_ws
+#' @export
 setMethod("concat_ws", signature(sep = "character", x = "Column"),
           function(sep, x, ...) {
             jcols <- listToSeq(lapply(list(x, ...), function(x) { x@jc }))
@@ -362,7 +1684,10 @@ setMethod("concat_ws", signature(sep = "character", x = "Column"),
 #'
 #' Convert a number in a string column from one base to another.
 #'
-#' @rdname functions
+#' @family math_funcs
+#' @rdname conv
+#' @name conv
+#' @export
 setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeric"),
           function(x, fromBase, toBase) {
             fromBase <- as.integer(fromBase)
@@ -378,7 +1703,10 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri
 #' Parses the expression string into the column that it represents, similar to
 #' DataFrame.selectExpr
 #'
-#' @rdname functions
+#' @family normal_funcs
+#' @rdname expr
+#' @name expr
+#' @export
 setMethod("expr", signature(x = "character"),
           function(x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "expr", x)
@@ -389,7 +1717,10 @@ setMethod("expr", signature(x = "character"),
 #'
 #' Formats the arguments in printf-style and returns the result as a string column.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname format_string
+#' @name format_string
+#' @export
 setMethod("format_string", signature(format = "character", x = "Column"),
           function(format, x, ...) {
             jcols <- listToSeq(lapply(list(x, ...), function(arg) { arg@jc }))
@@ -405,7 +1736,10 @@ setMethod("format_string", signature(format = "character", x = "Column"),
 #' representing the timestamp of that moment in the current system time zone in the given
 #' format.
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname from_unixtime
+#' @name from_unixtime
+#' @export
 setMethod("from_unixtime", signature(x = "Column"),
           function(x, format = "yyyy-MM-dd HH:mm:ss") {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -420,7 +1754,10 @@ setMethod("from_unixtime", signature(x = "Column"),
 #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
 #' could not be found in str.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname locate
+#' @name locate
+#' @export
 setMethod("locate", signature(substr = "character", str = "Column"),
           function(substr, str, pos = 0) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -433,7 +1770,10 @@ setMethod("locate", signature(substr = "character", str = "Column"),
 #'
 #' Left-pad the string column with
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname lpad
+#' @name lpad
+#' @export
 setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -446,12 +1786,19 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
 #'
 #' Generate a random column with i.i.d. samples from U[0.0, 1.0].
 #'
-#' @rdname functions
+#' @family normal_funcs
+#' @rdname rand
+#' @name rand
+#' @export
 setMethod("rand", signature(seed = "missing"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "rand")
             column(jc)
           })
+#' @family normal_funcs
+#' @rdname rand
+#' @name rand
+#' @export
 setMethod("rand", signature(seed = "numeric"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "rand", as.integer(seed))
@@ -462,12 +1809,19 @@ setMethod("rand", signature(seed = "numeric"),
 #'
 #' Generate a column with i.i.d. samples from the standard normal distribution.
 #'
-#' @rdname functions
+#' @family normal_funcs
+#' @rdname randn
+#' @name randn
+#' @export
 setMethod("randn", signature(seed = "missing"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "randn")
             column(jc)
           })
+#' @family normal_funcs
+#' @rdname randn
+#' @name randn
+#' @export
 setMethod("randn", signature(seed = "numeric"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed))
@@ -478,7 +1832,10 @@ setMethod("randn", signature(seed = "numeric"),
 #'
 #' Extract a specific(idx) group identified by a java regex, from the specified string column.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname regexp_extract
+#' @name regexp_extract
+#' @export
 setMethod("regexp_extract",
           signature(x = "Column", pattern = "character", idx = "numeric"),
           function(x, pattern, idx) {
@@ -492,7 +1849,10 @@ setMethod("regexp_extract",
 #'
 #' Replace all substrings of the specified string value that match regexp with rep.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname regexp_replace
+#' @name regexp_replace
+#' @export
 setMethod("regexp_replace",
           signature(x = "Column", pattern = "character", replacement = "character"),
           function(x, pattern, replacement) {
@@ -506,7 +1866,10 @@ setMethod("regexp_replace",
 #'
 #' Right-padded with pad to a length of len.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname rpad
+#' @name rpad
+#' @export
 setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -522,7 +1885,10 @@ setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
 #' returned. If count is negative, every to the right of the final delimiter (counting from the
 #' right) is returned. substring <- index performs a case-sensitive match when searching for delim.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname substring_index
+#' @name substring_index
+#' @export
 setMethod("substring_index",
           signature(x = "Column", delim = "character", count = "numeric"),
           function(x, delim, count) {
@@ -539,7 +1905,10 @@ setMethod("substring_index",
 #' The translate will happen when any character in the string matching with the character
 #' in the matchingString.
 #'
-#' @rdname functions
+#' @family string_funcs
+#' @rdname translate
+#' @name translate
+#' @export
 setMethod("translate",
           signature(x = "Column", matchingString = "character", replaceString = "character"),
           function(x, matchingString, replaceString) {
@@ -552,30 +1921,28 @@ setMethod("translate",
 #'
 #' Gets current Unix timestamp in seconds.
 #'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname unix_timestamp
+#' @name unix_timestamp
+#' @export
 setMethod("unix_timestamp", signature(x = "missing", format = "missing"),
           function(x, format) {
             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp")
             column(jc)
           })
-#' unix_timestamp
-#'
-#' Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds),
-#' using the default timezone and the default locale, return null if fail.
-#'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname unix_timestamp
+#' @name unix_timestamp
+#' @export
 setMethod("unix_timestamp", signature(x = "Column", format = "missing"),
           function(x, format) {
             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc)
             column(jc)
           })
-#' unix_timestamp
-#'
-#' Convert time string with given pattern
-#' (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
-#' to Unix time stamp (in seconds), return null if fail.
-#'
-#' @rdname functions
+#' @family datetime_funcs
+#' @rdname unix_timestamp
+#' @name unix_timestamp
+#' @export
 setMethod("unix_timestamp", signature(x = "Column", format = "character"),
           function(x, format = "yyyy-MM-dd HH:mm:ss") {
             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format)
@@ -586,7 +1953,10 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"),
 #' Evaluates a list of conditions and returns one of multiple possible result expressions.
 #' For unmatched expressions null is returned.
 #'
-#' @rdname column
+#' @family normal_funcs
+#' @rdname when
+#' @name when
+#' @export
 setMethod("when", signature(condition = "Column", value = "ANY"),
           function(condition, value) {
               condition <- condition@jc
@@ -597,10 +1967,13 @@ setMethod("when", signature(condition = "Column", value = "ANY"),
 
 #' ifelse
 #'
-#' Evaluates a list of conditions and returns `yes` if the conditions are satisfied.
-#' Otherwise `no` is returned for unmatched conditions.
+#' Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied.
+#' Otherwise \code{no} is returned for unmatched conditions.
 #'
-#' @rdname column
+#' @family normal_funcs
+#' @rdname ifelse
+#' @name ifelse
+#' @export
 setMethod("ifelse",
           signature(test = "Column", yes = "ANY", no = "ANY"),
           function(test, yes, no) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 84cb8dfdaa2dd..610a8c31223cd 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -567,10 +567,6 @@ setGeneric("withColumnRenamed",
 
 ###################### Column Methods ##########################
 
-#' @rdname column
-#' @export
-setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
-
 #' @rdname column
 #' @export
 setGeneric("asc", function(x) { standardGeneric("asc") })
@@ -587,10 +583,6 @@ setGeneric("cast", function(x, dataType) { standardGeneric("cast") })
 #' @export
 setGeneric("contains", function(x, ...) { standardGeneric("contains") })
 
-#' @rdname column
-#' @export
-setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
-
 #' @rdname column
 #' @export
 setGeneric("desc", function(x) { standardGeneric("desc") })
@@ -607,10 +599,6 @@ setGeneric("getField", function(x, ...) { standardGeneric("getField") })
 #' @export
 setGeneric("getItem", function(x, ...) { standardGeneric("getItem") })
 
-#' @rdname column
-#' @export
-setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
-
 #' @rdname column
 #' @export
 setGeneric("isNull", function(x) { standardGeneric("isNull") })
@@ -619,30 +607,10 @@ setGeneric("isNull", function(x) { standardGeneric("isNull") })
 #' @export
 setGeneric("isNotNull", function(x) { standardGeneric("isNotNull") })
 
-#' @rdname column
-#' @export
-setGeneric("last", function(x) { standardGeneric("last") })
-
 #' @rdname column
 #' @export
 setGeneric("like", function(x, ...) { standardGeneric("like") })
 
-#' @rdname column
-#' @export
-setGeneric("lower", function(x) { standardGeneric("lower") })
-
-#' @rdname column
-#' @export
-setGeneric("n", function(x) { standardGeneric("n") })
-
-#' @rdname column
-#' @export
-setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
-
-#' @rdname column
-#' @export
-setGeneric("rint", function(x, ...) { standardGeneric("rint") })
-
 #' @rdname column
 #' @export
 setGeneric("rlike", function(x, ...) { standardGeneric("rlike") })
@@ -662,312 +630,340 @@ setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") })
 
 ###################### Expression Function Methods ##########################
 
-#' @rdname functions
+#' @rdname add_months
 #' @export
 setGeneric("add_months", function(y, x) { standardGeneric("add_months") })
 
-#' @rdname functions
+#' @rdname approxCountDistinct
+#' @export
+setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
+
+#' @rdname ascii
 #' @export
 setGeneric("ascii", function(x) { standardGeneric("ascii") })
 
-#' @rdname functions
+#' @rdname avg
 #' @export
 setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 
-#' @rdname functions
+#' @rdname base64
 #' @export
 setGeneric("base64", function(x) { standardGeneric("base64") })
 
-#' @rdname functions
+#' @rdname bin
 #' @export
 setGeneric("bin", function(x) { standardGeneric("bin") })
 
-#' @rdname functions
+#' @rdname bitwiseNOT
 #' @export
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
-#' @rdname functions
+#' @rdname cbrt
 #' @export
 setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
 
-#' @rdname functions
+#' @rdname ceil
 #' @export
 setGeneric("ceil", function(x) { standardGeneric("ceil") })
 
-#' @rdname functions
+#' @rdname concat
 #' @export
 setGeneric("concat", function(x, ...) { standardGeneric("concat") })
 
-#' @rdname functions
+#' @rdname concat_ws
 #' @export
 setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") })
 
-#' @rdname functions
+#' @rdname conv
 #' @export
 setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") })
 
-#' @rdname functions
+#' @rdname countDistinct
+#' @export
+setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
+
+#' @rdname crc32
 #' @export
 setGeneric("crc32", function(x) { standardGeneric("crc32") })
 
-#' @rdname functions
+#' @rdname datediff
 #' @export
 setGeneric("datediff", function(y, x) { standardGeneric("datediff") })
 
-#' @rdname functions
+#' @rdname date_add
 #' @export
 setGeneric("date_add", function(y, x) { standardGeneric("date_add") })
 
-#' @rdname functions
+#' @rdname date_format
 #' @export
 setGeneric("date_format", function(y, x) { standardGeneric("date_format") })
 
-#' @rdname functions
+#' @rdname date_sub
 #' @export
 setGeneric("date_sub", function(y, x) { standardGeneric("date_sub") })
 
-#' @rdname functions
+#' @rdname dayofmonth
 #' @export
 setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
 
-#' @rdname functions
+#' @rdname dayofyear
 #' @export
 setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 
-#' @rdname functions
+#' @rdname explode
 #' @export
 setGeneric("explode", function(x) { standardGeneric("explode") })
 
-#' @rdname functions
+#' @rdname expr
 #' @export
 setGeneric("expr", function(x) { standardGeneric("expr") })
 
-#' @rdname functions
+#' @rdname from_utc_timestamp
 #' @export
 setGeneric("from_utc_timestamp", function(y, x) { standardGeneric("from_utc_timestamp") })
 
-#' @rdname functions
+#' @rdname format_number
 #' @export
 setGeneric("format_number", function(y, x) { standardGeneric("format_number") })
 
-#' @rdname functions
+#' @rdname format_string
 #' @export
 setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") })
 
-#' @rdname functions
+#' @rdname from_unixtime
 #' @export
 setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") })
 
-#' @rdname functions
+#' @rdname greatest
 #' @export
 setGeneric("greatest", function(x, ...) { standardGeneric("greatest") })
 
-#' @rdname functions
+#' @rdname hex
 #' @export
 setGeneric("hex", function(x) { standardGeneric("hex") })
 
-#' @rdname functions
+#' @rdname hour
 #' @export
 setGeneric("hour", function(x) { standardGeneric("hour") })
 
-#' @rdname functions
+#' @rdname hypot
+#' @export
+setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
+
+#' @rdname initcap
 #' @export
 setGeneric("initcap", function(x) { standardGeneric("initcap") })
 
-#' @rdname functions
+#' @rdname instr
 #' @export
 setGeneric("instr", function(y, x) { standardGeneric("instr") })
 
-#' @rdname functions
+#' @rdname isNaN
 #' @export
 setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
 
-#' @rdname functions
+#' @rdname last
+#' @export
+setGeneric("last", function(x) { standardGeneric("last") })
+
+#' @rdname last_day
 #' @export
 setGeneric("last_day", function(x) { standardGeneric("last_day") })
 
-#' @rdname functions
+#' @rdname least
 #' @export
 setGeneric("least", function(x, ...) { standardGeneric("least") })
 
-#' @rdname functions
+#' @rdname levenshtein
 #' @export
 setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") })
 
-#' @rdname functions
+#' @rdname lit
 #' @export
 setGeneric("lit", function(x) { standardGeneric("lit") })
 
-#' @rdname functions
+#' @rdname locate
 #' @export
 setGeneric("locate", function(substr, str, ...) { standardGeneric("locate") })
 
-#' @rdname functions
+#' @rdname lower
 #' @export
 setGeneric("lower", function(x) { standardGeneric("lower") })
 
-#' @rdname functions
+#' @rdname lpad
 #' @export
 setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
 
-#' @rdname functions
+#' @rdname ltrim
 #' @export
 setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
 
-#' @rdname functions
+#' @rdname md5
 #' @export
 setGeneric("md5", function(x) { standardGeneric("md5") })
 
-#' @rdname functions
+#' @rdname minute
 #' @export
 setGeneric("minute", function(x) { standardGeneric("minute") })
 
-#' @rdname functions
+#' @rdname month
 #' @export
 setGeneric("month", function(x) { standardGeneric("month") })
 
-#' @rdname functions
+#' @rdname months_between
 #' @export
 setGeneric("months_between", function(y, x) { standardGeneric("months_between") })
 
-#' @rdname functions
+#' @rdname count
+#' @export
+setGeneric("n", function(x) { standardGeneric("n") })
+
+#' @rdname nanvl
 #' @export
 setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
 
-#' @rdname functions
+#' @rdname negate
 #' @export
 setGeneric("negate", function(x) { standardGeneric("negate") })
 
-#' @rdname functions
+#' @rdname next_day
 #' @export
 setGeneric("next_day", function(y, x) { standardGeneric("next_day") })
 
-#' @rdname functions
+#' @rdname countDistinct
+#' @export
+setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
+
+#' @rdname pmod
 #' @export
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
 
-#' @rdname functions
+#' @rdname quarter
 #' @export
 setGeneric("quarter", function(x) { standardGeneric("quarter") })
 
-#' @rdname functions
+#' @rdname rand
 #' @export
 setGeneric("rand", function(seed) { standardGeneric("rand") })
 
-#' @rdname functions
+#' @rdname randn
 #' @export
 setGeneric("randn", function(seed) { standardGeneric("randn") })
 
-#' @rdname functions
+#' @rdname regexp_extract
 #' @export
 setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
 
-#' @rdname functions
+#' @rdname regexp_replace
 #' @export
 setGeneric("regexp_replace",
            function(x, pattern, replacement) { standardGeneric("regexp_replace") })
 
-#' @rdname functions
+#' @rdname reverse
 #' @export
 setGeneric("reverse", function(x) { standardGeneric("reverse") })
 
-#' @rdname functions
+#' @rdname rint
+#' @export
+setGeneric("rint", function(x, ...) { standardGeneric("rint") })
+
+#' @rdname rpad
 #' @export
 setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
 
-#' @rdname functions
+#' @rdname rtrim
 #' @export
 setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
 
-#' @rdname functions
+#' @rdname second
 #' @export
 setGeneric("second", function(x) { standardGeneric("second") })
 
-#' @rdname functions
+#' @rdname sha1
 #' @export
 setGeneric("sha1", function(x) { standardGeneric("sha1") })
 
-#' @rdname functions
+#' @rdname sha2
 #' @export
 setGeneric("sha2", function(y, x) { standardGeneric("sha2") })
 
-#' @rdname functions
+#' @rdname shiftLeft
 #' @export
 setGeneric("shiftLeft", function(y, x) { standardGeneric("shiftLeft") })
 
-#' @rdname functions
+#' @rdname shiftRight
 #' @export
 setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
 
-#' @rdname functions
+#' @rdname shiftRightUnsigned
 #' @export
 setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })
 
-#' @rdname functions
+#' @rdname signum
 #' @export
 setGeneric("signum", function(x) { standardGeneric("signum") })
 
-#' @rdname functions
+#' @rdname size
 #' @export
 setGeneric("size", function(x) { standardGeneric("size") })
 
-#' @rdname functions
+#' @rdname soundex
 #' @export
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
 
-#' @rdname functions
+#' @rdname substring_index
 #' @export
 setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
 
-#' @rdname functions
+#' @rdname sumDistinct
 #' @export
 setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
 
-#' @rdname functions
+#' @rdname toDegrees
 #' @export
 setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
 
-#' @rdname functions
+#' @rdname toRadians
 #' @export
 setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 
-#' @rdname functions
+#' @rdname to_date
 #' @export
 setGeneric("to_date", function(x) { standardGeneric("to_date") })
 
-#' @rdname functions
+#' @rdname to_utc_timestamp
 #' @export
 setGeneric("to_utc_timestamp", function(y, x) { standardGeneric("to_utc_timestamp") })
 
-#' @rdname functions
+#' @rdname translate
 #' @export
 setGeneric("translate", function(x, matchingString, replaceString) { standardGeneric("translate") })
 
-#' @rdname functions
+#' @rdname trim
 #' @export
 setGeneric("trim", function(x) { standardGeneric("trim") })
 
-#' @rdname functions
+#' @rdname unbase64
 #' @export
 setGeneric("unbase64", function(x) { standardGeneric("unbase64") })
 
-#' @rdname functions
+#' @rdname unhex
 #' @export
 setGeneric("unhex", function(x) { standardGeneric("unhex") })
 
-#' @rdname functions
+#' @rdname unix_timestamp
 #' @export
 setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timestamp") })
 
-#' @rdname functions
+#' @rdname upper
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
 
-#' @rdname functions
+#' @rdname weekofyear
 #' @export
 setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 
-#' @rdname functions
+#' @rdname year
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 

From 642c43c81c835139e3f35dfd6a215d668a474203 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 24 Aug 2015 19:45:41 -0700
Subject: [PATCH 055/802] [SQL] [MINOR] [DOC] Clarify docs for inferring
 DataFrame from RDD of Products

 * Makes `SQLImplicits.rddToDataFrameHolder` scaladoc consistent with `SQLContext.createDataFrame[A <: Product](rdd: RDD[A])` since the former is essentially a wrapper for the latter
 * Clarifies `createDataFrame[A <: Product]` scaladoc to apply for any `RDD[Product]`, not just case classes

Author: Feynman Liang <fliang@databricks.com>

Closes #8406 from feynmanliang/sql-doc-fixes.
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala   | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 126c9c6f839c7..a1eea09e0477b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -350,7 +350,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * Creates a DataFrame from an RDD of case classes.
+   * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples).
    *
    * @group dataframes
    * @since 1.3.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index 47b6f80bed483..bf03c61088426 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -40,7 +40,7 @@ private[sql] abstract class SQLImplicits {
   implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name)
 
   /**
-   * Creates a DataFrame from an RDD of case classes or tuples.
+   * Creates a DataFrame from an RDD of Product (e.g. case classes, tuples).
    * @since 1.3.0
    */
   implicit def rddToDataFrameHolder[A <: Product : TypeTag](rdd: RDD[A]): DataFrameHolder = {

From a0c0aae1defe5e1e57704065631d201f8e3f6bac Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 25 Aug 2015 12:49:50 +0800
Subject: [PATCH 056/802] [SPARK-10121] [SQL] Thrift server always use the
 latest class loader provided by the conf of executionHive's state

https://issues.apache.org/jira/browse/SPARK-10121

Looks like the problem is that if we add a jar through another thread, the thread handling the JDBC session will not get the latest classloader.

Author: Yin Huai <yhuai@databricks.com>

Closes #8368 from yhuai/SPARK-10121.
---
 .../SparkExecuteStatementOperation.scala      |  6 +++
 .../HiveThriftServer2Suites.scala             | 54 +++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 833bf62d47d07..02cc7e5efa521 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -159,6 +159,12 @@ private[hive] class SparkExecuteStatementOperation(
 
               // User information is part of the metastore client member in Hive
               hiveContext.setSession(currentSqlSession)
+              // Always use the latest class loader provided by executionHive's state.
+              val executionHiveClassLoader =
+                hiveContext.executionHive.state.getConf.getClassLoader
+              sessionHive.getConf.setClassLoader(executionHiveClassLoader)
+              parentSessionState.getConf.setClassLoader(executionHiveClassLoader)
+
               Hive.set(sessionHive)
               SessionState.setCurrentSessionState(parentSessionState)
               try {
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index ded42bca9971e..b72249b3bf8c0 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -377,6 +377,60 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       rs2.close()
     }
   }
+
+  test("test add jar") {
+    withMultipleConnectionJdbcStatement(
+      {
+        statement =>
+          val jarFile =
+            "../hive/src/test/resources/hive-hcatalog-core-0.13.1.jar"
+              .split("/")
+              .mkString(File.separator)
+
+          statement.executeQuery(s"ADD JAR $jarFile")
+      },
+
+      {
+        statement =>
+          val queries = Seq(
+            "DROP TABLE IF EXISTS smallKV",
+            "CREATE TABLE smallKV(key INT, val STRING)",
+            s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE smallKV",
+            "DROP TABLE IF EXISTS addJar",
+            """CREATE TABLE addJar(key string)
+              |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+            """.stripMargin)
+
+          queries.foreach(statement.execute)
+
+          statement.executeQuery(
+            """
+              |INSERT INTO TABLE addJar SELECT 'k1' as key FROM smallKV limit 1
+            """.stripMargin)
+
+          val actualResult =
+            statement.executeQuery("SELECT key FROM addJar")
+          val actualResultBuffer = new collection.mutable.ArrayBuffer[String]()
+          while (actualResult.next()) {
+            actualResultBuffer += actualResult.getString(1)
+          }
+          actualResult.close()
+
+          val expectedResult =
+            statement.executeQuery("SELECT 'k1'")
+          val expectedResultBuffer = new collection.mutable.ArrayBuffer[String]()
+          while (expectedResult.next()) {
+            expectedResultBuffer += expectedResult.getString(1)
+          }
+          expectedResult.close()
+
+          assert(expectedResultBuffer === actualResultBuffer)
+
+          statement.executeQuery("DROP TABLE IF EXISTS addJar")
+          statement.executeQuery("DROP TABLE IF EXISTS smallKV")
+      }
+    )
+  }
 }
 
 class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {

From 5175ca0c85b10045d12c3fb57b1e52278a413ecf Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 24 Aug 2015 23:15:27 -0700
Subject: [PATCH 057/802] [SPARK-10178] [SQL] HiveComparisionTest should print
 out dependent tables

In `HiveComparisionTest`s it is possible to fail a query of the form `SELECT * FROM dest1`, where `dest1` is the query that is actually computing the incorrect results.  To aid debugging this patch improves the harness to also print these query plans and their results.

Author: Michael Armbrust <michael@databricks.com>

Closes #8388 from marmbrus/generatedTables.
---
 .../hive/execution/HiveComparisonTest.scala   | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 2bdb0e11878e5..4d45249d9c6b8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.execution
 
 import java.io._
 
+import scala.util.control.NonFatal
+
 import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
 
 import org.apache.spark.{Logging, SparkFunSuite}
@@ -386,11 +388,45 @@ abstract class HiveComparisonTest
                 hiveCacheFiles.foreach(_.delete())
               }
 
+              // If this query is reading other tables that were created during this test run
+              // also print out the query plans and results for those.
+              val computedTablesMessages: String = try {
+                val tablesRead = new TestHive.QueryExecution(query).executedPlan.collect {
+                  case ts: HiveTableScan => ts.relation.tableName
+                }.toSet
+
+                TestHive.reset()
+                val executions = queryList.map(new TestHive.QueryExecution(_))
+                executions.foreach(_.toRdd)
+                val tablesGenerated = queryList.zip(executions).flatMap {
+                  case (q, e) => e.executedPlan.collect {
+                    case i: InsertIntoHiveTable if tablesRead contains i.table.tableName =>
+                      (q, e, i)
+                  }
+                }
+
+                tablesGenerated.map { case (hiveql, execution, insert) =>
+                  s"""
+                     |=== Generated Table ===
+                     |$hiveql
+                     |$execution
+                     |== Results ==
+                     |${insert.child.execute().collect().mkString("\n")}
+                   """.stripMargin
+                }.mkString("\n")
+
+              } catch {
+                case NonFatal(e) =>
+                  logError("Failed to compute generated tables", e)
+                  s"Couldn't compute dependent tables: $e"
+              }
+
               val errorMessage =
                 s"""
                   |Results do not match for $testCaseName:
                   |$hiveQuery\n${hiveQuery.analyzed.output.map(_.name).mkString("\t")}
                   |$resultComparison
+                  |$computedTablesMessages
                 """.stripMargin
 
               stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase)

From d9c25dec87e6da7d66a47ff94e7eefa008081b9d Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Mon, 24 Aug 2015 23:26:14 -0700
Subject: [PATCH 058/802] =?UTF-8?q?[SPARK-9786]=20[STREAMING]=20[KAFKA]=20?=
 =?UTF-8?q?fix=20backpressure=20so=20it=20works=20with=20defa=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ult maxRatePerPartition setting of 0

Author: cody koeninger <cody@koeninger.org>

Closes #8413 from koeninger/backpressure-testing-master.
---
 .../spark/streaming/kafka/DirectKafkaInputDStream.scala  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 8a177077775c6..1000094e93cb3 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -95,8 +95,13 @@ class DirectKafkaInputDStream[
 
     val effectiveRateLimitPerPartition = estimatedRateLimit
       .filter(_ > 0)
-      .map(limit => Math.min(maxRateLimitPerPartition, (limit / numPartitions)))
-      .getOrElse(maxRateLimitPerPartition)
+      .map { limit =>
+        if (maxRateLimitPerPartition > 0) {
+          Math.min(maxRateLimitPerPartition, (limit / numPartitions))
+        } else {
+          limit / numPartitions
+        }
+      }.getOrElse(maxRateLimitPerPartition)
 
     if (effectiveRateLimitPerPartition > 0) {
       val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000

From f023aa2fcc1d1dbb82aee568be0a8f2457c309ae Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 24 Aug 2015 23:34:50 -0700
Subject: [PATCH 059/802] [SPARK-10137] [STREAMING] Avoid to restart receivers
 if scheduleReceivers returns balanced results

This PR fixes the following cases for `ReceiverSchedulingPolicy`.

1) Assume there are 4 executors: host1, host2, host3, host4, and 5 receivers: r1, r2, r3, r4, r5. Then `ReceiverSchedulingPolicy.scheduleReceivers` will return (r1 -> host1, r2 -> host2, r3 -> host3, r4 -> host4, r5 -> host1).
Let's assume r1 starts at first on `host1` as `scheduleReceivers` suggested,  and try to register with ReceiverTracker. But the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will return (host2, host3, host4) according to the current executor weights (host1 -> 1.0, host2 -> 0.5, host3 -> 0.5, host4 -> 0.5), so ReceiverTracker will reject `r1`. This is unexpected since r1 is starting exactly where `scheduleReceivers` suggested.

This case can be fixed by ignoring the information of the receiver that is rescheduling in `receiverTrackingInfoMap`.

2) Assume there are 3 executors (host1, host2, host3) and each executors has 3 cores, and 3 receivers: r1, r2, r3. Assume r1 is running on host1. Now r2 is restarting, the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will always return (host1, host2, host3). So it's possible that r2 will be scheduled to host1 by TaskScheduler. r3 is similar. Then at last, it's possible that there are 3 receivers running on host1, while host2 and host3 are idle.

This issue can be fixed by returning only executors that have the minimum wight rather than returning at least 3 executors.

Author: zsxwing <zsxwing@gmail.com>

Closes #8340 from zsxwing/fix-receiver-scheduling.
---
 .../scheduler/ReceiverSchedulingPolicy.scala  |  58 +++++++---
 .../streaming/scheduler/ReceiverTracker.scala | 106 ++++++++++++------
 .../ReceiverSchedulingPolicySuite.scala       |  13 ++-
 3 files changed, 120 insertions(+), 57 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
index ef5b687b5831a..10b5a7f57a802 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
@@ -22,6 +22,36 @@ import scala.collection.mutable
 
 import org.apache.spark.streaming.receiver.Receiver
 
+/**
+ * A class that tries to schedule receivers with evenly distributed. There are two phases for
+ * scheduling receivers.
+ *
+ * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule
+ *   all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase.
+ *   It will try to schedule receivers with evenly distributed. ReceiverTracker should update its
+ *   receiverTrackingInfoMap according to the results of `scheduleReceivers`.
+ *   `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that
+ *   contains the scheduled locations. Then when a receiver is starting, it will send a register
+ *   request and `ReceiverTracker.registerReceiver` will be called. In
+ *   `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check
+ *   if the location of this receiver is one of the scheduled executors, if not, the register will
+ *   be rejected.
+ * - The second phase is local scheduling when a receiver is restarting. There are two cases of
+ *   receiver restarting:
+ *   - If a receiver is restarting because it's rejected due to the real location and the scheduled
+ *     executors mismatching, in other words, it fails to start in one of the locations that
+ *     `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are
+ *     still alive in the list of scheduled executors, then use them to launch the receiver job.
+ *   - If a receiver is restarting without a scheduled executors list, or the executors in the list
+ *     are dead, `ReceiverTracker` should call `rescheduleReceiver`. If so, `ReceiverTracker` should
+ *     not set `ReceiverTrackingInfo.scheduledExecutors` for this executor, instead, it should clear
+ *     it. Then when this receiver is registering, we can know this is a local scheduling, and
+ *     `ReceiverTrackingInfo` should call `rescheduleReceiver` again to check if the launching
+ *     location is matching.
+ *
+ * In conclusion, we should make a global schedule, try to achieve that exactly as long as possible,
+ * otherwise do local scheduling.
+ */
 private[streaming] class ReceiverSchedulingPolicy {
 
   /**
@@ -102,8 +132,7 @@ private[streaming] class ReceiverSchedulingPolicy {
 
   /**
    * Return a list of candidate executors to run the receiver. If the list is empty, the caller can
-   * run this receiver in arbitrary executor. The caller can use `preferredNumExecutors` to require
-   * returning `preferredNumExecutors` executors if possible.
+   * run this receiver in arbitrary executor.
    *
    * This method tries to balance executors' load. Here is the approach to schedule executors
    * for a receiver.
@@ -122,9 +151,8 @@ private[streaming] class ReceiverSchedulingPolicy {
    *         If a receiver is scheduled to an executor but has not yet run, it contributes
    *         `1.0 / #candidate_executors_of_this_receiver` to the executor's weight.</li>
    *     </ul>
-   *     At last, if there are more than `preferredNumExecutors` idle executors (weight = 0),
-   *     returns all idle executors. Otherwise, we only return `preferredNumExecutors` best options
-   *     according to the weights.
+   *     At last, if there are any idle executors (weight = 0), returns all idle executors.
+   *     Otherwise, returns the executors that have the minimum weight.
    *   </li>
    * </ol>
    *
@@ -134,8 +162,7 @@ private[streaming] class ReceiverSchedulingPolicy {
       receiverId: Int,
       preferredLocation: Option[String],
       receiverTrackingInfoMap: Map[Int, ReceiverTrackingInfo],
-      executors: Seq[String],
-      preferredNumExecutors: Int = 3): Seq[String] = {
+      executors: Seq[String]): Seq[String] = {
     if (executors.isEmpty) {
       return Seq.empty
     }
@@ -156,15 +183,18 @@ private[streaming] class ReceiverSchedulingPolicy {
       }
     }.groupBy(_._1).mapValues(_.map(_._2).sum) // Sum weights for each executor
 
-    val idleExecutors = (executors.toSet -- executorWeights.keys).toSeq
-    if (idleExecutors.size >= preferredNumExecutors) {
-      // If there are more than `preferredNumExecutors` idle executors, return all of them
+    val idleExecutors = executors.toSet -- executorWeights.keys
+    if (idleExecutors.nonEmpty) {
       scheduledExecutors ++= idleExecutors
     } else {
-      // If there are less than `preferredNumExecutors` idle executors, return 3 best options
-      scheduledExecutors ++= idleExecutors
-      val sortedExecutors = executorWeights.toSeq.sortBy(_._2).map(_._1)
-      scheduledExecutors ++= (idleExecutors ++ sortedExecutors).take(preferredNumExecutors)
+      // There is no idle executor. So select all executors that have the minimum weight.
+      val sortedExecutors = executorWeights.toSeq.sortBy(_._2)
+      if (sortedExecutors.nonEmpty) {
+        val minWeight = sortedExecutors(0)._2
+        scheduledExecutors ++= sortedExecutors.takeWhile(_._2 == minWeight).map(_._1)
+      } else {
+        // This should not happen since "executors" is not empty
+      }
     }
     scheduledExecutors.toSeq
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 30d25a64e307a..3d532a675db02 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -244,8 +244,21 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     }
 
     if (isTrackerStopping || isTrackerStopped) {
-      false
-    } else if (!scheduleReceiver(streamId).contains(hostPort)) {
+      return false
+    }
+
+    val scheduledExecutors = receiverTrackingInfos(streamId).scheduledExecutors
+    val accetableExecutors = if (scheduledExecutors.nonEmpty) {
+        // This receiver is registering and it's scheduled by
+        // ReceiverSchedulingPolicy.scheduleReceivers. So use "scheduledExecutors" to check it.
+        scheduledExecutors.get
+      } else {
+        // This receiver is scheduled by "ReceiverSchedulingPolicy.rescheduleReceiver", so calling
+        // "ReceiverSchedulingPolicy.rescheduleReceiver" again to check it.
+        scheduleReceiver(streamId)
+      }
+
+    if (!accetableExecutors.contains(hostPort)) {
       // Refuse it since it's scheduled to a wrong executor
       false
     } else {
@@ -426,12 +439,25 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
           startReceiver(receiver, executors)
         }
       case RestartReceiver(receiver) =>
-        val scheduledExecutors = schedulingPolicy.rescheduleReceiver(
-          receiver.streamId,
-          receiver.preferredLocation,
-          receiverTrackingInfos,
-          getExecutors)
-        updateReceiverScheduledExecutors(receiver.streamId, scheduledExecutors)
+        // Old scheduled executors minus the ones that are not active any more
+        val oldScheduledExecutors = getStoredScheduledExecutors(receiver.streamId)
+        val scheduledExecutors = if (oldScheduledExecutors.nonEmpty) {
+            // Try global scheduling again
+            oldScheduledExecutors
+          } else {
+            val oldReceiverInfo = receiverTrackingInfos(receiver.streamId)
+            // Clear "scheduledExecutors" to indicate we are going to do local scheduling
+            val newReceiverInfo = oldReceiverInfo.copy(
+              state = ReceiverState.INACTIVE, scheduledExecutors = None)
+            receiverTrackingInfos(receiver.streamId) = newReceiverInfo
+            schedulingPolicy.rescheduleReceiver(
+              receiver.streamId,
+              receiver.preferredLocation,
+              receiverTrackingInfos,
+              getExecutors)
+          }
+        // Assume there is one receiver restarting at one time, so we don't need to update
+        // receiverTrackingInfos
         startReceiver(receiver, scheduledExecutors)
       case c: CleanupOldBlocks =>
         receiverTrackingInfos.values.flatMap(_.endpoint).foreach(_.send(c))
@@ -464,6 +490,24 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         context.reply(true)
     }
 
+    /**
+     * Return the stored scheduled executors that are still alive.
+     */
+    private def getStoredScheduledExecutors(receiverId: Int): Seq[String] = {
+      if (receiverTrackingInfos.contains(receiverId)) {
+        val scheduledExecutors = receiverTrackingInfos(receiverId).scheduledExecutors
+        if (scheduledExecutors.nonEmpty) {
+          val executors = getExecutors.toSet
+          // Only return the alive executors
+          scheduledExecutors.get.filter(executors)
+        } else {
+          Nil
+        }
+      } else {
+        Nil
+      }
+    }
+
     /**
      * Start a receiver along with its scheduled executors
      */
@@ -484,7 +528,23 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration)
 
       // Function to start the receiver on the worker node
-      val startReceiverFunc = new StartReceiverFunc(checkpointDirOption, serializableHadoopConf)
+      val startReceiverFunc: Iterator[Receiver[_]] => Unit =
+        (iterator: Iterator[Receiver[_]]) => {
+          if (!iterator.hasNext) {
+            throw new SparkException(
+              "Could not start receiver as object not found.")
+          }
+          if (TaskContext.get().attemptNumber() == 0) {
+            val receiver = iterator.next()
+            assert(iterator.hasNext == false)
+            val supervisor = new ReceiverSupervisorImpl(
+              receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
+            supervisor.start()
+            supervisor.awaitTermination()
+          } else {
+            // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it.
+          }
+        }
 
       // Create the RDD using the scheduledExecutors to run the receiver in a Spark job
       val receiverRDD: RDD[Receiver[_]] =
@@ -541,31 +601,3 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   }
 
 }
-
-/**
- * Function to start the receiver on the worker node. Use a class instead of closure to avoid
- * the serialization issue.
- */
-private[streaming] class StartReceiverFunc(
-    checkpointDirOption: Option[String],
-    serializableHadoopConf: SerializableConfiguration)
-  extends (Iterator[Receiver[_]] => Unit) with Serializable {
-
-  override def apply(iterator: Iterator[Receiver[_]]): Unit = {
-    if (!iterator.hasNext) {
-      throw new SparkException(
-        "Could not start receiver as object not found.")
-    }
-    if (TaskContext.get().attemptNumber() == 0) {
-      val receiver = iterator.next()
-      assert(iterator.hasNext == false)
-      val supervisor = new ReceiverSupervisorImpl(
-        receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
-      supervisor.start()
-      supervisor.awaitTermination()
-    } else {
-      // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it.
-    }
-  }
-
-}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
index 0418d776ecc9a..b2a51d72bac2b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
@@ -39,7 +39,7 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
     assert(scheduledExecutors.toSet === Set("host1", "host2"))
   }
 
-  test("rescheduleReceiver: return all idle executors if more than 3 idle executors") {
+  test("rescheduleReceiver: return all idle executors if there are any idle executors") {
     val executors = Seq("host1", "host2", "host3", "host4", "host5")
     // host3 is idle
     val receiverTrackingInfoMap = Map(
@@ -49,16 +49,16 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
     assert(scheduledExecutors.toSet === Set("host2", "host3", "host4", "host5"))
   }
 
-  test("rescheduleReceiver: return 3 best options if less than 3 idle executors") {
+  test("rescheduleReceiver: return all executors that have minimum weight if no idle executors") {
     val executors = Seq("host1", "host2", "host3", "host4", "host5")
-    // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0
-    // host4 and host5 are idle
+    // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0, host4 = 0.5, host5 = 0.5
     val receiverTrackingInfoMap = Map(
       0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")),
       1 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host2", "host3")), None),
-      2 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None))
+      2 -> ReceiverTrackingInfo(2, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None),
+      3 -> ReceiverTrackingInfo(4, ReceiverState.SCHEDULED, Some(Seq("host4", "host5")), None))
     val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
-      3, None, receiverTrackingInfoMap, executors)
+      4, None, receiverTrackingInfoMap, executors)
     assert(scheduledExecutors.toSet === Set("host2", "host4", "host5"))
   }
 
@@ -127,4 +127,5 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
       assert(executors.isEmpty)
     }
   }
+
 }

From df7041d02d3fd44b08a859f5d77bf6fb726895f0 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 24 Aug 2015 23:38:32 -0700
Subject: [PATCH 060/802] [SPARK-10196] [SQL] Correctly saving decimals in
 internal rows to JSON.

https://issues.apache.org/jira/browse/SPARK-10196

Author: Yin Huai <yhuai@databricks.com>

Closes #8408 from yhuai/DecimalJsonSPARK-10196.
---
 .../datasources/json/JacksonGenerator.scala   |  2 +-
 .../sources/JsonHadoopFsRelationSuite.scala   | 27 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index 99ac7730bd1c9..330ba907b2ef9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -95,7 +95,7 @@ private[sql] object JacksonGenerator {
       case (FloatType, v: Float) => gen.writeNumber(v)
       case (DoubleType, v: Double) => gen.writeNumber(v)
       case (LongType, v: Long) => gen.writeNumber(v)
-      case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
+      case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal)
       case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
       case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
       case (BooleanType, v: Boolean) => gen.writeBoolean(v)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index ed6d512ab36fe..8ca3a17085194 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.sources
 
+import java.math.BigDecimal
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -75,4 +77,29 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       )
     }
   }
+
+  test("SPARK-10196: save decimal type to JSON") {
+    withTempDir { file =>
+      file.delete()
+
+      val schema =
+        new StructType()
+          .add("decimal", DecimalType(7, 2))
+
+      val data =
+        Row(new BigDecimal("10.02")) ::
+          Row(new BigDecimal("20000.99")) ::
+          Row(new BigDecimal("10000")) :: Nil
+      val df = createDataFrame(sparkContext.parallelize(data), schema)
+
+      // Write the data out.
+      df.write.format(dataSourceName).save(file.getCanonicalPath)
+
+      // Read it back and check the result.
+      checkAnswer(
+        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
+        df
+      )
+    }
+  }
 }

From bf03fe68d62f33dda70dff45c3bda1f57b032dfc Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 25 Aug 2015 14:58:42 +0800
Subject: [PATCH 061/802] [SPARK-10136] [SQL] A more robust fix for SPARK-10136

PR #8341 is a valid fix for SPARK-10136, but it didn't catch the real root cause.  The real problem can be rather tricky to explain, and requires audiences to be pretty familiar with parquet-format spec, especially details of `LIST` backwards-compatibility rules.  Let me have a try to give an explanation here.

The structure of the problematic Parquet schema generated by parquet-avro is something like this:

```
message m {
  <repetition> group f (LIST) {         // Level 1
    repeated group array (LIST) {       // Level 2
      repeated <primitive-type> array;  // Level 3
    }
  }
}
```

(The schema generated by parquet-thrift is structurally similar, just replace the `array` at level 2 with `f_tuple`, and the other one at level 3 with `f_tuple_tuple`.)

This structure consists of two nested legacy 2-level `LIST`-like structures:

1. The repeated group type at level 2 is the element type of the outer array defined at level 1

   This group should map to an `CatalystArrayConverter.ElementConverter` when building converters.

2. The repeated primitive type at level 3 is the element type of the inner array defined at level 2

   This group should also map to an `CatalystArrayConverter.ElementConverter`.

The root cause of SPARK-10136 is that, the group at level 2 isn't properly recognized as the element type of level 1.  Thus, according to parquet-format spec, the repeated primitive at level 3 is left as a so called "unannotated repeated primitive type", and is recognized as a required list of required primitive type, thus a `RepeatedPrimitiveConverter` instead of a `CatalystArrayConverter.ElementConverter` is created for it.

According to  parquet-format spec, unannotated repeated type shouldn't appear in a `LIST`- or `MAP`-annotated group.  PR #8341 fixed this issue by allowing such unannotated repeated type appear in `LIST`-annotated groups, which is a non-standard, hacky, but valid fix.  (I didn't realize this when authoring #8341 though.)

As for the reason why level 2 isn't recognized as a list element type, it's because of the following `LIST` backwards-compatibility rule defined in the parquet-format spec:

> If the repeated field is a group with one field and is named either `array` or uses the `LIST`-annotated group's name with `_tuple` appended then the repeated type is the element type and elements are required.

(The `array` part is for parquet-avro compatibility, while the `_tuple` part is for parquet-thrift.)

This rule is implemented in [`CatalystSchemaConverter.isElementType`] [1], but neglected in [`CatalystRowConverter.isElementType`] [2].  This PR delivers a more robust fix by adding this rule in the latter method.

Note that parquet-avro 1.7.0 also suffers from this issue. Details can be found at [PARQUET-364] [3].

[1]: https://github.com/apache/spark/blob/85f9a61357994da5023b08b0a8a2eb09388ce7f8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala#L259-L305
[2]: https://github.com/apache/spark/blob/85f9a61357994da5023b08b0a8a2eb09388ce7f8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala#L456-L463
[3]: https://issues.apache.org/jira/browse/PARQUET-364

Author: Cheng Lian <lian@databricks.com>

Closes #8361 from liancheng/spark-10136/proper-version.
---
 .../parquet/CatalystRowConverter.scala         | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index d2c2db51769ba..cbf0704c4a9a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -415,8 +415,9 @@ private[parquet] class CatalystRowConverter(
     private val elementConverter: Converter = {
       val repeatedType = parquetSchema.getType(0)
       val elementType = catalystSchema.elementType
+      val parentName = parquetSchema.getName
 
-      if (isElementType(repeatedType, elementType)) {
+      if (isElementType(repeatedType, elementType, parentName)) {
         newConverter(repeatedType, elementType, new ParentContainerUpdater {
           override def set(value: Any): Unit = currentArray += value
         })
@@ -453,10 +454,13 @@ private[parquet] class CatalystRowConverter(
      * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
      */
     // scalastyle:on
-    private def isElementType(parquetRepeatedType: Type, catalystElementType: DataType): Boolean = {
+    private def isElementType(
+        parquetRepeatedType: Type, catalystElementType: DataType, parentName: String): Boolean = {
       (parquetRepeatedType, catalystElementType) match {
         case (t: PrimitiveType, _) => true
         case (t: GroupType, _) if t.getFieldCount > 1 => true
+        case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == "array" => true
+        case (t: GroupType, _) if t.getFieldCount == 1 && t.getName == parentName + "_tuple" => true
         case (t: GroupType, StructType(Array(f))) if f.name == t.getFieldName(0) => true
         case _ => false
       }
@@ -474,15 +478,9 @@ private[parquet] class CatalystRowConverter(
 
       override def getConverter(fieldIndex: Int): Converter = converter
 
-      override def end(): Unit = {
-        converter.updater.end()
-        currentArray += currentElement
-      }
+      override def end(): Unit = currentArray += currentElement
 
-      override def start(): Unit = {
-        converter.updater.start()
-        currentElement = null
-      }
+      override def start(): Unit = currentElement = null
     }
   }
 

From 82268f07abfa658869df2354ae72f8d6ddd119e8 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 25 Aug 2015 00:04:10 -0700
Subject: [PATCH 062/802] [SPARK-9293] [SPARK-9813] Analysis should check that
 set operations are only performed on tables with equal numbers of columns

This patch adds an analyzer rule to ensure that set operations (union, intersect, and except) are only applied to tables with the same number of columns. Without this rule, there are scenarios where invalid queries can return incorrect results instead of failing with error messages; SPARK-9813 provides one example of this problem. In other cases, the invalid query can crash at runtime with extremely confusing exceptions.

I also performed a bit of cleanup to refactor some of those logical operators' code into a common `SetOperation` base class.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7631 from JoshRosen/SPARK-9293.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |  6 +++
 .../catalyst/analysis/HiveTypeCoercion.scala  | 14 +++----
 .../plans/logical/basicOperators.scala        | 38 +++++++++----------
 .../analysis/AnalysisErrorSuite.scala         | 18 +++++++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  2 +-
 6 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 39f554c137c98..7701fd0451041 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -137,6 +137,12 @@ trait CheckAnalysis {
               }
             }
 
+          case s @ SetOperation(left, right) if left.output.length != right.output.length =>
+            failAnalysis(
+              s"${s.nodeName} can only be performed on tables with the same number of columns, " +
+               s"but the left table has ${left.output.length} columns and the right has " +
+               s"${right.output.length}")
+
           case _ => // Fallbacks to the following checks
         }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 2cb067f4aac91..a1aa2a2b2c680 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -203,6 +203,7 @@ object HiveTypeCoercion {
         planName: String,
         left: LogicalPlan,
         right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
+      require(left.output.length == right.output.length)
 
       val castedTypes = left.output.zip(right.output).map {
         case (lhs, rhs) if lhs.dataType != rhs.dataType =>
@@ -229,15 +230,10 @@ object HiveTypeCoercion {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case p if p.analyzed => p
 
-      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-        val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right)
-        Union(newLeft, newRight)
-      case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
-        val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right)
-        Except(newLeft, newRight)
-      case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
-        val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right)
-        Intersect(newLeft, newRight)
+      case s @ SetOperation(left, right) if s.childrenResolved
+          && left.output.length == right.output.length && !s.resolved =>
+        val (newLeft, newRight) = widenOutputTypes(s.nodeName, left, right)
+        s.makeCopy(Array(newLeft, newRight))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 73b8261260acb..722f69cdca827 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -89,13 +89,21 @@ case class Filter(condition: Expression, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 }
 
-case class Union(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
+abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
   // TODO: These aren't really the same attributes as nullability etc might change.
-  override def output: Seq[Attribute] = left.output
+  final override def output: Seq[Attribute] = left.output
 
-  override lazy val resolved: Boolean =
+  final override lazy val resolved: Boolean =
     childrenResolved &&
-    left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
+      left.output.length == right.output.length &&
+      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
+}
+
+private[sql] object SetOperation {
+  def unapply(p: SetOperation): Option[(LogicalPlan, LogicalPlan)] = Some((p.left, p.right))
+}
+
+case class Union(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) {
 
   override def statistics: Statistics = {
     val sizeInBytes = left.statistics.sizeInBytes + right.statistics.sizeInBytes
@@ -103,6 +111,10 @@ case class Union(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
   }
 }
 
+case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right)
+
+case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right)
+
 case class Join(
   left: LogicalPlan,
   right: LogicalPlan,
@@ -142,15 +154,6 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 }
 
-
-case class Except(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
-  override def output: Seq[Attribute] = left.output
-
-  override lazy val resolved: Boolean =
-    childrenResolved &&
-      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
-}
-
 case class InsertIntoTable(
     table: LogicalPlan,
     partition: Map[String, Option[String]],
@@ -160,7 +163,7 @@ case class InsertIntoTable(
   extends LogicalPlan {
 
   override def children: Seq[LogicalPlan] = child :: Nil
-  override def output: Seq[Attribute] = child.output
+  override def output: Seq[Attribute] = Seq.empty
 
   assert(overwrite || !ifNotExists)
   override lazy val resolved: Boolean = childrenResolved && child.output.zip(table.output).forall {
@@ -440,10 +443,3 @@ case object OneRowRelation extends LeafNode {
   override def statistics: Statistics = Statistics(sizeInBytes = 1)
 }
 
-case class Intersect(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
-  override def output: Seq[Attribute] = left.output
-
-  override lazy val resolved: Boolean =
-    childrenResolved &&
-      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 7065adce04bf8..fbdd3a7776f50 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -145,6 +145,24 @@ class AnalysisErrorSuite extends AnalysisTest {
     UnresolvedTestPlan(),
     "unresolved" :: Nil)
 
+  errorTest(
+    "union with unequal number of columns",
+    testRelation.unionAll(testRelation2),
+    "union" :: "number of columns" :: testRelation2.output.length.toString ::
+      testRelation.output.length.toString :: Nil)
+
+  errorTest(
+    "intersect with unequal number of columns",
+    testRelation.intersect(testRelation2),
+    "intersect" :: "number of columns" :: testRelation2.output.length.toString ::
+      testRelation.output.length.toString :: Nil)
+
+  errorTest(
+    "except with unequal number of columns",
+    testRelation.except(testRelation2),
+    "except" :: "number of columns" :: testRelation2.output.length.toString ::
+      testRelation.output.length.toString :: Nil)
+
   errorTest(
     "SPARK-9955: correct error message for aggregate",
     // When parse SQL string, we will wrap aggregate expressions with UnresolvedAlias.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index bbe8c1911bf86..98d21aa76d64e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -751,7 +751,7 @@ private[hive] case class InsertIntoHiveTable(
   extends LogicalPlan {
 
   override def children: Seq[LogicalPlan] = child :: Nil
-  override def output: Seq[Attribute] = child.output
+  override def output: Seq[Attribute] = Seq.empty
 
   val numDynamicPartitions = partition.values.count(_.isEmpty)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 12c667e6e92da..62efda613a176 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -61,7 +61,7 @@ case class InsertIntoHiveTable(
     serializer
   }
 
-  def output: Seq[Attribute] = child.output
+  def output: Seq[Attribute] = Seq.empty
 
   def saveAsHiveFile(
       rdd: RDD[InternalRow],

From d4549fe58fa0d781e0e891bceff893420cb1d598 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 25 Aug 2015 00:28:51 -0700
Subject: [PATCH 063/802] [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column,
 DataFrame API docs

cc: shivaram

## Summary

- Add name tags to each methods in DataFrame.R and column.R
- Replace `rdname column` with `rdname {each_func}`. i.e. alias method : `rdname column` =>  `rdname alias`

## Generated PDF File
https://drive.google.com/file/d/0B9biIZIU47lLNHN2aFpnQXlSeGs/view?usp=sharing

## JIRA
[[SPARK-10214] Improve SparkR Column, DataFrame API docs - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-10214)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8414 from yu-iskw/SPARK-10214.
---
 R/pkg/R/DataFrame.R | 101 +++++++++++++++++++++++++++++++++++---------
 R/pkg/R/column.R    |  40 ++++++++++++------
 R/pkg/R/generics.R  |   2 +-
 3 files changed, 109 insertions(+), 34 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 895603235011e..10f3c4ea59864 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -27,9 +27,10 @@ setOldClass("jobj")
 #'              \code{jsonFile}, \code{table} etc.
 #' @rdname DataFrame
 #' @seealso jsonFile, table
+#' @docType class
 #'
-#' @param env An R environment that stores bookkeeping states of the DataFrame
-#' @param sdf A Java object reference to the backing Scala DataFrame
+#' @slot env An R environment that stores bookkeeping states of the DataFrame
+#' @slot sdf A Java object reference to the backing Scala DataFrame
 #' @export
 setClass("DataFrame",
          slots = list(env = "environment",
@@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname printSchema
+#' @name printSchema
 #' @export
 #' @examples
 #'\dontrun{
@@ -84,6 +86,7 @@ setMethod("printSchema",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname schema
+#' @name schema
 #' @export
 #' @examples
 #'\dontrun{
@@ -106,6 +109,7 @@ setMethod("schema",
 #' @param x A SparkSQL DataFrame
 #' @param extended Logical. If extended is False, explain() only prints the physical plan.
 #' @rdname explain
+#' @name explain
 #' @export
 #' @examples
 #'\dontrun{
@@ -135,6 +139,7 @@ setMethod("explain",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname isLocal
+#' @name isLocal
 #' @export
 #' @examples
 #'\dontrun{
@@ -158,6 +163,7 @@ setMethod("isLocal",
 #' @param numRows The number of rows to print. Defaults to 20.
 #'
 #' @rdname showDF
+#' @name showDF
 #' @export
 #' @examples
 #'\dontrun{
@@ -181,6 +187,7 @@ setMethod("showDF",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname show
+#' @name show
 #' @export
 #' @examples
 #'\dontrun{
@@ -206,6 +213,7 @@ setMethod("show", "DataFrame",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname dtypes
+#' @name dtypes
 #' @export
 #' @examples
 #'\dontrun{
@@ -230,6 +238,8 @@ setMethod("dtypes",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname columns
+#' @name columns
+#' @aliases names
 #' @export
 #' @examples
 #'\dontrun{
@@ -248,7 +258,7 @@ setMethod("columns",
           })
 
 #' @rdname columns
-#' @aliases names,DataFrame,function-method
+#' @name names
 setMethod("names",
           signature(x = "DataFrame"),
           function(x) {
@@ -256,6 +266,7 @@ setMethod("names",
           })
 
 #' @rdname columns
+#' @name names<-
 setMethod("names<-",
           signature(x = "DataFrame"),
           function(x, value) {
@@ -273,6 +284,7 @@ setMethod("names<-",
 #' @param tableName A character vector containing the name of the table
 #'
 #' @rdname registerTempTable
+#' @name registerTempTable
 #' @export
 #' @examples
 #'\dontrun{
@@ -299,6 +311,7 @@ setMethod("registerTempTable",
 #' the existing rows in the table.
 #'
 #' @rdname insertInto
+#' @name insertInto
 #' @export
 #' @examples
 #'\dontrun{
@@ -321,7 +334,8 @@ setMethod("insertInto",
 #'
 #' @param x A SparkSQL DataFrame
 #'
-#' @rdname cache-methods
+#' @rdname cache
+#' @name cache
 #' @export
 #' @examples
 #'\dontrun{
@@ -347,6 +361,7 @@ setMethod("cache",
 #'
 #' @param x The DataFrame to persist
 #' @rdname persist
+#' @name persist
 #' @export
 #' @examples
 #'\dontrun{
@@ -372,6 +387,7 @@ setMethod("persist",
 #' @param x The DataFrame to unpersist
 #' @param blocking Whether to block until all blocks are deleted
 #' @rdname unpersist-methods
+#' @name unpersist
 #' @export
 #' @examples
 #'\dontrun{
@@ -397,6 +413,7 @@ setMethod("unpersist",
 #' @param x A SparkSQL DataFrame
 #' @param numPartitions The number of partitions to use.
 #' @rdname repartition
+#' @name repartition
 #' @export
 #' @examples
 #'\dontrun{
@@ -446,6 +463,7 @@ setMethod("toJSON",
 #' @param x A SparkSQL DataFrame
 #' @param path The directory where the file is saved
 #' @rdname saveAsParquetFile
+#' @name saveAsParquetFile
 #' @export
 #' @examples
 #'\dontrun{
@@ -467,6 +485,7 @@ setMethod("saveAsParquetFile",
 #'
 #' @param x A SparkSQL DataFrame
 #' @rdname distinct
+#' @name distinct
 #' @export
 #' @examples
 #'\dontrun{
@@ -488,7 +507,8 @@ setMethod("distinct",
 #' @description Returns a new DataFrame containing distinct rows in this DataFrame
 #'
 #' @rdname unique
-#' @aliases unique
+#' @name unique
+#' @aliases distinct
 setMethod("unique",
           signature(x = "DataFrame"),
           function(x) {
@@ -526,7 +546,7 @@ setMethod("sample",
           })
 
 #' @rdname sample
-#' @aliases sample
+#' @name sample_frac
 setMethod("sample_frac",
           signature(x = "DataFrame", withReplacement = "logical",
                     fraction = "numeric"),
@@ -541,6 +561,8 @@ setMethod("sample_frac",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname count
+#' @name count
+#' @aliases nrow
 #' @export
 #' @examples
 #'\dontrun{
@@ -574,6 +596,7 @@ setMethod("nrow",
 #' @param x a SparkSQL DataFrame
 #'
 #' @rdname ncol
+#' @name ncol
 #' @export
 #' @examples
 #'\dontrun{
@@ -593,6 +616,7 @@ setMethod("ncol",
 #' @param x a SparkSQL DataFrame
 #'
 #' @rdname dim
+#' @name dim
 #' @export
 #' @examples
 #'\dontrun{
@@ -613,8 +637,8 @@ setMethod("dim",
 #' @param x A SparkSQL DataFrame
 #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns
 #' should be converted to factors. FALSE by default.
-
-#' @rdname collect-methods
+#' @rdname collect
+#' @name collect
 #' @export
 #' @examples
 #'\dontrun{
@@ -650,6 +674,7 @@ setMethod("collect",
 #' @return A new DataFrame containing the number of rows specified.
 #'
 #' @rdname limit
+#' @name limit
 #' @export
 #' @examples
 #' \dontrun{
@@ -669,6 +694,7 @@ setMethod("limit",
 #' Take the first NUM rows of a DataFrame and return a the results as a data.frame
 #'
 #' @rdname take
+#' @name take
 #' @export
 #' @examples
 #'\dontrun{
@@ -696,6 +722,7 @@ setMethod("take",
 #' @return A data.frame
 #'
 #' @rdname head
+#' @name head
 #' @export
 #' @examples
 #'\dontrun{
@@ -717,6 +744,7 @@ setMethod("head",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname first
+#' @name first
 #' @export
 #' @examples
 #'\dontrun{
@@ -732,7 +760,7 @@ setMethod("first",
             take(x, 1)
           })
 
-# toRDD()
+# toRDD
 #
 # Converts a Spark DataFrame to an RDD while preserving column names.
 #
@@ -769,6 +797,7 @@ setMethod("toRDD",
 #' @seealso GroupedData
 #' @aliases group_by
 #' @rdname groupBy
+#' @name groupBy
 #' @export
 #' @examples
 #' \dontrun{
@@ -792,7 +821,7 @@ setMethod("groupBy",
            })
 
 #' @rdname groupBy
-#' @aliases group_by
+#' @name group_by
 setMethod("group_by",
           signature(x = "DataFrame"),
           function(x, ...) {
@@ -804,7 +833,8 @@ setMethod("group_by",
 #' Compute aggregates by specifying a list of columns
 #'
 #' @param x a DataFrame
-#' @rdname DataFrame
+#' @rdname agg
+#' @name agg
 #' @aliases summarize
 #' @export
 setMethod("agg",
@@ -813,8 +843,8 @@ setMethod("agg",
             agg(groupBy(x), ...)
           })
 
-#' @rdname DataFrame
-#' @aliases agg
+#' @rdname agg
+#' @name summarize
 setMethod("summarize",
           signature(x = "DataFrame"),
           function(x, ...) {
@@ -890,12 +920,14 @@ getColumn <- function(x, c) {
 }
 
 #' @rdname select
+#' @name $
 setMethod("$", signature(x = "DataFrame"),
           function(x, name) {
             getColumn(x, name)
           })
 
 #' @rdname select
+#' @name $<-
 setMethod("$<-", signature(x = "DataFrame"),
           function(x, name, value) {
             stopifnot(class(value) == "Column" || is.null(value))
@@ -923,6 +955,7 @@ setMethod("$<-", signature(x = "DataFrame"),
           })
 
 #' @rdname select
+#' @name [[
 setMethod("[[", signature(x = "DataFrame"),
           function(x, i) {
             if (is.numeric(i)) {
@@ -933,6 +966,7 @@ setMethod("[[", signature(x = "DataFrame"),
           })
 
 #' @rdname select
+#' @name [
 setMethod("[", signature(x = "DataFrame", i = "missing"),
           function(x, i, j, ...) {
             if (is.numeric(j)) {
@@ -1008,6 +1042,7 @@ setMethod("select",
 #' @param ... Additional expressions
 #' @return A DataFrame
 #' @rdname selectExpr
+#' @name selectExpr
 #' @export
 #' @examples
 #'\dontrun{
@@ -1034,6 +1069,8 @@ setMethod("selectExpr",
 #' @param col A Column expression.
 #' @return A DataFrame with the new column added.
 #' @rdname withColumn
+#' @name withColumn
+#' @aliases mutate
 #' @export
 #' @examples
 #'\dontrun{
@@ -1057,7 +1094,7 @@ setMethod("withColumn",
 #' @param col a named argument of the form name = col
 #' @return A new DataFrame with the new columns added.
 #' @rdname withColumn
-#' @aliases withColumn
+#' @name mutate
 #' @export
 #' @examples
 #'\dontrun{
@@ -1094,6 +1131,7 @@ setMethod("mutate",
 #' @param newCol The new column name.
 #' @return A DataFrame with the column name changed.
 #' @rdname withColumnRenamed
+#' @name withColumnRenamed
 #' @export
 #' @examples
 #'\dontrun{
@@ -1124,6 +1162,7 @@ setMethod("withColumnRenamed",
 #' @param newCol A named pair of the form new_column_name = existing_column
 #' @return A DataFrame with the column name changed.
 #' @rdname withColumnRenamed
+#' @name rename
 #' @aliases withColumnRenamed
 #' @export
 #' @examples
@@ -1165,6 +1204,8 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' @param ... Additional sorting fields
 #' @return A DataFrame where all elements are sorted.
 #' @rdname arrange
+#' @name arrange
+#' @aliases orderby
 #' @export
 #' @examples
 #'\dontrun{
@@ -1191,7 +1232,7 @@ setMethod("arrange",
           })
 
 #' @rdname arrange
-#' @aliases orderBy,DataFrame,function-method
+#' @name orderby
 setMethod("orderBy",
           signature(x = "DataFrame", col = "characterOrColumn"),
           function(x, col) {
@@ -1207,6 +1248,7 @@ setMethod("orderBy",
 #' or a string containing a SQL statement
 #' @return A DataFrame containing only the rows that meet the condition.
 #' @rdname filter
+#' @name filter
 #' @export
 #' @examples
 #'\dontrun{
@@ -1228,7 +1270,7 @@ setMethod("filter",
           })
 
 #' @rdname filter
-#' @aliases where,DataFrame,function-method
+#' @name where
 setMethod("where",
           signature(x = "DataFrame", condition = "characterOrColumn"),
           function(x, condition) {
@@ -1247,6 +1289,7 @@ setMethod("where",
 #' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner".
 #' @return A DataFrame containing the result of the join operation.
 #' @rdname join
+#' @name join
 #' @export
 #' @examples
 #'\dontrun{
@@ -1279,8 +1322,9 @@ setMethod("join",
             dataFrame(sdf)
           })
 
-#' rdname merge
-#' aliases join
+#' @rdname merge
+#' @name merge
+#' @aliases join
 setMethod("merge",
           signature(x = "DataFrame", y = "DataFrame"),
           function(x, y, joinExpr = NULL, joinType = NULL, ...) {
@@ -1298,6 +1342,7 @@ setMethod("merge",
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the union.
 #' @rdname unionAll
+#' @name unionAll
 #' @export
 #' @examples
 #'\dontrun{
@@ -1319,6 +1364,7 @@ setMethod("unionAll",
 #' @description Returns a new DataFrame containing rows of all parameters.
 #
 #' @rdname rbind
+#' @name rbind
 #' @aliases unionAll
 setMethod("rbind",
           signature(... = "DataFrame"),
@@ -1339,6 +1385,7 @@ setMethod("rbind",
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the intersect.
 #' @rdname intersect
+#' @name intersect
 #' @export
 #' @examples
 #'\dontrun{
@@ -1364,6 +1411,7 @@ setMethod("intersect",
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the except operation.
 #' @rdname except
+#' @name except
 #' @export
 #' @examples
 #'\dontrun{
@@ -1403,6 +1451,8 @@ setMethod("except",
 #' @param mode One of 'append', 'overwrite', 'error', 'ignore'
 #'
 #' @rdname write.df
+#' @name write.df
+#' @aliases saveDF
 #' @export
 #' @examples
 #'\dontrun{
@@ -1435,7 +1485,7 @@ setMethod("write.df",
           })
 
 #' @rdname write.df
-#' @aliases saveDF
+#' @name saveDF
 #' @export
 setMethod("saveDF",
           signature(df = "DataFrame", path = "character"),
@@ -1466,6 +1516,7 @@ setMethod("saveDF",
 #' @param mode One of 'append', 'overwrite', 'error', 'ignore'
 #'
 #' @rdname saveAsTable
+#' @name saveAsTable
 #' @export
 #' @examples
 #'\dontrun{
@@ -1505,6 +1556,8 @@ setMethod("saveAsTable",
 #' @param ... Additional expressions
 #' @return A DataFrame
 #' @rdname describe
+#' @name describe
+#' @aliases summary
 #' @export
 #' @examples
 #'\dontrun{
@@ -1525,6 +1578,7 @@ setMethod("describe",
           })
 
 #' @rdname describe
+#' @name describe
 setMethod("describe",
           signature(x = "DataFrame"),
           function(x) {
@@ -1538,7 +1592,7 @@ setMethod("describe",
 #' @description Computes statistics for numeric columns of the DataFrame
 #'
 #' @rdname summary
-#' @aliases describe
+#' @name summary
 setMethod("summary",
           signature(x = "DataFrame"),
           function(x) {
@@ -1562,6 +1616,8 @@ setMethod("summary",
 #' @return A DataFrame
 #'
 #' @rdname nafunctions
+#' @name dropna
+#' @aliases na.omit
 #' @export
 #' @examples
 #'\dontrun{
@@ -1588,7 +1644,8 @@ setMethod("dropna",
             dataFrame(sdf)
           })
 
-#' @aliases dropna
+#' @rdname nafunctions
+#' @name na.omit
 #' @export
 setMethod("na.omit",
           signature(x = "DataFrame"),
@@ -1615,6 +1672,7 @@ setMethod("na.omit",
 #' @return A DataFrame
 #'
 #' @rdname nafunctions
+#' @name fillna
 #' @export
 #' @examples
 #'\dontrun{
@@ -1685,6 +1743,7 @@ setMethod("fillna",
 #'         occurrences will have zero as their counts.
 #'
 #' @rdname statfunctions
+#' @name crosstab
 #' @export
 #' @examples
 #' \dontrun{
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index a1f50c383367c..4805096f3f9c5 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -24,10 +24,9 @@ setOldClass("jobj")
 
 #' @title S4 class that represents a DataFrame column
 #' @description The column class supports unary, binary operations on DataFrame columns
-
 #' @rdname column
 #'
-#' @param jc reference to JVM DataFrame column
+#' @slot jc reference to JVM DataFrame column
 #' @export
 setClass("Column",
          slots = list(jc = "jobj"))
@@ -46,6 +45,7 @@ col <- function(x) {
 }
 
 #' @rdname show
+#' @name show
 setMethod("show", "Column",
           function(object) {
             cat("Column", callJMethod(object@jc, "toString"), "\n")
@@ -122,8 +122,11 @@ createMethods()
 #' alias
 #'
 #' Set a new name for a column
-
-#' @rdname column
+#'
+#' @rdname alias
+#' @name alias
+#' @family colum_func
+#' @export
 setMethod("alias",
           signature(object = "Column"),
           function(object, data) {
@@ -138,7 +141,9 @@ setMethod("alias",
 #'
 #' An expression that returns a substring.
 #'
-#' @rdname column
+#' @rdname substr
+#' @name substr
+#' @family colum_func
 #'
 #' @param start starting position
 #' @param stop ending position
@@ -152,7 +157,9 @@ setMethod("substr", signature(x = "Column"),
 #'
 #' Test if the column is between the lower bound and upper bound, inclusive.
 #'
-#' @rdname column
+#' @rdname between
+#' @name between
+#' @family colum_func
 #'
 #' @param bounds lower and upper bounds
 setMethod("between", signature(x = "Column"),
@@ -167,7 +174,9 @@ setMethod("between", signature(x = "Column"),
 
 #' Casts the column to a different data type.
 #'
-#' @rdname column
+#' @rdname cast
+#' @name cast
+#' @family colum_func
 #'
 #' @examples \dontrun{
 #'   cast(df$age, "string")
@@ -189,11 +198,15 @@ setMethod("cast",
 
 #' Match a column with given values.
 #'
-#' @rdname column
+#' @rdname match
+#' @name %in%
+#' @aliases %in%
 #' @return a matched values as a result of comparing with given values.
-#' @examples \dontrun{
-#'   filter(df, "age in (10, 30)")
-#'   where(df, df$age %in% c(10, 30))
+#' @export
+#' @examples
+#' \dontrun{
+#' filter(df, "age in (10, 30)")
+#' where(df, df$age %in% c(10, 30))
 #' }
 setMethod("%in%",
           signature(x = "Column"),
@@ -208,7 +221,10 @@ setMethod("%in%",
 #' If values in the specified column are null, returns the value. 
 #' Can be used in conjunction with `when` to specify a default value for expressions.
 #'
-#' @rdname column
+#' @rdname otherwise
+#' @name otherwise
+#' @family colum_func
+#' @export
 setMethod("otherwise",
           signature(x = "Column", value = "ANY"),
           function(x, value) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 610a8c31223cd..a829d46c1894c 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -441,7 +441,7 @@ setGeneric("filter", function(x, condition) { standardGeneric("filter") })
 #' @export
 setGeneric("group_by", function(x, ...) { standardGeneric("group_by") })
 
-#' @rdname DataFrame
+#' @rdname groupBy
 #' @export
 setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") })
 

From 57b960bf3706728513f9e089455a533f0244312e Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 25 Aug 2015 08:32:20 +0100
Subject: [PATCH 064/802] [SPARK-6196] [BUILD] Remove MapR profiles in favor of
 hadoop-provided

Follow up to https://github.com/apache/spark/pull/7047

pwendell mentioned that MapR should use `hadoop-provided` now, and indeed the new build script does not produce `mapr3`/`mapr4` artifacts anymore. Hence the action seems to be to remove the profiles, which are now not used.

CC trystanleftwich

Author: Sean Owen <sowen@cloudera.com>

Closes #8338 from srowen/SPARK-6196.
---
 pom.xml | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/pom.xml b/pom.xml
index d5945f2546d38..0716016523ee1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2386,44 +2386,6 @@
       </modules>
     </profile>
 
-    <profile>
-      <id>mapr3</id>
-      <properties>
-        <hadoop.version>1.0.3-mapr-3.0.3</hadoop.version>
-        <yarn.version>2.4.1-mapr-1408</yarn.version>
-        <hbase.version>0.98.4-mapr-1408</hbase.version>
-        <zookeeper.version>3.4.5-mapr-1406</zookeeper.version>
-      </properties>
-    </profile>
-
-    <profile>
-      <id>mapr4</id>
-      <properties>
-        <hadoop.version>2.4.1-mapr-1408</hadoop.version>
-        <yarn.version>2.4.1-mapr-1408</yarn.version>
-        <hbase.version>0.98.4-mapr-1408</hbase.version>
-        <zookeeper.version>3.4.5-mapr-1406</zookeeper.version>
-      </properties>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-recipes</artifactId>
-          <version>${curator.version}</version>
-          <exclusions>
-            <exclusion>
-              <groupId>org.apache.zookeeper</groupId>
-              <artifactId>zookeeper</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.zookeeper</groupId>
-          <artifactId>zookeeper</artifactId>
-          <version>3.4.5-mapr-1406</version>
-        </dependency>
-      </dependencies>
-    </profile>
-
     <profile>
       <id>hive-thriftserver</id>
       <modules>

From 1fc37581a52530bac5d555dbf14927a5780c3b75 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 25 Aug 2015 00:35:51 -0700
Subject: [PATCH 065/802] [SPARK-10210] [STREAMING] Filter out non-existent
 blocks before creating BlockRDD

When write ahead log is not enabled, a recovered streaming driver still tries to run jobs using pre-failure block ids, and fails as the block do not exists in-memory any more (and cannot be recovered as receiver WAL is not enabled).

This occurs because the driver-side WAL of ReceivedBlockTracker is recovers that past block information, and ReceiveInputDStream creates BlockRDDs even if those blocks do not exist.

The solution in this PR is to filter out block ids that do not exist before creating the BlockRDD. In addition, it adds unit tests to verify other logic in ReceiverInputDStream.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8405 from tdas/SPARK-10210.
---
 .../dstream/ReceiverInputDStream.scala        |  10 +-
 .../rdd/WriteAheadLogBackedBlockRDD.scala     |   2 +-
 .../streaming/ReceiverInputDStreamSuite.scala | 156 ++++++++++++++++++
 3 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index a15800917c6f4..6c139f32da31d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -116,7 +116,15 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
             logWarning("Some blocks have Write Ahead Log information; this is unexpected")
           }
         }
-        new BlockRDD[T](ssc.sc, blockIds)
+        val validBlockIds = blockIds.filter { id =>
+          ssc.sparkContext.env.blockManager.master.contains(id)
+        }
+        if (validBlockIds.size != blockIds.size) {
+          logWarning("Some blocks could not be recovered as they were not found in memory. " +
+            "To prevent such data loss, enabled Write Ahead Log (see programming guide " +
+            "for more details.")
+        }
+        new BlockRDD[T](ssc.sc, validBlockIds)
       }
     } else {
       // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 620b8a36a2baf..e081ffe46f502 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -75,7 +75,7 @@ private[streaming]
 class WriteAheadLogBackedBlockRDD[T: ClassTag](
     @transient sc: SparkContext,
     @transient blockIds: Array[BlockId],
-    @transient walRecordHandles: Array[WriteAheadLogRecordHandle],
+    @transient val walRecordHandles: Array[WriteAheadLogRecordHandle],
     @transient isBlockIdValid: Array[Boolean] = Array.empty,
     storeInBlockManager: Boolean = false,
     storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
new file mode 100644
index 0000000000000..6d388d9624d92
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.rdd.BlockRDD
+import org.apache.spark.storage.{StorageLevel, StreamBlockId}
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD
+import org.apache.spark.streaming.receiver.{BlockManagerBasedStoreResult, Receiver, WriteAheadLogBasedStoreResult}
+import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
+import org.apache.spark.streaming.util.{WriteAheadLogRecordHandle, WriteAheadLogUtils}
+import org.apache.spark.{SparkConf, SparkEnv}
+
+class ReceiverInputDStreamSuite extends TestSuiteBase with BeforeAndAfterAll {
+
+  override def afterAll(): Unit = {
+    StreamingContext.getActive().map { _.stop() }
+  }
+
+  testWithoutWAL("createBlockRDD creates empty BlockRDD when no block info") { receiverStream =>
+    val rdd = receiverStream.createBlockRDD(Time(0), Seq.empty)
+    assert(rdd.isInstanceOf[BlockRDD[_]])
+    assert(!rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]])
+    assert(rdd.isEmpty())
+  }
+
+  testWithoutWAL("createBlockRDD creates correct BlockRDD with block info") { receiverStream =>
+    val blockInfos = Seq.fill(5) { createBlockInfo(withWALInfo = false) }
+    val blockIds = blockInfos.map(_.blockId)
+
+    // Verify that there are some blocks that are present, and some that are not
+    require(blockIds.forall(blockId => SparkEnv.get.blockManager.master.contains(blockId)))
+
+    val rdd = receiverStream.createBlockRDD(Time(0), blockInfos)
+    assert(rdd.isInstanceOf[BlockRDD[_]])
+    assert(!rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]])
+    val blockRDD = rdd.asInstanceOf[BlockRDD[_]]
+    assert(blockRDD.blockIds.toSeq === blockIds)
+  }
+
+  testWithoutWAL("createBlockRDD filters non-existent blocks before creating BlockRDD") {
+    receiverStream =>
+      val presentBlockInfos = Seq.fill(2)(createBlockInfo(withWALInfo = false, createBlock = true))
+      val absentBlockInfos = Seq.fill(3)(createBlockInfo(withWALInfo = false, createBlock = false))
+      val blockInfos = presentBlockInfos ++ absentBlockInfos
+      val blockIds = blockInfos.map(_.blockId)
+
+      // Verify that there are some blocks that are present, and some that are not
+      require(blockIds.exists(blockId => SparkEnv.get.blockManager.master.contains(blockId)))
+      require(blockIds.exists(blockId => !SparkEnv.get.blockManager.master.contains(blockId)))
+
+      val rdd = receiverStream.createBlockRDD(Time(0), blockInfos)
+      assert(rdd.isInstanceOf[BlockRDD[_]])
+      val blockRDD = rdd.asInstanceOf[BlockRDD[_]]
+      assert(blockRDD.blockIds.toSeq === presentBlockInfos.map { _.blockId})
+  }
+
+  testWithWAL("createBlockRDD creates empty WALBackedBlockRDD when no block info") {
+    receiverStream =>
+      val rdd = receiverStream.createBlockRDD(Time(0), Seq.empty)
+      assert(rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]])
+      assert(rdd.isEmpty())
+  }
+
+  testWithWAL(
+    "createBlockRDD creates correct WALBackedBlockRDD with all block info having WAL info") {
+    receiverStream =>
+      val blockInfos = Seq.fill(5) { createBlockInfo(withWALInfo = true) }
+      val blockIds = blockInfos.map(_.blockId)
+      val rdd = receiverStream.createBlockRDD(Time(0), blockInfos)
+      assert(rdd.isInstanceOf[WriteAheadLogBackedBlockRDD[_]])
+      val blockRDD = rdd.asInstanceOf[WriteAheadLogBackedBlockRDD[_]]
+      assert(blockRDD.blockIds.toSeq === blockIds)
+      assert(blockRDD.walRecordHandles.toSeq === blockInfos.map { _.walRecordHandleOption.get })
+  }
+
+  testWithWAL("createBlockRDD creates BlockRDD when some block info dont have WAL info") {
+    receiverStream =>
+      val blockInfos1 = Seq.fill(2) { createBlockInfo(withWALInfo = true) }
+      val blockInfos2 = Seq.fill(3) { createBlockInfo(withWALInfo = false) }
+      val blockInfos = blockInfos1 ++ blockInfos2
+      val blockIds = blockInfos.map(_.blockId)
+      val rdd = receiverStream.createBlockRDD(Time(0), blockInfos)
+      assert(rdd.isInstanceOf[BlockRDD[_]])
+      val blockRDD = rdd.asInstanceOf[BlockRDD[_]]
+      assert(blockRDD.blockIds.toSeq === blockIds)
+  }
+
+
+  private def testWithoutWAL(msg: String)(body: ReceiverInputDStream[_] => Unit): Unit = {
+    test(s"Without WAL enabled: $msg") {
+      runTest(enableWAL = false, body)
+    }
+  }
+
+  private def testWithWAL(msg: String)(body: ReceiverInputDStream[_] => Unit): Unit = {
+    test(s"With WAL enabled: $msg") {
+      runTest(enableWAL = true, body)
+    }
+  }
+
+  private def runTest(enableWAL: Boolean, body: ReceiverInputDStream[_] => Unit): Unit = {
+    val conf = new SparkConf()
+    conf.setMaster("local[4]").setAppName("ReceiverInputDStreamSuite")
+    conf.set(WriteAheadLogUtils.RECEIVER_WAL_ENABLE_CONF_KEY, enableWAL.toString)
+    require(WriteAheadLogUtils.enableReceiverLog(conf) === enableWAL)
+    val ssc = new StreamingContext(conf, Seconds(1))
+    val receiverStream = new ReceiverInputDStream[Int](ssc) {
+      override def getReceiver(): Receiver[Int] = null
+    }
+    withStreamingContext(ssc) { ssc =>
+      body(receiverStream)
+    }
+  }
+
+  /**
+   * Create a block info for input to the ReceiverInputDStream.createBlockRDD
+   * @param withWALInfo Create block with WAL info in it
+   * @param createBlock Actually create the block in the BlockManager
+   * @return
+   */
+  private def createBlockInfo(
+      withWALInfo: Boolean,
+      createBlock: Boolean = true): ReceivedBlockInfo = {
+    val blockId = new StreamBlockId(0, Random.nextLong())
+    if (createBlock) {
+      SparkEnv.get.blockManager.putSingle(blockId, 1, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      require(SparkEnv.get.blockManager.master.contains(blockId))
+    }
+    val storeResult = if (withWALInfo) {
+      new WriteAheadLogBasedStoreResult(blockId, None, new WriteAheadLogRecordHandle { })
+    } else {
+      new BlockManagerBasedStoreResult(blockId, None)
+    }
+    new ReceivedBlockInfo(0, None, None, storeResult)
+  }
+}

From 2f493f7e3924b769160a16f73cccbebf21973b91 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 25 Aug 2015 16:00:44 +0800
Subject: [PATCH 066/802] [SPARK-10177] [SQL] fix reading Timestamp in parquet
 from Hive

We misunderstood the Julian days and nanoseconds of the day in parquet (as TimestampType) from Hive/Impala, they are overlapped, so can't be added together directly.

In order to avoid the confusing rounding when do the converting, we use `2440588` as the Julian Day of epoch of unix timestamp (which should be 2440587.5).

Author: Davies Liu <davies@databricks.com>
Author: Cheng Lian <lian@databricks.com>

Closes #8400 from davies/timestamp_parquet.
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala     |  7 ++++---
 .../sql/catalyst/util/DateTimeUtilsSuite.scala      | 13 +++++++++----
 .../sql/hive/ParquetHiveCompatibilitySuite.scala    |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 672620460c3c5..d652fce3fd9b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -37,7 +37,8 @@ object DateTimeUtils {
   type SQLTimestamp = Long
 
   // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
-  final val JULIAN_DAY_OF_EPOCH = 2440587  // and .5
+  // it's 2440587.5, rounding up to compatible with Hive
+  final val JULIAN_DAY_OF_EPOCH = 2440588
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
@@ -183,7 +184,7 @@ object DateTimeUtils {
    */
   def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = {
     // use Long to avoid rounding errors
-    val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2
+    val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY
     seconds * MICROS_PER_SECOND + nanoseconds / 1000L
   }
 
@@ -191,7 +192,7 @@ object DateTimeUtils {
    * Returns Julian day and nanoseconds in a day from the number of microseconds
    */
   def toJulianDay(us: SQLTimestamp): (Int, Long) = {
-    val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
+    val seconds = us / MICROS_PER_SECOND
     val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
     val secondsInDay = seconds % SECONDS_PER_DAY
     val nanos = (us % MICROS_PER_SECOND) * 1000L
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d18fa4df13355..1596bb79fa94b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   test("us and julian day") {
     val (d, ns) = toJulianDay(0)
     assert(d === JULIAN_DAY_OF_EPOCH)
-    assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
+    assert(ns === 0)
     assert(fromJulianDay(d, ns) == 0L)
 
-    val t = new Timestamp(61394778610000L) // (2015, 6, 11, 10, 10, 10, 100)
+    val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
     val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
-    val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
-    assert(t.equals(t2))
+    val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
+    assert(t.equals(t1))
+
+    val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
+    val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
+    val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
+    assert(t2.equals(t22))
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
index bc30180cf0917..91d7a48208e8d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -113,7 +113,7 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with Before
       "BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING")
   }
 
-  ignore("SPARK-10177 timestamp") {
+  test("SPARK-10177 timestamp") {
     testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP")
   }
 

From 7bc9a8c6249300ded31ea931c463d0a8f798e193 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 25 Aug 2015 01:06:36 -0700
Subject: [PATCH 067/802] [SPARK-10195] [SQL] Data sources Filter should not
 expose internal types

Spark SQL's data sources API exposes Catalyst's internal types through its Filter interfaces. This is a problem because types like UTF8String are not stable developer APIs and should not be exposed to third-parties.

This issue caused incompatibilities when upgrading our `spark-redshift` library to work against Spark 1.5.0.  To avoid these issues in the future we should only expose public types through these Filter objects. This patch accomplishes this by using CatalystTypeConverters to add the appropriate conversions.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8403 from JoshRosen/datasources-internal-vs-external-types.
---
 .../datasources/DataSourceStrategy.scala      | 67 ++++++++++---------
 .../execution/datasources/jdbc/JDBCRDD.scala  |  2 +-
 .../datasources/parquet/ParquetFilters.scala  | 19 +++---
 .../spark/sql/sources/FilteredScanSuite.scala |  7 ++
 4 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2a4c40db8bb66..6c1ef6a6df887 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
-import org.apache.spark.sql.catalyst.{InternalRow, expressions}
+import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
@@ -344,45 +345,47 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
    */
   protected[sql] def selectFilters(filters: Seq[Expression]) = {
     def translate(predicate: Expression): Option[Filter] = predicate match {
-      case expressions.EqualTo(a: Attribute, Literal(v, _)) =>
-        Some(sources.EqualTo(a.name, v))
-      case expressions.EqualTo(Literal(v, _), a: Attribute) =>
-        Some(sources.EqualTo(a.name, v))
-
-      case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) =>
-        Some(sources.EqualNullSafe(a.name, v))
-      case expressions.EqualNullSafe(Literal(v, _), a: Attribute) =>
-        Some(sources.EqualNullSafe(a.name, v))
-
-      case expressions.GreaterThan(a: Attribute, Literal(v, _)) =>
-        Some(sources.GreaterThan(a.name, v))
-      case expressions.GreaterThan(Literal(v, _), a: Attribute) =>
-        Some(sources.LessThan(a.name, v))
-
-      case expressions.LessThan(a: Attribute, Literal(v, _)) =>
-        Some(sources.LessThan(a.name, v))
-      case expressions.LessThan(Literal(v, _), a: Attribute) =>
-        Some(sources.GreaterThan(a.name, v))
-
-      case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) =>
-        Some(sources.GreaterThanOrEqual(a.name, v))
-      case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) =>
-        Some(sources.LessThanOrEqual(a.name, v))
-
-      case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) =>
-        Some(sources.LessThanOrEqual(a.name, v))
-      case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) =>
-        Some(sources.GreaterThanOrEqual(a.name, v))
+      case expressions.EqualTo(a: Attribute, Literal(v, t)) =>
+        Some(sources.EqualTo(a.name, convertToScala(v, t)))
+      case expressions.EqualTo(Literal(v, t), a: Attribute) =>
+        Some(sources.EqualTo(a.name, convertToScala(v, t)))
+
+      case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) =>
+        Some(sources.EqualNullSafe(a.name, convertToScala(v, t)))
+      case expressions.EqualNullSafe(Literal(v, t), a: Attribute) =>
+        Some(sources.EqualNullSafe(a.name, convertToScala(v, t)))
+
+      case expressions.GreaterThan(a: Attribute, Literal(v, t)) =>
+        Some(sources.GreaterThan(a.name, convertToScala(v, t)))
+      case expressions.GreaterThan(Literal(v, t), a: Attribute) =>
+        Some(sources.LessThan(a.name, convertToScala(v, t)))
+
+      case expressions.LessThan(a: Attribute, Literal(v, t)) =>
+        Some(sources.LessThan(a.name, convertToScala(v, t)))
+      case expressions.LessThan(Literal(v, t), a: Attribute) =>
+        Some(sources.GreaterThan(a.name, convertToScala(v, t)))
+
+      case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, t)) =>
+        Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t)))
+      case expressions.GreaterThanOrEqual(Literal(v, t), a: Attribute) =>
+        Some(sources.LessThanOrEqual(a.name, convertToScala(v, t)))
+
+      case expressions.LessThanOrEqual(a: Attribute, Literal(v, t)) =>
+        Some(sources.LessThanOrEqual(a.name, convertToScala(v, t)))
+      case expressions.LessThanOrEqual(Literal(v, t), a: Attribute) =>
+        Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t)))
 
       case expressions.InSet(a: Attribute, set) =>
-        Some(sources.In(a.name, set.toArray))
+        val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType)
+        Some(sources.In(a.name, set.toArray.map(toScala)))
 
       // Because we only convert In to InSet in Optimizer when there are more than certain
       // items. So it is possible we still get an In expression here that needs to be pushed
       // down.
       case expressions.In(a: Attribute, list) if !list.exists(!_.isInstanceOf[Literal]) =>
         val hSet = list.map(e => e.eval(EmptyRow))
-        Some(sources.In(a.name, hSet.toArray))
+        val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType)
+        Some(sources.In(a.name, hSet.toArray.map(toScala)))
 
       case expressions.IsNull(a: Attribute) =>
         Some(sources.IsNull(a.name))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index e537d631f4559..730d88b024cb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -262,7 +262,7 @@ private[sql] class JDBCRDD(
    * Converts value to SQL expression.
    */
   private def compileValue(value: Any): Any = value match {
-    case stringValue: UTF8String => s"'${escapeSql(stringValue.toString)}'"
+    case stringValue: String => s"'${escapeSql(stringValue)}'"
     case _ => value
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index c74c8388632f5..c6b3fe7900da8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -32,7 +32,6 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
 
 private[sql] object ParquetFilters {
   val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
@@ -65,7 +64,7 @@ private[sql] object ParquetFilters {
     case StringType =>
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
-        Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[UTF8String].getBytes)).orNull)
+        Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8"))).orNull)
     case BinaryType =>
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
@@ -86,7 +85,7 @@ private[sql] object ParquetFilters {
     case StringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
-        Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[UTF8String].getBytes)).orNull)
+        Option(v).map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8"))).orNull)
     case BinaryType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
@@ -104,7 +103,8 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
     case StringType =>
       (n: String, v: Any) =>
-        FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes))
+        FilterApi.lt(binaryColumn(n),
+          Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8")))
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
@@ -121,7 +121,8 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
     case StringType =>
       (n: String, v: Any) =>
-        FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes))
+        FilterApi.ltEq(binaryColumn(n),
+          Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8")))
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
@@ -138,7 +139,8 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
     case StringType =>
       (n: String, v: Any) =>
-        FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes))
+        FilterApi.gt(binaryColumn(n),
+          Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8")))
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
@@ -155,7 +157,8 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
     case StringType =>
       (n: String, v: Any) =>
-        FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[UTF8String].getBytes))
+        FilterApi.gtEq(binaryColumn(n),
+          Binary.fromByteArray(v.asInstanceOf[String].getBytes("utf-8")))
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
@@ -177,7 +180,7 @@ private[sql] object ParquetFilters {
     case StringType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
-          SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[UTF8String].getBytes))))
+          SetInFilter(v.map(s => Binary.fromByteArray(s.asInstanceOf[String].getBytes("utf-8")))))
     case BinaryType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index c81c3d3982805..68ce37c00077e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.sources
 import scala.language.existentials
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.sql._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -78,6 +79,9 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL
       case StringStartsWith("c", v) => _.startsWith(v)
       case StringEndsWith("c", v) => _.endsWith(v)
       case StringContains("c", v) => _.contains(v)
+      case EqualTo("c", v: String) => _.equals(v)
+      case EqualTo("c", v: UTF8String) => sys.error("UTF8String should not appear in filters")
+      case In("c", values) => (s: String) => values.map(_.asInstanceOf[String]).toSet.contains(s)
       case _ => (c: String) => true
     }
 
@@ -237,6 +241,9 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
   testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%eE%'", 1)
   testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%Ee%'", 0)
 
+  testPushDown("SELECT c FROM oneToTenFiltered WHERE c = 'aaaaaAAAAA'", 1)
+  testPushDown("SELECT c FROM oneToTenFiltered WHERE c IN ('aaaaaAAAAA', 'foo')", 1)
+
   def testPushDown(sqlString: String, expectedCount: Int): Unit = {
     test(s"PushDown Returns $expectedCount: $sqlString") {
       val queryExecution = sql(sqlString).queryExecution

From 0e6368ffaec1965d0c7f89420e04a974675c7f6e Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 25 Aug 2015 16:19:34 +0800
Subject: [PATCH 068/802] [SPARK-10197] [SQL] Add null check in wrapperFor
 (inside HiveInspectors).

https://issues.apache.org/jira/browse/SPARK-10197

Author: Yin Huai <yhuai@databricks.com>

Closes #8407 from yhuai/ORCSPARK-10197.
---
 .../spark/sql/hive/HiveInspectors.scala       | 29 +++++++++++++++----
 .../spark/sql/hive/orc/OrcSourceSuite.scala   | 29 +++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 9824dad239596..64fffdbf9b020 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -370,17 +370,36 @@ private[hive] trait HiveInspectors {
   protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match {
     case _: JavaHiveVarcharObjectInspector =>
       (o: Any) =>
-        val s = o.asInstanceOf[UTF8String].toString
-        new HiveVarchar(s, s.size)
+        if (o != null) {
+          val s = o.asInstanceOf[UTF8String].toString
+          new HiveVarchar(s, s.size)
+        } else {
+          null
+        }
 
     case _: JavaHiveDecimalObjectInspector =>
-      (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+      (o: Any) =>
+        if (o != null) {
+          HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+        } else {
+          null
+        }
 
     case _: JavaDateObjectInspector =>
-      (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+      (o: Any) =>
+        if (o != null) {
+          DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+        } else {
+          null
+        }
 
     case _: JavaTimestampObjectInspector =>
-      (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+      (o: Any) =>
+        if (o != null) {
+          DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+        } else {
+          null
+        }
 
     case soi: StandardStructObjectInspector =>
       val schema = dataType.asInstanceOf[StructType]
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 82e08caf46457..80c38084f293d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll {
       sql("SELECT * FROM normal_orc_as_source"),
       (6 to 10).map(i => Row(i, s"part-$i")))
   }
+
+  test("write null values") {
+    sql("DROP TABLE IF EXISTS orcNullValues")
+
+    val df = sql(
+      """
+        |SELECT
+        |  CAST(null as TINYINT),
+        |  CAST(null as SMALLINT),
+        |  CAST(null as INT),
+        |  CAST(null as BIGINT),
+        |  CAST(null as FLOAT),
+        |  CAST(null as DOUBLE),
+        |  CAST(null as DECIMAL(7,2)),
+        |  CAST(null as TIMESTAMP),
+        |  CAST(null as DATE),
+        |  CAST(null as STRING),
+        |  CAST(null as VARCHAR(10))
+        |FROM orc_temp_table limit 1
+      """.stripMargin)
+
+    df.write.format("orc").saveAsTable("orcNullValues")
+
+    checkAnswer(
+      sql("SELECT * FROM orcNullValues"),
+      Row.fromSeq(Seq.fill(11)(null)))
+
+    sql("DROP TABLE IF EXISTS orcNullValues")
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {

From 5c14890159a5711072bf395f662b2433a389edf9 Mon Sep 17 00:00:00 2001
From: "Zhang, Liye" <liye.zhang@intel.com>
Date: Tue, 25 Aug 2015 11:48:55 +0100
Subject: [PATCH 069/802] [DOC] add missing parameters in SparkContext.scala
 for scala doc

Author: Zhang, Liye <liye.zhang@intel.com>

Closes #8412 from liyezhang556520/minorDoc.
---
 .../scala/org/apache/spark/SparkContext.scala     | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1ddaca8a5ba8c..9849aff85d72e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * :: DeveloperApi ::
    * Alternative constructor for setting preferred locations where Spark will create executors.
    *
+   * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters
    * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
    * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
    * from a list of input files or InputFormats for the application.
@@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param jars Collection of JARs to send to the cluster. These can be paths on the local file
    *             system or HDFS, HTTP, HTTPS, or FTP URLs.
    * @param environment Environment variables to set on worker nodes.
+   * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
+   * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
+   * from a list of input files or InputFormats for the application.
    */
   def this(
       master: String,
@@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @note Small files are preferred, large file is also allowable, but may cause bad performance.
    * @note On some filesystems, `.../path/&#42;` can be a more efficient way to read all files
    *       in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma separated paths as the
+   *             list of inputs.
    * @param minPartitions A suggestion value of the minimal splitting number for input data.
    */
   def wholeTextFiles(
@@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @note Small files are preferred; very large files may cause bad performance.
    * @note On some filesystems, `.../path/&#42;` can be a more efficient way to read all files
    *       in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma separated paths as the
+   *             list of inputs.
    * @param minPartitions A suggestion value of the minimal splitting number for input data.
    */
   @Experimental
@@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * '''Note:''' We ensure that the byte array for each record in the resulting RDD
    * has the provided record length.
    *
-   * @param path Directory to the input data files
+   * @param path Directory to the input data files, the path can be comma separated paths as the
+   *             list of inputs.
    * @param recordLength The length at which to split the records
+   * @param conf Configuration for setting up the dataset.
+   *
    * @return An RDD of data with values, represented as byte arrays
    */
   @Experimental

From 7f1e507bf7e82bff323c5dec3c1ee044687c4173 Mon Sep 17 00:00:00 2001
From: ehnalis <zoltan.zvara@gmail.com>
Date: Tue, 25 Aug 2015 12:30:06 +0100
Subject: [PATCH 070/802] Fixed a typo in DAGScheduler.

Author: ehnalis <zoltan.zvara@gmail.com>

Closes #8308 from ehnalis/master.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 684db6646765f..daf9b0f95273e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -152,17 +152,24 @@ class DAGScheduler(
   // may lead to more delay in scheduling if those locations are busy.
   private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
 
-  // Called by TaskScheduler to report task's starting.
+  /**
+   * Called by the TaskSetManager to report task's starting.
+   */
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
     eventProcessLoop.post(BeginEvent(task, taskInfo))
   }
 
-  // Called to report that a task has completed and results are being fetched remotely.
+  /**
+   * Called by the TaskSetManager to report that a task has completed
+   * and results are being fetched remotely.
+   */
   def taskGettingResult(taskInfo: TaskInfo) {
     eventProcessLoop.post(GettingResultEvent(taskInfo))
   }
 
-  // Called by TaskScheduler to report task completions or failures.
+  /**
+   * Called by the TaskSetManager to report task completions or failures.
+   */
   def taskEnded(
       task: Task[_],
       reason: TaskEndReason,
@@ -188,18 +195,24 @@ class DAGScheduler(
       BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat"))
   }
 
-  // Called by TaskScheduler when an executor fails.
+  /**
+   * Called by TaskScheduler implementation when an executor fails.
+   */
   def executorLost(execId: String): Unit = {
     eventProcessLoop.post(ExecutorLost(execId))
   }
 
-  // Called by TaskScheduler when a host is added
+  /**
+   * Called by TaskScheduler implementation when a host is added.
+   */
   def executorAdded(execId: String, host: String): Unit = {
     eventProcessLoop.post(ExecutorAdded(execId, host))
   }
 
-  // Called by TaskScheduler to cancel an entire TaskSet due to either repeated failures or
-  // cancellation of the job itself.
+  /**
+   * Called by the TaskSetManager to cancel an entire TaskSet due to either repeated failures or
+   * cancellation of the job itself.
+   */
   def taskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]): Unit = {
     eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception))
   }

From 69c9c177160e32a2fbc9b36ecc52156077fca6fc Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 25 Aug 2015 12:33:13 +0100
Subject: [PATCH 071/802] [SPARK-9613] [CORE] Ban use of JavaConversions and
 migrate all existing uses to JavaConverters

Replace `JavaConversions` implicits with `JavaConverters`

Most occurrences I've seen so far are necessary conversions; a few have been avoidable. None are in critical code as far as I see, yet.

Author: Sean Owen <sowen@cloudera.com>

Closes #8033 from srowen/SPARK-9613.
---
 .../shuffle/unsafe/UnsafeShuffleWriter.java   |   4 +-
 .../org/apache/spark/MapOutputTracker.scala   |   4 +-
 .../scala/org/apache/spark/SSLOptions.scala   |  11 +-
 .../scala/org/apache/spark/SparkContext.scala |   4 +-
 .../scala/org/apache/spark/TestUtils.scala    |   9 +-
 .../apache/spark/api/java/JavaHadoopRDD.scala |   4 +-
 .../spark/api/java/JavaNewHadoopRDD.scala     |   4 +-
 .../apache/spark/api/java/JavaPairRDD.scala   |  19 ++-
 .../apache/spark/api/java/JavaRDDLike.scala   |  75 +++++-------
 .../spark/api/java/JavaSparkContext.scala     |  20 ++--
 .../spark/api/python/PythonHadoopUtil.scala   |  28 ++---
 .../apache/spark/api/python/PythonRDD.scala   |  26 ++---
 .../apache/spark/api/python/PythonUtils.scala |  15 ++-
 .../api/python/PythonWorkerFactory.scala      |  11 +-
 .../apache/spark/api/python/SerDeUtil.scala   |   3 +-
 .../WriteInputFormatTestDataGenerator.scala   |   8 +-
 .../scala/org/apache/spark/api/r/RRDD.scala   |  13 ++-
 .../scala/org/apache/spark/api/r/RUtils.scala |   5 +-
 .../scala/org/apache/spark/api/r/SerDe.scala  |   4 +-
 .../spark/broadcast/TorrentBroadcast.scala    |   4 +-
 .../spark/deploy/ExternalShuffleService.scala |   8 +-
 .../apache/spark/deploy/PythonRunner.scala    |   4 +-
 .../apache/spark/deploy/RPackageUtils.scala   |   4 +-
 .../org/apache/spark/deploy/RRunner.scala     |   4 +-
 .../spark/deploy/SparkCuratorUtil.scala       |   4 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala |  19 +--
 .../spark/deploy/SparkSubmitArguments.scala   |   6 +-
 .../master/ZooKeeperPersistenceEngine.scala   |   6 +-
 .../spark/deploy/worker/CommandUtils.scala    |   5 +-
 .../spark/deploy/worker/DriverRunner.scala    |   8 +-
 .../spark/deploy/worker/ExecutorRunner.scala  |   7 +-
 .../apache/spark/deploy/worker/Worker.scala   |   1 -
 .../org/apache/spark/executor/Executor.scala  |   6 +-
 .../spark/executor/ExecutorSource.scala       |   4 +-
 .../spark/executor/MesosExecutorBackend.scala |   6 +-
 .../spark/input/PortableDataStream.scala      |  11 +-
 .../input/WholeTextFileInputFormat.scala      |   8 +-
 .../spark/launcher/WorkerCommandBuilder.scala |   4 +-
 .../apache/spark/metrics/MetricsConfig.scala  |  22 ++--
 .../network/netty/NettyBlockRpcServer.scala   |   4 +-
 .../netty/NettyBlockTransferService.scala     |   6 +-
 .../apache/spark/network/nio/Connection.scala |   4 +-
 .../spark/partial/GroupedCountEvaluator.scala |  10 +-
 .../spark/partial/GroupedMeanEvaluator.scala  |  10 +-
 .../spark/partial/GroupedSumEvaluator.scala   |  10 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |   6 +-
 .../scala/org/apache/spark/rdd/PipedRDD.scala |   6 +-
 .../org/apache/spark/rdd/SubtractedRDD.scala  |   4 +-
 .../spark/scheduler/InputFormatInfo.scala     |   4 +-
 .../org/apache/spark/scheduler/Pool.scala     |  10 +-
 .../mesos/CoarseMesosSchedulerBackend.scala   |  20 ++--
 .../mesos/MesosClusterPersistenceEngine.scala |   4 +-
 .../cluster/mesos/MesosClusterScheduler.scala |  14 +--
 .../cluster/mesos/MesosSchedulerBackend.scala |  22 ++--
 .../cluster/mesos/MesosSchedulerUtils.scala   |  25 ++--
 .../spark/serializer/KryoSerializer.scala     |  10 +-
 .../shuffle/FileShuffleBlockResolver.scala    |   8 +-
 .../storage/BlockManagerMasterEndpoint.scala  |   8 +-
 .../org/apache/spark/util/AkkaUtils.scala     |   4 +-
 .../org/apache/spark/util/ListenerBus.scala   |   7 +-
 .../spark/util/MutableURLClassLoader.scala    |   2 -
 .../spark/util/TimeStampedHashMap.scala       |  10 +-
 .../spark/util/TimeStampedHashSet.scala       |   4 +-
 .../scala/org/apache/spark/util/Utils.scala   |  20 ++--
 .../apache/spark/util/collection/Utils.scala  |   4 +-
 .../java/org/apache/spark/JavaAPISuite.java   |   6 +-
 .../org/apache/spark/SparkConfSuite.scala     |   7 +-
 .../spark/deploy/LogUrlsStandaloneSuite.scala |   1 -
 .../spark/deploy/RPackageUtilsSuite.scala     |   8 +-
 .../deploy/worker/ExecutorRunnerTest.scala    |   5 +-
 .../spark/scheduler/SparkListenerSuite.scala  |   9 +-
 .../mesos/MesosSchedulerBackendSuite.scala    |  15 +--
 .../serializer/KryoSerializerSuite.scala      |   3 +-
 .../org/apache/spark/ui/UISeleniumSuite.scala |  21 ++--
 .../spark/examples/CassandraCQLTest.scala     |  15 +--
 .../apache/spark/examples/CassandraTest.scala |   6 +-
 .../spark/examples/DriverSubmissionTest.scala |   6 +-
 .../pythonconverters/AvroConverters.scala     |  16 +--
 .../CassandraConverters.scala                 |  14 ++-
 .../pythonconverters/HBaseConverters.scala    |   5 +-
 .../streaming/flume/sink/SparkSinkSuite.scala |   4 +-
 .../streaming/flume/EventTransformer.scala    |   4 +-
 .../streaming/flume/FlumeBatchFetcher.scala   |   3 +-
 .../streaming/flume/FlumeInputDStream.scala   |   7 +-
 .../flume/FlumePollingInputDStream.scala      |   6 +-
 .../streaming/flume/FlumeTestUtils.scala      |  10 +-
 .../spark/streaming/flume/FlumeUtils.scala    |   8 +-
 .../flume/PollingFlumeTestUtils.scala         |  16 ++-
 .../flume/FlumePollingStreamSuite.scala       |   8 +-
 .../streaming/flume/FlumeStreamSuite.scala    |   2 +-
 .../streaming/kafka/KafkaTestUtils.scala      |   4 +-
 .../spark/streaming/kafka/KafkaUtils.scala    |  35 +++---
 .../spark/streaming/zeromq/ZeroMQUtils.scala  |  15 ++-
 .../kinesis/KinesisBackedBlockRDD.scala       |   4 +-
 .../streaming/kinesis/KinesisReceiver.scala   |   4 +-
 .../streaming/kinesis/KinesisTestUtils.scala  |   3 +-
 .../kinesis/KinesisReceiverSuite.scala        |  12 +-
 .../mllib/util/LinearDataGenerator.scala      |   4 +-
 .../ml/classification/JavaOneVsRestSuite.java |   7 +-
 .../LogisticRegressionSuite.scala             |   4 +-
 .../spark/mllib/classification/SVMSuite.scala |   4 +-
 .../optimization/GradientDescentSuite.scala   |   4 +-
 .../spark/mllib/recommendation/ALSSuite.scala |   4 +-
 project/SparkBuild.scala                      |   8 +-
 python/pyspark/sql/column.py                  |  12 ++
 python/pyspark/sql/dataframe.py               |   4 +-
 scalastyle-config.xml                         |   7 ++
 .../main/scala/org/apache/spark/sql/Row.scala |  12 +-
 .../spark/sql/catalyst/analysis/Catalog.scala |   4 +-
 .../spark/sql/DataFrameNaFunctions.scala      |   8 +-
 .../apache/spark/sql/DataFrameReader.scala    |   4 +-
 .../apache/spark/sql/DataFrameWriter.scala    |   4 +-
 .../org/apache/spark/sql/GroupedData.scala    |   4 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  13 ++-
 .../org/apache/spark/sql/SQLContext.scala     |   8 +-
 .../datasources/ResolvedDataSource.scala      |   4 +-
 .../parquet/CatalystReadSupport.scala         |   8 +-
 .../parquet/CatalystRowConverter.scala        |   4 +-
 .../parquet/CatalystSchemaConverter.scala     |   4 +-
 .../datasources/parquet/ParquetRelation.scala |  13 ++-
 .../parquet/ParquetTypesConverter.scala       |   4 +-
 .../joins/ShuffledHashOuterJoin.scala         |   6 +-
 .../spark/sql/execution/pythonUDFs.scala      |  11 +-
 .../apache/spark/sql/JavaDataFrameSuite.java  |   8 +-
 .../spark/sql/DataFrameNaFunctionsSuite.scala |   6 +-
 .../org/apache/spark/sql/QueryTest.scala      |   4 +-
 .../ParquetAvroCompatibilitySuite.scala       |   3 +-
 .../parquet/ParquetCompatibilityTest.scala    |   7 +-
 .../datasources/parquet/ParquetIOSuite.scala  |  25 ++--
 .../SparkExecuteStatementOperation.scala      |  10 +-
 .../hive/thriftserver/SparkSQLCLIDriver.scala |  16 +--
 .../thriftserver/SparkSQLCLIService.scala     |   6 +-
 .../hive/thriftserver/SparkSQLDriver.scala    |  14 +--
 .../sql/hive/thriftserver/SparkSQLEnv.scala   |   4 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   4 +-
 .../spark/sql/hive/HiveInspectors.scala       |  40 +++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  12 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    | 110 ++++++++++--------
 .../org/apache/spark/sql/hive/HiveShim.scala  |   5 +-
 .../spark/sql/hive/client/ClientWrapper.scala |  27 ++---
 .../spark/sql/hive/client/HiveShim.scala      |  14 +--
 .../execution/DescribeHiveTableCommand.scala  |   8 +-
 .../sql/hive/execution/HiveTableScan.scala    |   9 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  12 +-
 .../hive/execution/ScriptTransformation.scala |  12 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |   9 +-
 .../spark/sql/hive/orc/OrcRelation.scala      |   8 +-
 .../apache/spark/sql/hive/test/TestHive.scala |  11 +-
 .../spark/sql/hive/client/FiltersSuite.scala  |   4 +-
 .../sql/hive/execution/HiveUDFSuite.scala     |  29 +++--
 .../sql/hive/execution/PruningSuite.scala     |   7 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   4 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |   8 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  12 +-
 .../streaming/api/java/JavaPairDStream.scala  |  28 ++---
 .../api/java/JavaStreamingContext.scala       |  32 ++---
 .../streaming/api/python/PythonDStream.scala  |   5 +-
 .../spark/streaming/receiver/Receiver.scala   |   6 +-
 .../streaming/scheduler/JobScheduler.scala    |   4 +-
 .../scheduler/ReceivedBlockTracker.scala      |   4 +-
 .../util/FileBasedWriteAheadLog.scala         |   4 +-
 .../spark/streaming/JavaTestUtils.scala       |  24 ++--
 .../streaming/util/WriteAheadLogSuite.scala   |   4 +-
 .../spark/tools/GenerateMIMAIgnore.scala      |   6 +-
 .../org/apache/spark/deploy/yarn/Client.scala |  13 ++-
 .../spark/deploy/yarn/ExecutorRunnable.scala  |  24 ++--
 .../spark/deploy/yarn/YarnAllocator.scala     |  19 ++-
 .../spark/deploy/yarn/YarnRMClient.scala      |   8 +-
 .../deploy/yarn/BaseYarnClusterSuite.scala    |   6 +-
 .../spark/deploy/yarn/ClientSuite.scala       |   8 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |   5 +-
 171 files changed, 863 insertions(+), 880 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
index 2389c28b28395..fdb309e365f69 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -24,7 +24,7 @@
 
 import scala.Option;
 import scala.Product2;
-import scala.collection.JavaConversions;
+import scala.collection.JavaConverters;
 import scala.collection.immutable.Map;
 import scala.reflect.ClassTag;
 import scala.reflect.ClassTag$;
@@ -160,7 +160,7 @@ public long getPeakMemoryUsedBytes() {
    */
   @VisibleForTesting
   public void write(Iterator<Product2<K, V>> records) throws IOException {
-    write(JavaConversions.asScalaIterator(records));
+    write(JavaConverters.asScalaIteratorConverter(records).asScala());
   }
 
   @Override
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 92218832d256f..a387592783850 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -21,8 +21,8 @@ import java.io._
 import java.util.concurrent.ConcurrentHashMap
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
-import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, RpcCallContext, RpcEndpoint}
@@ -398,7 +398,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
  */
 private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) {
   protected val mapStatuses: Map[Int, Array[MapStatus]] =
-    new ConcurrentHashMap[Int, Array[MapStatus]]
+    new ConcurrentHashMap[Int, Array[MapStatus]]().asScala
 }
 
 private[spark] object MapOutputTracker extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index 32df42d57dbd6..3b9c885bf97a7 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark
 
-import java.io.{File, FileInputStream}
-import java.security.{KeyStore, NoSuchAlgorithmException}
-import javax.net.ssl.{KeyManager, KeyManagerFactory, SSLContext, TrustManager, TrustManagerFactory}
+import java.io.File
+import java.security.NoSuchAlgorithmException
+import javax.net.ssl.SSLContext
+
+import scala.collection.JavaConverters._
 
 import com.typesafe.config.{Config, ConfigFactory, ConfigValueFactory}
 import org.eclipse.jetty.util.ssl.SslContextFactory
@@ -79,7 +81,6 @@ private[spark] case class SSLOptions(
    * object. It can be used then to compose the ultimate Akka configuration.
    */
   def createAkkaConfig: Option[Config] = {
-    import scala.collection.JavaConversions._
     if (enabled) {
       Some(ConfigFactory.empty()
         .withValue("akka.remote.netty.tcp.security.key-store",
@@ -97,7 +98,7 @@ private[spark] case class SSLOptions(
         .withValue("akka.remote.netty.tcp.security.protocol",
           ConfigValueFactory.fromAnyRef(protocol.getOrElse("")))
         .withValue("akka.remote.netty.tcp.security.enabled-algorithms",
-          ConfigValueFactory.fromIterable(supportedAlgorithms.toSeq))
+          ConfigValueFactory.fromIterable(supportedAlgorithms.asJava))
         .withValue("akka.remote.netty.tcp.enable-ssl",
           ConfigValueFactory.fromAnyRef(true)))
     } else {
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 9849aff85d72e..f3da04a7f55d0 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -26,8 +26,8 @@ import java.util.{Arrays, Properties, UUID}
 import java.util.concurrent.atomic.{AtomicReference, AtomicBoolean, AtomicInteger}
 import java.util.UUID.randomUUID
 
+import scala.collection.JavaConverters._
 import scala.collection.{Map, Set}
-import scala.collection.JavaConversions._
 import scala.collection.generic.Growable
 import scala.collection.mutable.HashMap
 import scala.reflect.{ClassTag, classTag}
@@ -1546,7 +1546,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   def getAllPools: Seq[Schedulable] = {
     assertNotStopped()
     // TODO(xiajunluan): We should take nested pools into account
-    taskScheduler.rootPool.schedulableQueue.toSeq
+    taskScheduler.rootPool.schedulableQueue.asScala.toSeq
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index a1ebbecf93b7b..888763a3e8ebf 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -19,11 +19,12 @@ package org.apache.spark
 
 import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
 import java.net.{URI, URL}
+import java.nio.charset.StandardCharsets
+import java.util.Arrays
 import java.util.jar.{JarEntry, JarOutputStream}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
-import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.{ByteStreams, Files}
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
@@ -71,7 +72,7 @@ private[spark] object TestUtils {
     files.foreach { case (k, v) =>
       val entry = new JarEntry(k)
       jarStream.putNextEntry(entry)
-      ByteStreams.copy(new ByteArrayInputStream(v.getBytes(UTF_8)), jarStream)
+      ByteStreams.copy(new ByteArrayInputStream(v.getBytes(StandardCharsets.UTF_8)), jarStream)
     }
     jarStream.close()
     jarFile.toURI.toURL
@@ -125,7 +126,7 @@ private[spark] object TestUtils {
     } else {
       Seq()
     }
-    compiler.getTask(null, null, null, options, null, Seq(sourceFile)).call()
+    compiler.getTask(null, null, null, options.asJava, null, Arrays.asList(sourceFile)).call()
 
     val fileName = className + ".class"
     val result = new File(fileName)
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
index 0ae0b4ec042e2..891bcddeac286 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.api.java
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.mapred.InputSplit
@@ -37,7 +37,7 @@ class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
   def mapPartitionsWithInputSplit[R](
       f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
       preservesPartitioning: Boolean = false): JavaRDD[R] = {
-    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
+    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
       preservesPartitioning)(fakeClassTag))(fakeClassTag)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala
index ec4f3964d75e0..0f49279f3e647 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.api.java
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.mapreduce.InputSplit
@@ -37,7 +37,7 @@ class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
   def mapPartitionsWithInputSplit[R](
       f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
       preservesPartitioning: Boolean = false): JavaRDD[R] = {
-    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
+    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
       preservesPartitioning)(fakeClassTag))(fakeClassTag)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 8441bb3a3047e..fb787979c1820 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -20,7 +20,7 @@ package org.apache.spark.api.java
 import java.util.{Comparator, List => JList, Map => JMap}
 import java.lang.{Iterable => JIterable}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
@@ -142,7 +142,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   def sampleByKey(withReplacement: Boolean,
       fractions: JMap[K, Double],
       seed: Long): JavaPairRDD[K, V] =
-    new JavaPairRDD[K, V](rdd.sampleByKey(withReplacement, fractions, seed))
+    new JavaPairRDD[K, V](rdd.sampleByKey(withReplacement, fractions.asScala, seed))
 
   /**
    * Return a subset of this RDD sampled by key (via stratified sampling).
@@ -173,7 +173,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   def sampleByKeyExact(withReplacement: Boolean,
       fractions: JMap[K, Double],
       seed: Long): JavaPairRDD[K, V] =
-    new JavaPairRDD[K, V](rdd.sampleByKeyExact(withReplacement, fractions, seed))
+    new JavaPairRDD[K, V](rdd.sampleByKeyExact(withReplacement, fractions.asScala, seed))
 
   /**
    * ::Experimental::
@@ -768,7 +768,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Return the list of values in the RDD for key `key`. This operation is done efficiently if the
    * RDD has a known partitioner by only searching the partition that the key maps to.
    */
-  def lookup(key: K): JList[V] = seqAsJavaList(rdd.lookup(key))
+  def lookup(key: K): JList[V] = rdd.lookup(key).asJava
 
   /** Output the RDD to any Hadoop-supported file system. */
   def saveAsHadoopFile[F <: OutputFormat[_, _]](
@@ -987,30 +987,27 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
 object JavaPairRDD {
   private[spark]
   def groupByResultToJava[K: ClassTag, T](rdd: RDD[(K, Iterable[T])]): RDD[(K, JIterable[T])] = {
-    rddToPairRDDFunctions(rdd).mapValues(asJavaIterable)
+    rddToPairRDDFunctions(rdd).mapValues(_.asJava)
   }
 
   private[spark]
   def cogroupResultToJava[K: ClassTag, V, W](
       rdd: RDD[(K, (Iterable[V], Iterable[W]))]): RDD[(K, (JIterable[V], JIterable[W]))] = {
-    rddToPairRDDFunctions(rdd).mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2)))
+    rddToPairRDDFunctions(rdd).mapValues(x => (x._1.asJava, x._2.asJava))
   }
 
   private[spark]
   def cogroupResult2ToJava[K: ClassTag, V, W1, W2](
       rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))])
       : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2]))] = {
-    rddToPairRDDFunctions(rdd)
-      .mapValues(x => (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3)))
+    rddToPairRDDFunctions(rdd).mapValues(x => (x._1.asJava, x._2.asJava, x._3.asJava))
   }
 
   private[spark]
   def cogroupResult3ToJava[K: ClassTag, V, W1, W2, W3](
       rdd: RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))])
   : RDD[(K, (JIterable[V], JIterable[W1], JIterable[W2], JIterable[W3]))] = {
-    rddToPairRDDFunctions(rdd)
-      .mapValues(x =>
-        (asJavaIterable(x._1), asJavaIterable(x._2), asJavaIterable(x._3), asJavaIterable(x._4)))
+    rddToPairRDDFunctions(rdd).mapValues(x => (x._1.asJava, x._2.asJava, x._3.asJava, x._4.asJava))
   }
 
   def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index c582488f16fe7..fc817cdd6a3f8 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -21,7 +21,6 @@ import java.{lang => jl}
 import java.lang.{Iterable => JIterable, Long => JLong}
 import java.util.{Comparator, List => JList, Iterator => JIterator}
 
-import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
@@ -59,10 +58,10 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def rdd: RDD[T]
 
   @deprecated("Use partitions() instead.", "1.1.0")
-  def splits: JList[Partition] = new java.util.ArrayList(rdd.partitions.toSeq)
+  def splits: JList[Partition] = rdd.partitions.toSeq.asJava
 
   /** Set of partitions in this RDD. */
-  def partitions: JList[Partition] = new java.util.ArrayList(rdd.partitions.toSeq)
+  def partitions: JList[Partition] = rdd.partitions.toSeq.asJava
 
   /** The partitioner of this RDD. */
   def partitioner: Optional[Partitioner] = JavaUtils.optionToOptional(rdd.partitioner)
@@ -82,7 +81,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * subclasses of RDD.
    */
   def iterator(split: Partition, taskContext: TaskContext): java.util.Iterator[T] =
-    asJavaIterator(rdd.iterator(split, taskContext))
+    rdd.iterator(split, taskContext).asJava
 
   // Transformations (return a new RDD)
 
@@ -99,7 +98,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitionsWithIndex[R](
       f: JFunction2[jl.Integer, java.util.Iterator[T], java.util.Iterator[R]],
       preservesPartitioning: Boolean = false): JavaRDD[R] =
-    new JavaRDD(rdd.mapPartitionsWithIndex(((a, b) => f(a, asJavaIterator(b))),
+    new JavaRDD(rdd.mapPartitionsWithIndex((a, b) => f.call(a, b.asJava).asScala,
         preservesPartitioning)(fakeClassTag))(fakeClassTag)
 
   /**
@@ -153,7 +152,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    */
   def mapPartitions[U](f: FlatMapFunction[java.util.Iterator[T], U]): JavaRDD[U] = {
     def fn: (Iterator[T]) => Iterator[U] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     JavaRDD.fromRDD(rdd.mapPartitions(fn)(fakeClassTag[U]))(fakeClassTag[U])
   }
@@ -164,7 +163,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitions[U](f: FlatMapFunction[java.util.Iterator[T], U],
       preservesPartitioning: Boolean): JavaRDD[U] = {
     def fn: (Iterator[T]) => Iterator[U] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     JavaRDD.fromRDD(
       rdd.mapPartitions(fn, preservesPartitioning)(fakeClassTag[U]))(fakeClassTag[U])
@@ -175,7 +174,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    */
   def mapPartitionsToDouble(f: DoubleFlatMapFunction[java.util.Iterator[T]]): JavaDoubleRDD = {
     def fn: (Iterator[T]) => Iterator[jl.Double] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     new JavaDoubleRDD(rdd.mapPartitions(fn).map((x: jl.Double) => x.doubleValue()))
   }
@@ -186,7 +185,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitionsToPair[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2]):
   JavaPairRDD[K2, V2] = {
     def fn: (Iterator[T]) => Iterator[(K2, V2)] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     JavaPairRDD.fromRDD(rdd.mapPartitions(fn))(fakeClassTag[K2], fakeClassTag[V2])
   }
@@ -197,7 +196,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitionsToDouble(f: DoubleFlatMapFunction[java.util.Iterator[T]],
       preservesPartitioning: Boolean): JavaDoubleRDD = {
     def fn: (Iterator[T]) => Iterator[jl.Double] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     new JavaDoubleRDD(rdd.mapPartitions(fn, preservesPartitioning)
       .map(x => x.doubleValue()))
@@ -209,7 +208,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def mapPartitionsToPair[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2],
       preservesPartitioning: Boolean): JavaPairRDD[K2, V2] = {
     def fn: (Iterator[T]) => Iterator[(K2, V2)] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     JavaPairRDD.fromRDD(
       rdd.mapPartitions(fn, preservesPartitioning))(fakeClassTag[K2], fakeClassTag[V2])
@@ -219,14 +218,14 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * Applies a function f to each partition of this RDD.
    */
   def foreachPartition(f: VoidFunction[java.util.Iterator[T]]) {
-    rdd.foreachPartition((x => f.call(asJavaIterator(x))))
+    rdd.foreachPartition((x => f.call(x.asJava)))
   }
 
   /**
    * Return an RDD created by coalescing all elements within each partition into an array.
    */
   def glom(): JavaRDD[JList[T]] =
-    new JavaRDD(rdd.glom().map(x => new java.util.ArrayList[T](x.toSeq)))
+    new JavaRDD(rdd.glom().map(_.toSeq.asJava))
 
   /**
    * Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of
@@ -266,13 +265,13 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * Return an RDD created by piping elements to a forked external process.
    */
   def pipe(command: JList[String]): JavaRDD[String] =
-    rdd.pipe(asScalaBuffer(command))
+    rdd.pipe(command.asScala)
 
   /**
    * Return an RDD created by piping elements to a forked external process.
    */
   def pipe(command: JList[String], env: java.util.Map[String, String]): JavaRDD[String] =
-    rdd.pipe(asScalaBuffer(command), mapAsScalaMap(env))
+    rdd.pipe(command.asScala, env.asScala)
 
   /**
    * Zips this RDD with another one, returning key-value pairs with the first element in each RDD,
@@ -294,8 +293,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
       other: JavaRDDLike[U, _],
       f: FlatMapFunction2[java.util.Iterator[T], java.util.Iterator[U], V]): JavaRDD[V] = {
     def fn: (Iterator[T], Iterator[U]) => Iterator[V] = {
-      (x: Iterator[T], y: Iterator[U]) => asScalaIterator(
-        f.call(asJavaIterator(x), asJavaIterator(y)).iterator())
+      (x: Iterator[T], y: Iterator[U]) => f.call(x.asJava, y.asJava).iterator().asScala
     }
     JavaRDD.fromRDD(
       rdd.zipPartitions(other.rdd)(fn)(other.classTag, fakeClassTag[V]))(fakeClassTag[V])
@@ -333,22 +331,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   /**
    * Return an array that contains all of the elements in this RDD.
    */
-  def collect(): JList[T] = {
-    import scala.collection.JavaConversions._
-    val arr: java.util.Collection[T] = rdd.collect().toSeq
-    new java.util.ArrayList(arr)
-  }
+  def collect(): JList[T] =
+    rdd.collect().toSeq.asJava
 
   /**
    * Return an iterator that contains all of the elements in this RDD.
    *
    * The iterator will consume as much memory as the largest partition in this RDD.
    */
-  def toLocalIterator(): JIterator[T] = {
-     import scala.collection.JavaConversions._
-     rdd.toLocalIterator
-  }
-
+  def toLocalIterator(): JIterator[T] =
+     asJavaIteratorConverter(rdd.toLocalIterator).asJava
 
   /**
    * Return an array that contains all of the elements in this RDD.
@@ -363,9 +355,8 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def collectPartitions(partitionIds: Array[Int]): Array[JList[T]] = {
     // This is useful for implementing `take` from other language frontends
     // like Python where the data is serialized.
-    import scala.collection.JavaConversions._
     val res = context.runJob(rdd, (it: Iterator[T]) => it.toArray, partitionIds)
-    res.map(x => new java.util.ArrayList(x.toSeq)).toArray
+    res.map(_.toSeq.asJava)
   }
 
   /**
@@ -489,20 +480,14 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * it will be slow if a lot of partitions are required. In that case, use collect() to get the
    * whole RDD instead.
    */
-  def take(num: Int): JList[T] = {
-    import scala.collection.JavaConversions._
-    val arr: java.util.Collection[T] = rdd.take(num).toSeq
-    new java.util.ArrayList(arr)
-  }
+  def take(num: Int): JList[T] =
+    rdd.take(num).toSeq.asJava
 
   def takeSample(withReplacement: Boolean, num: Int): JList[T] =
     takeSample(withReplacement, num, Utils.random.nextLong)
 
-  def takeSample(withReplacement: Boolean, num: Int, seed: Long): JList[T] = {
-    import scala.collection.JavaConversions._
-    val arr: java.util.Collection[T] = rdd.takeSample(withReplacement, num, seed).toSeq
-    new java.util.ArrayList(arr)
-  }
+  def takeSample(withReplacement: Boolean, num: Int, seed: Long): JList[T] =
+    rdd.takeSample(withReplacement, num, seed).toSeq.asJava
 
   /**
    * Return the first element in this RDD.
@@ -582,10 +567,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * @return an array of top elements
    */
   def top(num: Int, comp: Comparator[T]): JList[T] = {
-    import scala.collection.JavaConversions._
-    val topElems = rdd.top(num)(Ordering.comparatorToOrdering(comp))
-    val arr: java.util.Collection[T] = topElems.toSeq
-    new java.util.ArrayList(arr)
+    rdd.top(num)(Ordering.comparatorToOrdering(comp)).toSeq.asJava
   }
 
   /**
@@ -607,10 +589,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * @return an array of top elements
    */
   def takeOrdered(num: Int, comp: Comparator[T]): JList[T] = {
-    import scala.collection.JavaConversions._
-    val topElems = rdd.takeOrdered(num)(Ordering.comparatorToOrdering(comp))
-    val arr: java.util.Collection[T] = topElems.toSeq
-    new java.util.ArrayList(arr)
+    rdd.takeOrdered(num)(Ordering.comparatorToOrdering(comp)).toSeq.asJava
   }
 
   /**
@@ -696,7 +675,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * applies a function f to each partition of this RDD.
    */
   def foreachPartitionAsync(f: VoidFunction[java.util.Iterator[T]]): JavaFutureAction[Void] = {
-    new JavaFutureActionWrapper[Unit, Void](rdd.foreachPartitionAsync(x => f.call(x)),
+    new JavaFutureActionWrapper[Unit, Void](rdd.foreachPartitionAsync(x => f.call(x.asJava)),
       { x => null.asInstanceOf[Void] })
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 02e49a853c5f7..609496ccdfef1 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -21,8 +21,7 @@ import java.io.Closeable
 import java.util
 import java.util.{Map => JMap}
 
-import scala.collection.JavaConversions
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
@@ -104,7 +103,7 @@ class JavaSparkContext(val sc: SparkContext)
    */
   def this(master: String, appName: String, sparkHome: String, jars: Array[String],
       environment: JMap[String, String]) =
-    this(new SparkContext(master, appName, sparkHome, jars.toSeq, environment, Map()))
+    this(new SparkContext(master, appName, sparkHome, jars.toSeq, environment.asScala, Map()))
 
   private[spark] val env = sc.env
 
@@ -118,7 +117,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   def appName: String = sc.appName
 
-  def jars: util.List[String] = sc.jars
+  def jars: util.List[String] = sc.jars.asJava
 
   def startTime: java.lang.Long = sc.startTime
 
@@ -142,7 +141,7 @@ class JavaSparkContext(val sc: SparkContext)
   /** Distribute a local Scala collection to form an RDD. */
   def parallelize[T](list: java.util.List[T], numSlices: Int): JavaRDD[T] = {
     implicit val ctag: ClassTag[T] = fakeClassTag
-    sc.parallelize(JavaConversions.asScalaBuffer(list), numSlices)
+    sc.parallelize(list.asScala, numSlices)
   }
 
   /** Get an RDD that has no partitions or elements. */
@@ -161,7 +160,7 @@ class JavaSparkContext(val sc: SparkContext)
   : JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = fakeClassTag
     implicit val ctagV: ClassTag[V] = fakeClassTag
-    JavaPairRDD.fromRDD(sc.parallelize(JavaConversions.asScalaBuffer(list), numSlices))
+    JavaPairRDD.fromRDD(sc.parallelize(list.asScala, numSlices))
   }
 
   /** Distribute a local Scala collection to form an RDD. */
@@ -170,8 +169,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /** Distribute a local Scala collection to form an RDD. */
   def parallelizeDoubles(list: java.util.List[java.lang.Double], numSlices: Int): JavaDoubleRDD =
-    JavaDoubleRDD.fromRDD(sc.parallelize(JavaConversions.asScalaBuffer(list).map(_.doubleValue()),
-      numSlices))
+    JavaDoubleRDD.fromRDD(sc.parallelize(list.asScala.map(_.doubleValue()), numSlices))
 
   /** Distribute a local Scala collection to form an RDD. */
   def parallelizeDoubles(list: java.util.List[java.lang.Double]): JavaDoubleRDD =
@@ -519,7 +517,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /** Build the union of two or more RDDs. */
   override def union[T](first: JavaRDD[T], rest: java.util.List[JavaRDD[T]]): JavaRDD[T] = {
-    val rdds: Seq[RDD[T]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.rdd)
+    val rdds: Seq[RDD[T]] = (Seq(first) ++ rest.asScala).map(_.rdd)
     implicit val ctag: ClassTag[T] = first.classTag
     sc.union(rdds)
   }
@@ -527,7 +525,7 @@ class JavaSparkContext(val sc: SparkContext)
   /** Build the union of two or more RDDs. */
   override def union[K, V](first: JavaPairRDD[K, V], rest: java.util.List[JavaPairRDD[K, V]])
       : JavaPairRDD[K, V] = {
-    val rdds: Seq[RDD[(K, V)]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.rdd)
+    val rdds: Seq[RDD[(K, V)]] = (Seq(first) ++ rest.asScala).map(_.rdd)
     implicit val ctag: ClassTag[(K, V)] = first.classTag
     implicit val ctagK: ClassTag[K] = first.kClassTag
     implicit val ctagV: ClassTag[V] = first.vClassTag
@@ -536,7 +534,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /** Build the union of two or more RDDs. */
   override def union(first: JavaDoubleRDD, rest: java.util.List[JavaDoubleRDD]): JavaDoubleRDD = {
-    val rdds: Seq[RDD[Double]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.srdd)
+    val rdds: Seq[RDD[Double]] = (Seq(first) ++ rest.asScala).map(_.srdd)
     new JavaDoubleRDD(sc.union(rdds))
   }
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
index b959b683d1674..a7dfa1d257cf2 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
@@ -17,15 +17,17 @@
 
 package org.apache.spark.api.python
 
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.util.{SerializableConfiguration, Utils}
-import org.apache.spark.{Logging, SparkException}
+import scala.collection.JavaConverters._
+import scala.util.{Failure, Success, Try}
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io._
-import scala.util.{Failure, Success, Try}
-import org.apache.spark.annotation.Experimental
 
+import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.{SerializableConfiguration, Utils}
 
 /**
  * :: Experimental ::
@@ -68,7 +70,6 @@ private[python] class WritableToJavaConverter(
    * object representation
    */
   private def convertWritable(writable: Writable): Any = {
-    import collection.JavaConversions._
     writable match {
       case iw: IntWritable => iw.get()
       case dw: DoubleWritable => dw.get()
@@ -89,9 +90,7 @@ private[python] class WritableToJavaConverter(
         aw.get().map(convertWritable(_))
       case mw: MapWritable =>
         val map = new java.util.HashMap[Any, Any]()
-        mw.foreach { case (k, v) =>
-          map.put(convertWritable(k), convertWritable(v))
-        }
+        mw.asScala.foreach { case (k, v) => map.put(convertWritable(k), convertWritable(v)) }
         map
       case w: Writable => WritableUtils.clone(w, conf.value.value)
       case other => other
@@ -122,7 +121,6 @@ private[python] class JavaToWritableConverter extends Converter[Any, Writable] {
    * supported out-of-the-box.
    */
   private def convertToWritable(obj: Any): Writable = {
-    import collection.JavaConversions._
     obj match {
       case i: java.lang.Integer => new IntWritable(i)
       case d: java.lang.Double => new DoubleWritable(d)
@@ -134,7 +132,7 @@ private[python] class JavaToWritableConverter extends Converter[Any, Writable] {
       case null => NullWritable.get()
       case map: java.util.Map[_, _] =>
         val mapWritable = new MapWritable()
-        map.foreach { case (k, v) =>
+        map.asScala.foreach { case (k, v) =>
           mapWritable.put(convertToWritable(k), convertToWritable(v))
         }
         mapWritable
@@ -161,9 +159,8 @@ private[python] object PythonHadoopUtil {
    * Convert a [[java.util.Map]] of properties to a [[org.apache.hadoop.conf.Configuration]]
    */
   def mapToConf(map: java.util.Map[String, String]): Configuration = {
-    import collection.JavaConversions._
     val conf = new Configuration()
-    map.foreach{ case (k, v) => conf.set(k, v) }
+    map.asScala.foreach { case (k, v) => conf.set(k, v) }
     conf
   }
 
@@ -172,9 +169,8 @@ private[python] object PythonHadoopUtil {
    * any matching keys in left
    */
   def mergeConfs(left: Configuration, right: Configuration): Configuration = {
-    import collection.JavaConversions._
     val copy = new Configuration(left)
-    right.iterator().foreach(entry => copy.set(entry.getKey, entry.getValue))
+    right.asScala.foreach(entry => copy.set(entry.getKey, entry.getValue))
     copy
   }
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 2a56bf28d7027..b4d152b336602 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -21,7 +21,7 @@ import java.io._
 import java.net._
 import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.existentials
 
@@ -66,11 +66,11 @@ private[spark] class PythonRDD(
     val env = SparkEnv.get
     val localdir = env.blockManager.diskBlockManager.localDirs.map(
       f => f.getPath()).mkString(",")
-    envVars += ("SPARK_LOCAL_DIRS" -> localdir) // it's also used in monitor thread
+    envVars.put("SPARK_LOCAL_DIRS", localdir) // it's also used in monitor thread
     if (reuse_worker) {
-      envVars += ("SPARK_REUSE_WORKER" -> "1")
+      envVars.put("SPARK_REUSE_WORKER", "1")
     }
-    val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
+    val worker: Socket = env.createPythonWorker(pythonExec, envVars.asScala.toMap)
     // Whether is the worker released into idle pool
     @volatile var released = false
 
@@ -150,7 +150,7 @@ private[spark] class PythonRDD(
               // Check whether the worker is ready to be re-used.
               if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
                 if (reuse_worker) {
-                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                  env.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker)
                   released = true
                 }
               }
@@ -217,13 +217,13 @@ private[spark] class PythonRDD(
         // sparkFilesDir
         PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
         // Python includes (*.zip and *.egg files)
-        dataOut.writeInt(pythonIncludes.length)
-        for (include <- pythonIncludes) {
+        dataOut.writeInt(pythonIncludes.size())
+        for (include <- pythonIncludes.asScala) {
           PythonRDD.writeUTF(include, dataOut)
         }
         // Broadcast variables
         val oldBids = PythonRDD.getWorkerBroadcasts(worker)
-        val newBids = broadcastVars.map(_.id).toSet
+        val newBids = broadcastVars.asScala.map(_.id).toSet
         // number of different broadcasts
         val toRemove = oldBids.diff(newBids)
         val cnt = toRemove.size + newBids.diff(oldBids).size
@@ -233,7 +233,7 @@ private[spark] class PythonRDD(
           dataOut.writeLong(- bid - 1)  // bid >= 0
           oldBids.remove(bid)
         }
-        for (broadcast <- broadcastVars) {
+        for (broadcast <- broadcastVars.asScala) {
           if (!oldBids.contains(broadcast.id)) {
             // send new broadcast
             dataOut.writeLong(broadcast.id)
@@ -287,7 +287,7 @@ private[spark] class PythonRDD(
       if (!context.isCompleted) {
         try {
           logWarning("Incomplete task interrupted: Attempting to kill Python Worker")
-          env.destroyPythonWorker(pythonExec, envVars.toMap, worker)
+          env.destroyPythonWorker(pythonExec, envVars.asScala.toMap, worker)
         } catch {
           case e: Exception =>
             logError("Exception when trying to kill worker", e)
@@ -358,10 +358,10 @@ private[spark] object PythonRDD extends Logging {
     type ByteArray = Array[Byte]
     type UnrolledPartition = Array[ByteArray]
     val allPartitions: Array[UnrolledPartition] =
-      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions)
+      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions.asScala)
     val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
     serveIterator(flattenedPartition.iterator,
-      s"serve RDD ${rdd.id} with partitions ${partitions.mkString(",")}")
+      s"serve RDD ${rdd.id} with partitions ${partitions.asScala.mkString(",")}")
   }
 
   /**
@@ -819,7 +819,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
       val in = socket.getInputStream
       val out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
       out.writeInt(val2.size)
-      for (array <- val2) {
+      for (array <- val2.asScala) {
         out.writeInt(array.length)
         out.write(array)
       }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 90dacaeb93429..31e534f160eeb 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.api.python
 
-import java.io.{File}
+import java.io.File
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkContext
@@ -51,7 +51,14 @@ private[spark] object PythonUtils {
    * Convert list of T into seq of T (for calling API with varargs)
    */
   def toSeq[T](vs: JList[T]): Seq[T] = {
-    vs.toList.toSeq
+    vs.asScala
+  }
+
+  /**
+   * Convert list of T into a (Scala) List of T
+   */
+  def toList[T](vs: JList[T]): List[T] = {
+    vs.asScala.toList
   }
 
   /**
@@ -65,6 +72,6 @@ private[spark] object PythonUtils {
    * Convert java map of K, V into Map of K, V (for calling API with varargs)
    */
   def toScalaMap[K, V](jm: java.util.Map[K, V]): Map[K, V] = {
-    jm.toMap
+    jm.asScala.toMap
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index e314408c067e9..7039b734d2e40 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -19,9 +19,10 @@ package org.apache.spark.api.python
 
 import java.io.{DataOutputStream, DataInputStream, InputStream, OutputStreamWriter}
 import java.net.{InetAddress, ServerSocket, Socket, SocketException}
+import java.util.Arrays
 
 import scala.collection.mutable
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark._
 import org.apache.spark.util.{RedirectThread, Utils}
@@ -108,9 +109,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
 
       // Create and start the worker
-      val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.worker"))
+      val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", "pyspark.worker"))
       val workerEnv = pb.environment()
-      workerEnv.putAll(envVars)
+      workerEnv.putAll(envVars.asJava)
       workerEnv.put("PYTHONPATH", pythonPath)
       // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
       workerEnv.put("PYTHONUNBUFFERED", "YES")
@@ -151,9 +152,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
       try {
         // Create and start the daemon
-        val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.daemon"))
+        val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", "pyspark.daemon"))
         val workerEnv = pb.environment()
-        workerEnv.putAll(envVars)
+        workerEnv.putAll(envVars.asJava)
         workerEnv.put("PYTHONPATH", pythonPath)
         // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
         workerEnv.put("PYTHONUNBUFFERED", "YES")
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 1f1debcf84ad4..fd27276e70bfe 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -22,7 +22,6 @@ import java.util.{ArrayList => JArrayList}
 
 import org.apache.spark.api.java.JavaRDD
 
-import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Failure
@@ -214,7 +213,7 @@ private[spark] object SerDeUtil extends Logging {
         new AutoBatchedPickler(cleaned)
       } else {
         val pickle = new Pickler
-        cleaned.grouped(batchSize).map(batched => pickle.dumps(seqAsJavaList(batched)))
+        cleaned.grouped(batchSize).map(batched => pickle.dumps(batched.asJava))
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
index 8f30ff9202c83..ee1fb056f0d96 100644
--- a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
@@ -20,6 +20,8 @@ package org.apache.spark.api.python
 import java.io.{DataOutput, DataInput}
 import java.{util => ju}
 
+import scala.collection.JavaConverters._
+
 import com.google.common.base.Charsets.UTF_8
 
 import org.apache.hadoop.io._
@@ -62,10 +64,9 @@ private[python] class TestInputKeyConverter extends Converter[Any, Any] {
 }
 
 private[python] class TestInputValueConverter extends Converter[Any, Any] {
-  import collection.JavaConversions._
   override def convert(obj: Any): ju.List[Double] = {
     val m = obj.asInstanceOf[MapWritable]
-    seqAsJavaList(m.keySet.map(w => w.asInstanceOf[DoubleWritable].get()).toSeq)
+    m.keySet.asScala.map(_.asInstanceOf[DoubleWritable].get()).toSeq.asJava
   }
 }
 
@@ -76,9 +77,8 @@ private[python] class TestOutputKeyConverter extends Converter[Any, Any] {
 }
 
 private[python] class TestOutputValueConverter extends Converter[Any, Any] {
-  import collection.JavaConversions._
   override def convert(obj: Any): DoubleWritable = {
-    new DoubleWritable(obj.asInstanceOf[java.util.Map[Double, _]].keySet().head)
+    new DoubleWritable(obj.asInstanceOf[java.util.Map[Double, _]].keySet().iterator().next())
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 1cf2824f862ee..9d5bbb5d609f3 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -19,9 +19,10 @@ package org.apache.spark.api.r
 
 import java.io._
 import java.net.{InetAddress, ServerSocket}
+import java.util.Arrays
 import java.util.{Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.io.Source
 import scala.reflect.ClassTag
 import scala.util.Try
@@ -365,11 +366,11 @@ private[r] object RRDD {
       sparkConf.setIfMissing("spark.master", "local")
     }
 
-    for ((name, value) <- sparkEnvirMap) {
-      sparkConf.set(name.asInstanceOf[String], value.asInstanceOf[String])
+    for ((name, value) <- sparkEnvirMap.asScala) {
+      sparkConf.set(name.toString, value.toString)
     }
-    for ((name, value) <- sparkExecutorEnvMap) {
-      sparkConf.setExecutorEnv(name.asInstanceOf[String], value.asInstanceOf[String])
+    for ((name, value) <- sparkExecutorEnvMap.asScala) {
+      sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
     val jsc = new JavaSparkContext(sparkConf)
@@ -395,7 +396,7 @@ private[r] object RRDD {
     val rOptions = "--vanilla"
     val rLibDir = RUtils.sparkRPackagePath(isDriver = false)
     val rExecScript = rLibDir + "/SparkR/worker/" + script
-    val pb = new ProcessBuilder(List(rCommand, rOptions, rExecScript))
+    val pb = new ProcessBuilder(Arrays.asList(rCommand, rOptions, rExecScript))
     // Unset the R_TESTS environment variable for workers.
     // This is set by R CMD check as startup.Rs
     // (http://svn.r-project.org/R/trunk/src/library/tools/R/testing.R)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
index 427b2bc7cbcbb..9e807cc52f18c 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.api.r
 
 import java.io.File
-
-import scala.collection.JavaConversions._
+import java.util.Arrays
 
 import org.apache.spark.{SparkEnv, SparkException}
 
@@ -68,7 +67,7 @@ private[spark] object RUtils {
   /** Check if R is installed before running tests that use R commands. */
   def isRInstalled: Boolean = {
     try {
-      val builder = new ProcessBuilder(Seq("R", "--version"))
+      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
       builder.start().waitFor() == 0
     } catch {
       case e: Exception => false
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 3c89f24473744..dbbbcf40c1e96 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -20,7 +20,7 @@ package org.apache.spark.api.r
 import java.io.{DataInputStream, DataOutputStream}
 import java.sql.{Timestamp, Date, Time}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 /**
  * Utility functions to serialize, deserialize objects to / from R
@@ -165,7 +165,7 @@ private[spark] object SerDe {
         val valueType = readObjectType(in)
         readTypedObject(in, valueType)
       })
-      mapAsJavaMap(keys.zip(values).toMap)
+      keys.zip(values).toMap.asJava
     } else {
       new java.util.HashMap[Object, Object]()
     }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index a0c9b5e63c744..7e3764d802fe1 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -20,7 +20,7 @@ package org.apache.spark.broadcast
 import java.io._
 import java.nio.ByteBuffer
 
-import scala.collection.JavaConversions.asJavaEnumeration
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 import scala.util.Random
 
@@ -210,7 +210,7 @@ private object TorrentBroadcast extends Logging {
       compressionCodec: Option[CompressionCodec]): T = {
     require(blocks.nonEmpty, "Cannot unblockify an empty array of blocks")
     val is = new SequenceInputStream(
-      asJavaEnumeration(blocks.iterator.map(block => new ByteBufferInputStream(block))))
+      blocks.iterator.map(new ByteBufferInputStream(_)).asJavaEnumeration)
     val in: InputStream = compressionCodec.map(c => c.compressedInputStream(is)).getOrElse(is)
     val ser = serializer.newInstance()
     val serIn = ser.deserializeStream(in)
diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
index 22ef701d833b2..6840a3ae831f0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
@@ -19,13 +19,13 @@ package org.apache.spark.deploy
 
 import java.util.concurrent.CountDownLatch
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.{Logging, SparkConf, SecurityManager}
 import org.apache.spark.network.TransportContext
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.sasl.SaslServerBootstrap
-import org.apache.spark.network.server.TransportServer
+import org.apache.spark.network.server.{TransportServerBootstrap, TransportServer}
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
 import org.apache.spark.network.util.TransportConf
 import org.apache.spark.util.Utils
@@ -67,13 +67,13 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana
   def start() {
     require(server == null, "Shuffle server already started")
     logInfo(s"Starting shuffle service on port $port with useSasl = $useSasl")
-    val bootstraps =
+    val bootstraps: Seq[TransportServerBootstrap] =
       if (useSasl) {
         Seq(new SaslServerBootstrap(transportConf, securityManager))
       } else {
         Nil
       }
-    server = transportContext.createServer(port, bootstraps)
+    server = transportContext.createServer(port, bootstraps.asJava)
   }
 
   /** Clean up all shuffle files associated with an application that has exited. */
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 23d01e9cbb9f9..d85327603f64d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -21,7 +21,7 @@ import java.net.URI
 import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Try
 
 import org.apache.spark.SparkUserAppException
@@ -71,7 +71,7 @@ object PythonRunner {
     val pythonPath = PythonUtils.mergePythonPaths(pathElements: _*)
 
     // Launch Python process
-    val builder = new ProcessBuilder(Seq(pythonExec, formattedPythonFile) ++ otherArgs)
+    val builder = new ProcessBuilder((Seq(pythonExec, formattedPythonFile) ++ otherArgs).asJava)
     val env = builder.environment()
     env.put("PYTHONPATH", pythonPath)
     // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u:
diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
index ed1e972955679..4b28866dcaa7c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
@@ -22,7 +22,7 @@ import java.util.jar.JarFile
 import java.util.logging.Level
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.io.{ByteStreams, Files}
 
@@ -110,7 +110,7 @@ private[deploy] object RPackageUtils extends Logging {
       print(s"Building R package with the command: $installCmd", printStream)
     }
     try {
-      val builder = new ProcessBuilder(installCmd)
+      val builder = new ProcessBuilder(installCmd.asJava)
       builder.redirectErrorStream(true)
       val env = builder.environment()
       env.clear()
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index c0cab22fa8252..05b954ce36998 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -20,7 +20,7 @@ package org.apache.spark.deploy
 import java.io._
 import java.util.concurrent.{Semaphore, TimeUnit}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
 
@@ -68,7 +68,7 @@ object RRunner {
     if (initialized.tryAcquire(backendTimeout, TimeUnit.SECONDS)) {
       // Launch R
       val returnCode = try {
-        val builder = new ProcessBuilder(Seq(rCommand, rFileNormalized) ++ otherArgs)
+        val builder = new ProcessBuilder((Seq(rCommand, rFileNormalized) ++ otherArgs).asJava)
         val env = builder.environment()
         env.put("EXISTING_SPARKR_BACKEND_PORT", sparkRBackendPort.toString)
         val rPackageDir = RUtils.sparkRPackagePath(isDriver = true)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala
index b8d3993540220..8d5e716e6aea4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory}
 import org.apache.curator.retry.ExponentialBackoffRetry
@@ -57,7 +57,7 @@ private[spark] object SparkCuratorUtil extends Logging {
 
   def deleteRecursive(zk: CuratorFramework, path: String) {
     if (zk.checkExists().forPath(path) != null) {
-      for (child <- zk.getChildren.forPath(path)) {
+      for (child <- zk.getChildren.forPath(path).asScala) {
         zk.delete().forPath(path + "/" + child)
       }
       zk.delete().forPath(path)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index dda4216c7efe2..f7723ef5bde4c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -22,7 +22,7 @@ import java.lang.reflect.Method
 import java.security.PrivilegedExceptionAction
 import java.util.{Arrays, Comparator}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.util.control.NonFatal
@@ -71,7 +71,7 @@ class SparkHadoopUtil extends Logging {
   }
 
   def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
-    for (token <- source.getTokens()) {
+    for (token <- source.getTokens.asScala) {
       dest.addToken(token)
     }
   }
@@ -175,8 +175,8 @@ class SparkHadoopUtil extends Logging {
   }
 
   private def getFileSystemThreadStatistics(): Seq[AnyRef] = {
-    val stats = FileSystem.getAllStatistics()
-    stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
+    FileSystem.getAllStatistics.asScala.map(
+      Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
   }
 
   private def getFileSystemThreadStatisticsMethod(methodName: String): Method = {
@@ -306,12 +306,13 @@ class SparkHadoopUtil extends Logging {
     val renewalInterval =
       sparkConf.getLong("spark.yarn.token.renewal.interval", (24 hours).toMillis)
 
-    credentials.getAllTokens.filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
+    credentials.getAllTokens.asScala
+      .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
       .map { t =>
-      val identifier = new DelegationTokenIdentifier()
-      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
-      (identifier.getIssueDate + fraction * renewalInterval).toLong - now
-    }.foldLeft(0L)(math.max)
+        val identifier = new DelegationTokenIdentifier()
+        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
+        (identifier.getIssueDate + fraction * renewalInterval).toLong - now
+      }.foldLeft(0L)(math.max)
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 3f3c6627c21fb..18a1c52ae53fb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -23,7 +23,7 @@ import java.net.URI
 import java.util.{List => JList}
 import java.util.jar.JarFile
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.io.Source
 
@@ -94,7 +94,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
 
   // Set parameters from command line arguments
   try {
-    parse(args.toList)
+    parse(args.asJava)
   } catch {
     case e: IllegalArgumentException =>
       SparkSubmit.printErrorAndExit(e.getMessage())
@@ -458,7 +458,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   }
 
   override protected def handleExtraArgs(extra: JList[String]): Unit = {
-    childArgs ++= extra
+    childArgs ++= extra.asScala
   }
 
   private def printUsageAndExit(exitCode: Int, unknownParam: Any = null): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
index 563831cc6b8dd..540e802420ce0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.master
 
 import java.nio.ByteBuffer
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.curator.framework.CuratorFramework
@@ -49,8 +49,8 @@ private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer
   }
 
   override def read[T: ClassTag](prefix: String): Seq[T] = {
-    val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix))
-    file.map(deserializeFromFile[T]).flatten
+    zk.getChildren.forPath(WORKING_DIR).asScala
+      .filter(_.startsWith(prefix)).map(deserializeFromFile[T]).flatten
   }
 
   override def close() {
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index 45a3f43045437..ce02ee203a4bd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.deploy.worker
 
 import java.io.{File, FileOutputStream, InputStream, IOException}
-import java.lang.System._
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.Map
 
 import org.apache.spark.Logging
@@ -62,7 +61,7 @@ object CommandUtils extends Logging {
     // SPARK-698: do not call the run.cmd script, as process.destroy()
     // fails to kill a process tree on Windows
     val cmd = new WorkerCommandBuilder(sparkHome, memory, command).buildCommand()
-    cmd.toSeq ++ Seq(command.mainClass) ++ command.arguments
+    cmd.asScala ++ Seq(command.mainClass) ++ command.arguments
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index ec51c3d935d8e..89159ff5e2b3c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.worker
 
 import java.io._
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
@@ -172,8 +172,8 @@ private[deploy] class DriverRunner(
       CommandUtils.redirectStream(process.getInputStream, stdout)
 
       val stderr = new File(baseDir, "stderr")
-      val header = "Launch Command: %s\n%s\n\n".format(
-        builder.command.mkString("\"", "\" \"", "\""), "=" * 40)
+      val formattedCommand = builder.command.asScala.mkString("\"", "\" \"", "\"")
+      val header = "Launch Command: %s\n%s\n\n".format(formattedCommand, "=" * 40)
       Files.append(header, stderr, UTF_8)
       CommandUtils.redirectStream(process.getErrorStream, stderr)
     }
@@ -229,6 +229,6 @@ private[deploy] trait ProcessBuilderLike {
 private[deploy] object ProcessBuilderLike {
   def apply(processBuilder: ProcessBuilder): ProcessBuilderLike = new ProcessBuilderLike {
     override def start(): Process = processBuilder.start()
-    override def command: Seq[String] = processBuilder.command()
+    override def command: Seq[String] = processBuilder.command().asScala
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index ab3fea475c2a5..3aef0515cbf6e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.worker
 
 import java.io._
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
@@ -129,7 +129,8 @@ private[deploy] class ExecutorRunner(
       val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),
         memory, sparkHome.getAbsolutePath, substituteVariables)
       val command = builder.command()
-      logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))
+      val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
+      logInfo(s"Launch command: $formattedCommand")
 
       builder.directory(executorDir)
       builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
@@ -145,7 +146,7 @@ private[deploy] class ExecutorRunner(
 
       process = builder.start()
       val header = "Spark Executor Command: %s\n%s\n\n".format(
-        command.mkString("\"", "\" \"", "\""), "=" * 40)
+        formattedCommand, "=" * 40)
 
       // Redirect its stdout and stderr to files
       val stdout = new File(executorDir, "stdout")
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 79b1536d94016..770927c80f7a4 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -24,7 +24,6 @@ import java.util.{UUID, Date}
 import java.util.concurrent._
 import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFuture}
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, HashSet, LinkedHashMap}
 import scala.concurrent.ExecutionContext
 import scala.util.Random
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 42a85e42ea2b6..c3491bb8b1cf3 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -23,7 +23,7 @@ import java.net.URL
 import java.nio.ByteBuffer
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
@@ -147,7 +147,7 @@ private[spark] class Executor(
 
   /** Returns the total amount of time this JVM process has spent in garbage collection. */
   private def computeTotalGcTime(): Long = {
-    ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum
+    ManagementFactory.getGarbageCollectorMXBeans.asScala.map(_.getCollectionTime).sum
   }
 
   class TaskRunner(
@@ -425,7 +425,7 @@ private[spark] class Executor(
     val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]()
     val curGCTime = computeTotalGcTime()
 
-    for (taskRunner <- runningTasks.values()) {
+    for (taskRunner <- runningTasks.values().asScala) {
       if (taskRunner.task != null) {
         taskRunner.task.metrics.foreach { metrics =>
           metrics.updateShuffleReadMetrics()
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala
index 293c512f8b70c..d16f4a1fc4e3b 100644
--- a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala
@@ -19,7 +19,7 @@ package org.apache.spark.executor
 
 import java.util.concurrent.ThreadPoolExecutor
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.codahale.metrics.{Gauge, MetricRegistry}
 import org.apache.hadoop.fs.FileSystem
@@ -30,7 +30,7 @@ private[spark]
 class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source {
 
   private def fileStats(scheme: String) : Option[FileSystem.Statistics] =
-    FileSystem.getAllStatistics().find(s => s.getScheme.equals(scheme))
+    FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme))
 
   private def registerFileSystemStat[T](
         scheme: String, name: String, f: FileSystem.Statistics => T, defaultValue: T) = {
diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index cfd672e1d8a97..0474fd2ccc12e 100644
--- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -19,7 +19,7 @@ package org.apache.spark.executor
 
 import java.nio.ByteBuffer
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.mesos.protobuf.ByteString
 import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver}
@@ -28,7 +28,7 @@ import org.apache.mesos.Protos.{TaskStatus => MesosTaskStatus, _}
 import org.apache.spark.{Logging, TaskState, SparkConf, SparkEnv}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.scheduler.cluster.mesos.{MesosTaskLaunchData}
+import org.apache.spark.scheduler.cluster.mesos.MesosTaskLaunchData
 import org.apache.spark.util.{SignalLogger, Utils}
 
 private[spark] class MesosExecutorBackend
@@ -55,7 +55,7 @@ private[spark] class MesosExecutorBackend
       slaveInfo: SlaveInfo) {
 
     // Get num cores for this task from ExecutorInfo, created in MesosSchedulerBackend.
-    val cpusPerTask = executorInfo.getResourcesList
+    val cpusPerTask = executorInfo.getResourcesList.asScala
       .find(_.getName == "cpus")
       .map(_.getScalar.getValue.toInt)
       .getOrElse(0)
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index 6cda7772f77bc..a5ad47293f1c2 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -19,7 +19,7 @@ package org.apache.spark.input
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.io.ByteStreams
 import org.apache.hadoop.conf.Configuration
@@ -44,12 +44,9 @@ private[spark] abstract class StreamFileInputFormat[T]
    * which is set through setMaxSplitSize
    */
   def setMinPartitions(context: JobContext, minPartitions: Int) {
-    val files = listStatus(context)
-    val totalLen = files.map { file =>
-      if (file.isDir) 0L else file.getLen
-    }.sum
-
-    val maxSplitSize = Math.ceil(totalLen * 1.0 / files.length).toLong
+    val files = listStatus(context).asScala
+    val totalLen = files.map(file => if (file.isDir) 0L else file.getLen).sum
+    val maxSplitSize = Math.ceil(totalLen * 1.0 / files.size).toLong
     super.setMaxSplitSize(maxSplitSize)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
index aaef7c74eea33..1ba34a11414a2 100644
--- a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.input
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.InputSplit
@@ -52,10 +52,8 @@ private[spark] class WholeTextFileInputFormat
    * which is set through setMaxSplitSize
    */
   def setMinPartitions(context: JobContext, minPartitions: Int) {
-    val files = listStatus(context)
-    val totalLen = files.map { file =>
-      if (file.isDir) 0L else file.getLen
-    }.sum
+    val files = listStatus(context).asScala
+    val totalLen = files.map(file => if (file.isDir) 0L else file.getLen).sum
     val maxSplitSize = Math.ceil(totalLen * 1.0 /
       (if (minPartitions == 0) 1 else minPartitions)).toLong
     super.setMaxSplitSize(maxSplitSize)
diff --git a/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala b/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
index 9be98723aed14..0c096656f9236 100644
--- a/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
@@ -20,7 +20,7 @@ package org.apache.spark.launcher
 import java.io.File
 import java.util.{HashMap => JHashMap, List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.deploy.Command
 
@@ -32,7 +32,7 @@ import org.apache.spark.deploy.Command
 private[spark] class WorkerCommandBuilder(sparkHome: String, memoryMb: Int, command: Command)
     extends AbstractCommandBuilder {
 
-  childEnv.putAll(command.environment)
+  childEnv.putAll(command.environment.asJava)
   childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, sparkHome)
 
   override def buildCommand(env: JMap[String, String]): JList[String] = {
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
index d7495551ad233..dd2d325d87034 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
@@ -20,6 +20,7 @@ package org.apache.spark.metrics
 import java.io.{FileInputStream, InputStream}
 import java.util.Properties
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.matching.Regex
 
@@ -58,25 +59,20 @@ private[spark] class MetricsConfig(conf: SparkConf) extends Logging {
 
     propertyCategories = subProperties(properties, INSTANCE_REGEX)
     if (propertyCategories.contains(DEFAULT_PREFIX)) {
-      import scala.collection.JavaConversions._
-
-      val defaultProperty = propertyCategories(DEFAULT_PREFIX)
-      for { (inst, prop) <- propertyCategories
-            if (inst != DEFAULT_PREFIX)
-            (k, v) <- defaultProperty
-            if (prop.getProperty(k) == null) } {
-        prop.setProperty(k, v)
+      val defaultProperty = propertyCategories(DEFAULT_PREFIX).asScala
+      for((inst, prop) <- propertyCategories if (inst != DEFAULT_PREFIX);
+          (k, v) <- defaultProperty if (prop.get(k) == null)) {
+        prop.put(k, v)
       }
     }
   }
 
   def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = {
     val subProperties = new mutable.HashMap[String, Properties]
-    import scala.collection.JavaConversions._
-    prop.foreach { kv =>
-      if (regex.findPrefixOf(kv._1).isDefined) {
-        val regex(prefix, suffix) = kv._1
-        subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2)
+    prop.asScala.foreach { kv =>
+      if (regex.findPrefixOf(kv._1.toString).isDefined) {
+        val regex(prefix, suffix) = kv._1.toString
+        subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2.toString)
       }
     }
     subProperties
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index b089da8596e2b..7c170a742fb64 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.network.netty
 
 import java.nio.ByteBuffer
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.Logging
 import org.apache.spark.network.BlockDataManager
@@ -55,7 +55,7 @@ class NettyBlockRpcServer(
       case openBlocks: OpenBlocks =>
         val blocks: Seq[ManagedBuffer] =
           openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
-        val streamId = streamManager.registerStream(blocks.iterator)
+        val streamId = streamManager.registerStream(blocks.iterator.asJava)
         logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
         responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index d650d5fe73087..ff8aae9ebe9f0 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.network.netty
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.concurrent.{Future, Promise}
 
 import org.apache.spark.{SecurityManager, SparkConf}
@@ -58,7 +58,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
         securityManager.isSaslEncryptionEnabled()))
     }
     transportContext = new TransportContext(transportConf, rpcHandler)
-    clientFactory = transportContext.createClientFactory(clientBootstrap.toList)
+    clientFactory = transportContext.createClientFactory(clientBootstrap.toSeq.asJava)
     server = createServer(serverBootstrap.toList)
     appId = conf.getAppId
     logInfo("Server created on " + server.getPort)
@@ -67,7 +67,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
   /** Creates and binds the TransportServer, possibly trying multiple ports. */
   private def createServer(bootstraps: List[TransportServerBootstrap]): TransportServer = {
     def startService(port: Int): (TransportServer, Int) = {
-      val server = transportContext.createServer(port, bootstraps)
+      val server = transportContext.createServer(port, bootstraps.asJava)
       (server, server.getPort)
     }
 
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
index 1499da07bb83b..8d9ebadaf79d4 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
@@ -23,7 +23,7 @@ import java.nio.channels._
 import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.LinkedList
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
@@ -145,7 +145,7 @@ abstract class Connection(val channel: SocketChannel, val selector: Selector,
   }
 
   def callOnExceptionCallbacks(e: Throwable) {
-    onExceptionCallbacks foreach {
+    onExceptionCallbacks.asScala.foreach {
       callback =>
         try {
           callback(this, e)
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
index 91b07ce3af1b6..5afce75680f94 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.partial
 
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.JavaConversions.mapAsScalaMap
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
@@ -48,9 +48,9 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
     if (outputsMerged == totalOutputs) {
       val result = new JHashMap[T, BoundedDouble](sums.size)
       sums.foreach { case (key, sum) =>
-        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
+        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
       }
-      result
+      result.asScala
     } else if (outputsMerged == 0) {
       new HashMap[T, BoundedDouble]
     } else {
@@ -64,9 +64,9 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
         val stdev = math.sqrt(variance)
         val low = mean - confFactor * stdev
         val high = mean + confFactor * stdev
-        result(key) = new BoundedDouble(mean, confidence, low, high)
+        result.put(key, new BoundedDouble(mean, confidence, low, high))
       }
-      result
+      result.asScala
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala
index af26c3d59ac02..a164040684803 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedMeanEvaluator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.partial
 
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.JavaConversions.mapAsScalaMap
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.HashMap
 
@@ -55,9 +55,9 @@ private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Doub
       while (iter.hasNext) {
         val entry = iter.next()
         val mean = entry.getValue.mean
-        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
+        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
       }
-      result
+      result.asScala
     } else if (outputsMerged == 0) {
       new HashMap[T, BoundedDouble]
     } else {
@@ -72,9 +72,9 @@ private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Doub
         val confFactor = studentTCacher.get(counter.count)
         val low = mean - confFactor * stdev
         val high = mean + confFactor * stdev
-        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
+        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
       }
-      result
+      result.asScala
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala
index 442fb86227d86..54a1beab3514b 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedSumEvaluator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.partial
 
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.JavaConversions.mapAsScalaMap
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.HashMap
 
@@ -55,9 +55,9 @@ private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Doubl
       while (iter.hasNext) {
         val entry = iter.next()
         val sum = entry.getValue.sum
-        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
+        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
       }
-      result
+      result.asScala
     } else if (outputsMerged == 0) {
       new HashMap[T, BoundedDouble]
     } else {
@@ -80,9 +80,9 @@ private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Doubl
         val confFactor = studentTCacher.get(counter.count)
         val low = sumEstimate - confFactor * sumStdev
         val high = sumEstimate + confFactor * sumStdev
-        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
+        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
       }
-      result
+      result.asScala
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 326fafb230a40..4e5f2e8a5d467 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -22,7 +22,7 @@ import java.text.SimpleDateFormat
 import java.util.{Date, HashMap => JHashMap}
 
 import scala.collection.{Map, mutable}
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 import scala.util.DynamicVariable
@@ -312,14 +312,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     } : Iterator[JHashMap[K, V]]
 
     val mergeMaps = (m1: JHashMap[K, V], m2: JHashMap[K, V]) => {
-      m2.foreach { pair =>
+      m2.asScala.foreach { pair =>
         val old = m1.get(pair._1)
         m1.put(pair._1, if (old == null) pair._2 else cleanedF(old, pair._2))
       }
       m1
     } : JHashMap[K, V]
 
-    self.mapPartitions(reducePartition).reduce(mergeMaps)
+    self.mapPartitions(reducePartition).reduce(mergeMaps).asScala
   }
 
   /** Alias for reduceByKeyLocally */
diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
index 3bb9998e1db44..afbe566b76566 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
@@ -23,7 +23,7 @@ import java.io.IOException
 import java.io.PrintWriter
 import java.util.StringTokenizer
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
@@ -72,7 +72,7 @@ private[spark] class PipedRDD[T: ClassTag](
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[String] = {
-    val pb = new ProcessBuilder(command)
+    val pb = new ProcessBuilder(command.asJava)
     // Add the environmental variables to the process.
     val currentEnvVars = pb.environment()
     envVars.foreach { case (variable, value) => currentEnvVars.put(variable, value) }
@@ -81,7 +81,7 @@ private[spark] class PipedRDD[T: ClassTag](
     // so the user code can access the input filename
     if (split.isInstanceOf[HadoopPartition]) {
       val hadoopSplit = split.asInstanceOf[HadoopPartition]
-      currentEnvVars.putAll(hadoopSplit.getPipeEnvVars())
+      currentEnvVars.putAll(hadoopSplit.getPipeEnvVars().asJava)
     }
 
     // When spark.worker.separated.working.directory option is turned on, each
diff --git a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
index f7cb1791d4ac6..9a4fa301b06e3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
@@ -125,7 +125,7 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
     integrate(0, t => getSeq(t._1) += t._2)
     // the second dep is rdd2; remove all of its keys
     integrate(1, t => map.remove(t._1))
-    map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten
+    map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten
   }
 
   override def clearDependencies() {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index bac37bfdaa23f..0e438ab4366d9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.scheduler
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.immutable.Set
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 
@@ -107,7 +107,7 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl
 
     val retval = new ArrayBuffer[SplitInfo]()
     val list = instance.getSplits(job)
-    for (split <- list) {
+    for (split <- list.asScala) {
       retval ++= SplitInfo.toSplitInfo(inputFormatClazz, path, split)
     }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
index 174b73221afc0..5821afea98982 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler
 
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Logging
@@ -74,7 +74,7 @@ private[spark] class Pool(
     if (schedulableNameToSchedulable.containsKey(schedulableName)) {
       return schedulableNameToSchedulable.get(schedulableName)
     }
-    for (schedulable <- schedulableQueue) {
+    for (schedulable <- schedulableQueue.asScala) {
       val sched = schedulable.getSchedulableByName(schedulableName)
       if (sched != null) {
         return sched
@@ -84,12 +84,12 @@ private[spark] class Pool(
   }
 
   override def executorLost(executorId: String, host: String) {
-    schedulableQueue.foreach(_.executorLost(executorId, host))
+    schedulableQueue.asScala.foreach(_.executorLost(executorId, host))
   }
 
   override def checkSpeculatableTasks(): Boolean = {
     var shouldRevive = false
-    for (schedulable <- schedulableQueue) {
+    for (schedulable <- schedulableQueue.asScala) {
       shouldRevive |= schedulable.checkSpeculatableTasks()
     }
     shouldRevive
@@ -98,7 +98,7 @@ private[spark] class Pool(
   override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = {
     var sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
     val sortedSchedulableQueue =
-      schedulableQueue.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator)
+      schedulableQueue.asScala.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator)
     for (schedulable <- sortedSchedulableQueue) {
       sortedTaskSetQueue ++= schedulable.getSortedTaskSetQueue
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index d6e1e9e5bebc2..452c32d5411cd 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -21,7 +21,7 @@ import java.io.File
 import java.util.concurrent.locks.ReentrantLock
 import java.util.{Collections, List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{HashMap, HashSet}
 
 import com.google.common.collect.HashBiMap
@@ -233,7 +233,7 @@ private[spark] class CoarseMesosSchedulerBackend(
   override def resourceOffers(d: SchedulerDriver, offers: JList[Offer]) {
     stateLock.synchronized {
       val filters = Filters.newBuilder().setRefuseSeconds(5).build()
-      for (offer <- offers) {
+      for (offer <- offers.asScala) {
         val offerAttributes = toAttributeMap(offer.getAttributesList)
         val meetsConstraints = matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
         val slaveId = offer.getSlaveId.getValue
@@ -251,21 +251,21 @@ private[spark] class CoarseMesosSchedulerBackend(
           val cpusToUse = math.min(cpus, maxCores - totalCoresAcquired)
           totalCoresAcquired += cpusToUse
           val taskId = newMesosTaskId()
-          taskIdToSlaveId(taskId) = slaveId
+          taskIdToSlaveId.put(taskId, slaveId)
           slaveIdsWithExecutors += slaveId
           coresByTaskId(taskId) = cpusToUse
           // Gather cpu resources from the available resources and use them in the task.
           val (remainingResources, cpuResourcesToUse) =
             partitionResources(offer.getResourcesList, "cpus", cpusToUse)
           val (_, memResourcesToUse) =
-            partitionResources(remainingResources, "mem", calculateTotalMemory(sc))
+            partitionResources(remainingResources.asJava, "mem", calculateTotalMemory(sc))
           val taskBuilder = MesosTaskInfo.newBuilder()
             .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build())
             .setSlaveId(offer.getSlaveId)
             .setCommand(createCommand(offer, cpusToUse + extraCoresPerSlave, taskId))
             .setName("Task " + taskId)
-            .addAllResources(cpuResourcesToUse)
-            .addAllResources(memResourcesToUse)
+            .addAllResources(cpuResourcesToUse.asJava)
+            .addAllResources(memResourcesToUse.asJava)
 
           sc.conf.getOption("spark.mesos.executor.docker.image").foreach { image =>
             MesosSchedulerBackendUtil
@@ -314,9 +314,9 @@ private[spark] class CoarseMesosSchedulerBackend(
       }
 
       if (TaskState.isFinished(TaskState.fromMesos(state))) {
-        val slaveId = taskIdToSlaveId(taskId)
+        val slaveId = taskIdToSlaveId.get(taskId)
         slaveIdsWithExecutors -= slaveId
-        taskIdToSlaveId -= taskId
+        taskIdToSlaveId.remove(taskId)
         // Remove the cores we have remembered for this task, if it's in the hashmap
         for (cores <- coresByTaskId.get(taskId)) {
           totalCoresAcquired -= cores
@@ -361,7 +361,7 @@ private[spark] class CoarseMesosSchedulerBackend(
     stateLock.synchronized {
       if (slaveIdsWithExecutors.contains(slaveId)) {
         val slaveIdToTaskId = taskIdToSlaveId.inverse()
-        if (slaveIdToTaskId.contains(slaveId)) {
+        if (slaveIdToTaskId.containsKey(slaveId)) {
           val taskId: Int = slaveIdToTaskId.get(slaveId)
           taskIdToSlaveId.remove(taskId)
           removeExecutor(sparkExecutorId(slaveId, taskId.toString), reason)
@@ -411,7 +411,7 @@ private[spark] class CoarseMesosSchedulerBackend(
     val slaveIdToTaskId = taskIdToSlaveId.inverse()
     for (executorId <- executorIds) {
       val slaveId = executorId.split("/")(0)
-      if (slaveIdToTaskId.contains(slaveId)) {
+      if (slaveIdToTaskId.containsKey(slaveId)) {
         mesosDriver.killTask(
           TaskID.newBuilder().setValue(slaveIdToTaskId.get(slaveId).toString).build())
         pendingRemovedSlaveIds += slaveId
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
index 3efc536f1456c..e0c547dce6d07 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.curator.framework.CuratorFramework
 import org.apache.zookeeper.CreateMode
@@ -129,6 +129,6 @@ private[spark] class ZookeeperMesosClusterPersistenceEngine(
   }
 
   override def fetchAll[T](): Iterable[T] = {
-    zk.getChildren.forPath(WORKING_DIR).map(fetch[T]).flatten
+    zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T])
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 1206f184fbc82..07da9242b9922 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -21,7 +21,7 @@ import java.io.File
 import java.util.concurrent.locks.ReentrantLock
 import java.util.{Collections, Date, List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -350,7 +350,7 @@ private[spark] class MesosClusterScheduler(
         }
         // TODO: Page the status updates to avoid trying to reconcile
         // a large amount of tasks at once.
-        driver.reconcileTasks(statuses)
+        driver.reconcileTasks(statuses.toSeq.asJava)
       }
     }
   }
@@ -493,10 +493,10 @@ private[spark] class MesosClusterScheduler(
   }
 
   override def resourceOffers(driver: SchedulerDriver, offers: JList[Offer]): Unit = {
-    val currentOffers = offers.map { o =>
+    val currentOffers = offers.asScala.map(o =>
       new ResourceOffer(
         o, getResource(o.getResourcesList, "cpus"), getResource(o.getResourcesList, "mem"))
-    }.toList
+    ).toList
     logTrace(s"Received offers from Mesos: \n${currentOffers.mkString("\n")}")
     val tasks = new mutable.HashMap[OfferID, ArrayBuffer[TaskInfo]]()
     val currentTime = new Date()
@@ -521,10 +521,10 @@ private[spark] class MesosClusterScheduler(
         currentOffers,
         tasks)
     }
-    tasks.foreach { case (offerId, tasks) =>
-      driver.launchTasks(Collections.singleton(offerId), tasks)
+    tasks.foreach { case (offerId, taskInfos) =>
+      driver.launchTasks(Collections.singleton(offerId), taskInfos.asJava)
     }
-    offers
+    offers.asScala
       .filter(o => !tasks.keySet.contains(o.getId))
       .foreach(o => driver.declineOffer(o.getId))
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 5c20606d58715..2e424054be785 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -20,7 +20,7 @@ package org.apache.spark.scheduler.cluster.mesos
 import java.io.File
 import java.util.{ArrayList => JArrayList, Collections, List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{HashMap, HashSet}
 
 import org.apache.mesos.{Scheduler => MScheduler, _}
@@ -129,14 +129,12 @@ private[spark] class MesosSchedulerBackend(
     val (resourcesAfterCpu, usedCpuResources) =
       partitionResources(availableResources, "cpus", scheduler.CPUS_PER_TASK)
     val (resourcesAfterMem, usedMemResources) =
-      partitionResources(resourcesAfterCpu, "mem", calculateTotalMemory(sc))
+      partitionResources(resourcesAfterCpu.asJava, "mem", calculateTotalMemory(sc))
 
-    builder.addAllResources(usedCpuResources)
-    builder.addAllResources(usedMemResources)
+    builder.addAllResources(usedCpuResources.asJava)
+    builder.addAllResources(usedMemResources.asJava)
 
-    sc.conf.getOption("spark.mesos.uris").map { uris =>
-      setupUris(uris, command)
-    }
+    sc.conf.getOption("spark.mesos.uris").foreach(setupUris(_, command))
 
     val executorInfo = builder
       .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
@@ -148,7 +146,7 @@ private[spark] class MesosSchedulerBackend(
         .setupContainerBuilderDockerInfo(image, sc.conf, executorInfo.getContainerBuilder())
     }
 
-    (executorInfo.build(), resourcesAfterMem)
+    (executorInfo.build(), resourcesAfterMem.asJava)
   }
 
   /**
@@ -193,7 +191,7 @@ private[spark] class MesosSchedulerBackend(
 
   private def getTasksSummary(tasks: JArrayList[MesosTaskInfo]): String = {
     val builder = new StringBuilder
-    tasks.foreach { t =>
+    tasks.asScala.foreach { t =>
       builder.append("Task id: ").append(t.getTaskId.getValue).append("\n")
         .append("Slave id: ").append(t.getSlaveId.getValue).append("\n")
         .append("Task resources: ").append(t.getResourcesList).append("\n")
@@ -211,7 +209,7 @@ private[spark] class MesosSchedulerBackend(
   override def resourceOffers(d: SchedulerDriver, offers: JList[Offer]) {
     inClassLoader() {
       // Fail-fast on offers we know will be rejected
-      val (usableOffers, unUsableOffers) = offers.partition { o =>
+      val (usableOffers, unUsableOffers) = offers.asScala.partition { o =>
         val mem = getResource(o.getResourcesList, "mem")
         val cpus = getResource(o.getResourcesList, "cpus")
         val slaveId = o.getSlaveId.getValue
@@ -323,10 +321,10 @@ private[spark] class MesosSchedulerBackend(
       .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
       .setExecutor(executorInfo)
       .setName(task.name)
-      .addAllResources(cpuResources)
+      .addAllResources(cpuResources.asJava)
       .setData(MesosTaskLaunchData(task.serializedTask, task.attemptNumber).toByteString)
       .build()
-    (taskInfo, finalResources)
+    (taskInfo, finalResources.asJava)
   }
 
   override def statusUpdate(d: SchedulerDriver, status: TaskStatus) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index 5b854aa5c2754..860c8e097b3b9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.scheduler.cluster.mesos
 import java.util.{List => JList}
 import java.util.concurrent.CountDownLatch
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
@@ -137,7 +137,7 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
   protected def getResource(res: JList[Resource], name: String): Double = {
     // A resource can have multiple values in the offer since it can either be from
     // a specific role or wildcard.
-    res.filter(_.getName == name).map(_.getScalar.getValue).sum
+    res.asScala.filter(_.getName == name).map(_.getScalar.getValue).sum
   }
 
   protected def markRegistered(): Unit = {
@@ -169,7 +169,7 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
       amountToUse: Double): (List[Resource], List[Resource]) = {
     var remain = amountToUse
     var requestedResources = new ArrayBuffer[Resource]
-    val remainingResources = resources.map {
+    val remainingResources = resources.asScala.map {
       case r => {
         if (remain > 0 &&
           r.getType == Value.Type.SCALAR &&
@@ -214,7 +214,7 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
    * @return
    */
   protected def toAttributeMap(offerAttributes: JList[Attribute]): Map[String, GeneratedMessage] = {
-    offerAttributes.map(attr => {
+    offerAttributes.asScala.map(attr => {
       val attrValue = attr.getType match {
         case Value.Type.SCALAR => attr.getScalar
         case Value.Type.RANGES => attr.getRanges
@@ -253,7 +253,7 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
             requiredValues.map(_.toLong).exists(offerRange.contains(_))
           case Some(offeredValue: Value.Set) =>
             // check if the specified required values is a subset of offered set
-            requiredValues.subsetOf(offeredValue.getItemList.toSet)
+            requiredValues.subsetOf(offeredValue.getItemList.asScala.toSet)
           case Some(textValue: Value.Text) =>
             // check if the specified value is equal, if multiple values are specified
             // we succeed if any of them match.
@@ -299,14 +299,13 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
       Map()
     } else {
       try {
-        Map() ++ mapAsScalaMap(splitter.split(constraintsVal)).map {
-          case (k, v) =>
-            if (v == null || v.isEmpty) {
-              (k, Set[String]())
-            } else {
-              (k, v.split(',').toSet)
-            }
-        }
+        splitter.split(constraintsVal).asScala.toMap.mapValues(v =>
+          if (v == null || v.isEmpty) {
+            Set[String]()
+          } else {
+            v.split(',').toSet
+          }
+        )
       } catch {
         case NonFatal(e) =>
           throw new IllegalArgumentException(s"Bad constraint string: $constraintsVal", e)
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 0ff7562e912ca..048a938507277 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -21,6 +21,7 @@ import java.io.{EOFException, IOException, InputStream, OutputStream}
 import java.nio.ByteBuffer
 import javax.annotation.Nullable
 
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.{Kryo, KryoException}
@@ -373,16 +374,15 @@ private class JavaIterableWrapperSerializer
   override def read(kryo: Kryo, in: KryoInput, clz: Class[java.lang.Iterable[_]])
     : java.lang.Iterable[_] = {
     kryo.readClassAndObject(in) match {
-      case scalaIterable: Iterable[_] =>
-        scala.collection.JavaConversions.asJavaIterable(scalaIterable)
-      case javaIterable: java.lang.Iterable[_] =>
-        javaIterable
+      case scalaIterable: Iterable[_] => scalaIterable.asJava
+      case javaIterable: java.lang.Iterable[_] => javaIterable
     }
   }
 }
 
 private object JavaIterableWrapperSerializer extends Logging {
-  // The class returned by asJavaIterable (scala.collection.convert.Wrappers$IterableWrapper).
+  // The class returned by JavaConverters.asJava
+  // (scala.collection.convert.Wrappers$IterableWrapper).
   val wrapperClass =
     scala.collection.convert.WrapAsJava.asJavaIterable(Seq(1)).getClass
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
index f6a96d81e7aa9..c057de9b3f4df 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -21,7 +21,7 @@ import java.io.File
 import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicInteger
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.{Logging, SparkConf, SparkEnv}
 import org.apache.spark.executor.ShuffleWriteMetrics
@@ -210,11 +210,13 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
     shuffleStates.get(shuffleId) match {
       case Some(state) =>
         if (consolidateShuffleFiles) {
-          for (fileGroup <- state.allFileGroups; file <- fileGroup.files) {
+          for (fileGroup <- state.allFileGroups.asScala;
+               file <- fileGroup.files) {
             file.delete()
           }
         } else {
-          for (mapId <- state.completedMapTasks; reduceId <- 0 until state.numBuckets) {
+          for (mapId <- state.completedMapTasks.asScala;
+               reduceId <- 0 until state.numBuckets) {
             val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
             blockManager.diskBlockManager.getFile(blockId).delete()
           }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 6fec5240707a6..7db6035553ae6 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -21,7 +21,7 @@ import java.util.{HashMap => JHashMap}
 
 import scala.collection.immutable.HashSet
 import scala.collection.mutable
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, RpcCallContext, ThreadSafeRpcEndpoint}
@@ -133,7 +133,7 @@ class BlockManagerMasterEndpoint(
 
     // Find all blocks for the given RDD, remove the block from both blockLocations and
     // the blockManagerInfo that is tracking the blocks.
-    val blocks = blockLocations.keys.flatMap(_.asRDDId).filter(_.rddId == rddId)
+    val blocks = blockLocations.asScala.keys.flatMap(_.asRDDId).filter(_.rddId == rddId)
     blocks.foreach { blockId =>
       val bms: mutable.HashSet[BlockManagerId] = blockLocations.get(blockId)
       bms.foreach(bm => blockManagerInfo.get(bm).foreach(_.removeBlock(blockId)))
@@ -242,7 +242,7 @@ class BlockManagerMasterEndpoint(
 
   private def storageStatus: Array[StorageStatus] = {
     blockManagerInfo.map { case (blockManagerId, info) =>
-      new StorageStatus(blockManagerId, info.maxMem, info.blocks)
+      new StorageStatus(blockManagerId, info.maxMem, info.blocks.asScala)
     }.toArray
   }
 
@@ -292,7 +292,7 @@ class BlockManagerMasterEndpoint(
           if (askSlaves) {
             info.slaveEndpoint.ask[Seq[BlockId]](getMatchingBlockIds)
           } else {
-            Future { info.blocks.keys.filter(filter).toSeq }
+            Future { info.blocks.asScala.keys.filter(filter).toSeq }
           }
         future
       }
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 78e7ddc27d1c7..1738258a0c794 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.util
 
-import scala.collection.JavaConversions.mapAsJavaMap
+import scala.collection.JavaConverters._
 
 import akka.actor.{ActorRef, ActorSystem, ExtendedActorSystem}
 import akka.pattern.ask
@@ -92,7 +92,7 @@ private[spark] object AkkaUtils extends Logging {
     val akkaSslConfig = securityManager.akkaSSLOptions.createAkkaConfig
         .getOrElse(ConfigFactory.empty())
 
-    val akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap[String, String])
+    val akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap.asJava)
       .withFallback(akkaSslConfig).withFallback(ConfigFactory.parseString(
       s"""
       |akka.daemonic = on
diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
index a725767d08cc2..13cb516b583e9 100644
--- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
@@ -19,12 +19,11 @@ package org.apache.spark.util
 
 import java.util.concurrent.CopyOnWriteArrayList
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
 import org.apache.spark.Logging
-import org.apache.spark.scheduler.SparkListener
 
 /**
  * An event bus which posts events to its listeners.
@@ -46,7 +45,7 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
    * `postToAll` in the same thread for all events.
    */
   final def postToAll(event: E): Unit = {
-    // JavaConversions will create a JIterableWrapper if we use some Scala collection functions.
+    // JavaConverters can create a JIterableWrapper if we use asScala.
     // However, this method will be called frequently. To avoid the wrapper cost, here ewe use
     // Java Iterator directly.
     val iter = listeners.iterator
@@ -69,7 +68,7 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
 
   private[spark] def findListenersByClass[T <: L : ClassTag](): Seq[T] = {
     val c = implicitly[ClassTag[T]].runtimeClass
-    listeners.filter(_.getClass == c).map(_.asInstanceOf[T]).toSeq
+    listeners.asScala.filter(_.getClass == c).map(_.asInstanceOf[T]).toSeq
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
index 169489df6c1ea..a1c33212cdb2b 100644
--- a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
@@ -21,8 +21,6 @@ import java.net.{URLClassLoader, URL}
 import java.util.Enumeration
 import java.util.concurrent.ConcurrentHashMap
 
-import scala.collection.JavaConversions._
-
 /**
  * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
  */
diff --git a/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala b/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala
index 8de75ba9a9c92..d7e5143c30953 100644
--- a/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala
@@ -21,7 +21,8 @@ import java.util.Set
 import java.util.Map.Entry
 import java.util.concurrent.ConcurrentHashMap
 
-import scala.collection.{JavaConversions, mutable}
+import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.spark.Logging
 
@@ -50,8 +51,7 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa
   }
 
   def iterator: Iterator[(A, B)] = {
-    val jIterator = getEntrySet.iterator
-    JavaConversions.asScalaIterator(jIterator).map(kv => (kv.getKey, kv.getValue.value))
+    getEntrySet.iterator.asScala.map(kv => (kv.getKey, kv.getValue.value))
   }
 
   def getEntrySet: Set[Entry[A, TimeStampedValue[B]]] = internalMap.entrySet
@@ -90,9 +90,7 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa
   }
 
   override def filter(p: ((A, B)) => Boolean): mutable.Map[A, B] = {
-    JavaConversions.mapAsScalaConcurrentMap(internalMap)
-      .map { case (k, TimeStampedValue(v, t)) => (k, v) }
-      .filter(p)
+    internalMap.asScala.map { case (k, TimeStampedValue(v, t)) => (k, v) }.filter(p)
   }
 
   override def empty: mutable.Map[A, B] = new TimeStampedHashMap[A, B]()
diff --git a/core/src/main/scala/org/apache/spark/util/TimeStampedHashSet.scala b/core/src/main/scala/org/apache/spark/util/TimeStampedHashSet.scala
index 7cd8f28b12dd6..65efeb1f4c19c 100644
--- a/core/src/main/scala/org/apache/spark/util/TimeStampedHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/TimeStampedHashSet.scala
@@ -19,7 +19,7 @@ package org.apache.spark.util
 
 import java.util.concurrent.ConcurrentHashMap
 
-import scala.collection.JavaConversions
+import scala.collection.JavaConverters._
 import scala.collection.mutable.Set
 
 private[spark] class TimeStampedHashSet[A] extends Set[A] {
@@ -31,7 +31,7 @@ private[spark] class TimeStampedHashSet[A] extends Set[A] {
 
   def iterator: Iterator[A] = {
     val jIterator = internalMap.entrySet().iterator()
-    JavaConversions.asScalaIterator(jIterator).map(_.getKey)
+    jIterator.asScala.map(_.getKey)
   }
 
   override def + (elem: A): Set[A] = {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 8313312226713..2bab4af2e73ab 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -25,7 +25,7 @@ import java.util.{Properties, Locale, Random, UUID}
 import java.util.concurrent._
 import javax.net.ssl.HttpsURLConnection
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
@@ -748,12 +748,12 @@ private[spark] object Utils extends Logging {
         // getNetworkInterfaces returns ifs in reverse order compared to ifconfig output order
         // on unix-like system. On windows, it returns in index order.
         // It's more proper to pick ip address following system output order.
-        val activeNetworkIFs = NetworkInterface.getNetworkInterfaces.toList
+        val activeNetworkIFs = NetworkInterface.getNetworkInterfaces.asScala.toSeq
         val reOrderedNetworkIFs = if (isWindows) activeNetworkIFs else activeNetworkIFs.reverse
 
         for (ni <- reOrderedNetworkIFs) {
-          val addresses = ni.getInetAddresses.toList
-            .filterNot(addr => addr.isLinkLocalAddress || addr.isLoopbackAddress)
+          val addresses = ni.getInetAddresses.asScala
+            .filterNot(addr => addr.isLinkLocalAddress || addr.isLoopbackAddress).toSeq
           if (addresses.nonEmpty) {
             val addr = addresses.find(_.isInstanceOf[Inet4Address]).getOrElse(addresses.head)
             // because of Inet6Address.toHostName may add interface at the end if it knows about it
@@ -1498,10 +1498,8 @@ private[spark] object Utils extends Logging {
     * properties which have been set explicitly, as well as those for which only a default value
     * has been defined. */
   def getSystemProperties: Map[String, String] = {
-    val sysProps = for (key <- System.getProperties.stringPropertyNames()) yield
-      (key, System.getProperty(key))
-
-    sysProps.toMap
+    System.getProperties.stringPropertyNames().asScala
+      .map(key => (key, System.getProperty(key))).toMap
   }
 
   /**
@@ -1812,7 +1810,8 @@ private[spark] object Utils extends Logging {
     try {
       val properties = new Properties()
       properties.load(inReader)
-      properties.stringPropertyNames().map(k => (k, properties(k).trim)).toMap
+      properties.stringPropertyNames().asScala.map(
+        k => (k, properties.getProperty(k).trim)).toMap
     } catch {
       case e: IOException =>
         throw new SparkException(s"Failed when loading Spark properties from $filename", e)
@@ -1941,7 +1940,8 @@ private[spark] object Utils extends Logging {
           return true
         }
         isBindCollision(e.getCause)
-      case e: MultiException => e.getThrowables.exists(isBindCollision)
+      case e: MultiException =>
+        e.getThrowables.asScala.exists(isBindCollision)
       case e: Exception => isBindCollision(e.getCause)
       case _ => false
     }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Utils.scala b/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
index bdbca00a00622..4939b600dbfbd 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.util.collection
 
-import scala.collection.JavaConversions.{collectionAsScalaIterable, asJavaIterator}
+import scala.collection.JavaConverters._
 
 import com.google.common.collect.{Ordering => GuavaOrdering}
 
@@ -34,6 +34,6 @@ private[spark] object Utils {
     val ordering = new GuavaOrdering[T] {
       override def compare(l: T, r: T): Int = ord.compare(l, r)
     }
-    collectionAsScalaIterable(ordering.leastOf(asJavaIterator(input), num)).iterator
+    ordering.leastOf(input.asJava, num).iterator.asScala
   }
 }
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index ffe4b4baffb2a..ebd3d61ae7324 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -24,10 +24,10 @@
 import java.util.*;
 import java.util.concurrent.*;
 
-import scala.collection.JavaConversions;
 import scala.Tuple2;
 import scala.Tuple3;
 import scala.Tuple4;
+import scala.collection.JavaConverters;
 
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
@@ -1473,7 +1473,9 @@ public Integer call(Integer v1, Integer v2) throws Exception {
     Assert.assertEquals(expected, results);
 
     Partitioner defaultPartitioner = Partitioner.defaultPartitioner(
-        combinedRDD.rdd(), JavaConversions.asScalaBuffer(Lists.<RDD<?>>newArrayList()));
+        combinedRDD.rdd(),
+        JavaConverters.collectionAsScalaIterableConverter(
+            Collections.<RDD<?>>emptyList()).asScala().toSeq());
     combinedRDD = originalRDD.keyBy(keyFunction)
         .combineByKey(
              createCombinerFunction,
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index 90cb7da94e88a..ff9a92cc0a421 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.util.concurrent.{TimeUnit, Executors}
 
+import scala.collection.JavaConverters._
 import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.util.{Try, Random}
@@ -148,7 +149,6 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
   }
 
   test("Thread safeness - SPARK-5425") {
-    import scala.collection.JavaConversions._
     val executor = Executors.newSingleThreadScheduledExecutor()
     val sf = executor.scheduleAtFixedRate(new Runnable {
       override def run(): Unit =
@@ -163,8 +163,9 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
       }
     } finally {
       executor.shutdownNow()
-      for (key <- System.getProperties.stringPropertyNames() if key.startsWith("spark.5425."))
-        System.getProperties.remove(key)
+      val sysProps = System.getProperties
+      for (key <- sysProps.stringPropertyNames().asScala if key.startsWith("spark.5425."))
+        sysProps.remove(key)
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
index cbd2aee10c0e2..86eb41dd7e5d7 100644
--- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.deploy
 
 import java.net.URL
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.io.Source
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
index 47a64081e297e..1ed4bae3ca21e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
@@ -21,14 +21,14 @@ import java.io.{PrintStream, OutputStream, File}
 import java.net.URI
 import java.util.jar.Attributes.Name
 import java.util.jar.{JarFile, Manifest}
-import java.util.zip.{ZipEntry, ZipFile}
+import java.util.zip.ZipFile
 
-import org.scalatest.BeforeAndAfterEach
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.io.Files
 import org.apache.commons.io.FileUtils
+import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.api.r.RUtils
@@ -142,7 +142,7 @@ class RPackageUtilsSuite extends SparkFunSuite with BeforeAndAfterEach {
       IvyTestUtils.writeFile(fakePackageDir, "DESCRIPTION", "abc")
       val finalZip = RPackageUtils.zipRLibraries(tempDir, "sparkr.zip")
       assert(finalZip.exists())
-      val entries = new ZipFile(finalZip).entries().toSeq.map(_.getName)
+      val entries = new ZipFile(finalZip).entries().asScala.map(_.getName).toSeq
       assert(entries.contains("/test.R"))
       assert(entries.contains("/SparkR/abc.R"))
       assert(entries.contains("/SparkR/DESCRIPTION"))
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index bed6f3ea61241..98664dc1101e6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -19,8 +19,6 @@ package org.apache.spark.deploy.worker
 
 import java.io.File
 
-import scala.collection.JavaConversions._
-
 import org.apache.spark.deploy.{ApplicationDescription, Command, ExecutorState}
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 
@@ -36,6 +34,7 @@ class ExecutorRunnerTest extends SparkFunSuite {
       ExecutorState.RUNNING)
     val builder = CommandUtils.buildProcessBuilder(
       appDesc.command, new SecurityManager(conf), 512, sparkHome, er.substituteVariables)
-    assert(builder.command().last === appId)
+    val builderCommand = builder.command()
+    assert(builderCommand.get(builderCommand.size() - 1) === appId)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 730535ece7878..a9652d7e7d0b0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.scheduler
 import java.util.concurrent.Semaphore
 
 import scala.collection.mutable
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.scalatest.Matchers
 
@@ -365,10 +365,9 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
       .set("spark.extraListeners", classOf[ListenerThatAcceptsSparkConf].getName + "," +
         classOf[BasicJobCounter].getName)
     sc = new SparkContext(conf)
-    sc.listenerBus.listeners.collect { case x: BasicJobCounter => x}.size should be (1)
-    sc.listenerBus.listeners.collect {
-      case x: ListenerThatAcceptsSparkConf => x
-    }.size should be (1)
+    sc.listenerBus.listeners.asScala.count(_.isInstanceOf[BasicJobCounter]) should be (1)
+    sc.listenerBus.listeners.asScala
+      .count(_.isInstanceOf[ListenerThatAcceptsSparkConf]) should be (1)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index 5ed30f64d705f..319b3173e7a6e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -18,10 +18,11 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import java.nio.ByteBuffer
-import java.util
+import java.util.Arrays
+import java.util.Collection
 import java.util.Collections
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -61,7 +62,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
 
     val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
 
-    val resources = List(
+    val resources = Arrays.asList(
       mesosSchedulerBackend.createResource("cpus", 4),
       mesosSchedulerBackend.createResource("mem", 1024))
     // uri is null.
@@ -98,7 +99,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     val backend = new MesosSchedulerBackend(taskScheduler, sc, "master")
 
     val (execInfo, _) = backend.createExecutorInfo(
-      List(backend.createResource("cpus", 4)), "mockExecutor")
+      Arrays.asList(backend.createResource("cpus", 4)), "mockExecutor")
     assert(execInfo.getContainer.getDocker.getImage.equals("spark/mock"))
     val portmaps = execInfo.getContainer.getDocker.getPortMappingsList
     assert(portmaps.get(0).getHostPort.equals(80))
@@ -179,7 +180,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
     when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
 
-    val capture = ArgumentCaptor.forClass(classOf[util.Collection[TaskInfo]])
+    val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
     when(
       driver.launchTasks(
         Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
@@ -279,7 +280,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
     when(taskScheduler.CPUS_PER_TASK).thenReturn(1)
 
-    val capture = ArgumentCaptor.forClass(classOf[util.Collection[TaskInfo]])
+    val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
     when(
       driver.launchTasks(
         Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
@@ -304,7 +305,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     assert(cpusDev.getName.equals("cpus"))
     assert(cpusDev.getScalar.getValue.equals(1.0))
     assert(cpusDev.getRole.equals("dev"))
-    val executorResources = taskInfo.getExecutor.getResourcesList
+    val executorResources = taskInfo.getExecutor.getResourcesList.asScala
     assert(executorResources.exists { r =>
       r.getName.equals("mem") && r.getScalar.getValue.equals(484.0) && r.getRole.equals("prod")
     })
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 23a1fdb0f5009..8d1c9d17e977e 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.serializer
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.reflect.ClassTag
 
@@ -173,7 +174,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
   test("asJavaIterable") {
     // Serialize a collection wrapped by asJavaIterable
     val ser = new KryoSerializer(conf).newInstance()
-    val a = ser.serialize(scala.collection.convert.WrapAsJava.asJavaIterable(Seq(12345)))
+    val a = ser.serialize(Seq(12345).asJava)
     val b = ser.deserialize[java.lang.Iterable[Int]](a)
     assert(b.iterator().next() === 12345)
 
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 69888b2694bae..22e30ecaf0533 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -21,7 +21,6 @@ import java.net.{HttpURLConnection, URL}
 import javax.servlet.http.{HttpServletResponse, HttpServletRequest}
 
 import scala.io.Source
-import scala.collection.JavaConversions._
 import scala.xml.Node
 
 import com.gargoylesoftware.htmlunit.DefaultCssErrorHandler
@@ -341,15 +340,15 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         // The completed jobs table should have two rows. The first row will be the most recent job:
         val firstRow = find(cssSelector("tbody tr")).get.underlying
         val firstRowColumns = firstRow.findElements(By.tagName("td"))
-        firstRowColumns(0).getText should be ("1")
-        firstRowColumns(4).getText should be ("1/1 (2 skipped)")
-        firstRowColumns(5).getText should be ("8/8 (16 skipped)")
+        firstRowColumns.get(0).getText should be ("1")
+        firstRowColumns.get(4).getText should be ("1/1 (2 skipped)")
+        firstRowColumns.get(5).getText should be ("8/8 (16 skipped)")
         // The second row is the first run of the job, where nothing was skipped:
         val secondRow = findAll(cssSelector("tbody tr")).toSeq(1).underlying
         val secondRowColumns = secondRow.findElements(By.tagName("td"))
-        secondRowColumns(0).getText should be ("0")
-        secondRowColumns(4).getText should be ("3/3")
-        secondRowColumns(5).getText should be ("24/24")
+        secondRowColumns.get(0).getText should be ("0")
+        secondRowColumns.get(4).getText should be ("3/3")
+        secondRowColumns.get(5).getText should be ("24/24")
       }
     }
   }
@@ -502,8 +501,8 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         for {
           (row, idx) <- rows.zipWithIndex
           columns = row.findElements(By.tagName("td"))
-          id = columns(0).getText()
-          description = columns(1).getText()
+          id = columns.get(0).getText()
+          description = columns.get(1).getText()
         } {
           id should be (expJobInfo(idx)._1)
           description should include (expJobInfo(idx)._2)
@@ -547,8 +546,8 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         for {
           (row, idx) <- rows.zipWithIndex
           columns = row.findElements(By.tagName("td"))
-          id = columns(0).getText()
-          description = columns(1).getText()
+          id = columns.get(0).getText()
+          description = columns.get(1).getText()
         } {
           id should be (expStageInfo(idx)._1)
           description should include (expStageInfo(idx)._2)
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
index 36832f51d2ad4..fa07c1e5017cd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
@@ -19,10 +19,7 @@
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ListBuffer
-import scala.collection.immutable.Map
+import java.util.Collections
 
 import org.apache.cassandra.hadoop.ConfigHelper
 import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
@@ -32,7 +29,6 @@ import org.apache.cassandra.utils.ByteBufferUtil
 import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
 
 
 /*
@@ -121,12 +117,9 @@ object CassandraCQLTest {
 
     val casoutputCF = aggregatedRDD.map {
       case (productId, saleCount) => {
-        val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))
-        val outKey: java.util.Map[String, ByteBuffer] = outColFamKey
-        var outColFamVal = new ListBuffer[ByteBuffer]
-        outColFamVal += ByteBufferUtil.bytes(saleCount)
-        val outVal: java.util.List[ByteBuffer] = outColFamVal
-       (outKey, outVal)
+        val outKey = Collections.singletonMap("prod_id", ByteBufferUtil.bytes(productId))
+        val outVal = Collections.singletonList(ByteBufferUtil.bytes(saleCount))
+        (outKey, outVal)
       }
     }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
index 96ef3e198e380..2e56d24c60c33 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
@@ -19,10 +19,9 @@
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
+import java.util.Arrays
 import java.util.SortedMap
 
-import scala.collection.JavaConversions._
-
 import org.apache.cassandra.db.IColumn
 import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
 import org.apache.cassandra.hadoop.ConfigHelper
@@ -32,7 +31,6 @@ import org.apache.cassandra.utils.ByteBufferUtil
 import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
 
 /*
  * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra
@@ -118,7 +116,7 @@ object CassandraTest {
 
         val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis)
 
-        val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil
+        val mutations = Arrays.asList(new Mutation(), new Mutation())
         mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn())
         mutations.get(0).column_or_supercolumn.setColumn(colWord)
         mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn())
diff --git a/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala b/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
index c42df2b8845d2..bec61f3cd4296 100644
--- a/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
@@ -18,7 +18,7 @@
 // scalastyle:off println
 package org.apache.spark.examples
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.util.Utils
 
@@ -36,10 +36,10 @@ object DriverSubmissionTest {
     val properties = Utils.getSystemProperties
 
     println("Environment variables containing SPARK_TEST:")
-    env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
+    env.asScala.filter { case (k, _) => k.contains("SPARK_TEST")}.foreach(println)
 
     println("System properties containing spark.test:")
-    properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
+    properties.filter { case (k, _) => k.toString.contains("spark.test") }.foreach(println)
 
     for (i <- 1 until numSecondsToSleep) {
       println(s"Alive for $i out of $numSecondsToSleep seconds")
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
index 3ebb112fc069e..805184e740f06 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
@@ -19,7 +19,7 @@ package org.apache.spark.examples.pythonconverters
 
 import java.util.{Collection => JCollection, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.avro.generic.{GenericFixed, IndexedRecord}
 import org.apache.avro.mapred.AvroWrapper
@@ -58,7 +58,7 @@ object AvroConversionUtil extends Serializable {
     val map = new java.util.HashMap[String, Any]
     obj match {
       case record: IndexedRecord =>
-        record.getSchema.getFields.zipWithIndex.foreach { case (f, i) =>
+        record.getSchema.getFields.asScala.zipWithIndex.foreach { case (f, i) =>
           map.put(f.name, fromAvro(record.get(i), f.schema))
         }
       case other => throw new SparkException(
@@ -68,9 +68,9 @@ object AvroConversionUtil extends Serializable {
   }
 
   def unpackMap(obj: Any, schema: Schema): JMap[String, Any] = {
-    obj.asInstanceOf[JMap[_, _]].map { case (key, value) =>
+    obj.asInstanceOf[JMap[_, _]].asScala.map { case (key, value) =>
       (key.toString, fromAvro(value, schema.getValueType))
-    }
+    }.asJava
   }
 
   def unpackFixed(obj: Any, schema: Schema): Array[Byte] = {
@@ -91,17 +91,17 @@ object AvroConversionUtil extends Serializable {
 
   def unpackArray(obj: Any, schema: Schema): JCollection[Any] = obj match {
     case c: JCollection[_] =>
-      c.map(fromAvro(_, schema.getElementType))
+      c.asScala.map(fromAvro(_, schema.getElementType)).toSeq.asJava
     case arr: Array[_] if arr.getClass.getComponentType.isPrimitive =>
-      arr.toSeq
+      arr.toSeq.asJava.asInstanceOf[JCollection[Any]]
     case arr: Array[_] =>
-      arr.map(fromAvro(_, schema.getElementType)).toSeq
+      arr.map(fromAvro(_, schema.getElementType)).toSeq.asJava
     case other => throw new SparkException(
       s"Unknown ARRAY type ${other.getClass.getName}")
   }
 
   def unpackUnion(obj: Any, schema: Schema): Any = {
-    schema.getTypes.toList match {
+    schema.getTypes.asScala.toList match {
       case List(s) => fromAvro(obj, s)
       case List(n, s) if n.getType == NULL => fromAvro(obj, s)
       case List(s, n) if n.getType == NULL => fromAvro(obj, s)
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
index 83feb5703b908..00ce47af4813d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.examples.pythonconverters
 
-import org.apache.spark.api.python.Converter
 import java.nio.ByteBuffer
+
+import scala.collection.JavaConverters._
+
 import org.apache.cassandra.utils.ByteBufferUtil
-import collection.JavaConversions._
 
+import org.apache.spark.api.python.Converter
 
 /**
  * Implementation of [[org.apache.spark.api.python.Converter]] that converts Cassandra
@@ -30,7 +32,7 @@ import collection.JavaConversions._
 class CassandraCQLKeyConverter extends Converter[Any, java.util.Map[String, Int]] {
   override def convert(obj: Any): java.util.Map[String, Int] = {
     val result = obj.asInstanceOf[java.util.Map[String, ByteBuffer]]
-    mapAsJavaMap(result.mapValues(bb => ByteBufferUtil.toInt(bb)))
+    result.asScala.mapValues(ByteBufferUtil.toInt).asJava
   }
 }
 
@@ -41,7 +43,7 @@ class CassandraCQLKeyConverter extends Converter[Any, java.util.Map[String, Int]
 class CassandraCQLValueConverter extends Converter[Any, java.util.Map[String, String]] {
   override def convert(obj: Any): java.util.Map[String, String] = {
     val result = obj.asInstanceOf[java.util.Map[String, ByteBuffer]]
-    mapAsJavaMap(result.mapValues(bb => ByteBufferUtil.string(bb)))
+    result.asScala.mapValues(ByteBufferUtil.string).asJava
   }
 }
 
@@ -52,7 +54,7 @@ class CassandraCQLValueConverter extends Converter[Any, java.util.Map[String, St
 class ToCassandraCQLKeyConverter extends Converter[Any, java.util.Map[String, ByteBuffer]] {
   override def convert(obj: Any): java.util.Map[String, ByteBuffer] = {
     val input = obj.asInstanceOf[java.util.Map[String, Int]]
-    mapAsJavaMap(input.mapValues(i => ByteBufferUtil.bytes(i)))
+    input.asScala.mapValues(ByteBufferUtil.bytes).asJava
   }
 }
 
@@ -63,6 +65,6 @@ class ToCassandraCQLKeyConverter extends Converter[Any, java.util.Map[String, By
 class ToCassandraCQLValueConverter extends Converter[Any, java.util.List[ByteBuffer]] {
   override def convert(obj: Any): java.util.List[ByteBuffer] = {
     val input = obj.asInstanceOf[java.util.List[String]]
-    seqAsJavaList(input.map(s => ByteBufferUtil.bytes(s)))
+    input.asScala.map(ByteBufferUtil.bytes).asJava
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
index 90d48a64106c7..0a25ee7ae56f4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.examples.pythonconverters
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.parsing.json.JSONObject
 
 import org.apache.spark.api.python.Converter
@@ -33,7 +33,6 @@ import org.apache.hadoop.hbase.CellUtil
  */
 class HBaseResultToStringConverter extends Converter[Any, String] {
   override def convert(obj: Any): String = {
-    import collection.JavaConverters._
     val result = obj.asInstanceOf[Result]
     val output = result.listCells.asScala.map(cell =>
         Map(
@@ -77,7 +76,7 @@ class StringToImmutableBytesWritableConverter extends Converter[Any, ImmutableBy
  */
 class StringListToPutConverter extends Converter[Any, Put] {
   override def convert(obj: Any): Put = {
-    val output = obj.asInstanceOf[java.util.ArrayList[String]].map(Bytes.toBytes(_)).toArray
+    val output = obj.asInstanceOf[java.util.ArrayList[String]].asScala.map(Bytes.toBytes).toArray
     val put = new Put(output(0))
     put.add(output(1), output(2), output(3))
   }
diff --git a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
index fa43629d49771..d2654700ea729 100644
--- a/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
+++ b/external/flume-sink/src/test/scala/org/apache/spark/streaming/flume/sink/SparkSinkSuite.scala
@@ -20,7 +20,7 @@ import java.net.InetSocketAddress
 import java.util.concurrent.atomic.AtomicInteger
 import java.util.concurrent.{TimeUnit, CountDownLatch, Executors}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 import scala.util.{Failure, Success}
 
@@ -166,7 +166,7 @@ class SparkSinkSuite extends FunSuite {
     channelContext.put("capacity", channelCapacity.toString)
     channelContext.put("transactionCapacity", 1000.toString)
     channelContext.put("keep-alive", 0.toString)
-    channelContext.putAll(overrides)
+    channelContext.putAll(overrides.asJava)
     channel.setName(scala.util.Random.nextString(10))
     channel.configure(channelContext)
 
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
index 65c49c131518b..48df27b26867f 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.flume
 
 import java.io.{ObjectOutput, ObjectInput}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.util.Utils
 import org.apache.spark.Logging
@@ -60,7 +60,7 @@ private[streaming] object EventTransformer extends Logging {
     out.write(body)
     val numHeaders = headers.size()
     out.writeInt(numHeaders)
-    for ((k, v) <- headers) {
+    for ((k, v) <- headers.asScala) {
       val keyBuff = Utils.serialize(k.toString)
       out.writeInt(keyBuff.length)
       out.write(keyBuff)
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeBatchFetcher.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeBatchFetcher.scala
index 88cc2aa3bf022..b9d4e762ca05d 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeBatchFetcher.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeBatchFetcher.scala
@@ -16,7 +16,6 @@
  */
 package org.apache.spark.streaming.flume
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.base.Throwables
@@ -155,7 +154,7 @@ private[flume] class FlumeBatchFetcher(receiver: FlumePollingReceiver) extends R
     val buffer = new ArrayBuffer[SparkFlumeEvent](events.size())
     var j = 0
     while (j < events.size()) {
-      val event = events(j)
+      val event = events.get(j)
       val sparkFlumeEvent = new SparkFlumeEvent()
       sparkFlumeEvent.event.setBody(event.getBody)
       sparkFlumeEvent.event.setHeaders(event.getHeaders)
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
index 1e32a365a1eee..2bf99cb3cba1f 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
@@ -22,7 +22,7 @@ import java.io.{ObjectInput, ObjectOutput, Externalizable}
 import java.nio.ByteBuffer
 import java.util.concurrent.Executors
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.flume.source.avro.AvroSourceProtocol
@@ -99,7 +99,7 @@ class SparkFlumeEvent() extends Externalizable {
 
     val numHeaders = event.getHeaders.size()
     out.writeInt(numHeaders)
-    for ((k, v) <- event.getHeaders) {
+    for ((k, v) <- event.getHeaders.asScala) {
       val keyBuff = Utils.serialize(k.toString)
       out.writeInt(keyBuff.length)
       out.write(keyBuff)
@@ -127,8 +127,7 @@ class FlumeEventServer(receiver : FlumeReceiver) extends AvroSourceProtocol {
   }
 
   override def appendBatch(events : java.util.List[AvroFlumeEvent]) : Status = {
-    events.foreach (event =>
-      receiver.store(SparkFlumeEvent.fromAvroFlumeEvent(event)))
+    events.asScala.foreach(event => receiver.store(SparkFlumeEvent.fromAvroFlumeEvent(event)))
     Status.OK
   }
 }
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 583e7dca317ad..0bc46209b8369 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.flume
 import java.net.InetSocketAddress
 import java.util.concurrent.{LinkedBlockingQueue, Executors}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
@@ -94,9 +94,7 @@ private[streaming] class FlumePollingReceiver(
   override def onStop(): Unit = {
     logInfo("Shutting down Flume Polling Receiver")
     receiverExecutor.shutdownNow()
-    connections.foreach(connection => {
-      connection.transceiver.close()
-    })
+    connections.asScala.foreach(_.transceiver.close())
     channelFactory.releaseExternalResources()
   }
 
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
index 9d9c3b189415f..70018c86f92be 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
@@ -19,9 +19,9 @@ package org.apache.spark.streaming.flume
 
 import java.net.{InetSocketAddress, ServerSocket}
 import java.nio.ByteBuffer
-import java.util.{List => JList}
+import java.util.Collections
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.base.Charsets.UTF_8
 import org.apache.avro.ipc.NettyTransceiver
@@ -59,13 +59,13 @@ private[flume] class FlumeTestUtils {
   }
 
   /** Send data to the flume receiver */
-  def writeInput(input: JList[String], enableCompression: Boolean): Unit = {
+  def writeInput(input: Seq[String], enableCompression: Boolean): Unit = {
     val testAddress = new InetSocketAddress("localhost", testPort)
 
     val inputEvents = input.map { item =>
       val event = new AvroFlumeEvent
       event.setBody(ByteBuffer.wrap(item.getBytes(UTF_8)))
-      event.setHeaders(Map[CharSequence, CharSequence]("test" -> "header"))
+      event.setHeaders(Collections.singletonMap("test", "header"))
       event
     }
 
@@ -88,7 +88,7 @@ private[flume] class FlumeTestUtils {
     }
 
     // Send data
-    val status = client.appendBatch(inputEvents.toList)
+    val status = client.appendBatch(inputEvents.asJava)
     if (status != avro.Status.OK) {
       throw new AssertionError("Sent events unsuccessfully")
     }
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
index a65a9b921aafa..c719b80aca7ed 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
@@ -21,7 +21,7 @@ import java.net.InetSocketAddress
 import java.io.{DataOutputStream, ByteArrayOutputStream}
 import java.util.{List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.api.java.function.PairFunction
 import org.apache.spark.api.python.PythonRDD
@@ -268,8 +268,8 @@ private[flume] class FlumeUtilsPythonHelper {
       maxBatchSize: Int,
       parallelism: Int
     ): JavaPairDStream[Array[Byte], Array[Byte]] = {
-    assert(hosts.length == ports.length)
-    val addresses = hosts.zip(ports).map {
+    assert(hosts.size() == ports.size())
+    val addresses = hosts.asScala.zip(ports.asScala).map {
       case (host, port) => new InetSocketAddress(host, port)
     }
     val dstream = FlumeUtils.createPollingStream(
@@ -286,7 +286,7 @@ private object FlumeUtilsPythonHelper {
     val output = new DataOutputStream(byteStream)
     try {
       output.writeInt(map.size)
-      map.foreach { kv =>
+      map.asScala.foreach { kv =>
         PythonRDD.writeUTF(kv._1.toString, output)
         PythonRDD.writeUTF(kv._2.toString, output)
       }
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
index 91d63d49dbec3..a2ab320957db3 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.streaming.flume
 
 import java.util.concurrent._
-import java.util.{List => JList, Map => JMap}
+import java.util.{Map => JMap, Collections}
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.base.Charsets.UTF_8
@@ -77,7 +76,7 @@ private[flume] class PollingFlumeTestUtils {
   /**
    * Start 2 sinks and return the ports
    */
-  def startMultipleSinks(): JList[Int] = {
+  def startMultipleSinks(): Seq[Int] = {
     channels.clear()
     sinks.clear()
 
@@ -138,8 +137,7 @@ private[flume] class PollingFlumeTestUtils {
   /**
    * A Python-friendly method to assert the output
    */
-  def assertOutput(
-      outputHeaders: JList[JMap[String, String]], outputBodies: JList[String]): Unit = {
+  def assertOutput(outputHeaders: Seq[JMap[String, String]], outputBodies: Seq[String]): Unit = {
     require(outputHeaders.size == outputBodies.size)
     val eventSize = outputHeaders.size
     if (eventSize != totalEventsPerChannel * channels.size) {
@@ -149,12 +147,12 @@ private[flume] class PollingFlumeTestUtils {
     var counter = 0
     for (k <- 0 until channels.size; i <- 0 until totalEventsPerChannel) {
       val eventBodyToVerify = s"${channels(k).getName}-$i"
-      val eventHeaderToVerify: JMap[String, String] = Map[String, String](s"test-$i" -> "header")
+      val eventHeaderToVerify: JMap[String, String] = Collections.singletonMap(s"test-$i", "header")
       var found = false
       var j = 0
       while (j < eventSize && !found) {
-        if (eventBodyToVerify == outputBodies.get(j) &&
-          eventHeaderToVerify == outputHeaders.get(j)) {
+        if (eventBodyToVerify == outputBodies(j) &&
+          eventHeaderToVerify == outputHeaders(j)) {
           found = true
           counter += 1
         }
@@ -195,7 +193,7 @@ private[flume] class PollingFlumeTestUtils {
         tx.begin()
         for (j <- 0 until eventsPerBatch) {
           channel.put(EventBuilder.withBody(s"${channel.getName}-$t".getBytes(UTF_8),
-            Map[String, String](s"test-$t" -> "header")))
+            Collections.singletonMap(s"test-$t", "header")))
           t += 1
         }
         tx.commit()
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
index d5f9a0aa38f9f..ff2fb8eed204c 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.flume
 
 import java.net.InetSocketAddress
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
 import scala.concurrent.duration._
 import scala.language.postfixOps
@@ -116,9 +116,9 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log
       // The eventually is required to ensure that all data in the batch has been processed.
       eventually(timeout(10 seconds), interval(100 milliseconds)) {
         val flattenOutputBuffer = outputBuffer.flatten
-        val headers = flattenOutputBuffer.map(_.event.getHeaders.map {
-          case kv => (kv._1.toString, kv._2.toString)
-        }).map(mapAsJavaMap)
+        val headers = flattenOutputBuffer.map(_.event.getHeaders.asScala.map {
+          case (key, value) => (key.toString, value.toString)
+        }).map(_.asJava)
         val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8))
         utils.assertOutput(headers, bodies)
       }
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index 5bc4cdf65306c..5ffb60bd602f9 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming.flume
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
 import scala.concurrent.duration._
 import scala.language.postfixOps
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
index 79a9db4291bef..c9fd715d3d554 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala
@@ -24,6 +24,7 @@ import java.util.concurrent.TimeoutException
 import java.util.{Map => JMap, Properties}
 
 import scala.annotation.tailrec
+import scala.collection.JavaConverters._
 import scala.language.postfixOps
 import scala.util.control.NonFatal
 
@@ -159,8 +160,7 @@ private[kafka] class KafkaTestUtils extends Logging {
 
   /** Java-friendly function for sending messages to the Kafka broker */
   def sendMessages(topic: String, messageToFreq: JMap[String, JInt]): Unit = {
-    import scala.collection.JavaConversions._
-    sendMessages(topic, Map(messageToFreq.mapValues(_.intValue()).toSeq: _*))
+    sendMessages(topic, Map(messageToFreq.asScala.mapValues(_.intValue()).toSeq: _*))
   }
 
   /** Send the messages to the Kafka broker */
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 388dbb8184106..3128222077537 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.kafka
 import java.lang.{Integer => JInt, Long => JLong}
 import java.util.{List => JList, Map => JMap, Set => JSet}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import kafka.common.TopicAndPartition
@@ -96,7 +96,7 @@ object KafkaUtils {
       groupId: String,
       topics: JMap[String, JInt]
     ): JavaPairReceiverInputDStream[String, String] = {
-    createStream(jssc.ssc, zkQuorum, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*))
+    createStream(jssc.ssc, zkQuorum, groupId, Map(topics.asScala.mapValues(_.intValue()).toSeq: _*))
   }
 
   /**
@@ -115,7 +115,7 @@ object KafkaUtils {
       topics: JMap[String, JInt],
       storageLevel: StorageLevel
     ): JavaPairReceiverInputDStream[String, String] = {
-    createStream(jssc.ssc, zkQuorum, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*),
+    createStream(jssc.ssc, zkQuorum, groupId, Map(topics.asScala.mapValues(_.intValue()).toSeq: _*),
       storageLevel)
   }
 
@@ -149,7 +149,10 @@ object KafkaUtils {
     implicit val valueCmd: ClassTag[T] = ClassTag(valueDecoderClass)
 
     createStream[K, V, U, T](
-      jssc.ssc, kafkaParams.toMap, Map(topics.mapValues(_.intValue()).toSeq: _*), storageLevel)
+      jssc.ssc,
+      kafkaParams.asScala.toMap,
+      Map(topics.asScala.mapValues(_.intValue()).toSeq: _*),
+      storageLevel)
   }
 
   /** get leaders for the given offset ranges, or throw an exception */
@@ -275,7 +278,7 @@ object KafkaUtils {
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
     implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
     new JavaPairRDD(createRDD[K, V, KD, VD](
-      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges))
+      jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges))
   }
 
   /**
@@ -311,9 +314,9 @@ object KafkaUtils {
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
     implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
     implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
-    val leaderMap = Map(leaders.toSeq: _*)
+    val leaderMap = Map(leaders.asScala.toSeq: _*)
     createRDD[K, V, KD, VD, R](
-      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges, leaderMap, messageHandler.call _)
+      jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges, leaderMap, messageHandler.call(_))
   }
 
   /**
@@ -476,8 +479,8 @@ object KafkaUtils {
     val cleanedHandler = jssc.sparkContext.clean(messageHandler.call _)
     createDirectStream[K, V, KD, VD, R](
       jssc.ssc,
-      Map(kafkaParams.toSeq: _*),
-      Map(fromOffsets.mapValues { _.longValue() }.toSeq: _*),
+      Map(kafkaParams.asScala.toSeq: _*),
+      Map(fromOffsets.asScala.mapValues(_.longValue()).toSeq: _*),
       cleanedHandler
     )
   }
@@ -531,8 +534,8 @@ object KafkaUtils {
     implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
     createDirectStream[K, V, KD, VD](
       jssc.ssc,
-      Map(kafkaParams.toSeq: _*),
-      Set(topics.toSeq: _*)
+      Map(kafkaParams.asScala.toSeq: _*),
+      Set(topics.asScala.toSeq: _*)
     )
   }
 }
@@ -602,10 +605,10 @@ private[kafka] class KafkaUtilsPythonHelper {
     ): JavaPairInputDStream[Array[Byte], Array[Byte]] = {
 
     if (!fromOffsets.isEmpty) {
-      import scala.collection.JavaConversions._
-      val topicsFromOffsets = fromOffsets.keySet().map(_.topic)
-      if (topicsFromOffsets != topics.toSet) {
-        throw new IllegalStateException(s"The specified topics: ${topics.toSet.mkString(" ")} " +
+      val topicsFromOffsets = fromOffsets.keySet().asScala.map(_.topic)
+      if (topicsFromOffsets != topics.asScala.toSet) {
+        throw new IllegalStateException(
+          s"The specified topics: ${topics.asScala.toSet.mkString(" ")} " +
           s"do not equal to the topic from offsets: ${topicsFromOffsets.mkString(" ")}")
       }
     }
@@ -663,6 +666,6 @@ private[kafka] class KafkaUtilsPythonHelper {
         "with this RDD, please call this method only on a Kafka RDD.")
 
     val kafkaRDD = kafkaRDDs.head.asInstanceOf[KafkaRDD[_, _, _, _, _]]
-    kafkaRDD.offsetRanges.toSeq
+    kafkaRDD.offsetRanges.toSeq.asJava
   }
 }
diff --git a/external/zeromq/src/main/scala/org/apache/spark/streaming/zeromq/ZeroMQUtils.scala b/external/zeromq/src/main/scala/org/apache/spark/streaming/zeromq/ZeroMQUtils.scala
index 0469d0af8864a..4ea218eaa4de1 100644
--- a/external/zeromq/src/main/scala/org/apache/spark/streaming/zeromq/ZeroMQUtils.scala
+++ b/external/zeromq/src/main/scala/org/apache/spark/streaming/zeromq/ZeroMQUtils.scala
@@ -18,15 +18,17 @@
 package org.apache.spark.streaming.zeromq
 
 import scala.reflect.ClassTag
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+
 import akka.actor.{Props, SupervisorStrategy}
 import akka.util.ByteString
 import akka.zeromq.Subscribe
+
 import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
-import org.apache.spark.streaming.dstream.{ReceiverInputDStream}
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
 import org.apache.spark.streaming.receiver.ActorSupervisorStrategy
 
 object ZeroMQUtils {
@@ -75,7 +77,8 @@ object ZeroMQUtils {
     ): JavaReceiverInputDStream[T] = {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
-    val fn = (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).toIterator
+    val fn =
+      (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).iterator().asScala
     createStream[T](jssc.ssc, publisherUrl, subscribe, fn, storageLevel, supervisorStrategy)
   }
 
@@ -99,7 +102,8 @@ object ZeroMQUtils {
     ): JavaReceiverInputDStream[T] = {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
-    val fn = (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).toIterator
+    val fn =
+      (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).iterator().asScala
     createStream[T](jssc.ssc, publisherUrl, subscribe, fn, storageLevel)
   }
 
@@ -122,7 +126,8 @@ object ZeroMQUtils {
     ): JavaReceiverInputDStream[T] = {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
-    val fn = (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).toIterator
+    val fn =
+      (x: Seq[ByteString]) => bytesToObjects.call(x.map(_.toArray).toArray).iterator().asScala
     createStream[T](jssc.ssc, publisherUrl, subscribe, fn)
   }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
index a003ddf325e6e..5d32fa699ae5b 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming.kinesis
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
 import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain}
@@ -213,7 +213,7 @@ class KinesisSequenceRangeIterator(
       s"getting records using shard iterator") {
         client.getRecords(getRecordsRequest)
       }
-    (getRecordsResult.getRecords.iterator(), getRecordsResult.getNextShardIterator)
+    (getRecordsResult.getRecords.iterator().asScala, getRecordsResult.getNextShardIterator)
   }
 
   /**
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 22324e821ce94..6e0988c1af8a1 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -18,7 +18,7 @@ package org.apache.spark.streaming.kinesis
 
 import java.util.UUID
 
-import scala.collection.JavaConversions.asScalaIterator
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.control.NonFatal
 
@@ -202,7 +202,7 @@ private[kinesis] class KinesisReceiver(
   /** Add records of the given shard to the current block being generated */
   private[kinesis] def addRecords(shardId: String, records: java.util.List[Record]): Unit = {
     if (records.size > 0) {
-      val dataIterator = records.iterator().map { record =>
+      val dataIterator = records.iterator().asScala.map { record =>
         val byteBuffer = record.getData()
         val byteArray = new Array[Byte](byteBuffer.remaining())
         byteBuffer.get(byteArray)
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
index c8eec13ec7dc7..634bf94521079 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -20,6 +20,7 @@ package org.apache.spark.streaming.kinesis
 import java.nio.ByteBuffer
 import java.util.concurrent.TimeUnit
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.util.{Failure, Random, Success, Try}
@@ -115,7 +116,7 @@ private[kinesis] class KinesisTestUtils extends Logging {
    * Expose a Python friendly API.
    */
   def pushData(testData: java.util.List[Int]): Unit = {
-    pushData(scala.collection.JavaConversions.asScalaBuffer(testData))
+    pushData(testData.asScala)
   }
 
   def deleteStream(): Unit = {
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index ceb135e0651aa..3d136aec2e702 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -17,10 +17,10 @@
 package org.apache.spark.streaming.kinesis
 
 import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
+import java.util.Arrays
 
-import scala.collection.JavaConversions.seqAsJavaList
-
-import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException}
+import com.amazonaws.services.kinesis.clientlibrary.exceptions._
 import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
 import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
@@ -47,10 +47,10 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
   val someSeqNum = Some(seqNum)
 
   val record1 = new Record()
-  record1.setData(ByteBuffer.wrap("Spark In Action".getBytes()))
+  record1.setData(ByteBuffer.wrap("Spark In Action".getBytes(StandardCharsets.UTF_8)))
   val record2 = new Record()
-  record2.setData(ByteBuffer.wrap("Learning Spark".getBytes()))
-  val batch = List[Record](record1, record2)
+  record2.setData(ByteBuffer.wrap("Learning Spark".getBytes(StandardCharsets.UTF_8)))
+  val batch = Arrays.asList(record1, record2)
 
   var receiverMock: KinesisReceiver = _
   var checkpointerMock: IRecordProcessorCheckpointer = _
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 87eeb5db05d26..7a1c7796065ee 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.util
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Random
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
@@ -52,7 +52,7 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): java.util.List[LabeledPoint] = {
-    seqAsJavaList(generateLinearInput(intercept, weights, nPoints, seed, eps))
+    generateLinearInput(intercept, weights, nPoints, seed, eps).asJava
   }
 
   /**
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
index a1ee554152372..2744e020e9e49 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
@@ -20,7 +20,7 @@
 import java.io.Serializable;
 import java.util.List;
 
-import static scala.collection.JavaConversions.seqAsJavaList;
+import scala.collection.JavaConverters;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -55,8 +55,9 @@ public void setUp() {
 
         double[] xMean = {5.843, 3.057, 3.758, 1.199};
         double[] xVariance = {0.6856, 0.1899, 3.116, 0.581};
-        List<LabeledPoint> points = seqAsJavaList(generateMultinomialLogisticInput(
-                weights, xMean, xVariance, true, nPoints, 42));
+        List<LabeledPoint> points = JavaConverters.asJavaListConverter(
+            generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+        ).asJava();
         datasetRDD = jsc.parallelize(points, 2);
         dataset = jsql.createDataFrame(datasetRDD, LabeledPoint.class);
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 2473510e13514..8d14bb6572155 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.classification
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Random
 import scala.util.control.Breaks._
 
@@ -38,7 +38,7 @@ object LogisticRegressionSuite {
     scale: Double,
     nPoints: Int,
     seed: Int): java.util.List[LabeledPoint] = {
-    seqAsJavaList(generateLogisticInput(offset, scale, nPoints, seed))
+    generateLogisticInput(offset, scale, nPoints, seed).asJava
   }
 
   // Generate input of the form Y = logistic(offset + scale*X)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index b1d78cba9e3dc..ee3c85d09a463 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.classification
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Random
 
 import org.jblas.DoubleMatrix
@@ -35,7 +35,7 @@ object SVMSuite {
     weights: Array[Double],
     nPoints: Int,
     seed: Int): java.util.List[LabeledPoint] = {
-    seqAsJavaList(generateSVMInput(intercept, weights, nPoints, seed))
+    generateSVMInput(intercept, weights, nPoints, seed).asJava
   }
 
   // Generate noisy input of the form Y = signum(x.dot(weights) + intercept + noise)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
index 13b754a03943a..36ac7d267243d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.optimization
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Random
 
 import org.scalatest.Matchers
@@ -35,7 +35,7 @@ object GradientDescentSuite {
       scale: Double,
       nPoints: Int,
       seed: Int): java.util.List[LabeledPoint] = {
-    seqAsJavaList(generateGDInput(offset, scale, nPoints, seed))
+    generateGDInput(offset, scale, nPoints, seed).asJava
   }
 
   // Generate input of the form Y = logistic(offset + scale * X)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index 05b87728d6fdb..045135f7f8d60 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.recommendation
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.math.abs
 import scala.util.Random
 
@@ -38,7 +38,7 @@ object ALSSuite {
       negativeWeights: Boolean): (java.util.List[Rating], DoubleMatrix, DoubleMatrix) = {
     val (sampledRatings, trueRatings, truePrefs) =
       generateRatings(users, products, features, samplingRate, implicitPrefs)
-    (seqAsJavaList(sampledRatings), trueRatings, truePrefs)
+    (sampledRatings.asJava, trueRatings, truePrefs)
   }
 
   def generateRatings(
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 04e0d49b178cf..ea52bfd67944a 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -18,13 +18,13 @@
 import java.io._
 
 import scala.util.Properties
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import sbt._
 import sbt.Classpaths.publishTask
 import sbt.Keys._
 import sbtunidoc.Plugin.UnidocKeys.unidocGenjavadocVersion
-import com.typesafe.sbt.pom.{loadEffectivePom, PomBuild, SbtPomKeys}
+import com.typesafe.sbt.pom.{PomBuild, SbtPomKeys}
 import net.virtualvoid.sbt.graph.Plugin.graphSettings
 
 import spray.revolver.RevolverPlugin._
@@ -120,7 +120,7 @@ object SparkBuild extends PomBuild {
     case _ =>
   }
 
-  override val userPropertiesMap = System.getProperties.toMap
+  override val userPropertiesMap = System.getProperties.asScala.toMap
 
   lazy val MavenCompile = config("m2r") extend(Compile)
   lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
@@ -559,7 +559,7 @@ object TestSettings {
     javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
     javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
     javaOptions in Test += "-Dderby.system.durability=test",
-    javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
+    javaOptions in Test ++= System.getProperties.asScala.filter(_._1.startsWith("spark"))
       .map { case (k,v) => s"-D$k=$v" }.toSeq,
     javaOptions in Test += "-ea",
     javaOptions in Test ++= "-Xmx3g -Xss4096k -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 8af8637cf948d..0948f9b27cd38 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -61,6 +61,18 @@ def _to_seq(sc, cols, converter=None):
     return sc._jvm.PythonUtils.toSeq(cols)
 
 
+def _to_list(sc, cols, converter=None):
+    """
+    Convert a list of Column (or names) into a JVM (Scala) List of Column.
+
+    An optional `converter` could be used to convert items in `cols`
+    into JVM Column objects.
+    """
+    if converter:
+        cols = [converter(c) for c in cols]
+    return sc._jvm.PythonUtils.toList(cols)
+
+
 def _unary_op(name, doc="unary operator"):
     """ Create a method for given unary operator """
     def _(self):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 025811f519293..e269ef4304f3f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -32,7 +32,7 @@
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql import since
 from pyspark.sql.types import _parse_datatype_json_string
-from pyspark.sql.column import Column, _to_seq, _to_java_column
+from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
 from pyspark.sql.types import *
 
@@ -494,7 +494,7 @@ def randomSplit(self, weights, seed=None):
             if w < 0.0:
                 raise ValueError("Weights must be positive. Found weight value: %s" % w)
         seed = seed if seed is not None else random.randint(0, sys.maxsize)
-        rdd_array = self._jdf.randomSplit(_to_seq(self.sql_ctx._sc, weights), long(seed))
+        rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), long(seed))
         return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
 
     @property
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index b5e2e882d2254..68fdb4141cf27 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -161,6 +161,13 @@ This file is divided into 3 sections:
     ]]></customMessage>
   </check>
 
+  <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
+  <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+    <parameters><parameter name="regex">JavaConversions</parameter></parameters>
+    <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
+    scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
+  </check>
+
   <!-- ================================================================================ -->
   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
   <!-- ================================================================================ -->
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index ec895af9c3037..cfd9cb0e62598 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types.StructType
@@ -280,9 +282,8 @@ trait Row extends Serializable {
    *
    * @throws ClassCastException when data type does not match.
    */
-  def getList[T](i: Int): java.util.List[T] = {
-    scala.collection.JavaConversions.seqAsJavaList(getSeq[T](i))
-  }
+  def getList[T](i: Int): java.util.List[T] =
+    getSeq[T](i).asJava
 
   /**
    * Returns the value at position i of map type as a Scala Map.
@@ -296,9 +297,8 @@ trait Row extends Serializable {
    *
    * @throws ClassCastException when data type does not match.
    */
-  def getJavaMap[K, V](i: Int): java.util.Map[K, V] = {
-    scala.collection.JavaConversions.mapAsJavaMap(getMap[K, V](i))
-  }
+  def getJavaMap[K, V](i: Int): java.util.Map[K, V] =
+    getMap[K, V](i).asJava
 
   /**
    * Returns the value at position i of struct type as an [[Row]] object.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 503c4f4b20f38..4cc9a5520a085 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import java.util.concurrent.ConcurrentHashMap
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -147,7 +147,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
     val result = ArrayBuffer.empty[(String, Boolean)]
-    for (name <- tables.keySet()) {
+    for (name <- tables.keySet().asScala) {
       result += ((name, true))
     }
     result
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index a4fd4cf3b330b..77a42c0873a6b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import java.{lang => jl}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.expressions._
@@ -209,7 +209,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *
    * @since 1.3.1
    */
-  def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.toSeq)
+  def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.asScala.toSeq)
 
   /**
    * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
@@ -254,7 +254,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * @since 1.3.1
    */
   def replace[T](col: String, replacement: java.util.Map[T, T]): DataFrame = {
-    replace[T](col, replacement.toMap : Map[T, T])
+    replace[T](col, replacement.asScala.toMap)
   }
 
   /**
@@ -277,7 +277,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * @since 1.3.1
    */
   def replace[T](cols: Array[String], replacement: java.util.Map[T, T]): DataFrame = {
-    replace(cols.toSeq, replacement.toMap)
+    replace(cols.toSeq, replacement.asScala.toMap)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 6dc7bfe333498..97a8b6518a832 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import java.util.Properties
 
+import scala.collection.JavaConverters._
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.Experimental
@@ -90,7 +92,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    * @since 1.4.0
    */
   def options(options: java.util.Map[String, String]): DataFrameReader = {
-    this.options(scala.collection.JavaConversions.mapAsScalaMap(options))
+    this.options(options.asScala)
     this
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index ce8744b53175b..b2a66dd417b4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import java.util.Properties
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
@@ -109,7 +111,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def options(options: java.util.Map[String, String]): DataFrameWriter = {
-    this.options(scala.collection.JavaConversions.mapAsScalaMap(options))
+    this.options(options.asScala)
     this
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 99d557b03a033..ee31d83cce42c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 
 import org.apache.spark.annotation.Experimental
@@ -188,7 +188,7 @@ class GroupedData protected[sql](
    * @since 1.3.0
    */
   def agg(exprs: java.util.Map[String, String]): DataFrame = {
-    agg(exprs.toMap)
+    agg(exprs.asScala.toMap)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e9de14f025502..e6f7619519e6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.util.Properties
 
 import scala.collection.immutable
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.parquet.hadoop.ParquetOutputCommitter
 
@@ -531,7 +531,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   /** Set Spark SQL configuration properties. */
   def setConf(props: Properties): Unit = settings.synchronized {
-    props.foreach { case (k, v) => setConfString(k, v) }
+    props.asScala.foreach { case (k, v) => setConfString(k, v) }
   }
 
   /** Set the given Spark SQL configuration property using a `string` value. */
@@ -601,24 +601,25 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    * Return all the configuration properties that have been set (i.e. not the default).
    * This creates a new copy of the config properties in the form of a Map.
    */
-  def getAllConfs: immutable.Map[String, String] = settings.synchronized { settings.toMap }
+  def getAllConfs: immutable.Map[String, String] =
+    settings.synchronized { settings.asScala.toMap }
 
   /**
    * Return all the configuration definitions that have been defined in [[SQLConf]]. Each
    * definition contains key, defaultValue and doc.
    */
   def getAllDefinedConfs: Seq[(String, String, String)] = sqlConfEntries.synchronized {
-    sqlConfEntries.values.filter(_.isPublic).map { entry =>
+    sqlConfEntries.values.asScala.filter(_.isPublic).map { entry =>
       (entry.key, entry.defaultValueString, entry.doc)
     }.toSeq
   }
 
   private[spark] def unsetConf(key: String): Unit = {
-    settings -= key
+    settings.remove(key)
   }
 
   private[spark] def unsetConf(entry: SQLConfEntry[_]): Unit = {
-    settings -= entry.key
+    settings.remove(entry.key)
   }
 
   private[spark] def clear(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a1eea09e0477b..4e8414af50b44 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -21,7 +21,7 @@ import java.beans.Introspector
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicReference
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.immutable
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
@@ -225,7 +225,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     conf.setConf(properties)
     // After we have populated SQLConf, we call setConf to populate other confs in the subclass
     // (e.g. hiveconf in HiveContext).
-    properties.foreach {
+    properties.asScala.foreach {
       case (key, value) => setConf(key, value)
     }
   }
@@ -567,7 +567,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       tableName: String,
       source: String,
       options: java.util.Map[String, String]): DataFrame = {
-    createExternalTable(tableName, source, options.toMap)
+    createExternalTable(tableName, source, options.asScala.toMap)
   }
 
   /**
@@ -612,7 +612,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       source: String,
       schema: StructType,
       options: java.util.Map[String, String]): DataFrame = {
-    createExternalTable(tableName, source, schema, options.toMap)
+    createExternalTable(tableName, source, schema, options.asScala.toMap)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
index 8fbaf3a3059db..011724436621d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import java.util.ServiceLoader
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.{existentials, implicitConversions}
 import scala.util.{Success, Failure, Try}
 
@@ -55,7 +55,7 @@ object ResolvedDataSource extends Logging {
     val loader = Utils.getContextOrSparkClassLoader
     val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
 
-    serviceLoader.iterator().filter(_.shortName().equalsIgnoreCase(provider)).toList match {
+    serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider)).toList match {
       /** the provider format did not match any given registered aliases */
       case Nil => Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match {
         case Success(dataSource) => dataSource
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 3f8353af6e2ad..0a6bb44445f6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.{Map => JMap}
 
-import scala.collection.JavaConversions.{iterableAsScalaIterable, mapAsJavaMap, mapAsScalaMap}
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
@@ -44,7 +44,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
     val parquetRequestedSchema = readContext.getRequestedSchema
 
     val catalystRequestedSchema =
-      Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
+      Option(readContext.getReadSupportMetadata).map(_.asScala).flatMap { metadata =>
         metadata
           // First tries to read requested schema, which may result from projections
           .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
@@ -123,7 +123,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
       maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
         val toParquet = new CatalystSchemaConverter(conf)
         val fileSchema = context.getFileSchema.asGroupType()
-        val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
+        val fileFieldNames = fileSchema.getFields.asScala.map(_.getName).toSet
 
         StructType
           // Deserializes the Catalyst schema of requested columns
@@ -152,7 +152,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
         maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
         maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
 
-    new ReadContext(parquetRequestedSchema, metadata)
+    new ReadContext(parquetRequestedSchema, metadata.asJava)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index cbf0704c4a9a4..f682ca0d8ff4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteOrder
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.parquet.column.Dictionary
@@ -183,7 +183,7 @@ private[parquet] class CatalystRowConverter(
     // those missing fields and create converters for them, although values of these fields are
     // always null.
     val paddedParquetFields = {
-      val parquetFields = parquetType.getFields
+      val parquetFields = parquetType.getFields.asScala
       val parquetFieldNames = parquetFields.map(_.getName).toSet
       val missingFields = catalystType.filterNot(f => parquetFieldNames.contains(f.name))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index 535f0684e97f9..be6c0545f5a0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.schema.OriginalType._
@@ -82,7 +82,7 @@ private[parquet] class CatalystSchemaConverter(
   def convert(parquetSchema: MessageType): StructType = convert(parquetSchema.asGroupType())
 
   private def convert(parquetSchema: GroupType): StructType = {
-    val fields = parquetSchema.getFields.map { field =>
+    val fields = parquetSchema.getFields.asScala.map { field =>
       field.getRepetition match {
         case OPTIONAL =>
           StructField(field.getName, convertField(field), nullable = true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index bbf682aec0f9d..64982f37cf872 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -21,7 +21,7 @@ import java.net.URI
 import java.util.logging.{Logger => JLogger}
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.{Failure, Try}
 
@@ -336,7 +336,7 @@ private[sql] class ParquetRelation(
         override def getPartitions: Array[SparkPartition] = {
           val inputFormat = new ParquetInputFormat[InternalRow] {
             override def listStatus(jobContext: JobContext): JList[FileStatus] = {
-              if (cacheMetadata) cachedStatuses else super.listStatus(jobContext)
+              if (cacheMetadata) cachedStatuses.asJava else super.listStatus(jobContext)
             }
           }
 
@@ -344,7 +344,8 @@ private[sql] class ParquetRelation(
           val rawSplits = inputFormat.getSplits(jobContext)
 
           Array.tabulate[SparkPartition](rawSplits.size) { i =>
-            new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+            new SqlNewHadoopPartition(
+              id, i, rawSplits.get(i).asInstanceOf[InputSplit with Writable])
           }
         }
       }.asInstanceOf[RDD[Row]]  // type erasure hack to pass RDD[InternalRow] as RDD[Row]
@@ -588,7 +589,7 @@ private[sql] object ParquetRelation extends Logging {
       val metadata = footer.getParquetMetadata.getFileMetaData
       val serializedSchema = metadata
         .getKeyValueMetaData
-        .toMap
+        .asScala.toMap
         .get(CatalystReadSupport.SPARK_METADATA_KEY)
       if (serializedSchema.isEmpty) {
         // Falls back to Parquet schema if no Spark SQL schema found.
@@ -745,7 +746,7 @@ private[sql] object ParquetRelation extends Logging {
           // Reads footers in multi-threaded manner within each task
           val footers =
             ParquetFileReader.readAllFootersInParallel(
-              serializedConf.value, fakeFileStatuses, skipRowGroups)
+              serializedConf.value, fakeFileStatuses.asJava, skipRowGroups).asScala
 
           // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
           val converter =
@@ -772,7 +773,7 @@ private[sql] object ParquetRelation extends Logging {
     val fileMetaData = footer.getParquetMetadata.getFileMetaData
     fileMetaData
       .getKeyValueMetaData
-      .toMap
+      .asScala.toMap
       .get(CatalystReadSupport.SPARK_METADATA_KEY)
       .flatMap(deserializeSchemaString)
       .getOrElse(converter.convert(fileMetaData.getSchema))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
index 42376ef7a9c1f..142301fe87cb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.IOException
+import java.util.{Collections, Arrays}
 
-import scala.collection.JavaConversions._
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
@@ -107,7 +107,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
     ParquetFileWriter.writeMetadataFile(
       conf,
       path,
-      new Footer(path, new ParquetMetadata(metaData, Nil)) :: Nil)
+      Arrays.asList(new Footer(path, new ParquetMetadata(metaData, Collections.emptyList()))))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index ed282f98b7d71..d800c7456bdac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.joins
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
@@ -92,9 +92,9 @@ case class ShuffledHashOuterJoin(
         case FullOuter =>
           // TODO(davies): use UnsafeRow
           val leftHashTable =
-            buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output))
+            buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output)).asScala
           val rightHashTable =
-            buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output))
+            buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output)).asScala
           (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
             fullOuterIterator(key,
               leftHashTable.getOrElse(key, EMPTY_LIST),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 59f8b079ab333..5a58d846ad80b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import java.io.OutputStream
 import java.util.{List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import net.razorvine.pickle._
 
@@ -196,14 +196,15 @@ object EvaluatePython {
     case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
 
     case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      new GenericArrayData(c.map { e => fromJava(e, elementType)}.toArray)
+      new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray)
 
     case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
       new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
 
     case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
-      val keys = c.keysIterator.map(fromJava(_, keyType)).toArray
-      val values = c.valuesIterator.map(fromJava(_, valueType)).toArray
+      val keyValues = c.asScala.toSeq
+      val keys = keyValues.map(kv => fromJava(kv._1, keyType)).toArray
+      val values = keyValues.map(kv => fromJava(kv._2, valueType)).toArray
       ArrayBasedMapData(keys, values)
 
     case (c, StructType(fields)) if c.getClass.isArray =>
@@ -367,7 +368,7 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
       val pickle = new Unpickler
       iter.flatMap { pickedResult =>
         val unpickledBatch = pickle.loads(pickedResult)
-        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]]
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
       }
     }.mapPartitions { iter =>
       val row = new GenericMutableRow(1)
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 7abdd3db80341..4867cebf5328c 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -23,7 +23,7 @@
 import java.util.List;
 import java.util.Map;
 
-import scala.collection.JavaConversions;
+import scala.collection.JavaConverters;
 import scala.collection.Seq;
 
 import com.google.common.collect.ImmutableMap;
@@ -96,7 +96,7 @@ public void testVarargMethods() {
     df.groupBy().agg(countDistinct("key", "value"));
     df.groupBy().agg(countDistinct(col("key"), col("value")));
     df.select(coalesce(col("key")));
-    
+
     // Varargs with mathfunctions
     DataFrame df2 = context.table("testData2");
     df2.select(exp("a"), exp("b"));
@@ -172,7 +172,7 @@ public void testCreateDataFrameFromJavaBeans() {
     Seq<Integer> outputBuffer = (Seq<Integer>) first.getJavaMap(2).get("hello");
     Assert.assertArrayEquals(
       bean.getC().get("hello"),
-      Ints.toArray(JavaConversions.seqAsJavaList(outputBuffer)));
+      Ints.toArray(JavaConverters.seqAsJavaListConverter(outputBuffer).asJava()));
     Seq<String> d = first.getAs(3);
     Assert.assertEquals(bean.getD().size(), d.length());
     for (int i = 0; i < d.length(); i++) {
@@ -206,7 +206,7 @@ public void testCrosstab() {
       count++;
     }
   }
-  
+
   @Test
   public void testFrequentItems() {
     DataFrame df = context.table("testData2");
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index cdaa14ac80785..329ffb66083b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -153,11 +153,11 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
 
     // Test Java version
     checkAnswer(
-      df.na.fill(mapAsJavaMap(Map(
+      df.na.fill(Map(
         "a" -> "test",
         "c" -> 1,
         "d" -> 2.2
-      ))),
+      ).asJava),
       Row("test", null, 1, 2.2))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 4adcefb7dc4b1..3649c2a97b5ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import java.util.{Locale, TimeZone}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
@@ -145,7 +145,7 @@ object QueryTest {
   }
 
   def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): String = {
-    checkAnswer(df, expectedAnswer.toSeq) match {
+    checkAnswer(df, expectedAnswer.asScala) match {
       case Some(errorMessage) => errorMessage
       case None => null
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
index 45db619567a22..bd7cf8c10abef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 import java.nio.ByteBuffer
 import java.util.{List => JList, Map => JMap}
 
-import scala.collection.JavaConverters.seqAsJavaListConverter
-import scala.collection.JavaConverters.mapAsJavaMapConverter
+import scala.collection.JavaConverters._
 
 import org.apache.avro.Schema
 import org.apache.avro.generic.IndexedRecord
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
index d85c564e3e8d1..df68432faeeb3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.parquet.hadoop.ParquetFileReader
@@ -40,8 +40,9 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
       override def accept(path: Path): Boolean = pathFilter(path)
     }).toSeq
 
-    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
-    footers.head.getParquetMetadata.getFileMetaData.getSchema
+    val footers =
+      ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles.asJava, true)
+    footers.iterator().next().getParquetMetadata.getFileMetaData.getSchema
   }
 
   protected def logParquetSchema(path: String): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index e6b0a2ea95e38..08d2b9dee99b0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import scala.collection.JavaConversions._
+import java.util.Collections
+
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
@@ -28,7 +30,7 @@ import org.apache.parquet.example.data.simple.SimpleGroup
 import org.apache.parquet.example.data.{Group, GroupWriter}
 import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
-import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.metadata.{BlockMetaData, CompressionCodecName, FileMetaData, ParquetMetadata}
 import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
@@ -205,9 +207,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   test("compression codec") {
     def compressionCodecFor(path: String): String = {
       val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(configuration))
-        .getBlocks
-        .flatMap(_.getColumns)
+        .readMetaData(new Path(path), Some(configuration)).getBlocks.asScala
+        .flatMap(_.getColumns.asScala)
         .map(_.getCodec.name())
         .distinct
 
@@ -348,14 +349,16 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       """.stripMargin)
 
     withTempPath { location =>
-      val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString)
+      val extraMetadata = Collections.singletonMap(
+        CatalystReadSupport.SPARK_METADATA_KEY, sparkSchema.toString)
       val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark")
       val path = new Path(location.getCanonicalPath)
 
       ParquetFileWriter.writeMetadataFile(
         sqlContext.sparkContext.hadoopConfiguration,
         path,
-        new Footer(path, new ParquetMetadata(fileMetadata, Nil)) :: Nil)
+        Collections.singletonList(
+          new Footer(path, new ParquetMetadata(fileMetadata, Collections.emptyList()))))
 
       assertResult(sqlContext.read.parquet(path.toString).schema) {
         StructType(
@@ -386,7 +389,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
       configuration.clear()
-      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
     }
   }
 
@@ -410,7 +413,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
       configuration.clear()
-      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
     }
   }
 
@@ -434,7 +437,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       } finally {
         // Hadoop 1 doesn't have `Configuration.unset`
         configuration.clear()
-        clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+        clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
       }
     }
   }
@@ -481,7 +484,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
       configuration.clear()
-      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
     }
   }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 02cc7e5efa521..306f98bcb5344 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.hive.thriftserver
 import java.security.PrivilegedExceptionAction
 import java.sql.{Date, Timestamp}
 import java.util.concurrent.RejectedExecutionException
-import java.util.{Map => JMap, UUID}
+import java.util.{Arrays, Map => JMap, UUID}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => SMap}
 import scala.util.control.NonFatal
 
@@ -126,13 +126,13 @@ private[hive] class SparkExecuteStatementOperation(
 
   def getResultSetSchema: TableSchema = {
     if (result == null || result.queryExecution.analyzed.output.size == 0) {
-      new TableSchema(new FieldSchema("Result", "string", "") :: Nil)
+      new TableSchema(Arrays.asList(new FieldSchema("Result", "string", "")))
     } else {
       logInfo(s"Result Schema: ${result.queryExecution.analyzed.output}")
       val schema = result.queryExecution.analyzed.output.map { attr =>
         new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
       }
-      new TableSchema(schema)
+      new TableSchema(schema.asJava)
     }
   }
 
@@ -298,7 +298,7 @@ private[hive] class SparkExecuteStatementOperation(
       sqlOperationConf = new HiveConf(sqlOperationConf)
 
       // apply overlay query specific settings, if any
-      getConfOverlay().foreach { case (k, v) =>
+      getConfOverlay().asScala.foreach { case (k, v) =>
         try {
           sqlOperationConf.verifyAndSet(k, v)
         } catch {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 7799704c819d9..a29df567983b1 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
-import scala.collection.JavaConversions._
-
 import java.io._
 import java.util.{ArrayList => JArrayList, Locale}
 
+import scala.collection.JavaConverters._
+
 import jline.console.ConsoleReader
 import jline.console.history.FileHistory
 
@@ -101,9 +101,9 @@ private[hive] object SparkSQLCLIDriver extends Logging {
 
     // Set all properties specified via command line.
     val conf: HiveConf = sessionState.getConf
-    sessionState.cmdProperties.entrySet().foreach { item =>
-      val key = item.getKey.asInstanceOf[String]
-      val value = item.getValue.asInstanceOf[String]
+    sessionState.cmdProperties.entrySet().asScala.foreach { item =>
+      val key = item.getKey.toString
+      val value = item.getValue.toString
       // We do not propagate metastore options to the execution copy of hive.
       if (key != "javax.jdo.option.ConnectionURL") {
         conf.set(key, value)
@@ -316,15 +316,15 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
 
           if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CLI_PRINT_HEADER)) {
             // Print the column names.
-            Option(driver.getSchema.getFieldSchemas).map { fields =>
-              out.println(fields.map(_.getName).mkString("\t"))
+            Option(driver.getSchema.getFieldSchemas).foreach { fields =>
+              out.println(fields.asScala.map(_.getName).mkString("\t"))
             }
           }
 
           var counter = 0
           try {
             while (!out.checkError() && driver.getResults(res)) {
-              res.foreach{ l =>
+              res.asScala.foreach { l =>
                 counter += 1
                 out.println(l)
               }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
index 644165acf70a7..5ad8c54f296d5 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
@@ -21,6 +21,8 @@ import java.io.IOException
 import java.util.{List => JList}
 import javax.security.auth.login.LoginException
 
+import scala.collection.JavaConverters._
+
 import org.apache.commons.logging.Log
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.shims.Utils
@@ -34,8 +36,6 @@ import org.apache.hive.service.{AbstractService, Service, ServiceException}
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 
-import scala.collection.JavaConversions._
-
 private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext)
   extends CLIService(hiveServer)
   with ReflectedCompositeService {
@@ -76,7 +76,7 @@ private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
   def initCompositeService(hiveConf: HiveConf) {
     // Emulating `CompositeService.init(hiveConf)`
     val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
-    serviceList.foreach(_.init(hiveConf))
+    serviceList.asScala.foreach(_.init(hiveConf))
 
     // Emulating `AbstractService.init(hiveConf)`
     invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 77272aecf2835..2619286afc148 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.hive.thriftserver
 
-import java.util.{ArrayList => JArrayList, List => JList}
+import java.util.{Arrays, ArrayList => JArrayList, List => JList}
+
+import scala.collection.JavaConverters._
 
 import org.apache.commons.lang3.exception.ExceptionUtils
 import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
@@ -27,8 +29,6 @@ import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
 import org.apache.spark.Logging
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 
-import scala.collection.JavaConversions._
-
 private[hive] class SparkSQLDriver(
     val context: HiveContext = SparkSQLEnv.hiveContext)
   extends Driver
@@ -43,14 +43,14 @@ private[hive] class SparkSQLDriver(
   private def getResultSetSchema(query: context.QueryExecution): Schema = {
     val analyzed = query.analyzed
     logDebug(s"Result Schema: ${analyzed.output}")
-    if (analyzed.output.size == 0) {
-      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
+    if (analyzed.output.isEmpty) {
+      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
     } else {
       val fieldSchemas = analyzed.output.map { attr =>
         new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
       }
 
-      new Schema(fieldSchemas, null)
+      new Schema(fieldSchemas.asJava, null)
     }
   }
 
@@ -79,7 +79,7 @@ private[hive] class SparkSQLDriver(
     if (hiveResponse == null) {
       false
     } else {
-      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
+      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
       hiveResponse = null
       true
     }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
index 1d41c46131828..bacf6cc458fd5 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.io.PrintStream
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.scheduler.StatsReportListener
 import org.apache.spark.sql.hive.HiveContext
@@ -64,7 +64,7 @@ private[hive] object SparkSQLEnv extends Logging {
       hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
 
       if (log.isDebugEnabled) {
-        hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) =>
+        hiveContext.hiveconf.getAllProperties.asScala.toSeq.sorted.foreach { case (k, v) =>
           logDebug(s"HiveConf var: $k=$v")
         }
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 17cc83087fb1d..c0a458fa9ab8d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -22,7 +22,7 @@ import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 import java.util.concurrent.TimeUnit
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 import scala.concurrent.duration._
@@ -194,7 +194,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     logInfo("defalt warehouse location is " + defaltWarehouseLocation)
 
     // `configure` goes second to override other settings.
-    val allConfig = metadataConf.iterator.map(e => e.getKey -> e.getValue).toMap ++ configure
+    val allConfig = metadataConf.asScala.map(e => e.getKey -> e.getValue).toMap ++ configure
 
     val isolatedLoader = if (hiveMetastoreJars == "builtin") {
       if (hiveExecutionVersion != hiveMetastoreVersion) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 64fffdbf9b020..cfe2bb05ad89e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.hive
 
+import scala.collection.JavaConverters._
+
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.objectinspector.{StructField => HiveStructField, _}
@@ -31,9 +33,6 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, types}
 import org.apache.spark.unsafe.types.UTF8String
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-
 /**
  * 1. The Underlying data type in catalyst and in Hive
  * In catalyst:
@@ -290,13 +289,13 @@ private[hive] trait HiveInspectors {
       DateTimeUtils.fromJavaDate(poi.getWritableConstantValue.get())
     case mi: StandardConstantMapObjectInspector =>
       // take the value from the map inspector object, rather than the input data
-      val map = mi.getWritableConstantValue
-      val keys = map.keysIterator.map(unwrap(_, mi.getMapKeyObjectInspector)).toArray
-      val values = map.valuesIterator.map(unwrap(_, mi.getMapValueObjectInspector)).toArray
+      val keyValues = mi.getWritableConstantValue.asScala.toSeq
+      val keys = keyValues.map(kv => unwrap(kv._1, mi.getMapKeyObjectInspector)).toArray
+      val values = keyValues.map(kv => unwrap(kv._2, mi.getMapValueObjectInspector)).toArray
       ArrayBasedMapData(keys, values)
     case li: StandardConstantListObjectInspector =>
       // take the value from the list inspector object, rather than the input data
-      val values = li.getWritableConstantValue
+      val values = li.getWritableConstantValue.asScala
         .map(unwrap(_, li.getListElementObjectInspector))
         .toArray
       new GenericArrayData(values)
@@ -342,7 +341,7 @@ private[hive] trait HiveInspectors {
     case li: ListObjectInspector =>
       Option(li.getList(data))
         .map { l =>
-          val values = l.map(unwrap(_, li.getListElementObjectInspector)).toArray
+          val values = l.asScala.map(unwrap(_, li.getListElementObjectInspector)).toArray
           new GenericArrayData(values)
         }
         .orNull
@@ -351,15 +350,16 @@ private[hive] trait HiveInspectors {
       if (map == null) {
         null
       } else {
-        val keys = map.keysIterator.map(unwrap(_, mi.getMapKeyObjectInspector)).toArray
-        val values = map.valuesIterator.map(unwrap(_, mi.getMapValueObjectInspector)).toArray
+        val keyValues = map.asScala.toSeq
+        val keys = keyValues.map(kv => unwrap(kv._1, mi.getMapKeyObjectInspector)).toArray
+        val values = keyValues.map(kv => unwrap(kv._2, mi.getMapValueObjectInspector)).toArray
         ArrayBasedMapData(keys, values)
       }
     // currently, hive doesn't provide the ConstantStructObjectInspector
     case si: StructObjectInspector =>
       val allRefs = si.getAllStructFieldRefs
-      InternalRow.fromSeq(
-        allRefs.map(r => unwrap(si.getStructFieldData(data, r), r.getFieldObjectInspector)))
+      InternalRow.fromSeq(allRefs.asScala.map(
+        r => unwrap(si.getStructFieldData(data, r), r.getFieldObjectInspector)))
   }
 
 
@@ -403,14 +403,14 @@ private[hive] trait HiveInspectors {
 
     case soi: StandardStructObjectInspector =>
       val schema = dataType.asInstanceOf[StructType]
-      val wrappers = soi.getAllStructFieldRefs.zip(schema.fields).map { case (ref, field) =>
-        wrapperFor(ref.getFieldObjectInspector, field.dataType)
+      val wrappers = soi.getAllStructFieldRefs.asScala.zip(schema.fields).map {
+        case (ref, field) => wrapperFor(ref.getFieldObjectInspector, field.dataType)
       }
       (o: Any) => {
         if (o != null) {
           val struct = soi.create()
           val row = o.asInstanceOf[InternalRow]
-          soi.getAllStructFieldRefs.zip(wrappers).zipWithIndex.foreach {
+          soi.getAllStructFieldRefs.asScala.zip(wrappers).zipWithIndex.foreach {
             case ((field, wrapper), i) =>
               soi.setStructFieldData(struct, field, wrapper(row.get(i, schema(i).dataType)))
           }
@@ -537,7 +537,7 @@ private[hive] trait HiveInspectors {
       // 1. create the pojo (most likely) object
       val result = x.create()
       var i = 0
-      while (i < fieldRefs.length) {
+      while (i < fieldRefs.size) {
         // 2. set the property for the pojo
         val tpe = structType(i).dataType
         x.setStructFieldData(
@@ -552,9 +552,9 @@ private[hive] trait HiveInspectors {
       val fieldRefs = x.getAllStructFieldRefs
       val structType = dataType.asInstanceOf[StructType]
       val row = a.asInstanceOf[InternalRow]
-      val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
+      val result = new java.util.ArrayList[AnyRef](fieldRefs.size)
       var i = 0
-      while (i < fieldRefs.length) {
+      while (i < fieldRefs.size) {
         val tpe = structType(i).dataType
         result.add(wrap(row.get(i, tpe), fieldRefs.get(i).getFieldObjectInspector, tpe))
         i += 1
@@ -712,10 +712,10 @@ private[hive] trait HiveInspectors {
 
   def inspectorToDataType(inspector: ObjectInspector): DataType = inspector match {
     case s: StructObjectInspector =>
-      StructType(s.getAllStructFieldRefs.map(f => {
+      StructType(s.getAllStructFieldRefs.asScala.map(f =>
         types.StructField(
           f.getFieldName, inspectorToDataType(f.getFieldObjectInspector), nullable = true)
-      }))
+      ))
     case l: ListObjectInspector => ArrayType(inspectorToDataType(l.getListElementObjectInspector))
     case m: MapObjectInspector =>
       MapType(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 98d21aa76d64e..b8da0840ae569 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import com.google.common.base.Objects
@@ -483,7 +483,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       // are empty.
       val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
         val location = p.getLocation
-        val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
+        val values = InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
           case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
         })
         ParquetPartition(values, location)
@@ -798,9 +798,9 @@ private[hive] case class MetastoreRelation
 
     val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
     tTable.setSd(sd)
-    sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+    sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
     tTable.setPartitionKeys(
-      table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+      table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
 
     table.location.foreach(sd.setLocation)
     table.inputFormat.foreach(sd.setInputFormat)
@@ -852,11 +852,11 @@ private[hive] case class MetastoreRelation
       val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
       tPartition.setDbName(databaseName)
       tPartition.setTableName(tableName)
-      tPartition.setValues(p.values)
+      tPartition.setValues(p.values.asJava)
 
       val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
       tPartition.setSd(sd)
-      sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+      sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
 
       sd.setLocation(p.storage.location)
       sd.setInputFormat(p.storage.inputFormat)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index ad33dee555dd2..d5cd7e98b5267 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -20,6 +20,9 @@ package org.apache.spark.sql.hive
 import java.sql.Date
 import java.util.Locale
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.serde.serdeConstants
@@ -48,10 +51,6 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.random.RandomSampler
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
-
 /**
  * Used when we need to start parsing the AST before deciding that we are going to pass the command
  * back for Hive to execute natively.  Will be replaced with a native command that contains the
@@ -202,7 +201,7 @@ private[hive] object HiveQl extends Logging {
      * Returns a scala.Seq equivalent to [s] or Nil if [s] is null.
      */
     private def nilIfEmpty[A](s: java.util.List[A]): Seq[A] =
-      Option(s).map(_.toSeq).getOrElse(Nil)
+      Option(s).map(_.asScala).getOrElse(Nil)
 
     /**
      * Returns this ASTNode with the text changed to `newText`.
@@ -217,7 +216,7 @@ private[hive] object HiveQl extends Logging {
      */
     def withChildren(newChildren: Seq[ASTNode]): ASTNode = {
       (1 to n.getChildCount).foreach(_ => n.deleteChild(0))
-      n.addChildren(newChildren)
+      n.addChildren(newChildren.asJava)
       n
     }
 
@@ -323,11 +322,11 @@ private[hive] object HiveQl extends Logging {
     assert(tree.asInstanceOf[ASTNode].getText == "TOK_CREATETABLE", "Only CREATE TABLE supported.")
     val tableOps = tree.getChildren
     val colList =
-      tableOps
+      tableOps.asScala
         .find(_.asInstanceOf[ASTNode].getText == "TOK_TABCOLLIST")
         .getOrElse(sys.error("No columnList!")).getChildren
 
-    colList.map(nodeToAttribute)
+    colList.asScala.map(nodeToAttribute)
   }
 
   /** Extractor for matching Hive's AST Tokens. */
@@ -337,7 +336,7 @@ private[hive] object HiveQl extends Logging {
       case t: ASTNode =>
         CurrentOrigin.setPosition(t.getLine, t.getCharPositionInLine)
         Some((t.getText,
-          Option(t.getChildren).map(_.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]]))
+          Option(t.getChildren).map(_.asScala.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]]))
       case _ => None
     }
   }
@@ -424,7 +423,9 @@ private[hive] object HiveQl extends Logging {
 
   protected def extractDbNameTableName(tableNameParts: Node): (Option[String], String) = {
     val (db, tableName) =
-      tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) } match {
+      tableNameParts.getChildren.asScala.map {
+        case Token(part, Nil) => cleanIdentifier(part)
+      } match {
         case Seq(tableOnly) => (None, tableOnly)
         case Seq(databaseName, table) => (Some(databaseName), table)
       }
@@ -433,7 +434,9 @@ private[hive] object HiveQl extends Logging {
   }
 
   protected def extractTableIdent(tableNameParts: Node): Seq[String] = {
-    tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) } match {
+    tableNameParts.getChildren.asScala.map {
+      case Token(part, Nil) => cleanIdentifier(part)
+    } match {
       case Seq(tableOnly) => Seq(tableOnly)
       case Seq(databaseName, table) => Seq(databaseName, table)
       case other => sys.error("Hive only supports tables names like 'tableName' " +
@@ -624,7 +627,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           val cols = BaseSemanticAnalyzer.getColumns(list, true)
           if (cols != null) {
             tableDesc = tableDesc.copy(
-              schema = cols.map { field =>
+              schema = cols.asScala.map { field =>
                 HiveColumn(field.getName, field.getType, field.getComment)
               })
           }
@@ -636,7 +639,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           val cols = BaseSemanticAnalyzer.getColumns(list(0), false)
           if (cols != null) {
             tableDesc = tableDesc.copy(
-              partitionColumns = cols.map { field =>
+              partitionColumns = cols.asScala.map { field =>
                 HiveColumn(field.getName, field.getType, field.getComment)
               })
           }
@@ -672,7 +675,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             case _ => assert(false)
           }
           tableDesc = tableDesc.copy(
-            serdeProperties = tableDesc.serdeProperties ++ serdeParams)
+            serdeProperties = tableDesc.serdeProperties ++ serdeParams.asScala)
         case Token("TOK_TABLELOCATION", child :: Nil) =>
           var location = BaseSemanticAnalyzer.unescapeSQLString(child.getText)
           location = EximUtil.relativeToAbsolutePath(hiveConf, location)
@@ -684,7 +687,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             val serdeParams = new java.util.HashMap[String, String]()
             BaseSemanticAnalyzer.readProps(
               (child.getChild(1).getChild(0)).asInstanceOf[ASTNode], serdeParams)
-            tableDesc = tableDesc.copy(serdeProperties = tableDesc.serdeProperties ++ serdeParams)
+            tableDesc = tableDesc.copy(
+              serdeProperties = tableDesc.serdeProperties ++ serdeParams.asScala)
           }
         case Token("TOK_FILEFORMAT_GENERIC", child :: Nil) =>
           child.getText().toLowerCase(Locale.ENGLISH) match {
@@ -847,7 +851,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }
 
         val withWhere = whereClause.map { whereNode =>
-          val Seq(whereExpr) = whereNode.getChildren.toSeq
+          val Seq(whereExpr) = whereNode.getChildren.asScala
           Filter(nodeToExpr(whereExpr), relations)
         }.getOrElse(relations)
 
@@ -856,7 +860,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
         // Script transformations are expressed as a select clause with a single expression of type
         // TOK_TRANSFORM
-        val transformation = select.getChildren.head match {
+        val transformation = select.getChildren.iterator().next() match {
           case Token("TOK_SELEXPR",
                  Token("TOK_TRANSFORM",
                    Token("TOK_EXPLIST", inputExprs) ::
@@ -925,10 +929,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
         val withLateralView = lateralViewClause.map { lv =>
           val Token("TOK_SELECT",
-          Token("TOK_SELEXPR", clauses) :: Nil) = lv.getChildren.head
+          Token("TOK_SELEXPR", clauses) :: Nil) = lv.getChildren.iterator().next()
 
-          val alias =
-            getClause("TOK_TABALIAS", clauses).getChildren.head.asInstanceOf[ASTNode].getText
+          val alias = getClause("TOK_TABALIAS", clauses).getChildren.iterator().next()
+            .asInstanceOf[ASTNode].getText
 
           val (generator, attributes) = nodesToGenerator(clauses)
             Generate(
@@ -944,7 +948,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         // (if there is a group by) or a script transformation.
         val withProject: LogicalPlan = transformation.getOrElse {
           val selectExpressions =
-            select.getChildren.flatMap(selExprNodeToExpr).map(UnresolvedAlias(_)).toSeq
+            select.getChildren.asScala.flatMap(selExprNodeToExpr).map(UnresolvedAlias)
           Seq(
             groupByClause.map(e => e match {
               case Token("TOK_GROUPBY", children) =>
@@ -973,7 +977,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
         // Handle HAVING clause.
         val withHaving = havingClause.map { h =>
-          val havingExpr = h.getChildren.toSeq match { case Seq(hexpr) => nodeToExpr(hexpr) }
+          val havingExpr = h.getChildren.asScala match { case Seq(hexpr) => nodeToExpr(hexpr) }
           // Note that we added a cast to boolean. If the expression itself is already boolean,
           // the optimizer will get rid of the unnecessary cast.
           Filter(Cast(havingExpr, BooleanType), withProject)
@@ -983,32 +987,42 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         val withDistinct =
           if (selectDistinctClause.isDefined) Distinct(withHaving) else withHaving
 
-        // Handle ORDER BY, SORT BY, DISTRIBETU BY, and CLUSTER BY clause.
+        // Handle ORDER BY, SORT BY, DISTRIBUTE BY, and CLUSTER BY clause.
         val withSort =
           (orderByClause, sortByClause, distributeByClause, clusterByClause) match {
             case (Some(totalOrdering), None, None, None) =>
-              Sort(totalOrdering.getChildren.map(nodeToSortOrder), true, withDistinct)
+              Sort(totalOrdering.getChildren.asScala.map(nodeToSortOrder), true, withDistinct)
             case (None, Some(perPartitionOrdering), None, None) =>
-              Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false, withDistinct)
+              Sort(
+                perPartitionOrdering.getChildren.asScala.map(nodeToSortOrder),
+                false, withDistinct)
             case (None, None, Some(partitionExprs), None) =>
-              RepartitionByExpression(partitionExprs.getChildren.map(nodeToExpr), withDistinct)
+              RepartitionByExpression(
+                partitionExprs.getChildren.asScala.map(nodeToExpr), withDistinct)
             case (None, Some(perPartitionOrdering), Some(partitionExprs), None) =>
-              Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false,
-                RepartitionByExpression(partitionExprs.getChildren.map(nodeToExpr), withDistinct))
+              Sort(
+                perPartitionOrdering.getChildren.asScala.map(nodeToSortOrder), false,
+                RepartitionByExpression(
+                  partitionExprs.getChildren.asScala.map(nodeToExpr),
+                  withDistinct))
             case (None, None, None, Some(clusterExprs)) =>
-              Sort(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)), false,
-                RepartitionByExpression(clusterExprs.getChildren.map(nodeToExpr), withDistinct))
+              Sort(
+                clusterExprs.getChildren.asScala.map(nodeToExpr).map(SortOrder(_, Ascending)),
+                false,
+                RepartitionByExpression(
+                  clusterExprs.getChildren.asScala.map(nodeToExpr),
+                  withDistinct))
             case (None, None, None, None) => withDistinct
             case _ => sys.error("Unsupported set of ordering / distribution clauses.")
           }
 
         val withLimit =
-          limitClause.map(l => nodeToExpr(l.getChildren.head))
+          limitClause.map(l => nodeToExpr(l.getChildren.iterator().next()))
             .map(Limit(_, withSort))
             .getOrElse(withSort)
 
         // Collect all window specifications defined in the WINDOW clause.
-        val windowDefinitions = windowClause.map(_.getChildren.toSeq.collect {
+        val windowDefinitions = windowClause.map(_.getChildren.asScala.collect {
           case Token("TOK_WINDOWDEF",
           Token(windowName, Nil) :: Token("TOK_WINDOWSPEC", spec) :: Nil) =>
             windowName -> nodesToWindowSpecification(spec)
@@ -1063,7 +1077,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       val Token("TOK_SELECT",
             Token("TOK_SELEXPR", clauses) :: Nil) = selectClause
 
-      val alias = getClause("TOK_TABALIAS", clauses).getChildren.head.asInstanceOf[ASTNode].getText
+      val alias = getClause("TOK_TABALIAS", clauses).getChildren.iterator().next()
+        .asInstanceOf[ASTNode].getText
 
       val (generator, attributes) = nodesToGenerator(clauses)
         Generate(
@@ -1092,7 +1107,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       }
 
       val tableIdent =
-        tableNameParts.getChildren.map{ case Token(part, Nil) => cleanIdentifier(part)} match {
+        tableNameParts.getChildren.asScala.map {
+          case Token(part, Nil) => cleanIdentifier(part)
+        } match {
           case Seq(tableOnly) => Seq(tableOnly)
           case Seq(databaseName, table) => Seq(databaseName, table)
           case other => sys.error("Hive only supports tables names like 'tableName' " +
@@ -1139,7 +1156,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
       val isPreserved = tableOrdinals.map(i => (i - 1 < 0) || joinArgs(i - 1).getText == "PRESERVE")
       val tables = tableOrdinals.map(i => nodeToRelation(joinArgs(i)))
-      val joinExpressions = tableOrdinals.map(i => joinArgs(i + 1).getChildren.map(nodeToExpr))
+      val joinExpressions =
+        tableOrdinals.map(i => joinArgs(i + 1).getChildren.asScala.map(nodeToExpr))
 
       val joinConditions = joinExpressions.sliding(2).map {
         case Seq(c1, c2) =>
@@ -1164,7 +1182,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             joinType = joinType.remove(joinType.length - 1))
       }
 
-      val groups = (0 until joinExpressions.head.size).map(i => Coalesce(joinExpressions.map(_(i))))
+      val groups = joinExpressions.head.indices.map(i => Coalesce(joinExpressions.map(_(i))))
 
       // Unique join is not really the same as an outer join so we must group together results where
       // the joinExpressions are the same, taking the First of each value is only okay because the
@@ -1229,7 +1247,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
       val tableIdent = extractTableIdent(tableNameParts)
 
-      val partitionKeys = partitionClause.map(_.getChildren.map {
+      val partitionKeys = partitionClause.map(_.getChildren.asScala.map {
         // Parse partitions. We also make keys case insensitive.
         case Token("TOK_PARTVAL", Token(key, Nil) :: Token(value, Nil) :: Nil) =>
           cleanIdentifier(key.toLowerCase) -> Some(PlanUtils.stripQuotes(value))
@@ -1249,7 +1267,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
       val tableIdent = extractTableIdent(tableNameParts)
 
-      val partitionKeys = partitionClause.map(_.getChildren.map {
+      val partitionKeys = partitionClause.map(_.getChildren.asScala.map {
         // Parse partitions. We also make keys case insensitive.
         case Token("TOK_PARTVAL", Token(key, Nil) :: Token(value, Nil) :: Nil) =>
           cleanIdentifier(key.toLowerCase) -> Some(PlanUtils.stripQuotes(value))
@@ -1590,18 +1608,18 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         val (partitionByClause :: orderByClause :: sortByClause :: clusterByClause :: Nil) =
           getClauses(
             Seq("TOK_DISTRIBUTEBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_CLUSTERBY"),
-            partitionAndOrdering.getChildren.toSeq.asInstanceOf[Seq[ASTNode]])
+            partitionAndOrdering.getChildren.asScala.asInstanceOf[Seq[ASTNode]])
 
         (partitionByClause, orderByClause.orElse(sortByClause), clusterByClause) match {
           case (Some(partitionByExpr), Some(orderByExpr), None) =>
-            (partitionByExpr.getChildren.map(nodeToExpr),
-              orderByExpr.getChildren.map(nodeToSortOrder))
+            (partitionByExpr.getChildren.asScala.map(nodeToExpr),
+              orderByExpr.getChildren.asScala.map(nodeToSortOrder))
           case (Some(partitionByExpr), None, None) =>
-            (partitionByExpr.getChildren.map(nodeToExpr), Nil)
+            (partitionByExpr.getChildren.asScala.map(nodeToExpr), Nil)
           case (None, Some(orderByExpr), None) =>
-            (Nil, orderByExpr.getChildren.map(nodeToSortOrder))
+            (Nil, orderByExpr.getChildren.asScala.map(nodeToSortOrder))
           case (None, None, Some(clusterByExpr)) =>
-            val expressions = clusterByExpr.getChildren.map(nodeToExpr)
+            val expressions = clusterByExpr.getChildren.asScala.map(nodeToExpr)
             (expressions, expressions.map(SortOrder(_, Ascending)))
           case _ =>
             throw new NotImplementedError(
@@ -1639,7 +1657,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           }
 
           rowFrame.orElse(rangeFrame).map { frame =>
-            frame.getChildren.toList match {
+            frame.getChildren.asScala.toList match {
               case precedingNode :: followingNode :: Nil =>
                 SpecifiedWindowFrame(
                   frameType,
@@ -1701,7 +1719,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       case other => sys.error(s"Non ASTNode encountered: $other")
     }
 
-    Option(node.getChildren).map(_.toList).getOrElse(Nil).foreach(dumpTree(_, builder, indent + 1))
+    Option(node.getChildren).map(_.asScala).getOrElse(Nil).foreach(dumpTree(_, builder, indent + 1))
     builder
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 267074f3ad102..004805f3aed0b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -22,8 +22,7 @@ import java.rmi.server.UID
 
 import org.apache.avro.Schema
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
@@ -73,7 +72,7 @@ private[hive] object HiveShim {
    */
   def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
     if (ids != null && ids.nonEmpty) {
-      ColumnProjectionUtils.appendReadColumns(conf, ids)
+      ColumnProjectionUtils.appendReadColumns(conf, ids.asJava)
     }
     if (names != null && names.nonEmpty) {
       appendReadColumnNames(conf, names)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index f49c97de8ff4e..4d1e3ed9198e6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -21,7 +21,7 @@ import java.io.{File, PrintStream}
 import java.util.{Map => JMap}
 import javax.annotation.concurrent.GuardedBy
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.Path
@@ -305,10 +305,11 @@ private[hive] class ClientWrapper(
       HiveTable(
         name = h.getTableName,
         specifiedDatabase = Option(h.getDbName),
-        schema = h.getCols.map(f => HiveColumn(f.getName, f.getType, f.getComment)),
-        partitionColumns = h.getPartCols.map(f => HiveColumn(f.getName, f.getType, f.getComment)),
-        properties = h.getParameters.toMap,
-        serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.toMap,
+        schema = h.getCols.asScala.map(f => HiveColumn(f.getName, f.getType, f.getComment)),
+        partitionColumns = h.getPartCols.asScala.map(f =>
+          HiveColumn(f.getName, f.getType, f.getComment)),
+        properties = h.getParameters.asScala.toMap,
+        serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.asScala.toMap,
         tableType = h.getTableType match {
           case HTableType.MANAGED_TABLE => ManagedTable
           case HTableType.EXTERNAL_TABLE => ExternalTable
@@ -334,9 +335,9 @@ private[hive] class ClientWrapper(
   private def toQlTable(table: HiveTable): metadata.Table = {
     val qlTable = new metadata.Table(table.database, table.name)
 
-    qlTable.setFields(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+    qlTable.setFields(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
     qlTable.setPartCols(
-      table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+      table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
     table.properties.foreach { case (k, v) => qlTable.setProperty(k, v) }
     table.serdeProperties.foreach { case (k, v) => qlTable.setSerdeParam(k, v) }
 
@@ -366,13 +367,13 @@ private[hive] class ClientWrapper(
   private def toHivePartition(partition: metadata.Partition): HivePartition = {
     val apiPartition = partition.getTPartition
     HivePartition(
-      values = Option(apiPartition.getValues).map(_.toSeq).getOrElse(Seq.empty),
+      values = Option(apiPartition.getValues).map(_.asScala).getOrElse(Seq.empty),
       storage = HiveStorageDescriptor(
         location = apiPartition.getSd.getLocation,
         inputFormat = apiPartition.getSd.getInputFormat,
         outputFormat = apiPartition.getSd.getOutputFormat,
         serde = apiPartition.getSd.getSerdeInfo.getSerializationLib,
-        serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.toMap))
+        serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.asScala.toMap))
   }
 
   override def getPartitionOption(
@@ -397,7 +398,7 @@ private[hive] class ClientWrapper(
   }
 
   override def listTables(dbName: String): Seq[String] = withHiveState {
-    client.getAllTables(dbName)
+    client.getAllTables(dbName).asScala
   }
 
   /**
@@ -514,17 +515,17 @@ private[hive] class ClientWrapper(
   }
 
   def reset(): Unit = withHiveState {
-    client.getAllTables("default").foreach { t =>
+    client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
         val table = client.getTable("default", t)
-        client.getIndexes("default", t, 255).foreach { index =>
+        client.getIndexes("default", t, 255).asScala.foreach { index =>
           shim.dropIndex(client, "default", t, index.getIndexName)
         }
         if (!table.isIndexTable) {
           client.dropTable("default", t)
         }
       }
-      client.getAllDatabases.filterNot(_ == "default").foreach { db =>
+      client.getAllDatabases.asScala.filterNot(_ == "default").foreach { db =>
         logDebug(s"Dropping Database: $db")
         client.dropDatabase(db, true, false, true)
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 8fc8935b1dc3c..48bbb21e6c1de 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -23,7 +23,7 @@ import java.net.URI
 import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Set => JSet}
 import java.util.concurrent.TimeUnit
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.conf.HiveConf
@@ -201,7 +201,7 @@ private[client] class Shim_v0_12 extends Shim with Logging {
     setDataLocationMethod.invoke(table, new URI(loc))
 
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
-    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
+    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].asScala.toSeq
 
   override def getPartitionsByFilter(
       hive: Hive,
@@ -220,7 +220,7 @@ private[client] class Shim_v0_12 extends Shim with Logging {
   override def getDriverResults(driver: Driver): Seq[String] = {
     val res = new JArrayList[String]()
     getDriverResultsMethod.invoke(driver, res)
-    res.toSeq
+    res.asScala
   }
 
   override def getMetastoreClientConnectRetryDelayMillis(conf: HiveConf): Long = {
@@ -310,7 +310,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
     setDataLocationMethod.invoke(table, new Path(loc))
 
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
-    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
+    getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].asScala.toSeq
 
   /**
    * Converts catalyst expression to the format that Hive's getPartitionsByFilter() expects, i.e.
@@ -320,7 +320,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
    */
   def convertFilters(table: Table, filters: Seq[Expression]): String = {
     // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
-    val varcharKeys = table.getPartitionKeys
+    val varcharKeys = table.getPartitionKeys.asScala
       .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
       .map(col => col.getName).toSet
 
@@ -354,7 +354,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
         getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]]
       }
 
-    partitions.toSeq
+    partitions.asScala.toSeq
   }
 
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
@@ -363,7 +363,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   override def getDriverResults(driver: Driver): Seq[String] = {
     val res = new JArrayList[Object]()
     getDriverResultsMethod.invoke(driver, res)
-    res.map { r =>
+    res.asScala.map { r =>
       r match {
         case s: String => s
         case a: Array[Object] => a(0).asInstanceOf[String]
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
index 5f0ed5393d191..441b6b6033e1f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 
@@ -39,8 +39,8 @@ case class DescribeHiveTableCommand(
     // Trying to mimic the format of Hive's output. But not exactly the same.
     var results: Seq[(String, String, String)] = Nil
 
-    val columns: Seq[FieldSchema] = table.hiveQlTable.getCols
-    val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols
+    val columns: Seq[FieldSchema] = table.hiveQlTable.getCols.asScala
+    val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols.asScala
     results ++= columns.map(field => (field.getName, field.getType, field.getComment))
     if (partitionColumns.nonEmpty) {
       val partColumnInfo =
@@ -48,7 +48,7 @@ case class DescribeHiveTableCommand(
       results ++=
         partColumnInfo ++
           Seq(("# Partition Information", "", "")) ++
-          Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++
+          Seq((s"# ${output(0).name}", output(1).name, output(2).name)) ++
           partColumnInfo
     }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index ba7eb15a1c0c6..806d2b9b0b7d4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition}
@@ -98,7 +98,7 @@ case class HiveTableScan(
       .asInstanceOf[StructObjectInspector]
 
     val columnTypeNames = structOI
-      .getAllStructFieldRefs
+      .getAllStructFieldRefs.asScala
       .map(_.getFieldObjectInspector)
       .map(TypeInfoUtils.getTypeInfoFromObjectInspector(_).getTypeName)
       .mkString(",")
@@ -118,9 +118,8 @@ case class HiveTableScan(
       case None => partitions
       case Some(shouldKeep) => partitions.filter { part =>
         val dataTypes = relation.partitionKeys.map(_.dataType)
-        val castedValues = for ((value, dataType) <- part.getValues.zip(dataTypes)) yield {
-          castFromString(value, dataType)
-        }
+        val castedValues = part.getValues.asScala.zip(dataTypes)
+          .map { case (value, dataType) => castFromString(value, dataType) }
 
         // Only partitioned values are needed here, since the predicate has already been bound to
         // partition key attribute references.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 62efda613a176..58f7fa640e8a9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.hive.execution
 
 import java.util
 
+import scala.collection.JavaConverters._
+
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.metastore.MetaStoreUtils
 import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
 import org.apache.hadoop.hive.serde2.Serializer
@@ -38,8 +39,6 @@ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.{SparkException, TaskContext}
-
-import scala.collection.JavaConversions._
 import org.apache.spark.util.SerializableJobConf
 
 private[hive]
@@ -94,7 +93,8 @@ case class InsertIntoHiveTable(
           ObjectInspectorCopyOption.JAVA)
         .asInstanceOf[StructObjectInspector]
 
-      val fieldOIs = standardOI.getAllStructFieldRefs.map(_.getFieldObjectInspector).toArray
+      val fieldOIs = standardOI.getAllStructFieldRefs.asScala
+        .map(_.getFieldObjectInspector).toArray
       val dataTypes: Array[DataType] = child.output.map(_.dataType).toArray
       val wrappers = fieldOIs.zip(dataTypes).map { case (f, dt) => wrapperFor(f, dt)}
       val outputData = new Array[Any](fieldOIs.length)
@@ -198,7 +198,7 @@ case class InsertIntoHiveTable(
 
       // loadPartition call orders directories created on the iteration order of the this map
       val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
-      table.hiveQlTable.getPartCols().foreach { entry =>
+      table.hiveQlTable.getPartCols.asScala.foreach { entry =>
         orderedPartitionSpec.put(entry.getName, partitionSpec.get(entry.getName).getOrElse(""))
       }
 
@@ -226,7 +226,7 @@ case class InsertIntoHiveTable(
         val oldPart =
           catalog.client.getPartitionOption(
             catalog.client.getTable(table.databaseName, table.tableName),
-            partitionSpec)
+            partitionSpec.asJava)
 
         if (oldPart.isEmpty || !ifNotExists) {
             catalog.client.loadPartition(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index ade27454b9d29..c7651daffe36e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -21,7 +21,7 @@ import java.io._
 import java.util.Properties
 import javax.annotation.Nullable
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.hive.serde.serdeConstants
@@ -61,7 +61,7 @@ case class ScriptTransformation(
   protected override def doExecute(): RDD[InternalRow] = {
     def processIterator(inputIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val cmd = List("/bin/bash", "-c", script)
-      val builder = new ProcessBuilder(cmd)
+      val builder = new ProcessBuilder(cmd.asJava)
 
       val proc = builder.start()
       val inputStream = proc.getInputStream
@@ -172,10 +172,10 @@ case class ScriptTransformation(
             val fieldList = outputSoi.getAllStructFieldRefs()
             var i = 0
             while (i < dataList.size()) {
-              if (dataList(i) == null) {
+              if (dataList.get(i) == null) {
                 mutableRow.setNullAt(i)
               } else {
-                mutableRow(i) = unwrap(dataList(i), fieldList(i).getFieldObjectInspector)
+                mutableRow(i) = unwrap(dataList.get(i), fieldList.get(i).getFieldObjectInspector)
               }
               i += 1
             }
@@ -307,7 +307,7 @@ case class HiveScriptIOSchema (
       val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps)
       val fieldObjectInspectors = columnTypes.map(toInspector)
       val objectInspector = ObjectInspectorFactory
-        .getStandardStructObjectInspector(columns, fieldObjectInspectors)
+        .getStandardStructObjectInspector(columns.asJava, fieldObjectInspectors.asJava)
         .asInstanceOf[ObjectInspector]
       (serde, objectInspector)
     }
@@ -342,7 +342,7 @@ case class HiveScriptIOSchema (
     propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
 
     val properties = new Properties()
-    properties.putAll(propsMap)
+    properties.putAll(propsMap.asJava)
     serde.initialize(null, properties)
 
     serde
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 7182246e466a4..cad02373e5ba1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive
 
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.Try
 
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ConstantObjectInspector}
@@ -81,8 +81,7 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
 
   /* List all of the registered function names. */
   override def listFunction(): Seq[String] = {
-    val a = FunctionRegistry.getFunctionNames ++ underlying.listFunction()
-    a.toList.sorted
+    (FunctionRegistry.getFunctionNames.asScala ++ underlying.listFunction()).toList.sorted
   }
 
   /* Get the class of the registered function by specified name. */
@@ -116,7 +115,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
 
   @transient
   private lazy val method =
-    function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo))
+    function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo).asJava)
 
   @transient
   private lazy val arguments = children.map(toInspector).toArray
@@ -541,7 +540,7 @@ private[hive] case class HiveGenericUDTF(
   @transient
   protected lazy val collector = new UDTFCollector
 
-  lazy val elementTypes = outputInspector.getAllStructFieldRefs.map {
+  lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
     field => (inspectorToDataType(field.getFieldObjectInspector), true)
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 9f4f8b5789afe..1cff5cf9c3543 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
 
+import scala.collection.JavaConverters._
+
 import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -43,9 +45,6 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.util.SerializableConfiguration
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-
 private[sql] class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
 
   override def shortName(): String = "orc"
@@ -97,7 +96,8 @@ private[orc] class OrcOutputWriter(
   private val reusableOutputBuffer = new Array[Any](dataSchema.length)
 
   // Used to convert Catalyst values into Hadoop `Writable`s.
-  private val wrappers = structOI.getAllStructFieldRefs.zip(dataSchema.fields.map(_.dataType))
+  private val wrappers = structOI.getAllStructFieldRefs.asScala
+    .zip(dataSchema.fields.map(_.dataType))
     .map { case (ref, dt) =>
       wrapperFor(ref.getFieldObjectInspector, dt)
     }.toArray
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 4da86636ac100..572eaebe81ac2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.test
 import java.io.File
 import java.util.{Set => JavaSet}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.implicitConversions
 
@@ -37,9 +38,6 @@ import org.apache.spark.sql.hive.execution.HiveNativeCommand
 import org.apache.spark.util.{ShutdownHookManager, Utils}
 import org.apache.spark.{SparkConf, SparkContext}
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-
 // SPARK-3729: Test key required to check for initialization errors with config.
 object TestHive
   extends TestHiveContext(
@@ -405,7 +403,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   def reset() {
     try {
       // HACK: Hive is too noisy by default.
-      org.apache.log4j.LogManager.getCurrentLoggers.foreach { log =>
+      org.apache.log4j.LogManager.getCurrentLoggers.asScala.foreach { log =>
         log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
       }
 
@@ -415,9 +413,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       catalog.client.reset()
       catalog.unregisterAllTables()
 
-      FunctionRegistry.getFunctionNames.filterNot(originalUDFs.contains(_)).foreach { udfName =>
-        FunctionRegistry.unregisterTemporaryUDF(udfName)
-      }
+      FunctionRegistry.getFunctionNames.asScala.filterNot(originalUDFs.contains(_)).
+        foreach { udfName => FunctionRegistry.unregisterTemporaryUDF(udfName) }
 
       // Some tests corrupt this value on purpose, which breaks the RESET call below.
       hiveconf.set("fs.default.name", new File(".").toURI.toString)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 0efcf80bd4ea7..5e7b93d457106 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.client
 
-import scala.collection.JavaConversions._
+import java.util.Collections
 
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hadoop.hive.serde.serdeConstants
@@ -38,7 +38,7 @@ class FiltersSuite extends SparkFunSuite with Logging {
   private val varCharCol = new FieldSchema()
   varCharCol.setName("varchar")
   varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
-  testTable.setPartCols(varCharCol :: Nil)
+  testTable.setPartCols(Collections.singletonList(varCharCol))
 
   filterTest("string filter",
     (a("stringcol", StringType) > Literal("test")) :: Nil,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index b03a35132325d..9c10ffe1113dc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import java.io.{DataInput, DataOutput}
-import java.util
-import java.util.Properties
+import java.util.{ArrayList, Arrays, Properties}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.ql.udf.generic.{GenericUDAFAverage, GenericUDF}
@@ -33,8 +32,6 @@ import org.apache.spark.sql.hive.test.TestHive
 
 import org.apache.spark.util.Utils
 
-import scala.collection.JavaConversions._
-
 case class Fields(f1: Int, f2: Int, f3: Int, f4: Int, f5: Int)
 
 // Case classes for the custom UDF's.
@@ -326,11 +323,11 @@ class PairSerDe extends AbstractSerDe {
   override def getObjectInspector: ObjectInspector = {
     ObjectInspectorFactory
       .getStandardStructObjectInspector(
-        Seq("pair"),
-        Seq(ObjectInspectorFactory.getStandardStructObjectInspector(
-          Seq("id", "value"),
-          Seq(PrimitiveObjectInspectorFactory.javaIntObjectInspector,
-              PrimitiveObjectInspectorFactory.javaIntObjectInspector))
+        Arrays.asList("pair"),
+        Arrays.asList(ObjectInspectorFactory.getStandardStructObjectInspector(
+          Arrays.asList("id", "value"),
+          Arrays.asList(PrimitiveObjectInspectorFactory.javaIntObjectInspector,
+                        PrimitiveObjectInspectorFactory.javaIntObjectInspector))
     ))
   }
 
@@ -343,10 +340,10 @@ class PairSerDe extends AbstractSerDe {
   override def deserialize(value: Writable): AnyRef = {
     val pair = value.asInstanceOf[TestPair]
 
-    val row = new util.ArrayList[util.ArrayList[AnyRef]]
-    row.add(new util.ArrayList[AnyRef](2))
-    row(0).add(Integer.valueOf(pair.entry._1))
-    row(0).add(Integer.valueOf(pair.entry._2))
+    val row = new ArrayList[ArrayList[AnyRef]]
+    row.add(new ArrayList[AnyRef](2))
+    row.get(0).add(Integer.valueOf(pair.entry._1))
+    row.get(0).add(Integer.valueOf(pair.entry._2))
 
     row
   }
@@ -355,9 +352,9 @@ class PairSerDe extends AbstractSerDe {
 class PairUDF extends GenericUDF {
   override def initialize(p1: Array[ObjectInspector]): ObjectInspector =
     ObjectInspectorFactory.getStandardStructObjectInspector(
-      Seq("id", "value"),
-      Seq(PrimitiveObjectInspectorFactory.javaIntObjectInspector,
-        PrimitiveObjectInspectorFactory.javaIntObjectInspector)
+      Arrays.asList("id", "value"),
+      Arrays.asList(PrimitiveObjectInspectorFactory.javaIntObjectInspector,
+                    PrimitiveObjectInspectorFactory.javaIntObjectInspector)
   )
 
   override def evaluate(args: Array[DeferredObject]): AnyRef = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index 3bf8f3ac20480..210d566745415 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.sql.hive.execution
 
+import scala.collection.JavaConverters._
+
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.hive.test.TestHive
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-
 /**
  * A set of test cases that validate partition and column pruning.
  */
@@ -161,7 +160,7 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter {
       assert(actualOutputColumns === expectedOutputColumns, "Output columns mismatch")
       assert(actualScannedColumns === expectedScannedColumns, "Scanned columns mismatch")
 
-      val actualPartitions = actualPartValues.map(_.toSeq.mkString(",")).sorted
+      val actualPartitions = actualPartValues.map(_.asScala.mkString(",")).sorted
       val expectedPartitions = expectedPartValues.map(_.mkString(",")).sorted
 
       assert(actualPartitions === expectedPartitions, "Partitions selected do not match")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 55ecbd5b5f21d..1ff1d9a2934cc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive.execution
 
 import java.sql.{Date, Timestamp}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.DefaultParserDialect
@@ -164,7 +164,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
   test("show functions") {
     val allFunctions =
       (FunctionRegistry.builtin.listFunction().toSet[String] ++
-        org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionNames).toList.sorted
+        org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionNames.asScala).toList.sorted
     checkAnswer(sql("SHOW functions"), allFunctions.map(Row(_)))
     checkAnswer(sql("SHOW functions abs"), Row("abs"))
     checkAnswer(sql("SHOW functions 'abs'"), Row("abs"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 5bbca14bad320..7966b43596e75 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.sources
 
-import java.sql.Date
-
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
@@ -552,7 +550,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
       configuration.clear()
-      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
     }
   }
 
@@ -600,7 +598,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
       configuration.clear()
-      clonedConf.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
       sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString)
     }
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index 214cd80108b9b..edfa474677f15 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.streaming.api.java
 
-import java.util
 import java.lang.{Long => JLong}
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
@@ -145,8 +144,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * an array.
    */
   def glom(): JavaDStream[JList[T]] =
-    new JavaDStream(dstream.glom().map(x => new java.util.ArrayList[T](x.toSeq)))
-
+    new JavaDStream(dstream.glom().map(_.toSeq.asJava))
 
 
   /** Return the [[org.apache.spark.streaming.StreamingContext]] associated with this DStream */
@@ -191,7 +189,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    */
   def mapPartitions[U](f: FlatMapFunction[java.util.Iterator[T], U]): JavaDStream[U] = {
     def fn: (Iterator[T]) => Iterator[U] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     new JavaDStream(dstream.mapPartitions(fn)(fakeClassTag[U]))(fakeClassTag[U])
   }
@@ -204,7 +202,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
   def mapPartitionsToPair[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2])
   : JavaPairDStream[K2, V2] = {
     def fn: (Iterator[T]) => Iterator[(K2, V2)] = {
-      (x: Iterator[T]) => asScalaIterator(f.call(asJavaIterator(x)).iterator())
+      (x: Iterator[T]) => f.call(x.asJava).iterator().asScala
     }
     new JavaPairDStream(dstream.mapPartitions(fn))(fakeClassTag[K2], fakeClassTag[V2])
   }
@@ -282,7 +280,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
    * Return all the RDDs between 'fromDuration' to 'toDuration' (both included)
    */
   def slice(fromTime: Time, toTime: Time): JList[R] = {
-    new util.ArrayList(dstream.slice(fromTime, toTime).map(wrapRDD(_)).toSeq)
+    dstream.slice(fromTime, toTime).map(wrapRDD).asJava
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index 26383e420101e..e2aec6c2f63e7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.api.java
 import java.lang.{Long => JLong, Iterable => JIterable}
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
@@ -116,14 +116,14 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * generate the RDDs with Spark's default number of partitions.
    */
   def groupByKey(): JavaPairDStream[K, JIterable[V]] =
-    dstream.groupByKey().mapValues(asJavaIterable _)
+    dstream.groupByKey().mapValues(_.asJava)
 
   /**
    * Return a new DStream by applying `groupByKey` to each RDD. Hash partitioning is used to
    * generate the RDDs with `numPartitions` partitions.
    */
   def groupByKey(numPartitions: Int): JavaPairDStream[K, JIterable[V]] =
-    dstream.groupByKey(numPartitions).mapValues(asJavaIterable _)
+    dstream.groupByKey(numPartitions).mapValues(_.asJava)
 
   /**
    * Return a new DStream by applying `groupByKey` on each RDD of `this` DStream.
@@ -132,7 +132,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * is used to control the partitioning of each RDD.
    */
   def groupByKey(partitioner: Partitioner): JavaPairDStream[K, JIterable[V]] =
-    dstream.groupByKey(partitioner).mapValues(asJavaIterable _)
+    dstream.groupByKey(partitioner).mapValues(_.asJava)
 
   /**
    * Return a new DStream by applying `reduceByKey` to each RDD. The values for each key are
@@ -197,7 +197,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    *                       batching interval
    */
   def groupByKeyAndWindow(windowDuration: Duration): JavaPairDStream[K, JIterable[V]] = {
-    dstream.groupByKeyAndWindow(windowDuration).mapValues(asJavaIterable _)
+    dstream.groupByKeyAndWindow(windowDuration).mapValues(_.asJava)
   }
 
   /**
@@ -212,7 +212,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    */
   def groupByKeyAndWindow(windowDuration: Duration, slideDuration: Duration)
   : JavaPairDStream[K, JIterable[V]] = {
-    dstream.groupByKeyAndWindow(windowDuration, slideDuration).mapValues(asJavaIterable _)
+    dstream.groupByKeyAndWindow(windowDuration, slideDuration).mapValues(_.asJava)
   }
 
   /**
@@ -228,8 +228,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    */
   def groupByKeyAndWindow(windowDuration: Duration, slideDuration: Duration, numPartitions: Int)
     : JavaPairDStream[K, JIterable[V]] = {
-    dstream.groupByKeyAndWindow(windowDuration, slideDuration, numPartitions)
-      .mapValues(asJavaIterable _)
+    dstream.groupByKeyAndWindow(windowDuration, slideDuration, numPartitions).mapValues(_.asJava)
   }
 
   /**
@@ -248,8 +247,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       slideDuration: Duration,
       partitioner: Partitioner
     ): JavaPairDStream[K, JIterable[V]] = {
-    dstream.groupByKeyAndWindow(windowDuration, slideDuration, partitioner)
-      .mapValues(asJavaIterable _)
+    dstream.groupByKeyAndWindow(windowDuration, slideDuration, partitioner).mapValues(_.asJava)
   }
 
   /**
@@ -431,7 +429,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
   private def convertUpdateStateFunction[S](in: JFunction2[JList[V], Optional[S], Optional[S]]):
   (Seq[V], Option[S]) => Option[S] = {
     val scalaFunc: (Seq[V], Option[S]) => Option[S] = (values, state) => {
-      val list: JList[V] = values
+      val list: JList[V] = values.asJava
       val scalaState: Optional[S] = JavaUtils.optionToOptional(state)
       val result: Optional[S] = in.apply(list, scalaState)
       result.isPresent match {
@@ -539,7 +537,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    */
   def cogroup[W](other: JavaPairDStream[K, W]): JavaPairDStream[K, (JIterable[V], JIterable[W])] = {
     implicit val cm: ClassTag[W] = fakeClassTag
-    dstream.cogroup(other.dstream).mapValues(t => (asJavaIterable(t._1), asJavaIterable((t._2))))
+    dstream.cogroup(other.dstream).mapValues(t => (t._1.asJava, t._2.asJava))
   }
 
   /**
@@ -551,8 +549,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       numPartitions: Int
     ): JavaPairDStream[K, (JIterable[V], JIterable[W])] = {
     implicit val cm: ClassTag[W] = fakeClassTag
-    dstream.cogroup(other.dstream, numPartitions)
-           .mapValues(t => (asJavaIterable(t._1), asJavaIterable((t._2))))
+    dstream.cogroup(other.dstream, numPartitions).mapValues(t => (t._1.asJava, t._2.asJava))
   }
 
   /**
@@ -564,8 +561,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       partitioner: Partitioner
     ): JavaPairDStream[K, (JIterable[V], JIterable[W])] = {
     implicit val cm: ClassTag[W] = fakeClassTag
-    dstream.cogroup(other.dstream, partitioner)
-           .mapValues(t => (asJavaIterable(t._1), asJavaIterable((t._2))))
+    dstream.cogroup(other.dstream, partitioner).mapValues(t => (t._1.asJava, t._2.asJava))
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 35cc3ce5cf468..13f371f29603a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -21,7 +21,7 @@ import java.lang.{Boolean => JBoolean}
 import java.io.{Closeable, InputStream}
 import java.util.{List => JList, Map => JMap}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import akka.actor.{Props, SupervisorStrategy}
@@ -115,7 +115,13 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     sparkHome: String,
     jars: Array[String],
     environment: JMap[String, String]) =
-    this(new StreamingContext(master, appName, batchDuration, sparkHome, jars, environment))
+    this(new StreamingContext(
+      master,
+      appName,
+      batchDuration,
+      sparkHome,
+      jars,
+      environment.asScala))
 
   /**
    * Create a JavaStreamingContext using an existing JavaSparkContext.
@@ -197,7 +203,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
       converter: JFunction[InputStream, java.lang.Iterable[T]],
       storageLevel: StorageLevel)
   : JavaReceiverInputDStream[T] = {
-    def fn: (InputStream) => Iterator[T] = (x: InputStream) => converter.call(x).toIterator
+    def fn: (InputStream) => Iterator[T] = (x: InputStream) => converter.call(x).iterator().asScala
     implicit val cmt: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
     ssc.socketStream(hostname, port, fn, storageLevel)
@@ -432,7 +438,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
     val sQueue = new scala.collection.mutable.Queue[RDD[T]]
-    sQueue.enqueue(queue.map(_.rdd).toSeq: _*)
+    sQueue.enqueue(queue.asScala.map(_.rdd).toSeq: _*)
     ssc.queueStream(sQueue)
   }
 
@@ -456,7 +462,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
     val sQueue = new scala.collection.mutable.Queue[RDD[T]]
-    sQueue.enqueue(queue.map(_.rdd).toSeq: _*)
+    sQueue.enqueue(queue.asScala.map(_.rdd).toSeq: _*)
     ssc.queueStream(sQueue, oneAtATime)
   }
 
@@ -481,7 +487,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
     val sQueue = new scala.collection.mutable.Queue[RDD[T]]
-    sQueue.enqueue(queue.map(_.rdd).toSeq: _*)
+    sQueue.enqueue(queue.asScala.map(_.rdd).toSeq: _*)
     ssc.queueStream(sQueue, oneAtATime, defaultRDD.rdd)
   }
 
@@ -500,7 +506,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create a unified DStream from multiple DStreams of the same type and same slide duration.
    */
   def union[T](first: JavaDStream[T], rest: JList[JavaDStream[T]]): JavaDStream[T] = {
-    val dstreams: Seq[DStream[T]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.dstream)
+    val dstreams: Seq[DStream[T]] = (Seq(first) ++ rest.asScala).map(_.dstream)
     implicit val cm: ClassTag[T] = first.classTag
     ssc.union(dstreams)(cm)
   }
@@ -512,7 +518,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
       first: JavaPairDStream[K, V],
       rest: JList[JavaPairDStream[K, V]]
     ): JavaPairDStream[K, V] = {
-    val dstreams: Seq[DStream[(K, V)]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.dstream)
+    val dstreams: Seq[DStream[(K, V)]] = (Seq(first) ++ rest.asScala).map(_.dstream)
     implicit val cm: ClassTag[(K, V)] = first.classTag
     implicit val kcm: ClassTag[K] = first.kManifest
     implicit val vcm: ClassTag[V] = first.vManifest
@@ -534,12 +540,11 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     ): JavaDStream[T] = {
     implicit val cmt: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
-    val scalaDStreams = dstreams.map(_.dstream).toSeq
     val scalaTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
-      val jrdds = rdds.map(rdd => JavaRDD.fromRDD[AnyRef](rdd.asInstanceOf[RDD[AnyRef]])).toList
+      val jrdds = rdds.map(JavaRDD.fromRDD(_)).asJava
       transformFunc.call(jrdds, time).rdd
     }
-    ssc.transform(scalaDStreams, scalaTransformFunc)
+    ssc.transform(dstreams.asScala.map(_.dstream).toSeq, scalaTransformFunc)
   }
 
   /**
@@ -559,12 +564,11 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
     implicit val cmv: ClassTag[V] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
-    val scalaDStreams = dstreams.map(_.dstream).toSeq
     val scalaTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
-      val jrdds = rdds.map(rdd => JavaRDD.fromRDD[AnyRef](rdd.asInstanceOf[RDD[AnyRef]])).toList
+      val jrdds = rdds.map(JavaRDD.fromRDD(_)).asJava
       transformFunc.call(jrdds, time).rdd
     }
-    ssc.transform(scalaDStreams, scalaTransformFunc)
+    ssc.transform(dstreams.asScala.map(_.dstream).toSeq, scalaTransformFunc)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index d06401245ff17..2c373640d2fd9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -20,14 +20,13 @@ package org.apache.spark.streaming.api.python
 import java.io.{ObjectInputStream, ObjectOutputStream}
 import java.lang.reflect.Proxy
 import java.util.{ArrayList => JArrayList, List => JList}
-import scala.collection.JavaConversions._
+
 import scala.collection.JavaConverters._
 import scala.language.existentials
 
 import py4j.GatewayServer
 
 import org.apache.spark.api.java._
-import org.apache.spark.api.python._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Interval, Duration, Time}
@@ -161,7 +160,7 @@ private[python] object PythonDStream {
    */
   def toRDDQueue(rdds: JArrayList[JavaRDD[Array[Byte]]]): java.util.Queue[JavaRDD[Array[Byte]]] = {
     val queue = new java.util.LinkedList[JavaRDD[Array[Byte]]]
-    rdds.forall(queue.add(_))
+    rdds.asScala.foreach(queue.add)
     queue
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
index 554aae0117b24..2252e28f22af8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.receiver
 import java.nio.ByteBuffer
 
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.annotation.DeveloperApi
@@ -144,12 +144,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable
    * for being used in the corresponding InputDStream.
    */
   def store(dataIterator: java.util.Iterator[T], metadata: Any) {
-    supervisor.pushIterator(dataIterator, Some(metadata), None)
+    supervisor.pushIterator(dataIterator.asScala, Some(metadata), None)
   }
 
   /** Store an iterator of received data as a data block into Spark's memory. */
   def store(dataIterator: java.util.Iterator[T]) {
-    supervisor.pushIterator(dataIterator, None, None)
+    supervisor.pushIterator(dataIterator.asScala, None, None)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 6d4cdc4aa6b10..0cd39594ee923 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.scheduler
 
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.util.{Failure, Success}
 
 import org.apache.spark.Logging
@@ -128,7 +128,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   }
 
   def getPendingTimes(): Seq[Time] = {
-    jobSets.keySet.toSeq
+    jobSets.asScala.keys.toSeq
   }
 
   def reportError(msg: String, e: Throwable) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index 53b96d51c9180..f2711d1355e60 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -19,6 +19,7 @@ package org.apache.spark.streaming.scheduler
 
 import java.nio.ByteBuffer
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.implicitConversions
 
@@ -196,8 +197,7 @@ private[streaming] class ReceivedBlockTracker(
 
     writeAheadLogOption.foreach { writeAheadLog =>
       logInfo(s"Recovering from write ahead logs in ${checkpointDirOption.get}")
-      import scala.collection.JavaConversions._
-      writeAheadLog.readAll().foreach { byteBuffer =>
+      writeAheadLog.readAll().asScala.foreach { byteBuffer =>
         logTrace("Recovering record " + byteBuffer)
         Utils.deserialize[ReceivedBlockTrackerLogEvent](
           byteBuffer.array, Thread.currentThread().getContextClassLoader) match {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index fe6328b1ce727..9f4a4d6806ab5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -19,6 +19,7 @@ package org.apache.spark.streaming.util
 import java.nio.ByteBuffer
 import java.util.{Iterator => JIterator}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{Await, ExecutionContext, Future}
 import scala.language.postfixOps
@@ -118,7 +119,6 @@ private[streaming] class FileBasedWriteAheadLog(
    * hence the implementation is kept simple.
    */
   def readAll(): JIterator[ByteBuffer] = synchronized {
-    import scala.collection.JavaConversions._
     val logFilesToRead = pastLogs.map{ _.path} ++ currentLogPath
     logInfo("Reading from the logs: " + logFilesToRead.mkString("\n"))
 
@@ -126,7 +126,7 @@ private[streaming] class FileBasedWriteAheadLog(
       logDebug(s"Creating log reader with $file")
       val reader = new FileBasedWriteAheadLogReader(file, hadoopConf)
       CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
-    } flatMap { x => x }
+    }.flatten.asJava
   }
 
   /**
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala b/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
index bb80bff6dc2e6..57b50bdfd6520 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.streaming
 
-import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
+import java.util.{List => JList}
+
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
-import java.util.{List => JList}
-import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext}
-import org.apache.spark.streaming._
-import java.util.ArrayList
-import collection.JavaConversions._
 import org.apache.spark.api.java.JavaRDDLike
-import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.streaming.api.java.{JavaDStreamLike, JavaDStream, JavaStreamingContext}
 
 /** Exposes streaming test functionality in a Java-friendly way. */
 trait JavaTestBase extends TestSuiteBase {
@@ -39,7 +36,7 @@ trait JavaTestBase extends TestSuiteBase {
       ssc: JavaStreamingContext,
       data: JList[JList[T]],
       numPartitions: Int) = {
-    val seqData = data.map(Seq(_:_*))
+    val seqData = data.asScala.map(_.asScala)
 
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
@@ -72,9 +69,7 @@ trait JavaTestBase extends TestSuiteBase {
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
     ssc.getState()
     val res = runStreams[V](ssc.ssc, numBatches, numExpectedOutput)
-    val out = new ArrayList[JList[V]]()
-    res.map(entry => out.append(new ArrayList[V](entry)))
-    out
+    res.map(_.asJava).asJava
   }
 
   /**
@@ -90,12 +85,7 @@ trait JavaTestBase extends TestSuiteBase {
     implicit val cm: ClassTag[V] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
     val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput)
-    val out = new ArrayList[JList[JList[V]]]()
-    res.map{entry =>
-      val lists = entry.map(new ArrayList[V](_))
-      out.append(new ArrayList[JList[V]](lists))
-    }
-    out
+    res.map(entry => entry.map(_.asJava).asJava).asJava
   }
 }
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 325ff7c74c39d..5e49fd00769ad 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -20,6 +20,7 @@ import java.io._
 import java.nio.ByteBuffer
 import java.util
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
 import scala.language.{implicitConversions, postfixOps}
@@ -417,9 +418,8 @@ object WriteAheadLogSuite {
 
   /** Read all the data in the log file in a directory using the WriteAheadLog class. */
   def readDataUsingWriteAheadLog(logDirectory: String): Seq[String] = {
-    import scala.collection.JavaConversions._
     val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1)
-    val data = wal.readAll().map(byteBufferToString).toSeq
+    val data = wal.readAll().asScala.map(byteBufferToString).toSeq
     wal.close()
     data
   }
diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
index 9418beb6b3e3a..a0524cabff2d4 100644
--- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
@@ -22,7 +22,7 @@ import java.io.File
 import java.util.jar.JarFile
 
 import scala.collection.mutable
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.reflect.runtime.universe.runtimeMirror
 import scala.reflect.runtime.{universe => unv}
 import scala.util.Try
@@ -161,7 +161,7 @@ object GenerateMIMAIgnore {
     val path = packageName.replace('.', '/')
     val resources = classLoader.getResources(path)
 
-    val jars = resources.filter(x => x.getProtocol == "jar")
+    val jars = resources.asScala.filter(_.getProtocol == "jar")
       .map(_.getFile.split(":")(1).split("!")(0)).toSeq
 
     jars.flatMap(getClassesFromJar(_, path))
@@ -175,7 +175,7 @@ object GenerateMIMAIgnore {
   private def getClassesFromJar(jarPath: String, packageName: String) = {
     import scala.collection.mutable
     val jar = new JarFile(new File(jarPath))
-    val enums = jar.entries().map(_.getName).filter(_.startsWith(packageName))
+    val enums = jar.entries().asScala.map(_.getName).filter(_.startsWith(packageName))
     val classes = mutable.HashSet[Class[_]]()
     for (entry <- enums if entry.endsWith(".class")) {
       try {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index bff585b46cbbe..e9a02baafd28e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -25,7 +25,7 @@ import java.security.PrivilegedExceptionAction
 import java.util.{Properties, UUID}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
 import scala.reflect.runtime.universe
 import scala.util.{Try, Success, Failure}
@@ -511,7 +511,7 @@ private[spark] class Client(
     val nns = YarnSparkHadoopUtil.get.getNameNodesToAccess(sparkConf) + stagingDirPath
     YarnSparkHadoopUtil.get.obtainTokensForNamenodes(
       nns, hadoopConf, creds, Some(sparkConf.get("spark.yarn.principal")))
-    val t = creds.getAllTokens
+    val t = creds.getAllTokens.asScala
       .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
       .head
     val newExpiration = t.renew(hadoopConf)
@@ -650,8 +650,8 @@ private[spark] class Client(
     distCacheMgr.setDistArchivesEnv(launchEnv)
 
     val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
-    amContainer.setLocalResources(localResources)
-    amContainer.setEnvironment(launchEnv)
+    amContainer.setLocalResources(localResources.asJava)
+    amContainer.setEnvironment(launchEnv.asJava)
 
     val javaOpts = ListBuffer[String]()
 
@@ -782,7 +782,7 @@ private[spark] class Client(
 
     // TODO: it would be nicer to just make sure there are no null commands here
     val printableCommands = commands.map(s => if (s == null) "null" else s).toList
-    amContainer.setCommands(printableCommands)
+    amContainer.setCommands(printableCommands.asJava)
 
     logDebug("===============================================================================")
     logDebug("YARN AM launch context:")
@@ -797,7 +797,8 @@ private[spark] class Client(
 
     // send the acl settings into YARN to control who has access via YARN interfaces
     val securityManager = new SecurityManager(sparkConf)
-    amContainer.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager))
+    amContainer.setApplicationACLs(
+      YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
     setupSecurityToken(amContainer)
     UserGroupInformation.getCurrentUser().addCredentials(credentials)
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 4cc50483a17ff..9abd09b3cc7a5 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -20,14 +20,13 @@ package org.apache.spark.deploy.yarn
 import java.io.File
 import java.net.URI
 import java.nio.ByteBuffer
+import java.util.Collections
 
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.spark.util.Utils
-
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{HashMap, ListBuffer}
 
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.DataOutputBuffer
 import org.apache.hadoop.security.UserGroupInformation
@@ -40,6 +39,7 @@ import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.util.Utils
 
 class ExecutorRunnable(
     container: Container,
@@ -74,9 +74,9 @@ class ExecutorRunnable(
       .asInstanceOf[ContainerLaunchContext]
 
     val localResources = prepareLocalResources
-    ctx.setLocalResources(localResources)
+    ctx.setLocalResources(localResources.asJava)
 
-    ctx.setEnvironment(env)
+    ctx.setEnvironment(env.asJava)
 
     val credentials = UserGroupInformation.getCurrentUser().getCredentials()
     val dob = new DataOutputBuffer()
@@ -96,8 +96,9 @@ class ExecutorRunnable(
       |===============================================================================
       """.stripMargin)
 
-    ctx.setCommands(commands)
-    ctx.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr))
+    ctx.setCommands(commands.asJava)
+    ctx.setApplicationACLs(
+      YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr).asJava)
 
     // If external shuffle service is enabled, register with the Yarn shuffle service already
     // started on the NodeManager and, if authentication is enabled, provide it with our secret
@@ -112,7 +113,7 @@ class ExecutorRunnable(
           // Authentication is not enabled, so just provide dummy metadata
           ByteBuffer.allocate(0)
         }
-      ctx.setServiceData(Map[String, ByteBuffer]("spark_shuffle" -> secretBytes))
+      ctx.setServiceData(Collections.singletonMap("spark_shuffle", secretBytes))
     }
 
     // Send the start request to the ContainerManager
@@ -314,7 +315,8 @@ class ExecutorRunnable(
       env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=-4096"
     }
 
-    System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }
+    System.getenv().asScala.filterKeys(_.startsWith("SPARK"))
+      .foreach { case (k, v) => env(k) = v }
     env
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index ccf753e69f4b6..5f897cbcb4e9f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -21,9 +21,7 @@ import java.util.Collections
 import java.util.concurrent._
 import java.util.regex.Pattern
 
-import org.apache.spark.util.Utils
-
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
@@ -39,8 +37,8 @@ import org.apache.log4j.{Level, Logger}
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.util.Utils
 
 /**
  * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
@@ -164,7 +162,7 @@ private[yarn] class YarnAllocator(
    * Number of container requests at the given location that have not yet been fulfilled.
    */
   private def getNumPendingAtLocation(location: String): Int =
-    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum
+    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala.map(_.size).sum
 
   /**
    * Request as many executors from the ResourceManager as needed to reach the desired total. If
@@ -231,14 +229,14 @@ private[yarn] class YarnAllocator(
           numExecutorsRunning,
           allocateResponse.getAvailableResources))
 
-      handleAllocatedContainers(allocatedContainers)
+      handleAllocatedContainers(allocatedContainers.asScala)
     }
 
     val completedContainers = allocateResponse.getCompletedContainersStatuses()
     if (completedContainers.size > 0) {
       logDebug("Completed %d containers".format(completedContainers.size))
 
-      processCompletedContainers(completedContainers)
+      processCompletedContainers(completedContainers.asScala)
 
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
         .format(completedContainers.size, numExecutorsRunning))
@@ -271,7 +269,7 @@ private[yarn] class YarnAllocator(
         val request = createContainerRequest(resource, locality.nodes, locality.racks)
         amClient.addContainerRequest(request)
         val nodes = request.getNodes
-        val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
+        val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.asScala.last
         logInfo(s"Container request (host: $hostStr, capability: $resource)")
       }
     } else if (missing < 0) {
@@ -280,7 +278,8 @@ private[yarn] class YarnAllocator(
 
       val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource)
       if (!matchingRequests.isEmpty) {
-        matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest)
+        matchingRequests.iterator().next().asScala
+          .take(numToCancel).foreach(amClient.removeContainerRequest)
       } else {
         logWarning("Expected to find pending requests, but found none.")
       }
@@ -459,7 +458,7 @@ private[yarn] class YarnAllocator(
         }
       }
 
-      if (allocatedContainerToHostMap.containsKey(containerId)) {
+      if (allocatedContainerToHostMap.contains(containerId)) {
         val host = allocatedContainerToHostMap.get(containerId).get
         val containerSet = allocatedHostToContainersMap.get(host).get
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 4999f9c06210a..df042bf291de7 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -19,17 +19,15 @@ package org.apache.spark.deploy.yarn
 
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.{Map, Set}
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.ConverterUtils
 import org.apache.hadoop.yarn.webapp.util.WebAppUtils
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
@@ -108,8 +106,8 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
       val method = classOf[WebAppUtils].getMethod("getProxyHostsAndPortsForAmFilter",
         classOf[Configuration])
       val proxies = method.invoke(null, conf).asInstanceOf[JList[String]]
-      val hosts = proxies.map { proxy => proxy.split(":")(0) }
-      val uriBases = proxies.map { proxy => prefix + proxy + proxyBase }
+      val hosts = proxies.asScala.map { proxy => proxy.split(":")(0) }
+      val uriBases = proxies.asScala.map { proxy => prefix + proxy + proxyBase }
       Map("PROXY_HOSTS" -> hosts.mkString(","), "PROXY_URI_BASES" -> uriBases.mkString(","))
     } catch {
       case e: NoSuchMethodException =>
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 128e996b71fe5..b4f8049bff577 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -21,7 +21,7 @@ import java.io.{File, FileOutputStream, OutputStreamWriter}
 import java.util.Properties
 import java.util.concurrent.TimeUnit
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
@@ -132,7 +132,7 @@ abstract class BaseYarnClusterSuite
     props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
     props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
 
-    yarnCluster.getConfig().foreach { e =>
+    yarnCluster.getConfig.asScala.foreach { e =>
       props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
     }
 
@@ -149,7 +149,7 @@ abstract class BaseYarnClusterSuite
     props.store(writer, "Spark properties.")
     writer.close()
 
-    val extraJarArgs = if (!extraJars.isEmpty()) Seq("--jars", extraJars.mkString(",")) else Nil
+    val extraJarArgs = if (extraJars.nonEmpty) Seq("--jars", extraJars.mkString(",")) else Nil
     val mainArgs =
       if (klass.endsWith(".py")) {
         Seq(klass)
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 0a5402c89e764..e7f2501e7899f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.deploy.yarn
 import java.io.File
 import java.net.URI
 
-import scala.collection.JavaConversions._
-import scala.collection.mutable.{ HashMap => MutableHashMap }
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap => MutableHashMap}
 import scala.reflect.ClassTag
 import scala.util.Try
 
@@ -38,7 +38,7 @@ import org.mockito.Matchers._
 import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfterAll, Matchers}
 
-import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.util.Utils
 
 class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll {
@@ -201,7 +201,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll {
     appContext.getClass.getMethods.filter(_.getName.equals("getApplicationTags")).foreach{ method =>
       val tags = method.invoke(appContext).asInstanceOf[java.util.Set[String]]
       tags should contain allOf ("tag1", "dup", "tag2", "multi word")
-      tags.filter(!_.isEmpty).size should be (4)
+      tags.asScala.filter(_.nonEmpty).size should be (4)
     }
     appContext.getMaxAppAttempts should be (42)
   }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 128350b648992..5a4ea2ea2f4ff 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -21,7 +21,6 @@ import java.io.File
 import java.net.URL
 
 import scala.collection.mutable
-import scala.collection.JavaConversions._
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.{ByteStreams, Files}
@@ -216,8 +215,8 @@ private object YarnClusterDriver extends Logging with Matchers {
       assert(listener.driverLogs.nonEmpty)
       val driverLogs = listener.driverLogs.get
       assert(driverLogs.size === 2)
-      assert(driverLogs.containsKey("stderr"))
-      assert(driverLogs.containsKey("stdout"))
+      assert(driverLogs.contains("stderr"))
+      assert(driverLogs.contains("stdout"))
       val urlStr = driverLogs("stderr")
       // Ensure that this is a valid URL, else this will throw an exception
       new URL(urlStr)

From 5c08c86bfa43462fb2ca5f7c5980ddfb44dd57f8 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 25 Aug 2015 10:22:54 -0700
Subject: [PATCH 072/802] [SPARK-10198] [SQL] Turn off partition verification
 by default

Author: Michael Armbrust <michael@databricks.com>

Closes #8404 from marmbrus/turnOffPartitionVerification.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  2 +-
 .../spark/sql/hive/QueryPartitionSuite.scala  | 64 ++++++++++---------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e6f7619519e6a..9de75f4c4d084 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -312,7 +312,7 @@ private[spark] object SQLConf {
     doc = "When true, enable filter pushdown for ORC files.")
 
   val HIVE_VERIFY_PARTITION_PATH = booleanConf("spark.sql.hive.verifyPartitionPath",
-    defaultValue = Some(true),
+    defaultValue = Some(false),
     doc = "<TODO>")
 
   val HIVE_METASTORE_PARTITION_PRUNING = booleanConf("spark.sql.hive.metastorePartitionPruning",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 017bc2adc103b..1cc8a93e83411 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,50 +18,54 @@
 package org.apache.spark.sql.hive
 
 import com.google.common.io.Files
+import org.apache.spark.sql.test.SQLTestUtils
 
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.util.Utils
 
 
-class QueryPartitionSuite extends QueryTest {
+class QueryPartitionSuite extends QueryTest with SQLTestUtils {
 
   private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
   import ctx.implicits._
-  import ctx.sql
+
+  protected def _sqlContext = ctx
 
   test("SPARK-5068: query data when path doesn't exist"){
-    val testData = ctx.sparkContext.parallelize(
-      (1 to 10).map(i => TestData(i, i.toString))).toDF()
-    testData.registerTempTable("testData")
+    withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
+      val testData = ctx.sparkContext.parallelize(
+        (1 to 10).map(i => TestData(i, i.toString))).toDF()
+      testData.registerTempTable("testData")
 
-    val tmpDir = Files.createTempDir()
-    // create the table for test
-    sql(s"CREATE TABLE table_with_partition(key int,value string) " +
-      s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
-    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
-      "SELECT key,value FROM testData")
-    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
-      "SELECT key,value FROM testData")
-    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
-      "SELECT key,value FROM testData")
-    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
-      "SELECT key,value FROM testData")
+      val tmpDir = Files.createTempDir()
+      // create the table for test
+      sql(s"CREATE TABLE table_with_partition(key int,value string) " +
+        s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
+      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
+        "SELECT key,value FROM testData")
+      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
+        "SELECT key,value FROM testData")
+      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
+        "SELECT key,value FROM testData")
+      sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
+        "SELECT key,value FROM testData")
 
-    // test for the exist path
-    checkAnswer(sql("select key,value from table_with_partition"),
-      testData.toDF.collect ++ testData.toDF.collect
-        ++ testData.toDF.collect ++ testData.toDF.collect)
+      // test for the exist path
+      checkAnswer(sql("select key,value from table_with_partition"),
+        testData.toDF.collect ++ testData.toDF.collect
+          ++ testData.toDF.collect ++ testData.toDF.collect)
 
-    // delete the path of one partition
-    tmpDir.listFiles
-      .find { f => f.isDirectory && f.getName().startsWith("ds=") }
-      .foreach { f => Utils.deleteRecursively(f) }
+      // delete the path of one partition
+      tmpDir.listFiles
+        .find { f => f.isDirectory && f.getName().startsWith("ds=") }
+        .foreach { f => Utils.deleteRecursively(f) }
 
-    // test for after delete the path
-    checkAnswer(sql("select key,value from table_with_partition"),
-      testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)
+      // test for after delete the path
+      checkAnswer(sql("select key,value from table_with_partition"),
+        testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect)
 
-    sql("DROP TABLE table_with_partition")
-    sql("DROP TABLE createAndInsertTest")
+      sql("DROP TABLE table_with_partition")
+      sql("DROP TABLE createAndInsertTest")
+    }
   }
 }

From b37f0cc1b4c064d6f09edb161250fa8b783de52a Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 25 Aug 2015 10:54:03 -0700
Subject: [PATCH 073/802] [SPARK-8531] [ML] Update ML user guide for
 MinMaxScaler

jira: https://issues.apache.org/jira/browse/SPARK-8531

Update ML user guide for MinMaxScaler

Author: Yuhao Yang <hhbyyh@gmail.com>
Author: unknown <yuhaoyan@yuhaoyan-MOBL1.ccr.corp.intel.com>

Closes #7211 from hhbyyh/minmaxdoc.
---
 docs/ml-features.md | 71 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 642a4b4c53183..62de4838981cb 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame)
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
@@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame)
 </div>
 </div>
 
+## MinMaxScaler
+
+`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature to a specific range (often [0, 1]).  It takes parameters:
+
+* `min`: 0.0 by default. Lower bound after transformation, shared by all features.
+* `max`: 1.0 by default. Upper bound after transformation, shared by all features.
+
+`MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range.
+
+The rescaled value for a feature E is calculated as,
+`\begin{equation}
+  Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
+\end{equation}`
+For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`
+
+Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input.
+
+The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1].
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+More details can be found in the API docs for
+[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and
+[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
+{% highlight scala %}
+import org.apache.spark.ml.feature.MinMaxScaler
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+val dataFrame = sqlContext.createDataFrame(data)
+val scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures")
+
+// Compute summary statistics and generate MinMaxScalerModel
+val scalerModel = scaler.fit(dataFrame)
+
+// rescale each feature to range [min, max].
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+More details can be found in the API docs for
+[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and
+[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html).
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD<LabeledPoint> data =
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+MinMaxScaler scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures");
+
+// Compute summary statistics and generate MinMaxScalerModel
+MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+// rescale each feature to range [min, max].
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
+</div>
+</div>
+
 ## Bucketizer
 
 `Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:

From 881208a8e849facf54166bdd69d3634407f952e7 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 25 Aug 2015 11:58:47 -0700
Subject: [PATCH 074/802] [SPARK-10230] [MLLIB] Rename optimizeAlpha to
 optimizeDocConcentration

See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770)

CC jkbradley

Author: Feynman Liang <fliang@databricks.com>

Closes #8422 from feynmanliang/SPARK-10230.
---
 .../spark/mllib/clustering/LDAOptimizer.scala    | 16 ++++++++--------
 .../apache/spark/mllib/clustering/LDASuite.scala |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 5c2aae6403bea..38486e949bbcf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
-  private var optimizeAlpha: Boolean = false
+  private var optimizeDocConcentration: Boolean = false
 
   // internal data structure
   private var docs: RDD[(Long, Vector)] = null
@@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution)
-   * will be optimized during training.
+   * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for
+   * document-topic distribution) will be optimized during training.
    */
   @Since("1.5.0")
-  def getOptimzeAlpha: Boolean = this.optimizeAlpha
+  def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration
 
   /**
-   * Sets whether to optimize alpha parameter during training.
+   * Sets whether to optimize docConcentration parameter during training.
    *
    * Default: false
    */
   @Since("1.5.0")
-  def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
-    this.optimizeAlpha = optimizeAlpha
+  def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = {
+    this.optimizeDocConcentration = optimizeDocConcentration
     this
   }
 
@@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
     // Note that this is an optimization to avoid batch.count
     updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
-    if (optimizeAlpha) updateAlpha(gammat)
+    if (optimizeDocConcentration) updateAlpha(gammat)
     this
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 8a714f9b79e02..746a76a7e5fa1 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     val k = 2
     val docs = sc.parallelize(toyData)
     val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
-      .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false)
+      .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false)
     val lda = new LDA().setK(k)
       .setDocConcentration(1D / k)
       .setTopicConcentration(0.01)

From 16a2be1a84c0a274a60c0a584faaf58b55d4942b Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 12:16:23 -0700
Subject: [PATCH 075/802] [SPARK-10231] [MLLIB] update @Since annotation for
 mllib.classification

Update `Since` annotation in `mllib.classification`:

1. add version to classes, objects, constructors, and public variables declared in constructors
2. correct some versions
3. remove `Since` on `toString`

MechCoder dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8421 from mengxr/SPARK-10231 and squashes the following commits:

b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification
---
 .../classification/ClassificationModel.scala  |  7 +++--
 .../classification/LogisticRegression.scala   | 20 +++++++++----
 .../mllib/classification/NaiveBayes.scala     | 28 +++++++++++++++----
 .../spark/mllib/classification/SVM.scala      | 15 ++++++----
 .../StreamingLogisticRegressionWithSGD.scala  |  9 +++++-
 5 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index a29b425a71fd6..85a413243b049 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
  * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc.
  */
 @Experimental
+@Since("0.8.0")
 trait ClassificationModel extends Serializable {
   /**
    * Predict values for the given data set using the model trained.
@@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable {
    * @param testData RDD representing data points to be predicted
    * @return an RDD[Double] where each entry contains the corresponding prediction
    */
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: RDD[Vector]): RDD[Double]
 
   /**
@@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable {
    * @param testData array representing a single data point
    * @return predicted category from the trained model
    */
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: Vector): Double
 
   /**
@@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable {
    * @param testData JavaRDD representing data points to be predicted
    * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction
    */
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
     predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index e03e662227d14..5ceff5b2259ea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD
  *                   Multinomial Logistic Regression. By default, it is binary logistic regression
  *                   so numClasses will be set to 2.
  */
-class LogisticRegressionModel (
-    override val weights: Vector,
-    override val intercept: Double,
-    val numFeatures: Int,
-    val numClasses: Int)
+@Since("0.8.0")
+class LogisticRegressionModel @Since("1.3.0") (
+    @Since("1.0.0") override val weights: Vector,
+    @Since("1.0.0") override val intercept: Double,
+    @Since("1.3.0") val numFeatures: Int,
+    @Since("1.3.0") val numClasses: Int)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
   with Saveable with PMMLExportable {
 
@@ -75,6 +76,7 @@ class LogisticRegressionModel (
   /**
    * Constructs a [[LogisticRegressionModel]] with weights and intercept for binary classification.
    */
+  @Since("1.0.0")
   def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)
 
   private var threshold: Option[Double] = Some(0.5)
@@ -166,12 +168,12 @@ class LogisticRegressionModel (
 
   override protected def formatVersion: String = "1.0"
 
-  @Since("1.4.0")
   override def toString: String = {
     s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
   }
 }
 
+@Since("1.3.0")
 object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
 
   @Since("1.3.0")
@@ -207,6 +209,7 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
  * for k classes multi-label classification problem.
  * Using [[LogisticRegressionWithLBFGS]] is recommended over this.
  */
+@Since("0.8.0")
 class LogisticRegressionWithSGD private[mllib] (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -216,6 +219,7 @@ class LogisticRegressionWithSGD private[mllib] (
 
   private val gradient = new LogisticGradient()
   private val updater = new SquaredL2Updater()
+  @Since("0.8.0")
   override val optimizer = new GradientDescent(gradient, updater)
     .setStepSize(stepSize)
     .setNumIterations(numIterations)
@@ -227,6 +231,7 @@ class LogisticRegressionWithSGD private[mllib] (
    * Construct a LogisticRegression object with default parameters: {stepSize: 1.0,
    * numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(1.0, 100, 0.01, 1.0)
 
   override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
@@ -238,6 +243,7 @@ class LogisticRegressionWithSGD private[mllib] (
  * Top-level methods for calling Logistic Regression using Stochastic Gradient Descent.
  * NOTE: Labels used in Logistic Regression should be {0, 1}
  */
+@Since("0.8.0")
 object LogisticRegressionWithSGD {
   // NOTE(shivaram): We use multiple train methods instead of default arguments to support
   // Java programs.
@@ -333,11 +339,13 @@ object LogisticRegressionWithSGD {
  * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
  * for k classes multi-label classification problem.
  */
+@Since("1.1.0")
 class LogisticRegressionWithLBFGS
   extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {
 
   this.setFeatureScaling(true)
 
+  @Since("1.1.0")
   override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)
 
   override protected val validators = List(multiLabelValidator)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index dab369207cc9a..a956084ae06e8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -41,11 +41,12 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
  *              where D is number of features
  * @param modelType The type of NB model to fit  can be "multinomial" or "bernoulli"
  */
+@Since("0.9.0")
 class NaiveBayesModel private[spark] (
-    val labels: Array[Double],
-    val pi: Array[Double],
-    val theta: Array[Array[Double]],
-    val modelType: String)
+    @Since("1.0.0") val labels: Array[Double],
+    @Since("0.9.0") val pi: Array[Double],
+    @Since("0.9.0") val theta: Array[Array[Double]],
+    @Since("1.4.0") val modelType: String)
   extends ClassificationModel with Serializable with Saveable {
 
   import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes}
@@ -83,6 +84,7 @@ class NaiveBayesModel private[spark] (
       throw new UnknownError(s"Invalid modelType: $modelType.")
   }
 
+  @Since("1.0.0")
   override def predict(testData: RDD[Vector]): RDD[Double] = {
     val bcModel = testData.context.broadcast(this)
     testData.mapPartitions { iter =>
@@ -91,6 +93,7 @@ class NaiveBayesModel private[spark] (
     }
   }
 
+  @Since("1.0.0")
   override def predict(testData: Vector): Double = {
     modelType match {
       case Multinomial =>
@@ -107,6 +110,7 @@ class NaiveBayesModel private[spark] (
    * @return an RDD[Vector] where each entry contains the predicted posterior class probabilities,
    *         in the same order as class labels
    */
+  @Since("1.5.0")
   def predictProbabilities(testData: RDD[Vector]): RDD[Vector] = {
     val bcModel = testData.context.broadcast(this)
     testData.mapPartitions { iter =>
@@ -122,6 +126,7 @@ class NaiveBayesModel private[spark] (
    * @return predicted posterior class probabilities from the trained model,
    *         in the same order as class labels
    */
+  @Since("1.5.0")
   def predictProbabilities(testData: Vector): Vector = {
     modelType match {
       case Multinomial =>
@@ -158,6 +163,7 @@ class NaiveBayesModel private[spark] (
     new DenseVector(scaledProbs.map(_ / probSum))
   }
 
+  @Since("1.3.0")
   override def save(sc: SparkContext, path: String): Unit = {
     val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType)
     NaiveBayesModel.SaveLoadV2_0.save(sc, path, data)
@@ -166,6 +172,7 @@ class NaiveBayesModel private[spark] (
   override protected def formatVersion: String = "2.0"
 }
 
+@Since("1.3.0")
 object NaiveBayesModel extends Loader[NaiveBayesModel] {
 
   import org.apache.spark.mllib.util.Loader._
@@ -199,6 +206,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
       dataRDD.write.parquet(dataPath(path))
     }
 
+    @Since("1.3.0")
     def load(sc: SparkContext, path: String): NaiveBayesModel = {
       val sqlContext = new SQLContext(sc)
       // Load Parquet data.
@@ -301,30 +309,35 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
  * document classification.  By making every vector a 0-1 vector, it can also be used as
  * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative.
  */
-
+@Since("0.9.0")
 class NaiveBayes private (
     private var lambda: Double,
     private var modelType: String) extends Serializable with Logging {
 
   import NaiveBayes.{Bernoulli, Multinomial}
 
+  @Since("1.4.0")
   def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)
 
+  @Since("0.9.0")
   def this() = this(1.0, NaiveBayes.Multinomial)
 
   /** Set the smoothing parameter. Default: 1.0. */
+  @Since("0.9.0")
   def setLambda(lambda: Double): NaiveBayes = {
     this.lambda = lambda
     this
   }
 
   /** Get the smoothing parameter. */
+  @Since("1.4.0")
   def getLambda: Double = lambda
 
   /**
    * Set the model type using a string (case-sensitive).
    * Supported options: "multinomial" (default) and "bernoulli".
    */
+  @Since("1.4.0")
   def setModelType(modelType: String): NaiveBayes = {
     require(NaiveBayes.supportedModelTypes.contains(modelType),
       s"NaiveBayes was created with an unknown modelType: $modelType.")
@@ -333,6 +346,7 @@ class NaiveBayes private (
   }
 
   /** Get the model type. */
+  @Since("1.4.0")
   def getModelType: String = this.modelType
 
   /**
@@ -340,6 +354,7 @@ class NaiveBayes private (
    *
    * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    */
+  @Since("0.9.0")
   def run(data: RDD[LabeledPoint]): NaiveBayesModel = {
     val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
       val values = v match {
@@ -423,6 +438,7 @@ class NaiveBayes private (
 /**
  * Top-level methods for calling naive Bayes.
  */
+@Since("0.9.0")
 object NaiveBayes {
 
   /** String name for multinomial model type. */
@@ -485,7 +501,7 @@ object NaiveBayes {
    * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be
    *              multinomial or bernoulli
    */
-  @Since("0.9.0")
+  @Since("1.4.0")
   def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
     require(supportedModelTypes.contains(modelType),
       s"NaiveBayes was created with an unknown modelType: $modelType.")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 5f87269863572..896565cd90e89 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -33,9 +33,10 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class SVMModel (
-    override val weights: Vector,
-    override val intercept: Double)
+@Since("0.8.0")
+class SVMModel @Since("1.1.0") (
+    @Since("1.0.0") override val weights: Vector,
+    @Since("0.8.0") override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
   with Saveable with PMMLExportable {
 
@@ -47,7 +48,7 @@ class SVMModel (
    * with prediction score greater than or equal to this threshold is identified as an positive,
    * and negative otherwise. The default value is 0.0.
    */
-  @Since("1.3.0")
+  @Since("1.0.0")
   @Experimental
   def setThreshold(threshold: Double): this.type = {
     this.threshold = Some(threshold)
@@ -92,12 +93,12 @@ class SVMModel (
 
   override protected def formatVersion: String = "1.0"
 
-  @Since("1.4.0")
   override def toString: String = {
     s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}"
   }
 }
 
+@Since("1.3.0")
 object SVMModel extends Loader[SVMModel] {
 
   @Since("1.3.0")
@@ -132,6 +133,7 @@ object SVMModel extends Loader[SVMModel] {
  * regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
  * NOTE: Labels used in SVM should be {0, 1}.
  */
+@Since("0.8.0")
 class SVMWithSGD private (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -141,6 +143,7 @@ class SVMWithSGD private (
 
   private val gradient = new HingeGradient()
   private val updater = new SquaredL2Updater()
+  @Since("0.8.0")
   override val optimizer = new GradientDescent(gradient, updater)
     .setStepSize(stepSize)
     .setNumIterations(numIterations)
@@ -152,6 +155,7 @@ class SVMWithSGD private (
    * Construct a SVM object with default parameters: {stepSize: 1.0, numIterations: 100,
    * regParm: 0.01, miniBatchFraction: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(1.0, 100, 0.01, 1.0)
 
   override protected def createModel(weights: Vector, intercept: Double) = {
@@ -162,6 +166,7 @@ class SVMWithSGD private (
 /**
  * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}.
  */
+@Since("0.8.0")
 object SVMWithSGD {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
index 7d33df3221fbf..75630054d1368 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.classification
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
 
@@ -44,6 +44,7 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
  * }}}
  */
 @Experimental
+@Since("1.3.0")
 class StreamingLogisticRegressionWithSGD private[mllib] (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -58,6 +59,7 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
    * Initial weights must be set before using trainOn or predictOn
    * (see `StreamingLinearAlgorithm`)
    */
+  @Since("1.3.0")
   def this() = this(0.1, 50, 1.0, 0.0)
 
   protected val algorithm = new LogisticRegressionWithSGD(
@@ -66,30 +68,35 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
   protected var model: Option[LogisticRegressionModel] = None
 
   /** Set the step size for gradient descent. Default: 0.1. */
+  @Since("1.3.0")
   def setStepSize(stepSize: Double): this.type = {
     this.algorithm.optimizer.setStepSize(stepSize)
     this
   }
 
   /** Set the number of iterations of gradient descent to run per update. Default: 50. */
+  @Since("1.3.0")
   def setNumIterations(numIterations: Int): this.type = {
     this.algorithm.optimizer.setNumIterations(numIterations)
     this
   }
 
   /** Set the fraction of each batch to use for updates. Default: 1.0. */
+  @Since("1.3.0")
   def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
     this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction)
     this
   }
 
   /** Set the regularization parameter. Default: 0.0. */
+  @Since("1.3.0")
   def setRegParam(regParam: Double): this.type = {
     this.algorithm.optimizer.setRegParam(regParam)
     this
   }
 
   /** Set the initial weights. Default: [0.0, 0.0]. */
+  @Since("1.3.0")
   def setInitialWeights(initialWeights: Vector): this.type = {
     this.model = Some(algorithm.createModel(initialWeights, 0.0))
     this

From 71a138cd0e0a14e8426f97877e3b52a562bbd02c Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 25 Aug 2015 13:14:10 -0700
Subject: [PATCH 076/802] [SPARK-10048] [SPARKR] Support arbitrary nested Java
 array in serde.

This PR:
1. supports transferring arbitrary nested array from JVM to R side in SerDe;
2. based on 1, collect() implemenation is improved. Now it can support collecting data of complex types
   from a DataFrame.

Author: Sun Rui <rui.sun@intel.com>

Closes #8276 from sun-rui/SPARK-10048.
---
 R/pkg/R/DataFrame.R                           | 55 +++++++++---
 R/pkg/R/deserialize.R                         | 72 +++++++---------
 R/pkg/R/serialize.R                           | 10 +--
 R/pkg/inst/tests/test_Serde.R                 | 77 +++++++++++++++++
 R/pkg/inst/worker/worker.R                    |  4 +-
 .../apache/spark/api/r/RBackendHandler.scala  |  7 ++
 .../scala/org/apache/spark/api/r/SerDe.scala  | 86 +++++++++++--------
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 32 +------
 8 files changed, 216 insertions(+), 127 deletions(-)
 create mode 100644 R/pkg/inst/tests/test_Serde.R

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4ea59864..ae1d912cf6da1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -652,18 +652,49 @@ setMethod("dim",
 setMethod("collect",
           signature(x = "DataFrame"),
           function(x, stringsAsFactors = FALSE) {
-            # listCols is a list of raw vectors, one per column
-            listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf)
-            cols <- lapply(listCols, function(col) {
-              objRaw <- rawConnection(col)
-              numRows <- readInt(objRaw)
-              col <- readCol(objRaw, numRows)
-              close(objRaw)
-              col
-            })
-            names(cols) <- columns(x)
-            do.call(cbind.data.frame, list(cols, stringsAsFactors = stringsAsFactors))
-          })
+            names <- columns(x)
+            ncol <- length(names)
+            if (ncol <= 0) {
+              # empty data.frame with 0 columns and 0 rows
+              data.frame()
+            } else {
+              # listCols is a list of columns
+              listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf)
+              stopifnot(length(listCols) == ncol)
+              
+              # An empty data.frame with 0 columns and number of rows as collected
+              nrow <- length(listCols[[1]])
+              if (nrow <= 0) {
+                df <- data.frame()
+              } else {
+                df <- data.frame(row.names = 1 : nrow)                
+              }
+              
+              # Append columns one by one
+              for (colIndex in 1 : ncol) {
+                # Note: appending a column of list type into a data.frame so that
+                # data of complex type can be held. But getting a cell from a column
+                # of list type returns a list instead of a vector. So for columns of
+                # non-complex type, append them as vector.
+                col <- listCols[[colIndex]]
+                if (length(col) <= 0) {
+                  df[[names[colIndex]]] <- col
+                } else {
+                  # TODO: more robust check on column of primitive types
+                  vec <- do.call(c, col)
+                  if (class(vec) != "list") {
+                    df[[names[colIndex]]] <- vec                  
+                  } else {
+                    # For columns of complex type, be careful to access them.
+                    # Get a column of complex type returns a list.
+                    # Get a cell from a column of complex type returns a list instead of a vector.
+                    df[[names[colIndex]]] <- col
+                 }
+              }
+            }
+            df
+          }
+        })
 
 #' Limit
 #'
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 33bf13ec9e784..6cf628e3007de 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -48,6 +48,7 @@ readTypedObject <- function(con, type) {
     "r" = readRaw(con),
     "D" = readDate(con),
     "t" = readTime(con),
+    "a" = readArray(con),
     "l" = readList(con),
     "n" = NULL,
     "j" = getJobj(readString(con)),
@@ -85,8 +86,7 @@ readTime <- function(con) {
   as.POSIXct(t, origin = "1970-01-01")
 }
 
-# We only support lists where all elements are of same type
-readList <- function(con) {
+readArray <- function(con) {
   type <- readType(con)
   len <- readInt(con)
   if (len > 0) {
@@ -100,6 +100,25 @@ readList <- function(con) {
   }
 }
 
+# Read a list. Types of each element may be different.
+# Null objects are read as NA.
+readList <- function(con) {
+  len <- readInt(con)
+  if (len > 0) {
+    l <- vector("list", len)
+    for (i in 1:len) {
+      elem <- readObject(con)
+      if (is.null(elem)) {
+        elem <- NA
+      }
+      l[[i]] <- elem
+    }
+    l
+  } else {
+    list()
+  }
+}
+
 readRaw <- function(con) {
   dataLen <- readInt(con)
   readBin(con, raw(), as.integer(dataLen), endian = "big")
@@ -132,18 +151,19 @@ readDeserialize <- function(con) {
   }
 }
 
-readDeserializeRows <- function(inputCon) {
-  # readDeserializeRows will deserialize a DataOutputStream composed of
-  # a list of lists. Since the DOS is one continuous stream and
-  # the number of rows varies, we put the readRow function in a while loop
-  # that termintates when the next row is empty.
+readMultipleObjects <- function(inputCon) {
+  # readMultipleObjects will read multiple continuous objects from
+  # a DataOutputStream. There is no preceding field telling the count
+  # of the objects, so the number of objects varies, we try to read
+  # all objects in a loop until the end of the stream.
   data <- list()
   while(TRUE) {
-    row <- readRow(inputCon)
-    if (length(row) == 0) {
+    # If reaching the end of the stream, type returned should be "".
+    type <- readType(inputCon)
+    if (type == "") {
       break
     }
-    data[[length(data) + 1L]] <- row
+    data[[length(data) + 1L]] <- readTypedObject(inputCon, type)
   }
   data # this is a list of named lists now
 }
@@ -155,35 +175,5 @@ readRowList <- function(obj) {
   # deserialize the row.
   rawObj <- rawConnection(obj, "r+")
   on.exit(close(rawObj))
-  readRow(rawObj)
-}
-
-readRow <- function(inputCon) {
-  numCols <- readInt(inputCon)
-  if (length(numCols) > 0 && numCols > 0) {
-    lapply(1:numCols, function(x) {
-      obj <- readObject(inputCon)
-      if (is.null(obj)) {
-        NA
-      } else {
-        obj
-      }
-    }) # each row is a list now
-  } else {
-    list()
-  }
-}
-
-# Take a single column as Array[Byte] and deserialize it into an atomic vector
-readCol <- function(inputCon, numRows) {
-  if (numRows > 0) {
-    # sapply can not work with POSIXlt
-    do.call(c, lapply(1:numRows, function(x) {
-      value <- readObject(inputCon)
-      # Replace NULL with NA so we can coerce to vectors
-      if (is.null(value)) NA else value
-    }))
-  } else {
-    vector()
-  }
+  readObject(rawObj)
 }
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 311021e5d8473..e3676f57f907f 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -110,18 +110,10 @@ writeRowSerialize <- function(outputCon, rows) {
 serializeRow <- function(row) {
   rawObj <- rawConnection(raw(0), "wb")
   on.exit(close(rawObj))
-  writeRow(rawObj, row)
+  writeGenericList(rawObj, row)
   rawConnectionValue(rawObj)
 }
 
-writeRow <- function(con, row) {
-  numCols <- length(row)
-  writeInt(con, numCols)
-  for (i in 1:numCols) {
-    writeObject(con, row[[i]])
-  }
-}
-
 writeRaw <- function(con, batch) {
   writeInt(con, length(batch))
   writeBin(batch, con, endian = "big")
diff --git a/R/pkg/inst/tests/test_Serde.R b/R/pkg/inst/tests/test_Serde.R
new file mode 100644
index 0000000000000..009db85da2beb
--- /dev/null
+++ b/R/pkg/inst/tests/test_Serde.R
@@ -0,0 +1,77 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("SerDe functionality")
+
+sc <- sparkR.init()
+
+test_that("SerDe of primitive types", {
+  x <- callJStatic("SparkRHandler", "echo", 1L)
+  expect_equal(x, 1L)
+  expect_equal(class(x), "integer")
+  
+  x <- callJStatic("SparkRHandler", "echo", 1)
+  expect_equal(x, 1)
+  expect_equal(class(x), "numeric")
+
+  x <- callJStatic("SparkRHandler", "echo", TRUE)
+  expect_true(x)
+  expect_equal(class(x), "logical")
+  
+  x <- callJStatic("SparkRHandler", "echo", "abc")
+  expect_equal(x, "abc")
+  expect_equal(class(x), "character")  
+})
+
+test_that("SerDe of list of primitive types", {
+  x <- list(1L, 2L, 3L)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "integer")
+
+  x <- list(1, 2, 3)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "numeric")
+  
+  x <- list(TRUE, FALSE)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "logical")
+  
+  x <- list("a", "b", "c")
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "character")
+  
+  # Empty list
+  x <- list()
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+})
+
+test_that("SerDe of list of lists", {
+  x <- list(list(1L, 2L, 3L), list(1, 2, 3),
+            list(TRUE, FALSE), list("a", "b", "c"))
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+
+  # List of empty lists
+  x <- list(list(), list())
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+})
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index 7e3b5fc403b25..0c3b0d1f4be20 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -94,7 +94,7 @@ if (isEmpty != 0) {
     } else if (deserializer == "string") {
       data <- as.list(readLines(inputCon))
     } else if (deserializer == "row") {
-      data <- SparkR:::readDeserializeRows(inputCon)
+      data <- SparkR:::readMultipleObjects(inputCon)
     }
     # Timing reading input data for execution
     inputElap <- elapsedSecs()
@@ -120,7 +120,7 @@ if (isEmpty != 0) {
     } else if (deserializer == "string") {
       data <- readLines(inputCon)
     } else if (deserializer == "row") {
-      data <- SparkR:::readDeserializeRows(inputCon)
+      data <- SparkR:::readMultipleObjects(inputCon)
     }
     # Timing reading input data for execution
     inputElap <- elapsedSecs()
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 6ce02e2ea336a..bb82f3285f1d9 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -53,6 +53,13 @@ private[r] class RBackendHandler(server: RBackend)
 
     if (objId == "SparkRHandler") {
       methodName match {
+        // This function is for test-purpose only
+        case "echo" =>
+          val args = readArgs(numArgs, dis)
+          assert(numArgs == 1)
+
+          writeInt(dos, 0)
+          writeObject(dos, args(0))
         case "stopBackend" =>
           writeInt(dos, 0)
           writeType(dos, "void")
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index dbbbcf40c1e96..26ad4f1d4697e 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -149,6 +149,10 @@ private[spark] object SerDe {
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
+      case 'l' => {
+        val len = readInt(dis)
+        (0 until len).map(_ => readList(dis)).toArray
+      }
       case _ => throw new IllegalArgumentException(s"Invalid array type $arrType")
     }
   }
@@ -200,6 +204,9 @@ private[spark] object SerDe {
       case "date" => dos.writeByte('D')
       case "time" => dos.writeByte('t')
       case "raw" => dos.writeByte('r')
+      // Array of primitive types
+      case "array" => dos.writeByte('a')
+      // Array of objects
       case "list" => dos.writeByte('l')
       case "jobj" => dos.writeByte('j')
       case _ => throw new IllegalArgumentException(s"Invalid type $typeStr")
@@ -211,26 +218,35 @@ private[spark] object SerDe {
       writeType(dos, "void")
     } else {
       value.getClass.getName match {
+        case "java.lang.Character" =>
+          writeType(dos, "character")
+          writeString(dos, value.asInstanceOf[Character].toString)
         case "java.lang.String" =>
           writeType(dos, "character")
           writeString(dos, value.asInstanceOf[String])
-        case "long" | "java.lang.Long" =>
+        case "java.lang.Long" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Long].toDouble)
-        case "float" | "java.lang.Float" =>
+        case "java.lang.Float" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Float].toDouble)
-        case "decimal" | "java.math.BigDecimal" =>
+        case "java.math.BigDecimal" =>
           writeType(dos, "double")
           val javaDecimal = value.asInstanceOf[java.math.BigDecimal]
           writeDouble(dos, scala.math.BigDecimal(javaDecimal).toDouble)
-        case "double" | "java.lang.Double" =>
+        case "java.lang.Double" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Double])
-        case "int" | "java.lang.Integer" =>
+        case "java.lang.Byte" =>
+          writeType(dos, "integer")
+          writeInt(dos, value.asInstanceOf[Byte].toInt)
+        case "java.lang.Short" =>
+          writeType(dos, "integer")
+          writeInt(dos, value.asInstanceOf[Short].toInt)
+        case "java.lang.Integer" =>
           writeType(dos, "integer")
           writeInt(dos, value.asInstanceOf[Int])
-        case "boolean" | "java.lang.Boolean" =>
+        case "java.lang.Boolean" =>
           writeType(dos, "logical")
           writeBoolean(dos, value.asInstanceOf[Boolean])
         case "java.sql.Date" =>
@@ -242,43 +258,48 @@ private[spark] object SerDe {
         case "java.sql.Timestamp" =>
           writeType(dos, "time")
           writeTime(dos, value.asInstanceOf[Timestamp])
+
+        // Handle arrays
+
+        // Array of primitive types
+
+        // Special handling for byte array
         case "[B" =>
           writeType(dos, "raw")
           writeBytes(dos, value.asInstanceOf[Array[Byte]])
-        // TODO: Types not handled right now include
-        // byte, char, short, float
 
-        // Handle arrays
-        case "[Ljava.lang.String;" =>
-          writeType(dos, "list")
-          writeStringArr(dos, value.asInstanceOf[Array[String]])
+        case "[C" =>
+          writeType(dos, "array")
+          writeStringArr(dos, value.asInstanceOf[Array[Char]].map(_.toString))
+        case "[S" =>
+          writeType(dos, "array")
+          writeIntArr(dos, value.asInstanceOf[Array[Short]].map(_.toInt))
         case "[I" =>
-          writeType(dos, "list")
+          writeType(dos, "array")
           writeIntArr(dos, value.asInstanceOf[Array[Int]])
         case "[J" =>
-          writeType(dos, "list")
+          writeType(dos, "array")
           writeDoubleArr(dos, value.asInstanceOf[Array[Long]].map(_.toDouble))
+        case "[F" =>
+          writeType(dos, "array")
+          writeDoubleArr(dos, value.asInstanceOf[Array[Float]].map(_.toDouble))
         case "[D" =>
-          writeType(dos, "list")
+          writeType(dos, "array")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
         case "[Z" =>
-          writeType(dos, "list")
+          writeType(dos, "array")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
-        case "[[B" =>
+
+        // Array of objects, null objects use "void" type
+        case c if c.startsWith("[") =>
           writeType(dos, "list")
-          writeBytesArr(dos, value.asInstanceOf[Array[Array[Byte]]])
-        case otherName =>
-          // Handle array of objects
-          if (otherName.startsWith("[L")) {
-            val objArr = value.asInstanceOf[Array[Object]]
-            writeType(dos, "list")
-            writeType(dos, "jobj")
-            dos.writeInt(objArr.length)
-            objArr.foreach(o => writeJObj(dos, o))
-          } else {
-            writeType(dos, "jobj")
-            writeJObj(dos, value)
-          }
+          val array = value.asInstanceOf[Array[Object]]
+          writeInt(dos, array.length)
+          array.foreach(elem => writeObject(dos, elem))
+
+        case _ =>
+          writeType(dos, "jobj")
+          writeJObj(dos, value)
       }
     }
   }
@@ -350,11 +371,6 @@ private[spark] object SerDe {
     value.foreach(v => writeString(out, v))
   }
 
-  def writeBytesArr(out: DataOutputStream, value: Array[Array[Byte]]): Unit = {
-    writeType(out, "raw")
-    out.writeInt(value.length)
-    value.foreach(v => writeBytes(out, v))
-  }
 }
 
 private[r] object SerializationFormats {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 92861ab038f19..7f3defec3d42e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -98,27 +98,17 @@ private[r] object SQLUtils {
     val bos = new ByteArrayOutputStream()
     val dos = new DataOutputStream(bos)
 
-    SerDe.writeInt(dos, row.length)
-    (0 until row.length).map { idx =>
-      val obj: Object = row(idx).asInstanceOf[Object]
-      SerDe.writeObject(dos, obj)
-    }
+    val cols = (0 until row.length).map(row(_).asInstanceOf[Object]).toArray
+    SerDe.writeObject(dos, cols)
     bos.toByteArray()
   }
 
-  def dfToCols(df: DataFrame): Array[Array[Byte]] = {
+  def dfToCols(df: DataFrame): Array[Array[Any]] = {
     // localDF is Array[Row]
     val localDF = df.collect()
     val numCols = df.columns.length
-    // dfCols is Array[Array[Any]]
-    val dfCols = convertRowsToColumns(localDF, numCols)
-
-    dfCols.map { col =>
-      colToRBytes(col)
-    }
-  }
 
-  def convertRowsToColumns(localDF: Array[Row], numCols: Int): Array[Array[Any]] = {
+    // result is Array[Array[Any]]
     (0 until numCols).map { colIdx =>
       localDF.map { row =>
         row(colIdx)
@@ -126,20 +116,6 @@ private[r] object SQLUtils {
     }.toArray
   }
 
-  def colToRBytes(col: Array[Any]): Array[Byte] = {
-    val numRows = col.length
-    val bos = new ByteArrayOutputStream()
-    val dos = new DataOutputStream(bos)
-
-    SerDe.writeInt(dos, numRows)
-
-    col.map { item =>
-      val obj: Object = item.asInstanceOf[Object]
-      SerDe.writeObject(dos, obj)
-    }
-    bos.toByteArray()
-  }
-
   def saveMode(mode: String): SaveMode = {
     mode match {
       case "append" => SaveMode.Append

From c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 25 Aug 2015 13:21:05 -0700
Subject: [PATCH 077/802] [SPARK-9800] Adds docs for
 GradientDescent$.runMiniBatchSGD alias

* Adds doc for alias of runMIniBatchSGD documenting default value for convergeTol
* Cleans up a note in code

Author: Feynman Liang <fliang@databricks.com>

Closes #8425 from feynmanliang/SPARK-9800.
---
 .../apache/spark/mllib/optimization/GradientDescent.scala    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 8f0d1e4aa010a..3b663b5defb03 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -235,7 +235,7 @@ object GradientDescent extends Logging {
 
       if (miniBatchSize > 0) {
         /**
-         * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration
+         * lossSum is computed using the weights from the previous iteration
          * and regVal is the regularization value computed in the previous iteration as well.
          */
         stochasticLossHistory.append(lossSum / miniBatchSize + regVal)
@@ -264,6 +264,9 @@ object GradientDescent extends Logging {
 
   }
 
+  /**
+   * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001.
+   */
   def runMiniBatchSGD(
       data: RDD[(Double, Vector)],
       gradient: Gradient,

From c619c7552f22d28cfa321ce671fc9ca854dd655f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 13:22:38 -0700
Subject: [PATCH 078/802] [SPARK-10237] [MLLIB] update since versions in
 mllib.fpm

Same as #8421 but for `mllib.fpm`.

cc feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8429 from mengxr/SPARK-10237.
---
 .../spark/mllib/fpm/AssociationRules.scala    |  7 ++++--
 .../org/apache/spark/mllib/fpm/FPGrowth.scala |  9 ++++++--
 .../apache/spark/mllib/fpm/PrefixSpan.scala   | 23 ++++++++++++++++---
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index ba3b447a83398..95c688c86a7e4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -82,12 +82,15 @@ class AssociationRules private[fpm] (
     }.filter(_.confidence >= minConfidence)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.5.0")
   def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = {
     val tag = fakeClassTag[Item]
     run(freqItemsets.rdd)(tag)
   }
 }
 
+@Since("1.5.0")
 object AssociationRules {
 
   /**
@@ -104,8 +107,8 @@ object AssociationRules {
   @Since("1.5.0")
   @Experimental
   class Rule[Item] private[fpm] (
-      val antecedent: Array[Item],
-      val consequent: Array[Item],
+      @Since("1.5.0") val antecedent: Array[Item],
+      @Since("1.5.0") val consequent: Array[Item],
       freqUnion: Double,
       freqAntecedent: Double) extends Serializable {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index e37f806271680..aea5c4f8a8a7d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.3.0")
 @Experimental
-class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
+class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
+    @Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
   /**
    * Generates association rules for the [[Item]]s in [[freqItemsets]].
    * @param confidence minimal confidence of the rules produced
@@ -126,6 +127,8 @@ class FPGrowth private (
     new FPGrowthModel(freqItemsets)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.3.0")
   def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = {
     implicit val tag = fakeClassTag[Item]
     run(data.rdd.map(_.asScala.toArray))
@@ -226,7 +229,9 @@ object FPGrowth {
    *
    */
   @Since("1.3.0")
-  class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
+  class FreqItemset[Item] @Since("1.3.0") (
+      @Since("1.3.0") val items: Array[Item],
+      @Since("1.3.0") val freq: Long) extends Serializable {
 
     /**
      * Returns items in a Java List.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index dc4ae1d0b69ed..97916daa2e9ad 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.rdd.RDD
@@ -51,6 +51,7 @@ import org.apache.spark.storage.StorageLevel
  *       (Wikipedia)]]
  */
 @Experimental
+@Since("1.5.0")
 class PrefixSpan private (
     private var minSupport: Double,
     private var maxPatternLength: Int,
@@ -61,17 +62,20 @@ class PrefixSpan private (
    * Constructs a default instance with default parameters
    * {minSupport: `0.1`, maxPatternLength: `10`, maxLocalProjDBSize: `32000000L`}.
    */
+  @Since("1.5.0")
   def this() = this(0.1, 10, 32000000L)
 
   /**
    * Get the minimal support (i.e. the frequency of occurrence before a pattern is considered
    * frequent).
    */
+  @Since("1.5.0")
   def getMinSupport: Double = minSupport
 
   /**
    * Sets the minimal support level (default: `0.1`).
    */
+  @Since("1.5.0")
   def setMinSupport(minSupport: Double): this.type = {
     require(minSupport >= 0 && minSupport <= 1,
       s"The minimum support value must be in [0, 1], but got $minSupport.")
@@ -82,11 +86,13 @@ class PrefixSpan private (
   /**
    * Gets the maximal pattern length (i.e. the length of the longest sequential pattern to consider.
    */
+  @Since("1.5.0")
   def getMaxPatternLength: Int = maxPatternLength
 
   /**
    * Sets maximal pattern length (default: `10`).
    */
+  @Since("1.5.0")
   def setMaxPatternLength(maxPatternLength: Int): this.type = {
     // TODO: support unbounded pattern length when maxPatternLength = 0
     require(maxPatternLength >= 1,
@@ -98,12 +104,14 @@ class PrefixSpan private (
   /**
    * Gets the maximum number of items allowed in a projected database before local processing.
    */
+  @Since("1.5.0")
   def getMaxLocalProjDBSize: Long = maxLocalProjDBSize
 
   /**
    * Sets the maximum number of items (including delimiters used in the internal storage format)
    * allowed in a projected database before local processing (default: `32000000L`).
    */
+  @Since("1.5.0")
   def setMaxLocalProjDBSize(maxLocalProjDBSize: Long): this.type = {
     require(maxLocalProjDBSize >= 0L,
       s"The maximum local projected database size must be nonnegative, but got $maxLocalProjDBSize")
@@ -116,6 +124,7 @@ class PrefixSpan private (
    * @param data sequences of itemsets.
    * @return a [[PrefixSpanModel]] that contains the frequent patterns
    */
+  @Since("1.5.0")
   def run[Item: ClassTag](data: RDD[Array[Array[Item]]]): PrefixSpanModel[Item] = {
     if (data.getStorageLevel == StorageLevel.NONE) {
       logWarning("Input data is not cached.")
@@ -202,6 +211,7 @@ class PrefixSpan private (
    * @tparam Sequence sequence type, which is an Iterable of Itemsets
    * @return a [[PrefixSpanModel]] that contains the frequent sequential patterns
    */
+  @Since("1.5.0")
   def run[Item, Itemset <: jl.Iterable[Item], Sequence <: jl.Iterable[Itemset]](
       data: JavaRDD[Sequence]): PrefixSpanModel[Item] = {
     implicit val tag = fakeClassTag[Item]
@@ -211,6 +221,7 @@ class PrefixSpan private (
 }
 
 @Experimental
+@Since("1.5.0")
 object PrefixSpan extends Logging {
 
   /**
@@ -535,10 +546,14 @@ object PrefixSpan extends Logging {
    * @param freq frequency
    * @tparam Item item type
    */
-  class FreqSequence[Item](val sequence: Array[Array[Item]], val freq: Long) extends Serializable {
+  @Since("1.5.0")
+  class FreqSequence[Item] @Since("1.5.0") (
+      @Since("1.5.0") val sequence: Array[Array[Item]],
+      @Since("1.5.0") val freq: Long) extends Serializable {
     /**
      * Returns sequence as a Java List of lists for Java users.
      */
+    @Since("1.5.0")
     def javaSequence: ju.List[ju.List[Item]] = sequence.map(_.toList.asJava).toList.asJava
   }
 }
@@ -548,5 +563,7 @@ object PrefixSpan extends Logging {
  * @param freqSequences frequent sequences
  * @tparam Item item type
  */
-class PrefixSpanModel[Item](val freqSequences: RDD[PrefixSpan.FreqSequence[Item]])
+@Since("1.5.0")
+class PrefixSpanModel[Item] @Since("1.5.0") (
+    @Since("1.5.0") val freqSequences: RDD[PrefixSpan.FreqSequence[Item]])
   extends Serializable

From 9205907876cf65695e56c2a94bedd83df3675c03 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 25 Aug 2015 13:23:15 -0700
Subject: [PATCH 079/802] [SPARK-9797] [MLLIB] [DOC]
 StreamingLinearRegressionWithSGD.setConvergenceTol default value

Adds default convergence tolerance (0.001, set in `GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc

Author: Feynman Liang <fliang@databricks.com>

Closes #8424 from feynmanliang/SPARK-9797.
---
 .../mllib/regression/StreamingLinearRegressionWithSGD.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index 537a05274eec2..26654e4a06838 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   }
 
   /**
-   * Set the convergence tolerance.
+   * Set the convergence tolerance. Default: 0.001.
    */
   def setConvergenceTol(tolerance: Double): this.type = {
     this.algorithm.optimizer.setConvergenceTol(tolerance)

From 00ae4be97f7b205432db2967ba6d506286ef2ca6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 14:11:38 -0700
Subject: [PATCH 080/802] [SPARK-10239] [SPARK-10244] [MLLIB] update since
 versions in mllib.pmml and mllib.util

Same as #8421 but for `mllib.pmml` and `mllib.util`.

cc dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8430 from mengxr/SPARK-10239 and squashes the following commits:

a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util
---
 .../org/apache/spark/mllib/pmml/PMMLExportable.scala   |  7 ++++++-
 .../org/apache/spark/mllib/util/DataValidators.scala   |  7 +++++--
 .../apache/spark/mllib/util/KMeansDataGenerator.scala  |  5 ++++-
 .../apache/spark/mllib/util/LinearDataGenerator.scala  | 10 ++++++++--
 .../mllib/util/LogisticRegressionDataGenerator.scala   |  5 ++++-
 .../org/apache/spark/mllib/util/MFDataGenerator.scala  |  4 +++-
 .../scala/org/apache/spark/mllib/util/MLUtils.scala    |  2 ++
 .../org/apache/spark/mllib/util/SVMDataGenerator.scala |  6 ++++--
 .../org/apache/spark/mllib/util/modelSaveLoad.scala    |  6 +++++-
 9 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
index 5e882d4ebb10b..274ac7c99553b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
 import org.jpmml.model.JAXBUtil
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
 
 /**
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
  * developed by the Data Mining Group (www.dmg.org).
  */
 @DeveloperApi
+@Since("1.4.0")
 trait PMMLExportable {
 
   /**
@@ -48,6 +49,7 @@ trait PMMLExportable {
    * Export the model to a local file in PMML format
    */
   @Experimental
+  @Since("1.4.0")
   def toPMML(localPath: String): Unit = {
     toPMML(new StreamResult(new File(localPath)))
   }
@@ -57,6 +59,7 @@ trait PMMLExportable {
    * Export the model to a directory on a distributed file system in PMML format
    */
   @Experimental
+  @Since("1.4.0")
   def toPMML(sc: SparkContext, path: String): Unit = {
     val pmml = toPMML()
     sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
@@ -67,6 +70,7 @@ trait PMMLExportable {
    * Export the model to the OutputStream in PMML format
    */
   @Experimental
+  @Since("1.4.0")
   def toPMML(outputStream: OutputStream): Unit = {
     toPMML(new StreamResult(outputStream))
   }
@@ -76,6 +80,7 @@ trait PMMLExportable {
    * Export the model to a String in PMML format
    */
   @Experimental
+  @Since("1.4.0")
   def toPMML(): String = {
     val writer = new StringWriter
     toPMML(new StreamResult(writer))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
index be335a1aca58a..dffe6e78939e8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
@@ -17,16 +17,17 @@
 
 package org.apache.spark.mllib.util
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
  * A collection of methods used to validate data before applying ML algorithms.
  */
 @DeveloperApi
+@Since("0.8.0")
 object DataValidators extends Logging {
 
   /**
@@ -34,6 +35,7 @@ object DataValidators extends Logging {
    *
    * @return True if labels are all zero or one, false otherwise.
    */
+  @Since("1.0.0")
   val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
     val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
     if (numInvalid != 0) {
@@ -48,6 +50,7 @@ object DataValidators extends Logging {
    *
    * @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise.
    */
+  @Since("1.3.0")
   def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
     val numInvalid = data.filter(x =>
       x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
index e6bcff48b022c..00fd1606a369c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.util
 
 import scala.util.Random
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.SparkContext
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.rdd.RDD
 
 /**
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
  * cluster with scale 1 around each center.
  */
 @DeveloperApi
+@Since("0.8.0")
 object KMeansDataGenerator {
 
   /**
@@ -42,6 +43,7 @@ object KMeansDataGenerator {
    * @param r Scaling factor for the distribution of the initial centers
    * @param numPartitions Number of partitions of the generated RDD; default 2
    */
+  @Since("0.8.0")
   def generateKMeansRDD(
       sc: SparkContext,
       numPoints: Int,
@@ -62,6 +64,7 @@ object KMeansDataGenerator {
     }
   }
 
+  @Since("0.8.0")
   def main(args: Array[String]) {
     if (args.length < 6) {
       // scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 7a1c7796065ee..d0ba454f379a9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -22,11 +22,11 @@ import scala.util.Random
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
@@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
  * response variable `Y`.
  */
 @DeveloperApi
+@Since("0.8.0")
 object LinearDataGenerator {
 
   /**
@@ -46,6 +47,7 @@ object LinearDataGenerator {
    * @param seed Random seed
    * @return Java List of input.
    */
+  @Since("0.8.0")
   def generateLinearInputAsList(
       intercept: Double,
       weights: Array[Double],
@@ -68,6 +70,7 @@ object LinearDataGenerator {
    * @param eps Epsilon scaling factor.
    * @return Seq of input.
    */
+  @Since("0.8.0")
   def generateLinearInput(
       intercept: Double,
       weights: Array[Double],
@@ -92,6 +95,7 @@ object LinearDataGenerator {
    * @param eps Epsilon scaling factor.
    * @return Seq of input.
    */
+  @Since("0.8.0")
   def generateLinearInput(
       intercept: Double,
       weights: Array[Double],
@@ -132,6 +136,7 @@ object LinearDataGenerator {
    *
    * @return RDD of LabeledPoint containing sample data.
    */
+  @Since("0.8.0")
   def generateLinearRDD(
       sc: SparkContext,
       nexamples: Int,
@@ -151,6 +156,7 @@ object LinearDataGenerator {
     data
   }
 
+  @Since("0.8.0")
   def main(args: Array[String]) {
     if (args.length < 2) {
       // scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
index c09cbe69bb971..33477ee20ebbd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.util
 
 import scala.util.Random
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors
  * with probability `probOne` and scales features for positive examples by `eps`.
  */
 @DeveloperApi
+@Since("0.8.0")
 object LogisticRegressionDataGenerator {
 
   /**
@@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator {
    * @param nparts Number of partitions of the generated RDD. Default value is 2.
    * @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
    */
+  @Since("0.8.0")
   def generateLogisticRDD(
     sc: SparkContext,
     nexamples: Int,
@@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator {
     data
   }
 
+  @Since("0.8.0")
   def main(args: Array[String]) {
     if (args.length != 5) {
       // scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index 16f430599a515..906bd30563bd0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -23,7 +23,7 @@ import scala.language.postfixOps
 import scala.util.Random
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
 import org.apache.spark.rdd.RDD
 
@@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD
  *   testSampFact   (Double) Percentage of training data to use as test data.
  */
 @DeveloperApi
+@Since("0.8.0")
 object MFDataGenerator {
+  @Since("0.8.0")
   def main(args: Array[String]) {
     if (args.length < 2) {
       // scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 4940974bf4f41..81c2f0ce6e12c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream
 /**
  * Helper methods to load, save and pre-process data used in ML Lib.
  */
+@Since("0.8.0")
 object MLUtils {
 
   private[mllib] lazy val EPSILON = {
@@ -168,6 +169,7 @@ object MLUtils {
    *
    * @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]]
    */
+  @Since("1.0.0")
   def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
     // TODO: allow to specify label precision and feature precision.
     val dataStr = data.map { case LabeledPoint(label, features) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index ad20b7694a779..cde5979396178 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -21,11 +21,11 @@ import scala.util.Random
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
@@ -33,8 +33,10 @@ import org.apache.spark.mllib.regression.LabeledPoint
  * for the features and adds Gaussian noise with weight 0.1 to generate labels.
  */
 @DeveloperApi
+@Since("0.8.0")
 object SVMDataGenerator {
 
+  @Since("0.8.0")
   def main(args: Array[String]) {
     if (args.length < 2) {
       // scalastyle:off println
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index 30d642c754b7c..4d71d534a0774 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -24,7 +24,7 @@ import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
@@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType}
  * This should be inherited by the class which implements model instances.
  */
 @DeveloperApi
+@Since("1.3.0")
 trait Saveable {
 
   /**
@@ -50,6 +51,7 @@ trait Saveable {
    * @param path  Path specifying the directory in which to save this model.
    *              If the directory already exists, this method throws an exception.
    */
+  @Since("1.3.0")
   def save(sc: SparkContext, path: String): Unit
 
   /** Current version of model save/load format. */
@@ -64,6 +66,7 @@ trait Saveable {
  * This should be inherited by an object paired with the model class.
  */
 @DeveloperApi
+@Since("1.3.0")
 trait Loader[M <: Saveable] {
 
   /**
@@ -75,6 +78,7 @@ trait Loader[M <: Saveable] {
    * @param path  Path specifying the directory to which the model was saved.
    * @return  Model instance
    */
+  @Since("1.3.0")
   def load(sc: SparkContext, path: String): M
 
 }

From ec89bd840a6862751999d612f586a962cae63f6d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 25 Aug 2015 14:55:34 -0700
Subject: [PATCH 081/802] [SPARK-10245] [SQL] Fix decimal literals with
 precision < scale

In BigDecimal or java.math.BigDecimal, the precision could be smaller than scale, for example, BigDecimal("0.001") has precision = 1 and scale = 3. But DecimalType require that the precision should be larger than scale, so we should use the maximum of precision and scale when inferring the schema from decimal literal.

Author: Davies Liu <davies@databricks.com>

Closes #8428 from davies/smaller_decimal.
---
 .../spark/sql/catalyst/expressions/literals.scala      |  7 ++++---
 .../catalyst/expressions/LiteralExpressionSuite.scala  |  8 +++++---
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala     | 10 ++++++++++
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 34bad23802ba4..8c0c5d5b1e31e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -36,9 +36,10 @@ object Literal {
     case s: Short => Literal(s, ShortType)
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
-    case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, d.scale))
-    case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType(d.precision(), d.scale()))
-    case d: Decimal => Literal(d, DecimalType(d.precision, d.scale))
+    case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
+    case d: java.math.BigDecimal =>
+      Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
+    case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index f6404d21611e5..015eb1897fb8c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("decimal") {
-    List(0.0, 1.2, 1.1111, 5).foreach { d =>
+    List(-0.0001, 0.0, 0.001, 1.2, 1.1111, 5).foreach { d =>
       checkEvaluation(Literal(Decimal(d)), Decimal(d))
       checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt))
       checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong))
-      checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)),
-        Decimal((d * 1000L).toLong, 10, 1))
+      checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)),
+        Decimal((d * 1000L).toLong, 10, 3))
+      checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d))
+      checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index dcb4e83710982..aa07665c6b705 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1627,6 +1627,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(null))
   }
 
+  test("precision smaller than scale") {
+    checkAnswer(sql("select 10.00"), Row(BigDecimal("10.00")))
+    checkAnswer(sql("select 1.00"), Row(BigDecimal("1.00")))
+    checkAnswer(sql("select 0.10"), Row(BigDecimal("0.10")))
+    checkAnswer(sql("select 0.01"), Row(BigDecimal("0.01")))
+    checkAnswer(sql("select 0.001"), Row(BigDecimal("0.001")))
+    checkAnswer(sql("select -0.01"), Row(BigDecimal("-0.01")))
+    checkAnswer(sql("select -0.001"), Row(BigDecimal("-0.001")))
+  }
+
   test("external sorting updates peak execution memory") {
     withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) {
       val sc = sqlContext.sparkContext

From 7467b52ed07f174d93dfc4cb544dc4b69a2c2826 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 25 Aug 2015 15:19:41 -0700
Subject: [PATCH 082/802] [SPARK-10215] [SQL] Fix precision of division (follow
 the rule in Hive)

Follow the rule in Hive for decimal division. see https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113

cc chenghao-intel

Author: Davies Liu <davies@databricks.com>

Closes #8415 from davies/decimal_div2.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 10 ++++++--
 .../sql/catalyst/analysis/AnalysisSuite.scala |  9 +++----
 .../analysis/DecimalPrecisionSuite.scala      |  8 +++---
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 25 +++++++++++++++++--
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index a1aa2a2b2c680..87c11abbad490 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -396,8 +396,14 @@ object HiveTypeCoercion {
             resultType)
 
         case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1),
-            max(6, s1 + p2 + 1))
+          var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+          var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
+          val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+          if (diff > 0) {
+            decDig -= diff / 2 + 1
+            intDig = DecimalType.MAX_SCALE - decDig
+          }
+          val resultType = DecimalType.bounded(intDig + decDig, decDig)
           val widerType = widerDecimalType(p1, s1, p2, s2)
           CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
             resultType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 1e0cc81dae974..820b336aac759 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.dsl.plans._
 
 class AnalysisSuite extends AnalysisTest {
-  import TestRelations._
+  import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
     val plan = (1 to 100)
@@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest {
     assert(pl(1).dataType == DoubleType)
     assert(pl(2).dataType == DoubleType)
     // StringType will be promoted into Decimal(38, 18)
-    assert(pl(3).dataType == DecimalType(38, 29))
+    assert(pl(3).dataType == DecimalType(38, 22))
     assert(pl(4).dataType == DoubleType)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index fc11627da6fd1..b4ad618c23e39 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -136,10 +136,10 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
     checkType(Multiply(i, u), DecimalType(38, 18))
     checkType(Multiply(u, u), DecimalType(38, 36))
 
-    checkType(Divide(u, d1), DecimalType(38, 21))
-    checkType(Divide(u, d2), DecimalType(38, 24))
-    checkType(Divide(u, i), DecimalType(38, 29))
-    checkType(Divide(u, u), DecimalType(38, 38))
+    checkType(Divide(u, d1), DecimalType(38, 18))
+    checkType(Divide(u, d2), DecimalType(38, 19))
+    checkType(Divide(u, i), DecimalType(38, 23))
+    checkType(Divide(u, u), DecimalType(38, 18))
 
     checkType(Remainder(d1, u), DecimalType(19, 18))
     checkType(Remainder(d2, u), DecimalType(21, 18))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index aa07665c6b705..9e172b2c264cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1622,9 +1622,30 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333")))
     checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333")))
     checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.4333333333333333333333333333333333333", new MathContext(38))))
+      Row(BigDecimal("3.433333333333333333333333333", new MathContext(38))))
     checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"),
-      Row(null))
+      Row(BigDecimal("3.4333333333333333333333333333", new MathContext(38))))
+  }
+
+  test("SPARK-10215 Div of Decimal returns null") {
+    val d = Decimal(1.12321)
+    val df = Seq((d, 1)).toDF("a", "b")
+
+    checkAnswer(
+      df.selectExpr("b * a / b"),
+      Seq(Row(d.toBigDecimal)))
+    checkAnswer(
+      df.selectExpr("b * a / b / b"),
+      Seq(Row(d.toBigDecimal)))
+    checkAnswer(
+      df.selectExpr("b * a + b"),
+      Seq(Row(BigDecimal(2.12321))))
+    checkAnswer(
+      df.selectExpr("b * a - b"),
+      Seq(Row(BigDecimal(0.12321))))
+    checkAnswer(
+      df.selectExpr("b * a * b"),
+      Seq(Row(d.toBigDecimal)))
   }
 
   test("precision smaller than scale") {

From 125205cdb35530cdb4a8fff3e1ee49cf4a299583 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 25 Aug 2015 17:39:20 -0700
Subject: [PATCH 083/802] [SPARK-9888] [MLLIB] User guide for new LDA features

 * Adds two new sections to LDA's user guide; one for each optimizer/model
 * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, hyperpam optimization)
 * Cleans up a TODO and sets a default parameter in LDA code

jkbradley hhbyyh

Author: Feynman Liang <fliang@databricks.com>

Closes #8254 from feynmanliang/SPARK-9888.
---
 docs/mllib-clustering.md                      | 135 +++++++++++++++---
 .../spark/mllib/clustering/LDAModel.scala     |   1 -
 .../spark/mllib/clustering/LDASuite.scala     |   1 +
 3 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index fd9ab258e196d..3fb35d3c50b06 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
 is a topic model which infers topics from a collection of text documents.
 LDA can be thought of as a clustering algorithm as follows:
 
-* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset.
-* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts.
-* Rather than estimating a clustering using a traditional distance, LDA uses a function based
- on a statistical model of how text documents are generated.
-
-LDA takes in a collection of documents as vectors of word counts.
-It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
-on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides:
-
-* Topics: Inferred topics, each of which is a probability distribution over terms (words).
-* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only)
+* Topics correspond to cluster centers, and documents correspond to
+examples (rows) in a dataset.
+* Topics and documents both exist in a feature space, where feature
+vectors are vectors of word counts (bag of words).
+* Rather than estimating a clustering using a traditional distance, LDA
+uses a function based on a statistical model of how text documents are
+generated.
+
+LDA supports different inference algorithms via `setOptimizer` function.
+`EMLDAOptimizer` learns clustering using
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+on the likelihood function and yields comprehensive results, while
+`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online
+variational
+inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf)
+and is generally memory friendly.
 
-LDA takes the following parameters:
+LDA takes in a collection of documents as vectors of word counts and the
+following parameters (set using the builder pattern):
 
 * `k`: Number of topics (i.e., cluster centers)
-* `maxIterations`: Limit on the number of iterations of EM used for learning
-* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions.
-* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions.
-* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created.  If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery.
-
-*Note*: LDA is a new feature with some missing functionality.  In particular, it does not yet
-support prediction on new documents, and it does not have a Python API.  These will be added in the future.
+* `optimizer`: Optimizer to use for learning the LDA model, either
+`EMLDAOptimizer` or `OnlineLDAOptimizer`
+* `docConcentration`: Dirichlet parameter for prior over documents'
+distributions over topics. Larger values encourage smoother inferred
+distributions.
+* `topicConcentration`: Dirichlet parameter for prior over topics'
+distributions over terms (words). Larger values encourage smoother
+inferred distributions.
+* `maxIterations`: Limit on the number of iterations.
+* `checkpointInterval`: If using checkpointing (set in the Spark
+configuration), this parameter specifies the frequency with which
+checkpoints will be created.  If `maxIterations` is large, using
+checkpointing can help reduce shuffle file sizes on disk and help with
+failure recovery.
+
+
+All of MLlib's LDA models support:
+
+* `describeTopics`: Returns topics as arrays of most important terms and
+term weights
+* `topicsMatrix`: Returns a `vocabSize` by `k` matrix where each column
+is a topic
+
+*Note*: LDA is still an experimental feature under active development.
+As a result, certain features are only available in one of the two
+optimizers / models generated by the optimizer. Currently, a distributed
+model can be converted into a local model, but not vice-versa.
+
+The following discussion will describe each optimizer/model pair
+separately.
+
+**Expectation Maximization**
+
+Implemented in
+[`EMLDAOptimizer`](api/scala/index.html#org.apache.spark.mllib.clustering.EMLDAOptimizer)
+and
+[`DistributedLDAModel`](api/scala/index.html#org.apache.spark.mllib.clustering.DistributedLDAModel).
+
+For the parameters provided to `LDA`:
+
+* `docConcentration`: Only symmetric priors are supported, so all values
+in the provided `k`-dimensional vector must be identical. All values
+must also be $> 1.0$. Providing `Vector(-1)` results in default behavior
+(uniform `k` dimensional vector with value $(50 / k) + 1$
+* `topicConcentration`: Only symmetric priors supported. Values must be
+$> 1.0$. Providing `-1` results in defaulting to a value of $0.1 + 1$.
+* `maxIterations`: The maximum number of EM iterations.
+
+`EMLDAOptimizer` produces a `DistributedLDAModel`, which stores not only
+the inferred topics but also the full training corpus and topic
+distributions for each document in the training corpus. A
+`DistributedLDAModel` supports:
+
+ * `topTopicsPerDocument`: The top topics and their weights for
+ each document in the training corpus
+ * `topDocumentsPerTopic`: The top documents for each topic and
+ the corresponding weight of the topic in the documents.
+ * `logPrior`: log probability of the estimated topics and
+ document-topic distributions given the hyperparameters
+ `docConcentration` and `topicConcentration`
+ * `logLikelihood`: log likelihood of the training corpus, given the
+ inferred topics and document-topic distributions
+
+**Online Variational Bayes**
+
+Implemented in
+[`OnlineLDAOptimizer`](api/scala/org/apache/spark/mllib/clustering/OnlineLDAOptimizer.html)
+and
+[`LocalLDAModel`](api/scala/org/apache/spark/mllib/clustering/LocalLDAModel.html).
+
+For the parameters provided to `LDA`:
+
+* `docConcentration`: Asymmetric priors can be used by passing in a
+vector with values equal to the Dirichlet parameter in each of the `k`
+dimensions. Values should be $>= 0$. Providing `Vector(-1)` results in
+default behavior (uniform `k` dimensional vector with value $(1.0 / k)$)
+* `topicConcentration`: Only symmetric priors supported. Values must be
+$>= 0$. Providing `-1` results in defaulting to a value of $(1.0 / k)$.
+* `maxIterations`: Maximum number of minibatches to submit.
+
+In addition, `OnlineLDAOptimizer` accepts the following parameters:
+
+* `miniBatchFraction`: Fraction of corpus sampled and used at each
+iteration
+* `optimizeDocConcentration`: If set to true, performs maximum-likelihood
+estimation of the hyperparameter `docConcentration` (aka `alpha`)
+after each minibatch and sets the optimized `docConcentration` in the
+returned `LocalLDAModel`
+* `tau0` and `kappa`: Used for learning-rate decay, which is computed by
+$(\tau_0 + iter)^{-\kappa}$ where $iter$ is the current number of iterations.
+
+`OnlineLDAOptimizer` produces a `LocalLDAModel`, which only stores the
+inferred topics. A `LocalLDAModel` supports:
+
+* `logLikelihood(documents)`: Calculates a lower bound on the provided
+`documents` given the inferred topics.
+* `logPerplexity(documents)`: Calculates an upper bound on the
+perplexity of the provided `documents` given the inferred topics.
 
 **Examples**
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 667374a2bc418..432bbedc8d6f8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -435,7 +435,6 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       }
       val topicsMat = Matrices.fromBreeze(brzTopics)
 
-      // TODO: initialize with docConcentration, topicConcentration, and gammaShape after SPARK-9940
       new LocalLDAModel(topicsMat, docConcentration, topicConcentration, gammaShape)
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 746a76a7e5fa1..37fb69d68f6be 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -68,6 +68,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     // Train a model
     val lda = new LDA()
     lda.setK(k)
+      .setOptimizer(new EMLDAOptimizer)
       .setDocConcentration(topicSmoothing)
       .setTopicConcentration(termSmoothing)
       .setMaxIterations(5)

From 8668ead2e7097b9591069599fbfccf67c53db659 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 18:17:54 -0700
Subject: [PATCH 084/802] [SPARK-10233] [MLLIB] update since version in
 mllib.evaluation

Same as #8421 but for `mllib.evaluation`.

cc avulanov

Author: Xiangrui Meng <meng@databricks.com>

Closes #8423 from mengxr/SPARK-10233.
---
 .../evaluation/BinaryClassificationMetrics.scala     |  8 ++++----
 .../spark/mllib/evaluation/MulticlassMetrics.scala   | 11 ++++++++++-
 .../spark/mllib/evaluation/MultilabelMetrics.scala   | 12 +++++++++++-
 .../spark/mllib/evaluation/RegressionMetrics.scala   |  3 ++-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 76ae847921f44..508fe532b1306 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame
  *                be smaller as a result, meaning there may be an extra sample at
  *                partition boundaries.
  */
-@Since("1.3.0")
+@Since("1.0.0")
 @Experimental
-class BinaryClassificationMetrics(
-    val scoreAndLabels: RDD[(Double, Double)],
-    val numBins: Int) extends Logging {
+class BinaryClassificationMetrics @Since("1.3.0") (
+    @Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)],
+    @Since("1.3.0") val numBins: Int) extends Logging {
 
   require(numBins >= 0, "numBins must be nonnegative")
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 02e89d921033c..00e837661dfc2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame
  */
 @Since("1.1.0")
 @Experimental
-class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
    * An auxiliary constructor taking a DataFrame.
@@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns precision
    */
+  @Since("1.1.0")
   lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
 
   /**
@@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
    * because sum of all false positives is equal to sum
    * of all false negatives)
    */
+  @Since("1.1.0")
   lazy val recall: Double = precision
 
   /**
    * Returns f-measure
    * (equals to precision and recall because precision equals recall)
    */
+  @Since("1.1.0")
   lazy val fMeasure: Double = precision
 
   /**
    * Returns weighted true positive rate
    * (equals to precision, recall and f-measure)
    */
+  @Since("1.1.0")
   lazy val weightedTruePositiveRate: Double = weightedRecall
 
   /**
    * Returns weighted false positive rate
    */
+  @Since("1.1.0")
   lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) =>
     falsePositiveRate(category) * count.toDouble / labelCount
   }.sum
@@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
    * Returns weighted averaged recall
    * (equals to precision, recall and f-measure)
    */
+  @Since("1.1.0")
   lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) =>
     recall(category) * count.toDouble / labelCount
   }.sum
@@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns weighted averaged precision
    */
+  @Since("1.1.0")
   lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) =>
     precision(category) * count.toDouble / labelCount
   }.sum
@@ -196,6 +203,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns weighted averaged f1-measure
    */
+  @Since("1.1.0")
   lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, 1.0) * count.toDouble / labelCount
   }.sum
@@ -203,5 +211,6 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
   /**
    * Returns the sequence of labels in ascending order
    */
+  @Since("1.1.0")
   lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
index a0a8d9c56847b..c100b3c9ec14a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
  * both are non-null Arrays, each with unique elements.
  */
 @Since("1.2.0")
-class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) {
+class MultilabelMetrics @Since("1.2.0") (predictionAndLabels: RDD[(Array[Double], Array[Double])]) {
 
   /**
    * An auxiliary constructor taking a DataFrame.
@@ -46,6 +46,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
    * Returns subset accuracy
    * (for equal sets of labels)
    */
+  @Since("1.2.0")
   lazy val subsetAccuracy: Double = predictionAndLabels.filter { case (predictions, labels) =>
     predictions.deep == labels.deep
   }.count().toDouble / numDocs
@@ -53,6 +54,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns accuracy
    */
+  @Since("1.2.0")
   lazy val accuracy: Double = predictionAndLabels.map { case (predictions, labels) =>
     labels.intersect(predictions).size.toDouble /
       (labels.size + predictions.size - labels.intersect(predictions).size)}.sum / numDocs
@@ -61,6 +63,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns Hamming-loss
    */
+  @Since("1.2.0")
   lazy val hammingLoss: Double = predictionAndLabels.map { case (predictions, labels) =>
     labels.size + predictions.size - 2 * labels.intersect(predictions).size
   }.sum / (numDocs * numLabels)
@@ -68,6 +71,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns document-based precision averaged by the number of documents
    */
+  @Since("1.2.0")
   lazy val precision: Double = predictionAndLabels.map { case (predictions, labels) =>
     if (predictions.size > 0) {
       predictions.intersect(labels).size.toDouble / predictions.size
@@ -79,6 +83,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns document-based recall averaged by the number of documents
    */
+  @Since("1.2.0")
   lazy val recall: Double = predictionAndLabels.map { case (predictions, labels) =>
     labels.intersect(predictions).size.toDouble / labels.size
   }.sum / numDocs
@@ -86,6 +91,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
   /**
    * Returns document-based f1-measure averaged by the number of documents
    */
+  @Since("1.2.0")
   lazy val f1Measure: Double = predictionAndLabels.map { case (predictions, labels) =>
     2.0 * predictions.intersect(labels).size / (predictions.size + labels.size)
   }.sum / numDocs
@@ -143,6 +149,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
    * Returns micro-averaged label-based precision
    * (equals to micro-averaged document-based precision)
    */
+  @Since("1.2.0")
   lazy val microPrecision: Double = {
     val sumFp = fpPerClass.foldLeft(0L){ case(cum, (_, fp)) => cum + fp}
     sumTp.toDouble / (sumTp + sumFp)
@@ -152,6 +159,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
    * Returns micro-averaged label-based recall
    * (equals to micro-averaged document-based recall)
    */
+  @Since("1.2.0")
   lazy val microRecall: Double = {
     val sumFn = fnPerClass.foldLeft(0.0){ case(cum, (_, fn)) => cum + fn}
     sumTp.toDouble / (sumTp + sumFn)
@@ -161,10 +169,12 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
    * Returns micro-averaged label-based f1-measure
    * (equals to micro-averaged document-based f1-measure)
    */
+  @Since("1.2.0")
   lazy val microF1Measure: Double = 2.0 * sumTp / (2 * sumTp + sumFnClass + sumFpClass)
 
   /**
    * Returns the sequence of labels in ascending order
    */
+  @Since("1.2.0")
   lazy val labels: Array[Double] = tpPerClass.keys.toArray.sorted
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 36a6c357c3897..799ebb980ef01 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.DataFrame
  */
 @Since("1.2.0")
 @Experimental
-class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
+class RegressionMetrics @Since("1.2.0") (
+    predictionAndObservations: RDD[(Double, Double)]) extends Logging {
 
   /**
    * An auxiliary constructor taking a DataFrame.

From ab431f8a970b85fba34ccb506c0f8815e55c63bf Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 20:07:56 -0700
Subject: [PATCH 085/802] [SPARK-10238] [MLLIB] update since versions in
 mllib.linalg

Same as #8421 but for `mllib.linalg`.

cc dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8440 from mengxr/SPARK-10238 and squashes the following commits:

b38437e [Xiangrui Meng] update since versions in mllib.linalg
---
 .../apache/spark/mllib/linalg/Matrices.scala  | 44 ++++++++++++-------
 .../linalg/SingularValueDecomposition.scala   |  1 +
 .../apache/spark/mllib/linalg/Vectors.scala   | 25 ++++++++---
 .../linalg/distributed/BlockMatrix.scala      | 10 +++--
 .../linalg/distributed/CoordinateMatrix.scala |  4 +-
 .../distributed/DistributedMatrix.scala       |  2 +
 .../linalg/distributed/IndexedRowMatrix.scala |  4 +-
 .../mllib/linalg/distributed/RowMatrix.scala  |  5 ++-
 8 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 28b5b4637bf17..c02ba426fcc3a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -32,18 +32,23 @@ import org.apache.spark.sql.types._
  * Trait for a local matrix.
  */
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
+@Since("1.0.0")
 sealed trait Matrix extends Serializable {
 
   /** Number of rows. */
+  @Since("1.0.0")
   def numRows: Int
 
   /** Number of columns. */
+  @Since("1.0.0")
   def numCols: Int
 
   /** Flag that keeps track whether the matrix is transposed or not. False by default. */
+  @Since("1.3.0")
   val isTransposed: Boolean = false
 
   /** Converts to a dense array in column major. */
+  @Since("1.0.0")
   def toArray: Array[Double] = {
     val newArray = new Array[Double](numRows * numCols)
     foreachActive { (i, j, v) =>
@@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable {
   private[mllib] def toBreeze: BM[Double]
 
   /** Gets the (i, j)-th element. */
+  @Since("1.3.0")
   def apply(i: Int, j: Int): Double
 
   /** Return the index for the (i, j)-th element in the backing array. */
@@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable {
   private[mllib] def update(i: Int, j: Int, v: Double): Unit
 
   /** Get a deep copy of the matrix. */
+  @Since("1.2.0")
   def copy: Matrix
 
   /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+  @Since("1.3.0")
   def transpose: Matrix
 
   /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  @Since("1.2.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
     val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
     BLAS.gemm(1.0, this, y, 0.0, C)
@@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable {
   }
 
   /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+  @Since("1.2.0")
   def multiply(y: DenseVector): DenseVector = {
     multiply(y.asInstanceOf[Vector])
   }
 
   /** Convenience method for `Matrix`-`Vector` multiplication. */
+  @Since("1.4.0")
   def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
     BLAS.gemv(1.0, this, y, 0.0, output)
@@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable {
   override def toString: String = toBreeze.toString()
 
   /** A human readable representation of the matrix with maximum lines and width */
+  @Since("1.4.0")
   def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth)
 
   /** Map the values of this matrix using a function. Generates a new matrix. Performs the
@@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable {
   /**
    * Find the number of non-zero active values.
    */
+  @Since("1.5.0")
   def numNonzeros: Int
 
   /**
    * Find the number of values stored explicitly. These values can be zero as well.
    */
+  @Since("1.5.0")
   def numActives: Int
 }
 
@@ -230,11 +244,11 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
  */
 @Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
-class DenseMatrix(
-    val numRows: Int,
-    val numCols: Int,
-    val values: Array[Double],
-    override val isTransposed: Boolean) extends Matrix {
+class DenseMatrix @Since("1.3.0") (
+    @Since("1.0.0") val numRows: Int,
+    @Since("1.0.0") val numCols: Int,
+    @Since("1.0.0") val values: Array[Double],
+    @Since("1.3.0") override val isTransposed: Boolean) extends Matrix {
 
   require(values.length == numRows * numCols, "The number of values supplied doesn't match the " +
     s"size of the matrix! values.length: ${values.length}, numRows * numCols: ${numRows * numCols}")
@@ -254,7 +268,7 @@ class DenseMatrix(
    * @param numCols number of columns
    * @param values matrix entries in column major
    */
-  @Since("1.3.0")
+  @Since("1.0.0")
   def this(numRows: Int, numCols: Int, values: Array[Double]) =
     this(numRows, numCols, values, false)
 
@@ -491,13 +505,13 @@ object DenseMatrix {
  */
 @Since("1.2.0")
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
-class SparseMatrix(
-    val numRows: Int,
-    val numCols: Int,
-    val colPtrs: Array[Int],
-    val rowIndices: Array[Int],
-    val values: Array[Double],
-    override val isTransposed: Boolean) extends Matrix {
+class SparseMatrix @Since("1.3.0") (
+    @Since("1.2.0") val numRows: Int,
+    @Since("1.2.0") val numCols: Int,
+    @Since("1.2.0") val colPtrs: Array[Int],
+    @Since("1.2.0") val rowIndices: Array[Int],
+    @Since("1.2.0") val values: Array[Double],
+    @Since("1.3.0") override val isTransposed: Boolean) extends Matrix {
 
   require(values.length == rowIndices.length, "The number of row indices and values don't match! " +
     s"values.length: ${values.length}, rowIndices.length: ${rowIndices.length}")
@@ -527,7 +541,7 @@ class SparseMatrix(
    *                   order for each column
    * @param values non-zero matrix entries in column major
    */
-  @Since("1.3.0")
+  @Since("1.2.0")
   def this(
       numRows: Int,
       numCols: Int,
@@ -549,8 +563,6 @@ class SparseMatrix(
      }
   }
 
-  /**
-   */
   @Since("1.3.0")
   override def apply(i: Int, j: Int): Double = {
     val ind = index(i, j)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index a37aca99d5e72..4dcf8f28f2023 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -31,6 +31,7 @@ case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VTyp
  * :: Experimental ::
  * Represents QR factors.
  */
+@Since("1.5.0")
 @Experimental
 case class QRDecomposition[QType, RType](Q: QType, R: RType)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 3d577edbe23e1..06ebb15869909 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -38,16 +38,19 @@ import org.apache.spark.sql.types._
  * Note: Users should not implement this interface.
  */
 @SQLUserDefinedType(udt = classOf[VectorUDT])
+@Since("1.0.0")
 sealed trait Vector extends Serializable {
 
   /**
    * Size of the vector.
    */
+  @Since("1.0.0")
   def size: Int
 
   /**
    * Converts the instance to a double array.
    */
+  @Since("1.0.0")
   def toArray: Array[Double]
 
   override def equals(other: Any): Boolean = {
@@ -99,11 +102,13 @@ sealed trait Vector extends Serializable {
    * Gets the value of the ith element.
    * @param i index
    */
+  @Since("1.1.0")
   def apply(i: Int): Double = toBreeze(i)
 
   /**
    * Makes a deep copy of this vector.
    */
+  @Since("1.1.0")
   def copy: Vector = {
     throw new NotImplementedError(s"copy is not implemented for ${this.getClass}.")
   }
@@ -121,26 +126,31 @@ sealed trait Vector extends Serializable {
    * Number of active entries.  An "active entry" is an element which is explicitly stored,
    * regardless of its value.  Note that inactive entries have value 0.
    */
+  @Since("1.4.0")
   def numActives: Int
 
   /**
    * Number of nonzero elements. This scans all active values and count nonzeros.
    */
+  @Since("1.4.0")
   def numNonzeros: Int
 
   /**
    * Converts this vector to a sparse vector with all explicit zeros removed.
    */
+  @Since("1.4.0")
   def toSparse: SparseVector
 
   /**
    * Converts this vector to a dense vector.
    */
+  @Since("1.4.0")
   def toDense: DenseVector = new DenseVector(this.toArray)
 
   /**
    * Returns a vector in either dense or sparse format, whichever uses less storage.
    */
+  @Since("1.4.0")
   def compressed: Vector = {
     val nnz = numNonzeros
     // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
@@ -155,6 +165,7 @@ sealed trait Vector extends Serializable {
    * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
    * Returns -1 if vector has length 0.
    */
+  @Since("1.5.0")
   def argmax: Int
 }
 
@@ -532,7 +543,8 @@ object Vectors {
  */
 @Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[VectorUDT])
-class DenseVector(val values: Array[Double]) extends Vector {
+class DenseVector @Since("1.0.0") (
+    @Since("1.0.0") val values: Array[Double]) extends Vector {
 
   @Since("1.0.0")
   override def size: Int = values.length
@@ -632,7 +644,9 @@ class DenseVector(val values: Array[Double]) extends Vector {
 
 @Since("1.3.0")
 object DenseVector {
+
   /** Extracts the value array from a dense vector. */
+  @Since("1.3.0")
   def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
 }
 
@@ -645,10 +659,10 @@ object DenseVector {
  */
 @Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[VectorUDT])
-class SparseVector(
-    override val size: Int,
-    val indices: Array[Int],
-    val values: Array[Double]) extends Vector {
+class SparseVector @Since("1.0.0") (
+    @Since("1.0.0") override val size: Int,
+    @Since("1.0.0") val indices: Array[Int],
+    @Since("1.0.0") val values: Array[Double]) extends Vector {
 
   require(indices.length == values.length, "Sparse vectors require that the dimension of the" +
     s" indices match the dimension of the values. You provided ${indices.length} indices and " +
@@ -819,6 +833,7 @@ class SparseVector(
 
 @Since("1.3.0")
 object SparseVector {
+  @Since("1.3.0")
   def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] =
     Some((sv.size, sv.indices, sv.values))
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 94376c24a7ac6..a33b6137cf9cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -131,10 +131,10 @@ private[mllib] object GridPartitioner {
  */
 @Since("1.3.0")
 @Experimental
-class BlockMatrix(
-    val blocks: RDD[((Int, Int), Matrix)],
-    val rowsPerBlock: Int,
-    val colsPerBlock: Int,
+class BlockMatrix @Since("1.3.0") (
+    @Since("1.3.0") val blocks: RDD[((Int, Int), Matrix)],
+    @Since("1.3.0") val rowsPerBlock: Int,
+    @Since("1.3.0") val colsPerBlock: Int,
     private var nRows: Long,
     private var nCols: Long) extends DistributedMatrix with Logging {
 
@@ -171,7 +171,9 @@ class BlockMatrix(
     nCols
   }
 
+  @Since("1.3.0")
   val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt
+  @Since("1.3.0")
   val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt
 
   private[mllib] def createPartitioner(): GridPartitioner =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 4bb27ec840902..644f293d88a75 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -46,8 +46,8 @@ case class MatrixEntry(i: Long, j: Long, value: Double)
  */
 @Since("1.0.0")
 @Experimental
-class CoordinateMatrix(
-    val entries: RDD[MatrixEntry],
+class CoordinateMatrix @Since("1.0.0") (
+    @Since("1.0.0") val entries: RDD[MatrixEntry],
     private var nRows: Long,
     private var nCols: Long) extends DistributedMatrix {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
index e51327ebb7b58..db3433a5e2456 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala
@@ -28,9 +28,11 @@ import org.apache.spark.annotation.Since
 trait DistributedMatrix extends Serializable {
 
   /** Gets or computes the number of rows. */
+  @Since("1.0.0")
   def numRows(): Long
 
   /** Gets or computes the number of columns. */
+  @Since("1.0.0")
   def numCols(): Long
 
   /** Collects data and assembles a local dense breeze matrix (for test only). */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 6d2c05a47d049..b20ea0dc50da5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -45,8 +45,8 @@ case class IndexedRow(index: Long, vector: Vector)
  */
 @Since("1.0.0")
 @Experimental
-class IndexedRowMatrix(
-    val rows: RDD[IndexedRow],
+class IndexedRowMatrix @Since("1.0.0") (
+    @Since("1.0.0") val rows: RDD[IndexedRow],
     private var nRows: Long,
     private var nCols: Int) extends DistributedMatrix {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 78036eba5c3e6..9a423ddafdc09 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -47,8 +47,8 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.0.0")
 @Experimental
-class RowMatrix(
-    val rows: RDD[Vector],
+class RowMatrix @Since("1.0.0") (
+    @Since("1.0.0") val rows: RDD[Vector],
     private var nRows: Long,
     private var nCols: Int) extends DistributedMatrix with Logging {
 
@@ -519,6 +519,7 @@ class RowMatrix(
    * @param computeQ whether to computeQ
    * @return QRDecomposition(Q, R), Q = null if computeQ = false.
    */
+  @Since("1.5.0")
   def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = {
     val col = numCols().toInt
     // split rows horizontally into smaller matrices, and compute QR for each of them

From c3a54843c0c8a14059da4e6716c1ad45c69bbe6c Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 22:31:23 -0700
Subject: [PATCH 086/802] [SPARK-10240] [SPARK-10242] [MLLIB] update since
 versions in mlilb.random and mllib.stat

The same as #8241 but for `mllib.stat` and `mllib.random`.

cc feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8439 from mengxr/SPARK-10242.
---
 .../mllib/random/RandomDataGenerator.scala    | 43 ++++++++++--
 .../spark/mllib/random/RandomRDDs.scala       | 69 ++++++++++++++++---
 .../distribution/MultivariateGaussian.scala   |  6 +-
 .../spark/mllib/stat/test/TestResult.scala    | 24 ++++---
 4 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index 9349ecaa13f56..a2d85a68cd327 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.random
 import org.apache.commons.math3.distribution.{ExponentialDistribution,
   GammaDistribution, LogNormalDistribution, PoissonDistribution}
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
 
 /**
@@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
  * Trait for random data generators that generate i.i.d. data.
  */
 @DeveloperApi
+@Since("1.1.0")
 trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
 
   /**
    * Returns an i.i.d. sample as a generic type from an underlying distribution.
    */
+  @Since("1.1.0")
   def nextValue(): T
 
   /**
    * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the
    * class when applicable for non-locking concurrent usage.
    */
+  @Since("1.1.0")
   def copy(): RandomDataGenerator[T]
 }
 
@@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
  * Generates i.i.d. samples from U[0.0, 1.0]
  */
 @DeveloperApi
+@Since("1.1.0")
 class UniformGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
     random.nextDouble()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): UniformGenerator = new UniformGenerator()
 }
 
@@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] {
  * Generates i.i.d. samples from the standard normal distribution.
  */
 @DeveloperApi
+@Since("1.1.0")
 class StandardNormalGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
       random.nextGaussian()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
 }
 
@@ -87,16 +98,21 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] {
  * @param mean mean for the Poisson distribution.
  */
 @DeveloperApi
-class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+@Since("1.1.0")
+class PoissonGenerator @Since("1.1.0") (
+    @Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new PoissonDistribution(mean)
 
+  @Since("1.1.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.1.0")
   override def setSeed(seed: Long) {
     rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.1.0")
   override def copy(): PoissonGenerator = new PoissonGenerator(mean)
 }
 
@@ -107,16 +123,21 @@ class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
  * @param mean mean for the exponential distribution.
  */
 @DeveloperApi
-class ExponentialGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+@Since("1.3.0")
+class ExponentialGenerator @Since("1.3.0") (
+    @Since("1.3.0") val mean: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new ExponentialDistribution(mean)
 
+  @Since("1.3.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.3.0")
   override def setSeed(seed: Long) {
     rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.3.0")
   override def copy(): ExponentialGenerator = new ExponentialGenerator(mean)
 }
 
@@ -128,16 +149,22 @@ class ExponentialGenerator(val mean: Double) extends RandomDataGenerator[Double]
  * @param scale scale for the gamma distribution
  */
 @DeveloperApi
-class GammaGenerator(val shape: Double, val scale: Double) extends RandomDataGenerator[Double] {
+@Since("1.3.0")
+class GammaGenerator @Since("1.3.0") (
+    @Since("1.3.0") val shape: Double,
+    @Since("1.3.0") val scale: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new GammaDistribution(shape, scale)
 
+  @Since("1.3.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.3.0")
   override def setSeed(seed: Long) {
     rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.3.0")
   override def copy(): GammaGenerator = new GammaGenerator(shape, scale)
 }
 
@@ -150,15 +177,21 @@ class GammaGenerator(val shape: Double, val scale: Double) extends RandomDataGen
  * @param std standard deviation for the log normal distribution
  */
 @DeveloperApi
-class LogNormalGenerator(val mean: Double, val std: Double) extends RandomDataGenerator[Double] {
+@Since("1.3.0")
+class LogNormalGenerator @Since("1.3.0") (
+    @Since("1.3.0") val mean: Double,
+    @Since("1.3.0") val std: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new LogNormalDistribution(mean, std)
 
+  @Since("1.3.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.3.0")
   override def setSeed(seed: Long) {
     rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.3.0")
   override def copy(): LogNormalGenerator = new LogNormalGenerator(mean, std)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 174d5e0f6c9f0..4dd5ea214d678 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.random
 import scala.reflect.ClassTag
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.rdd.{RandomRDD, RandomVectorRDD}
@@ -32,6 +32,7 @@ import org.apache.spark.util.Utils
  * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution.
  */
 @Experimental
+@Since("1.1.0")
 object RandomRDDs {
 
   /**
@@ -46,6 +47,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ `U(0.0, 1.0)`.
    */
+  @Since("1.1.0")
   def uniformRDD(
       sc: SparkContext,
       size: Long,
@@ -58,6 +60,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#uniformRDD]].
    */
+  @Since("1.1.0")
   def uniformJavaRDD(
       jsc: JavaSparkContext,
       size: Long,
@@ -69,6 +72,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#uniformJavaRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def uniformJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size, numPartitions))
   }
@@ -76,6 +80,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#uniformJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def uniformJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size))
   }
@@ -92,6 +97,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ N(0.0, 1.0).
    */
+  @Since("1.1.0")
   def normalRDD(
       sc: SparkContext,
       size: Long,
@@ -104,6 +110,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#normalRDD]].
    */
+  @Since("1.1.0")
   def normalJavaRDD(
       jsc: JavaSparkContext,
       size: Long,
@@ -115,6 +122,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#normalJavaRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def normalJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size, numPartitions))
   }
@@ -122,6 +130,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#normalJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def normalJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size))
   }
@@ -137,6 +146,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
+  @Since("1.1.0")
   def poissonRDD(
       sc: SparkContext,
       mean: Double,
@@ -150,6 +160,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#poissonRDD]].
    */
+  @Since("1.1.0")
   def poissonJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -162,6 +173,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#poissonJavaRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def poissonJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -173,6 +185,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#poissonJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def poissonJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size))
   }
@@ -188,6 +201,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
+  @Since("1.3.0")
   def exponentialRDD(
       sc: SparkContext,
       mean: Double,
@@ -201,6 +215,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#exponentialRDD]].
    */
+  @Since("1.3.0")
   def exponentialJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -213,6 +228,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#exponentialJavaRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def exponentialJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -224,6 +240,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#exponentialJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.3.0")
   def exponentialJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size))
   }
@@ -240,6 +257,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
+  @Since("1.3.0")
   def gammaRDD(
       sc: SparkContext,
       shape: Double,
@@ -254,6 +272,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#gammaRDD]].
    */
+  @Since("1.3.0")
   def gammaJavaRDD(
       jsc: JavaSparkContext,
       shape: Double,
@@ -267,6 +286,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#gammaJavaRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def gammaJavaRDD(
       jsc: JavaSparkContext,
       shape: Double,
@@ -279,11 +299,12 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#gammaJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.3.0")
   def gammaJavaRDD(
-    jsc: JavaSparkContext,
-    shape: Double,
-    scale: Double,
-    size: Long): JavaDoubleRDD = {
+      jsc: JavaSparkContext,
+      shape: Double,
+      scale: Double,
+      size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size))
   }
 
@@ -299,6 +320,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
+  @Since("1.3.0")
   def logNormalRDD(
       sc: SparkContext,
       mean: Double,
@@ -313,6 +335,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#logNormalRDD]].
    */
+  @Since("1.3.0")
   def logNormalJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -326,6 +349,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#logNormalJavaRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def logNormalJavaRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -338,11 +362,12 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#logNormalJavaRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.3.0")
   def logNormalJavaRDD(
-    jsc: JavaSparkContext,
-    mean: Double,
-    std: Double,
-    size: Long): JavaDoubleRDD = {
+      jsc: JavaSparkContext,
+      mean: Double,
+      std: Double,
+      size: Long): JavaDoubleRDD = {
     JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size))
   }
 
@@ -359,6 +384,7 @@ object RandomRDDs {
    * @return RDD[Double] comprised of `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
+  @Since("1.1.0")
   def randomRDD[T: ClassTag](
       sc: SparkContext,
       generator: RandomDataGenerator[T],
@@ -381,6 +407,7 @@ object RandomRDDs {
    * @param seed Seed for the RNG that generates the seed for the generator in each partition.
    * @return RDD[Vector] with vectors containing i.i.d samples ~ `U(0.0, 1.0)`.
    */
+  @Since("1.1.0")
   def uniformVectorRDD(
       sc: SparkContext,
       numRows: Long,
@@ -394,6 +421,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#uniformVectorRDD]].
    */
+  @Since("1.1.0")
   def uniformJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -406,6 +434,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#uniformJavaVectorRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def uniformJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -417,6 +446,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#uniformJavaVectorRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def uniformJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -435,6 +465,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ `N(0.0, 1.0)`.
    */
+  @Since("1.1.0")
   def normalVectorRDD(
       sc: SparkContext,
       numRows: Long,
@@ -448,6 +479,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#normalVectorRDD]].
    */
+  @Since("1.1.0")
   def normalJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -460,6 +492,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#normalJavaVectorRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def normalJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -471,6 +504,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#normalJavaVectorRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def normalJavaVectorRDD(
       jsc: JavaSparkContext,
       numRows: Long,
@@ -491,6 +525,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Vector] with vectors containing `i.i.d.` samples.
    */
+  @Since("1.3.0")
   def logNormalVectorRDD(
       sc: SparkContext,
       mean: Double,
@@ -507,6 +542,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#logNormalVectorRDD]].
    */
+  @Since("1.3.0")
   def logNormalJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -521,6 +557,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#logNormalJavaVectorRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def logNormalJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -535,6 +572,7 @@ object RandomRDDs {
    * [[RandomRDDs#logNormalJavaVectorRDD]] with the default number of partitions and
    * the default seed.
    */
+  @Since("1.3.0")
   def logNormalJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -556,6 +594,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Pois(mean).
    */
+  @Since("1.1.0")
   def poissonVectorRDD(
       sc: SparkContext,
       mean: Double,
@@ -570,6 +609,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#poissonVectorRDD]].
    */
+  @Since("1.1.0")
   def poissonJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -583,6 +623,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#poissonJavaVectorRDD]] with the default seed.
    */
+  @Since("1.1.0")
   def poissonJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -595,6 +636,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#poissonJavaVectorRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.1.0")
   def poissonJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -615,6 +657,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
    */
+  @Since("1.3.0")
   def exponentialVectorRDD(
       sc: SparkContext,
       mean: Double,
@@ -630,6 +673,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#exponentialVectorRDD]].
    */
+  @Since("1.3.0")
   def exponentialJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -643,6 +687,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#exponentialJavaVectorRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def exponentialJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -656,6 +701,7 @@ object RandomRDDs {
    * [[RandomRDDs#exponentialJavaVectorRDD]] with the default number of partitions
    * and the default seed.
    */
+  @Since("1.3.0")
   def exponentialJavaVectorRDD(
       jsc: JavaSparkContext,
       mean: Double,
@@ -678,6 +724,7 @@ object RandomRDDs {
    * @param seed Random seed (default: a random long integer).
    * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
    */
+  @Since("1.3.0")
   def gammaVectorRDD(
       sc: SparkContext,
       shape: Double,
@@ -693,6 +740,7 @@ object RandomRDDs {
   /**
    * Java-friendly version of [[RandomRDDs#gammaVectorRDD]].
    */
+  @Since("1.3.0")
   def gammaJavaVectorRDD(
       jsc: JavaSparkContext,
       shape: Double,
@@ -707,6 +755,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#gammaJavaVectorRDD]] with the default seed.
    */
+  @Since("1.3.0")
   def gammaJavaVectorRDD(
       jsc: JavaSparkContext,
       shape: Double,
@@ -720,6 +769,7 @@ object RandomRDDs {
   /**
    * [[RandomRDDs#gammaJavaVectorRDD]] with the default number of partitions and the default seed.
    */
+  @Since("1.3.0")
   def gammaJavaVectorRDD(
       jsc: JavaSparkContext,
       shape: Double,
@@ -744,6 +794,7 @@ object RandomRDDs {
    * @return RDD[Vector] with vectors containing `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
+  @Since("1.1.0")
   def randomVectorRDD(sc: SparkContext,
       generator: RandomDataGenerator[Double],
       numRows: Long,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index bd4d81390bfae..92a5af708d04b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -35,9 +35,9 @@ import org.apache.spark.mllib.util.MLUtils
  */
 @Since("1.3.0")
 @DeveloperApi
-class MultivariateGaussian (
-    val mu: Vector,
-    val sigma: Matrix) extends Serializable {
+class MultivariateGaussian @Since("1.3.0") (
+    @Since("1.3.0") val mu: Vector,
+    @Since("1.3.0") val sigma: Matrix) extends Serializable {
 
   require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
   require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
index f44be13706695..d01b3707be944 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.stat.test
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 
 /**
  * :: Experimental ::
@@ -25,28 +25,33 @@ import org.apache.spark.annotation.Experimental
  * @tparam DF Return type of `degreesOfFreedom`.
  */
 @Experimental
+@Since("1.1.0")
 trait TestResult[DF] {
 
   /**
    * The probability of obtaining a test statistic result at least as extreme as the one that was
    * actually observed, assuming that the null hypothesis is true.
    */
+  @Since("1.1.0")
   def pValue: Double
 
   /**
    * Returns the degree(s) of freedom of the hypothesis test.
    * Return type should be Number(e.g. Int, Double) or tuples of Numbers for toString compatibility.
    */
+  @Since("1.1.0")
   def degreesOfFreedom: DF
 
   /**
    * Test statistic.
    */
+  @Since("1.1.0")
   def statistic: Double
 
   /**
    * Null hypothesis of the test.
    */
+  @Since("1.1.0")
   def nullHypothesis: String
 
   /**
@@ -78,11 +83,12 @@ trait TestResult[DF] {
  * Object containing the test results for the chi-squared hypothesis test.
  */
 @Experimental
+@Since("1.1.0")
 class ChiSqTestResult private[stat] (override val pValue: Double,
-    override val degreesOfFreedom: Int,
-    override val statistic: Double,
-    val method: String,
-    override val nullHypothesis: String) extends TestResult[Int] {
+    @Since("1.1.0") override val degreesOfFreedom: Int,
+    @Since("1.1.0") override val statistic: Double,
+    @Since("1.1.0") val method: String,
+    @Since("1.1.0") override val nullHypothesis: String) extends TestResult[Int] {
 
   override def toString: String = {
     "Chi squared test summary:\n" +
@@ -96,11 +102,13 @@ class ChiSqTestResult private[stat] (override val pValue: Double,
  * Object containing the test results for the Kolmogorov-Smirnov test.
  */
 @Experimental
+@Since("1.5.0")
 class KolmogorovSmirnovTestResult private[stat] (
-    override val pValue: Double,
-    override val statistic: Double,
-    override val nullHypothesis: String) extends TestResult[Int] {
+    @Since("1.5.0") override val pValue: Double,
+    @Since("1.5.0") override val statistic: Double,
+    @Since("1.5.0") override val nullHypothesis: String) extends TestResult[Int] {
 
+  @Since("1.5.0")
   override val degreesOfFreedom = 0
 
   override def toString: String = {

From d703372f86d6a59383ba8569fcd9d379849cffbf Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 22:33:48 -0700
Subject: [PATCH 087/802] [SPARK-10234] [MLLIB] update since version in
 mllib.clustering

Same as #8421 but for `mllib.clustering`.

cc feynmanliang yu-iskw

Author: Xiangrui Meng <meng@databricks.com>

Closes #8435 from mengxr/SPARK-10234.
---
 .../mllib/clustering/GaussianMixture.scala    |  1 +
 .../clustering/GaussianMixtureModel.scala     |  8 +++---
 .../spark/mllib/clustering/KMeans.scala       |  1 +
 .../spark/mllib/clustering/KMeansModel.scala  |  4 +--
 .../spark/mllib/clustering/LDAModel.scala     | 28 ++++++++++++++-----
 .../clustering/PowerIterationClustering.scala | 10 +++++--
 .../mllib/clustering/StreamingKMeans.scala    | 15 +++++-----
 7 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index daa947e81d44d..f82bd82c20371 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -53,6 +53,7 @@ import org.apache.spark.util.Utils
  * @param maxIterations The maximum number of iterations to perform
  */
 @Experimental
+@Since("1.3.0")
 class GaussianMixture private (
     private var k: Int,
     private var convergenceTol: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a10a8b624218..7f6163e04bf17 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row}
  */
 @Since("1.3.0")
 @Experimental
-class GaussianMixtureModel(
-  val weights: Array[Double],
-  val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
+class GaussianMixtureModel @Since("1.3.0") (
+  @Since("1.3.0") val weights: Array[Double],
+  @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
 
   require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
 
@@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
           (weight, new MultivariateGaussian(mu, sigma))
       }.unzip
 
-      return new GaussianMixtureModel(weights.toArray, gaussians.toArray)
+      new GaussianMixtureModel(weights.toArray, gaussians.toArray)
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 3e9545a74bef3..46920fffe6e1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
  * to it should be cached by the user.
  */
+@Since("0.8.0")
 class KMeans private (
     private var k: Int,
     private var maxIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index e425ecdd481c6..a741584982725 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -37,8 +37,8 @@ import org.apache.spark.sql.Row
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
  */
 @Since("0.8.0")
-class KMeansModel (
-    val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
+class KMeansModel @Since("1.1.0") (@Since("1.0.0") val clusterCenters: Array[Vector])
+  extends Saveable with Serializable with PMMLExportable {
 
   /**
    * A Java-friendly constructor that takes an Iterable of Vectors.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 432bbedc8d6f8..15129e0dd5a91 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -43,12 +43,15 @@ import org.apache.spark.util.BoundedPriorityQueue
  * including local and distributed data structures.
  */
 @Experimental
+@Since("1.3.0")
 abstract class LDAModel private[clustering] extends Saveable {
 
   /** Number of topics */
+  @Since("1.3.0")
   def k: Int
 
   /** Vocabulary size (number of terms or terms in the vocabulary) */
+  @Since("1.3.0")
   def vocabSize: Int
 
   /**
@@ -57,6 +60,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *
    * This is the parameter to a Dirichlet distribution.
    */
+  @Since("1.5.0")
   def docConcentration: Vector
 
   /**
@@ -68,6 +72,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    * Note: The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    */
+  @Since("1.5.0")
   def topicConcentration: Double
 
   /**
@@ -81,6 +86,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    * This is a matrix of size vocabSize x k, where each column is a topic.
    * No guarantees are given about the ordering of the topics.
    */
+  @Since("1.3.0")
   def topicsMatrix: Matrix
 
   /**
@@ -91,6 +97,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *          (term indices, term weights in topic).
    *          Each topic's terms are sorted in order of decreasing weight.
    */
+  @Since("1.3.0")
   def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])]
 
   /**
@@ -102,6 +109,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *          (term indices, term weights in topic).
    *          Each topic's terms are sorted in order of decreasing weight.
    */
+  @Since("1.3.0")
   def describeTopics(): Array[(Array[Int], Array[Double])] = describeTopics(vocabSize)
 
   /* TODO (once LDA can be trained with Strings or given a dictionary)
@@ -185,10 +193,11 @@ abstract class LDAModel private[clustering] extends Saveable {
  * @param topics Inferred topics (vocabSize x k matrix).
  */
 @Experimental
+@Since("1.3.0")
 class LocalLDAModel private[clustering] (
-    val topics: Matrix,
-    override val docConcentration: Vector,
-    override val topicConcentration: Double,
+    @Since("1.3.0") val topics: Matrix,
+    @Since("1.5.0") override val docConcentration: Vector,
+    @Since("1.5.0") override val topicConcentration: Double,
     override protected[clustering] val gammaShape: Double = 100)
   extends LDAModel with Serializable {
 
@@ -376,6 +385,7 @@ class LocalLDAModel private[clustering] (
 }
 
 @Experimental
+@Since("1.5.0")
 object LocalLDAModel extends Loader[LocalLDAModel] {
 
   private object SaveLoadV1_0 {
@@ -479,13 +489,14 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
  * than the [[LocalLDAModel]].
  */
 @Experimental
+@Since("1.3.0")
 class DistributedLDAModel private[clustering] (
     private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
     private[clustering] val globalTopicTotals: LDA.TopicCounts,
-    val k: Int,
-    val vocabSize: Int,
-    override val docConcentration: Vector,
-    override val topicConcentration: Double,
+    @Since("1.3.0") val k: Int,
+    @Since("1.3.0") val vocabSize: Int,
+    @Since("1.5.0") override val docConcentration: Vector,
+    @Since("1.5.0") override val topicConcentration: Double,
     private[spark] val iterationTimes: Array[Double],
     override protected[clustering] val gammaShape: Double = 100)
   extends LDAModel {
@@ -603,6 +614,7 @@ class DistributedLDAModel private[clustering] (
    *         (term indices, topic indices).  Note that terms will be omitted if not present in
    *         the document.
    */
+  @Since("1.5.0")
   lazy val topicAssignments: RDD[(Long, Array[Int], Array[Int])] = {
     // For reference, compare the below code with the core part of EMLDAOptimizer.next().
     val eta = topicConcentration
@@ -634,6 +646,7 @@ class DistributedLDAModel private[clustering] (
   }
 
   /** Java-friendly version of [[topicAssignments]] */
+  @Since("1.5.0")
   lazy val javaTopicAssignments: JavaRDD[(java.lang.Long, Array[Int], Array[Int])] = {
     topicAssignments.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Int])]].toJavaRDD()
   }
@@ -770,6 +783,7 @@ class DistributedLDAModel private[clustering] (
 
 
 @Experimental
+@Since("1.5.0")
 object DistributedLDAModel extends Loader[DistributedLDAModel] {
 
   private object SaveLoadV1_0 {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 396b36f2f6454..da234bdbb29e6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -42,9 +42,10 @@ import org.apache.spark.{Logging, SparkContext, SparkException}
  */
 @Since("1.3.0")
 @Experimental
-class PowerIterationClusteringModel(
-    val k: Int,
-    val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable {
+class PowerIterationClusteringModel @Since("1.3.0") (
+    @Since("1.3.0") val k: Int,
+    @Since("1.3.0") val assignments: RDD[PowerIterationClustering.Assignment])
+  extends Saveable with Serializable {
 
   @Since("1.4.0")
   override def save(sc: SparkContext, path: String): Unit = {
@@ -56,6 +57,8 @@ class PowerIterationClusteringModel(
 
 @Since("1.4.0")
 object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] {
+
+  @Since("1.4.0")
   override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
     PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
   }
@@ -120,6 +123,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
  * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
  */
 @Experimental
+@Since("1.3.0")
 class PowerIterationClustering private[clustering] (
     private var k: Int,
     private var maxIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 41f2668ec6a7d..1d50ffec96faf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -66,9 +66,10 @@ import org.apache.spark.util.random.XORShiftRandom
  */
 @Since("1.2.0")
 @Experimental
-class StreamingKMeansModel(
-    override val clusterCenters: Array[Vector],
-    val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
+class StreamingKMeansModel @Since("1.2.0") (
+    @Since("1.2.0") override val clusterCenters: Array[Vector],
+    @Since("1.2.0") val clusterWeights: Array[Double])
+  extends KMeansModel(clusterCenters) with Logging {
 
   /**
    * Perform a k-means update on a batch of data.
@@ -168,10 +169,10 @@ class StreamingKMeansModel(
  */
 @Since("1.2.0")
 @Experimental
-class StreamingKMeans(
-    var k: Int,
-    var decayFactor: Double,
-    var timeUnit: String) extends Logging with Serializable {
+class StreamingKMeans @Since("1.2.0") (
+    @Since("1.2.0") var k: Int,
+    @Since("1.2.0") var decayFactor: Double,
+    @Since("1.2.0") var timeUnit: String) extends Logging with Serializable {
 
   @Since("1.2.0")
   def this() = this(2, 1.0, StreamingKMeans.BATCHES)

From fb7e12fe2e14af8de4c206ca8096b2e8113bfddc Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 22:35:49 -0700
Subject: [PATCH 088/802] [SPARK-10243] [MLLIB] update since versions in
 mllib.tree

Same as #8421 but for `mllib.tree`.

cc jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #8442 from mengxr/SPARK-10236.
---
 .../spark/mllib/tree/DecisionTree.scala       |  3 +-
 .../mllib/tree/GradientBoostedTrees.scala     |  2 +-
 .../spark/mllib/tree/configuration/Algo.scala |  2 ++
 .../tree/configuration/BoostingStrategy.scala | 12 ++++----
 .../tree/configuration/FeatureType.scala      |  2 ++
 .../tree/configuration/QuantileStrategy.scala |  2 ++
 .../mllib/tree/configuration/Strategy.scala   | 29 ++++++++++---------
 .../mllib/tree/model/DecisionTreeModel.scala  |  5 +++-
 .../apache/spark/mllib/tree/model/Node.scala  | 18 ++++++------
 .../spark/mllib/tree/model/Predict.scala      |  6 ++--
 .../apache/spark/mllib/tree/model/Split.scala |  8 ++---
 .../mllib/tree/model/treeEnsembleModels.scala | 12 ++++----
 12 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 972841015d4f0..4a77d4adcd865 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom
  */
 @Since("1.0.0")
 @Experimental
-class DecisionTree (private val strategy: Strategy) extends Serializable with Logging {
+class DecisionTree @Since("1.0.0") (private val strategy: Strategy)
+  extends Serializable with Logging {
 
   strategy.assertValid()
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index e750408600c33..95ed48cea6716 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.2.0")
 @Experimental
-class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
+class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: BoostingStrategy)
   extends Serializable with Logging {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
index 8301ad160836b..853c7319ec44d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since}
 @Since("1.0.0")
 @Experimental
 object Algo extends Enumeration {
+  @Since("1.0.0")
   type Algo = Value
+  @Since("1.0.0")
   val Classification, Regression = Value
 
   private[mllib] def fromString(name: String): Algo = name match {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 7c569981977b4..b5c72fba3ede1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -41,14 +41,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  */
 @Since("1.2.0")
 @Experimental
-case class BoostingStrategy(
+case class BoostingStrategy @Since("1.4.0") (
     // Required boosting parameters
-    @BeanProperty var treeStrategy: Strategy,
-    @BeanProperty var loss: Loss,
+    @Since("1.2.0") @BeanProperty var treeStrategy: Strategy,
+    @Since("1.2.0") @BeanProperty var loss: Loss,
     // Optional boosting parameters
-    @BeanProperty var numIterations: Int = 100,
-    @BeanProperty var learningRate: Double = 0.1,
-    @BeanProperty var validationTol: Double = 1e-5) extends Serializable {
+    @Since("1.2.0") @BeanProperty var numIterations: Int = 100,
+    @Since("1.2.0") @BeanProperty var learningRate: Double = 0.1,
+    @Since("1.4.0") @BeanProperty var validationTol: Double = 1e-5) extends Serializable {
 
   /**
    * Check validity of parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
index bb7c7ee4f964f..4e0cd473def06 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
@@ -26,6 +26,8 @@ import org.apache.spark.annotation.{Experimental, Since}
 @Since("1.0.0")
 @Experimental
 object FeatureType extends Enumeration {
+  @Since("1.0.0")
   type FeatureType = Value
+  @Since("1.0.0")
   val Continuous, Categorical = Value
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
index 904e42deebb5f..8262db8a4f111 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
@@ -26,6 +26,8 @@ import org.apache.spark.annotation.{Experimental, Since}
 @Since("1.0.0")
 @Experimental
 object QuantileStrategy extends Enumeration {
+  @Since("1.0.0")
   type QuantileStrategy = Value
+  @Since("1.0.0")
   val Sort, MinMax, ApproxHist = Value
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index b74e3f1f46523..89cc13b7c06cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -69,20 +69,20 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  */
 @Since("1.0.0")
 @Experimental
-class Strategy (
-    @BeanProperty var algo: Algo,
-    @BeanProperty var impurity: Impurity,
-    @BeanProperty var maxDepth: Int,
-    @BeanProperty var numClasses: Int = 2,
-    @BeanProperty var maxBins: Int = 32,
-    @BeanProperty var quantileCalculationStrategy: QuantileStrategy = Sort,
-    @BeanProperty var categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
-    @BeanProperty var minInstancesPerNode: Int = 1,
-    @BeanProperty var minInfoGain: Double = 0.0,
-    @BeanProperty var maxMemoryInMB: Int = 256,
-    @BeanProperty var subsamplingRate: Double = 1,
-    @BeanProperty var useNodeIdCache: Boolean = false,
-    @BeanProperty var checkpointInterval: Int = 10) extends Serializable {
+class Strategy @Since("1.3.0") (
+    @Since("1.0.0") @BeanProperty var algo: Algo,
+    @Since("1.0.0") @BeanProperty var impurity: Impurity,
+    @Since("1.0.0") @BeanProperty var maxDepth: Int,
+    @Since("1.2.0") @BeanProperty var numClasses: Int = 2,
+    @Since("1.0.0") @BeanProperty var maxBins: Int = 32,
+    @Since("1.0.0") @BeanProperty var quantileCalculationStrategy: QuantileStrategy = Sort,
+    @Since("1.0.0") @BeanProperty var categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
+    @Since("1.2.0") @BeanProperty var minInstancesPerNode: Int = 1,
+    @Since("1.2.0") @BeanProperty var minInfoGain: Double = 0.0,
+    @Since("1.0.0") @BeanProperty var maxMemoryInMB: Int = 256,
+    @Since("1.2.0") @BeanProperty var subsamplingRate: Double = 1,
+    @Since("1.2.0") @BeanProperty var useNodeIdCache: Boolean = false,
+    @Since("1.2.0") @BeanProperty var checkpointInterval: Int = 10) extends Serializable {
 
   /**
    */
@@ -206,6 +206,7 @@ object Strategy {
   }
 
   @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0")
+  @Since("1.2.0")
   def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo)
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 3eefd135f7836..e1bf23f4c34bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -43,7 +43,9 @@ import org.apache.spark.util.Utils
  */
 @Since("1.0.0")
 @Experimental
-class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable {
+class DecisionTreeModel @Since("1.0.0") (
+    @Since("1.0.0") val topNode: Node,
+    @Since("1.0.0") val algo: Algo) extends Serializable with Saveable {
 
   /**
    * Predict values for a single data point using the model trained.
@@ -110,6 +112,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
   /**
    * Print the full model to a string.
    */
+  @Since("1.2.0")
   def toDebugString: String = {
     val header = toString + "\n"
     header + topNode.subtreeToString(2)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index 8c54c55107233..ea6e5aa5d94e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -41,15 +41,15 @@ import org.apache.spark.mllib.linalg.Vector
  */
 @Since("1.0.0")
 @DeveloperApi
-class Node (
-    val id: Int,
-    var predict: Predict,
-    var impurity: Double,
-    var isLeaf: Boolean,
-    var split: Option[Split],
-    var leftNode: Option[Node],
-    var rightNode: Option[Node],
-    var stats: Option[InformationGainStats]) extends Serializable with Logging {
+class Node @Since("1.2.0") (
+    @Since("1.0.0") val id: Int,
+    @Since("1.0.0") var predict: Predict,
+    @Since("1.2.0") var impurity: Double,
+    @Since("1.0.0") var isLeaf: Boolean,
+    @Since("1.0.0") var split: Option[Split],
+    @Since("1.0.0") var leftNode: Option[Node],
+    @Since("1.0.0") var rightNode: Option[Node],
+    @Since("1.0.0") var stats: Option[InformationGainStats]) extends Serializable with Logging {
 
   override def toString: String = {
     s"id = $id, isLeaf = $isLeaf, predict = $predict, impurity = $impurity, " +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
index 965784051ede5..06ceff19d8633 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
@@ -26,9 +26,9 @@ import org.apache.spark.annotation.{DeveloperApi, Since}
  */
 @Since("1.2.0")
 @DeveloperApi
-class Predict(
-    val predict: Double,
-    val prob: Double = 0.0) extends Serializable {
+class Predict @Since("1.2.0") (
+    @Since("1.2.0") val predict: Double,
+    @Since("1.2.0") val prob: Double = 0.0) extends Serializable {
 
   override def toString: String = s"$predict (prob = $prob)"
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index 45db83ae3a1f3..b85a66c05a81d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -34,10 +34,10 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
 @Since("1.0.0")
 @DeveloperApi
 case class Split(
-    feature: Int,
-    threshold: Double,
-    featureType: FeatureType,
-    categories: List[Double]) {
+    @Since("1.0.0") feature: Int,
+    @Since("1.0.0") threshold: Double,
+    @Since("1.0.0") featureType: FeatureType,
+    @Since("1.0.0") categories: List[Double]) {
 
   override def toString: String = {
     s"Feature = $feature, threshold = $threshold, featureType = $featureType, " +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 19571447a2c56..df5b8feab5d5d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -48,7 +48,9 @@ import org.apache.spark.util.Utils
  */
 @Since("1.2.0")
 @Experimental
-class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel])
+class RandomForestModel @Since("1.2.0") (
+    @Since("1.2.0") override val algo: Algo,
+    @Since("1.2.0") override val trees: Array[DecisionTreeModel])
   extends TreeEnsembleModel(algo, trees, Array.fill(trees.length)(1.0),
     combiningStrategy = if (algo == Classification) Vote else Average)
   with Saveable {
@@ -115,10 +117,10 @@ object RandomForestModel extends Loader[RandomForestModel] {
  */
 @Since("1.2.0")
 @Experimental
-class GradientBoostedTreesModel(
-    override val algo: Algo,
-    override val trees: Array[DecisionTreeModel],
-    override val treeWeights: Array[Double])
+class GradientBoostedTreesModel @Since("1.2.0") (
+    @Since("1.2.0") override val algo: Algo,
+    @Since("1.2.0") override val trees: Array[DecisionTreeModel],
+    @Since("1.2.0") override val treeWeights: Array[Double])
   extends TreeEnsembleModel(algo, trees, treeWeights, combiningStrategy = Sum)
   with Saveable {
 

From 4657fa1f37d41dd4c7240a960342b68c7c591f48 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 22:49:33 -0700
Subject: [PATCH 089/802] [SPARK-10235] [MLLIB] update since versions in
 mllib.regression

Same as #8421 but for `mllib.regression`.

cc freeman-lab dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8426 from mengxr/SPARK-10235 and squashes the following commits:

6cd28e4 [Xiangrui Meng] update since versions in mllib.regression
---
 .../regression/GeneralizedLinearAlgorithm.scala  |  6 ++++--
 .../mllib/regression/IsotonicRegression.scala    | 16 +++++++++-------
 .../spark/mllib/regression/LabeledPoint.scala    |  5 +++--
 .../apache/spark/mllib/regression/Lasso.scala    |  9 ++++++---
 .../mllib/regression/LinearRegression.scala      |  9 ++++++---
 .../spark/mllib/regression/RidgeRegression.scala | 12 +++++++-----
 .../regression/StreamingLinearAlgorithm.scala    |  8 +++-----
 .../StreamingLinearRegressionWithSGD.scala       | 11 +++++++++--
 8 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 509f6a2d169c4..7e3b4d5648fe3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("0.8.0")
 @DeveloperApi
-abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double)
+abstract class GeneralizedLinearModel @Since("1.0.0") (
+    @Since("1.0.0") val weights: Vector,
+    @Since("0.8.0") val intercept: Double)
   extends Serializable {
 
   /**
@@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * The optimizer to solve the problem.
    *
    */
-  @Since("1.0.0")
+  @Since("0.8.0")
   def optimizer: Optimizer
 
   /** Whether to add intercept (default: false). */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 31ca7c2f207d9..877d31ba41303 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext
  */
 @Since("1.3.0")
 @Experimental
-class IsotonicRegressionModel (
-    val boundaries: Array[Double],
-    val predictions: Array[Double],
-    val isotonic: Boolean) extends Serializable with Saveable {
+class IsotonicRegressionModel @Since("1.3.0") (
+    @Since("1.3.0") val boundaries: Array[Double],
+    @Since("1.3.0") val predictions: Array[Double],
+    @Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable {
 
   private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse
 
@@ -63,7 +63,6 @@ class IsotonicRegressionModel (
 
   /**
    * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter.
-   *
    */
   @Since("1.4.0")
   def this(boundaries: java.lang.Iterable[Double],
@@ -214,8 +213,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
     }
   }
 
-  /**
-   */
   @Since("1.4.0")
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = {
     implicit val formats = DefaultFormats
@@ -256,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
  * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
  */
 @Experimental
+@Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
 
   /**
@@ -263,6 +261,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
    *
    * @return New instance of IsotonicRegression.
    */
+  @Since("1.3.0")
   def this() = this(true)
 
   /**
@@ -271,6 +270,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
    * @param isotonic Isotonic (increasing) or antitonic (decreasing) sequence.
    * @return This instance of IsotonicRegression.
    */
+  @Since("1.3.0")
   def setIsotonic(isotonic: Boolean): this.type = {
     this.isotonic = isotonic
     this
@@ -286,6 +286,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
    *              the algorithm is executed.
    * @return Isotonic regression model.
    */
+  @Since("1.3.0")
   def run(input: RDD[(Double, Double, Double)]): IsotonicRegressionModel = {
     val preprocessedInput = if (isotonic) {
       input
@@ -311,6 +312,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
    *              the algorithm is executed.
    * @return Isotonic regression model.
    */
+  @Since("1.3.0")
   def run(input: JavaRDD[(JDouble, JDouble, JDouble)]): IsotonicRegressionModel = {
     run(input.rdd.retag.asInstanceOf[RDD[(Double, Double, Double)]])
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
index f7fe1b7b21fca..c284ad2325374 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
@@ -29,11 +29,12 @@ import org.apache.spark.SparkException
  *
  * @param label Label for this data point.
  * @param features List of features for this data point.
- *
  */
 @Since("0.8.0")
 @BeanInfo
-case class LabeledPoint(label: Double, features: Vector) {
+case class LabeledPoint @Since("1.0.0") (
+    @Since("0.8.0") label: Double,
+    @Since("1.0.0") features: Vector) {
   override def toString: String = {
     s"($label,$features)"
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
index 556411a366bd2..a9aba173fa0e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
@@ -34,9 +34,9 @@ import org.apache.spark.rdd.RDD
  *
  */
 @Since("0.8.0")
-class LassoModel (
-    override val weights: Vector,
-    override val intercept: Double)
+class LassoModel @Since("1.1.0") (
+    @Since("1.0.0") override val weights: Vector,
+    @Since("0.8.0") override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
   with RegressionModel with Serializable with Saveable with PMMLExportable {
 
@@ -84,6 +84,7 @@ object LassoModel extends Loader[LassoModel] {
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
  */
+@Since("0.8.0")
 class LassoWithSGD private (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -93,6 +94,7 @@ class LassoWithSGD private (
 
   private val gradient = new LeastSquaresGradient()
   private val updater = new L1Updater()
+  @Since("0.8.0")
   override val optimizer = new GradientDescent(gradient, updater)
     .setStepSize(stepSize)
     .setNumIterations(numIterations)
@@ -103,6 +105,7 @@ class LassoWithSGD private (
    * Construct a Lasso object with default parameters: {stepSize: 1.0, numIterations: 100,
    * regParam: 0.01, miniBatchFraction: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(1.0, 100, 0.01, 1.0)
 
   override protected def createModel(weights: Vector, intercept: Double) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index 00ab06e3ba738..4996ace5df85d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -34,9 +34,9 @@ import org.apache.spark.rdd.RDD
  *
  */
 @Since("0.8.0")
-class LinearRegressionModel (
-    override val weights: Vector,
-    override val intercept: Double)
+class LinearRegressionModel @Since("1.1.0") (
+    @Since("1.0.0") override val weights: Vector,
+    @Since("0.8.0") override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with RegressionModel with Serializable
   with Saveable with PMMLExportable {
 
@@ -85,6 +85,7 @@ object LinearRegressionModel extends Loader[LinearRegressionModel] {
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
  */
+@Since("0.8.0")
 class LinearRegressionWithSGD private[mllib] (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -93,6 +94,7 @@ class LinearRegressionWithSGD private[mllib] (
 
   private val gradient = new LeastSquaresGradient()
   private val updater = new SimpleUpdater()
+  @Since("0.8.0")
   override val optimizer = new GradientDescent(gradient, updater)
     .setStepSize(stepSize)
     .setNumIterations(numIterations)
@@ -102,6 +104,7 @@ class LinearRegressionWithSGD private[mllib] (
    * Construct a LinearRegression object with default parameters: {stepSize: 1.0,
    * numIterations: 100, miniBatchFraction: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(1.0, 100, 1.0)
 
   override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index 21a791d98b2cb..0a44ff559d55b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -35,9 +35,9 @@ import org.apache.spark.rdd.RDD
  *
  */
 @Since("0.8.0")
-class RidgeRegressionModel (
-    override val weights: Vector,
-    override val intercept: Double)
+class RidgeRegressionModel @Since("1.1.0") (
+    @Since("1.0.0") override val weights: Vector,
+    @Since("0.8.0") override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
   with RegressionModel with Serializable with Saveable with PMMLExportable {
 
@@ -85,6 +85,7 @@ object RidgeRegressionModel extends Loader[RidgeRegressionModel] {
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
  */
+@Since("0.8.0")
 class RidgeRegressionWithSGD private (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -94,7 +95,7 @@ class RidgeRegressionWithSGD private (
 
   private val gradient = new LeastSquaresGradient()
   private val updater = new SquaredL2Updater()
-
+  @Since("0.8.0")
   override val optimizer = new GradientDescent(gradient, updater)
     .setStepSize(stepSize)
     .setNumIterations(numIterations)
@@ -105,6 +106,7 @@ class RidgeRegressionWithSGD private (
    * Construct a RidgeRegression object with default parameters: {stepSize: 1.0, numIterations: 100,
    * regParam: 0.01, miniBatchFraction: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(1.0, 100, 0.01, 1.0)
 
   override protected def createModel(weights: Vector, intercept: Double) = {
@@ -134,7 +136,7 @@ object RidgeRegressionWithSGD {
    *        the number of features in the data.
    *
    */
-  @Since("0.8.0")
+  @Since("1.0.0")
   def train(
       input: RDD[LabeledPoint],
       numIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index cd3ed8a1549db..73948b2d9851a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -22,7 +22,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream}
 import org.apache.spark.streaming.dstream.DStream
 
@@ -83,9 +83,8 @@ abstract class StreamingLinearAlgorithm[
    * batch of data from the stream.
    *
    * @param data DStream containing labeled data
-   *
    */
-  @Since("1.3.0")
+  @Since("1.1.0")
   def trainOn(data: DStream[LabeledPoint]): Unit = {
     if (model.isEmpty) {
       throw new IllegalArgumentException("Model must be initialized before starting training.")
@@ -105,7 +104,6 @@ abstract class StreamingLinearAlgorithm[
 
   /**
    * Java-friendly version of `trainOn`.
-   *
    */
   @Since("1.3.0")
   def trainOn(data: JavaDStream[LabeledPoint]): Unit = trainOn(data.dstream)
@@ -129,7 +127,7 @@ abstract class StreamingLinearAlgorithm[
    * Java-friendly version of `predictOn`.
    *
    */
-  @Since("1.1.0")
+  @Since("1.3.0")
   def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Double] = {
     JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Double]])
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index 26654e4a06838..fe1d487cdd078 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.Vector
 
 /**
@@ -41,6 +41,7 @@ import org.apache.spark.mllib.linalg.Vector
  *    .trainOn(DStream)
  */
 @Experimental
+@Since("1.1.0")
 class StreamingLinearRegressionWithSGD private[mllib] (
     private var stepSize: Double,
     private var numIterations: Int,
@@ -54,8 +55,10 @@ class StreamingLinearRegressionWithSGD private[mllib] (
    * Initial weights must be set before using trainOn or predictOn
    * (see `StreamingLinearAlgorithm`)
    */
+  @Since("1.1.0")
   def this() = this(0.1, 50, 1.0)
 
+  @Since("1.1.0")
   val algorithm = new LinearRegressionWithSGD(stepSize, numIterations, miniBatchFraction)
 
   protected var model: Option[LinearRegressionModel] = None
@@ -63,6 +66,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   /**
    * Set the step size for gradient descent. Default: 0.1.
    */
+  @Since("1.1.0")
   def setStepSize(stepSize: Double): this.type = {
     this.algorithm.optimizer.setStepSize(stepSize)
     this
@@ -71,6 +75,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   /**
    * Set the number of iterations of gradient descent to run per update. Default: 50.
    */
+  @Since("1.1.0")
   def setNumIterations(numIterations: Int): this.type = {
     this.algorithm.optimizer.setNumIterations(numIterations)
     this
@@ -79,6 +84,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   /**
    * Set the fraction of each batch to use for updates. Default: 1.0.
    */
+  @Since("1.1.0")
   def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
     this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction)
     this
@@ -87,6 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   /**
    * Set the initial weights.
    */
+  @Since("1.1.0")
   def setInitialWeights(initialWeights: Vector): this.type = {
     this.model = Some(algorithm.createModel(initialWeights, 0.0))
     this
@@ -95,9 +102,9 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   /**
    * Set the convergence tolerance. Default: 0.001.
    */
+  @Since("1.5.0")
   def setConvergenceTol(tolerance: Double): this.type = {
     this.algorithm.optimizer.setConvergenceTol(tolerance)
     this
   }
-
 }

From 321d7759691bed9867b1f0470f12eab2faa50aff Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 25 Aug 2015 23:45:41 -0700
Subject: [PATCH 090/802] [SPARK-10236] [MLLIB] update since versions in
 mllib.feature

Same as #8421 but for `mllib.feature`.

cc dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8449 from mengxr/SPARK-10236.feature and squashes the following commits:

0e8d658 [Xiangrui Meng] remove unnecessary comment
ad70b03 [Xiangrui Meng] update since versions in mllib.feature
---
 .../mllib/clustering/PowerIterationClustering.scala  |  2 --
 .../apache/spark/mllib/feature/ChiSqSelector.scala   |  4 ++--
 .../spark/mllib/feature/ElementwiseProduct.scala     |  3 ++-
 .../scala/org/apache/spark/mllib/feature/IDF.scala   |  6 ++++--
 .../org/apache/spark/mllib/feature/Normalizer.scala  |  2 +-
 .../scala/org/apache/spark/mllib/feature/PCA.scala   |  7 +++++--
 .../apache/spark/mllib/feature/StandardScaler.scala  | 12 ++++++------
 .../org/apache/spark/mllib/feature/Word2Vec.scala    |  1 +
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index da234bdbb29e6..6c76e26fd1626 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -71,8 +71,6 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
     private[clustering]
     val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel"
 
-    /**
-     */
     @Since("1.4.0")
     def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = {
       val sqlContext = new SQLContext(sc)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index fdd974d7a391e..4743cfd1a2c3f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelectorModel (
+class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
 
   require(isSorted(selectedFeatures), "Array has to be sorted asc")
@@ -112,7 +112,7 @@ class ChiSqSelectorModel (
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelector (
+class ChiSqSelector @Since("1.3.0") (
   @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index 33e2d17bb472e..d0a6cf61687a8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -29,7 +29,8 @@ import org.apache.spark.mllib.linalg._
  */
 @Since("1.4.0")
 @Experimental
-class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
+class ElementwiseProduct @Since("1.4.0") (
+    @Since("1.4.0") val scalingVec: Vector) extends VectorTransformer {
 
   /**
    * Does the hadamard product transformation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index d5353ddd972e0..68078ccfa3d60 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -39,8 +39,9 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.1.0")
 @Experimental
-class IDF(val minDocFreq: Int) {
+class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) {
 
+  @Since("1.1.0")
   def this() = this(0)
 
   // TODO: Allow different IDF formulations.
@@ -162,7 +163,8 @@ private object IDF {
  * Represents an IDF model that can transform term frequency vectors.
  */
 @Experimental
-class IDFModel private[spark] (val idf: Vector) extends Serializable {
+@Since("1.1.0")
+class IDFModel private[spark] (@Since("1.1.0") val idf: Vector) extends Serializable {
 
   /**
    * Transforms term frequency (TF) vectors to TF-IDF vectors.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index 0e070257d9fb2..8d5a22520d6b8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors
  */
 @Since("1.1.0")
 @Experimental
-class Normalizer(p: Double) extends VectorTransformer {
+class Normalizer @Since("1.1.0") (p: Double) extends VectorTransformer {
 
   @Since("1.1.0")
   def this() = this(2)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index a48b7bba665d7..ecb3c1e6c1c83 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
  * @param k number of principal components
  */
 @Since("1.4.0")
-class PCA(val k: Int) {
+class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
   require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k")
 
   /**
@@ -74,7 +74,10 @@ class PCA(val k: Int) {
  * @param k number of principal components.
  * @param pc a principal components Matrix. Each column is one principal component.
  */
-class PCAModel private[spark] (val k: Int, val pc: DenseMatrix) extends VectorTransformer {
+@Since("1.4.0")
+class PCAModel private[spark] (
+    @Since("1.4.0") val k: Int,
+    @Since("1.4.0") val pc: DenseMatrix) extends VectorTransformer {
   /**
    * Transform a vector by computed Principal Components.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index b95d5a899001e..f018b453bae7e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -34,7 +34,7 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.1.0")
 @Experimental
-class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
+class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean) extends Logging {
 
   @Since("1.1.0")
   def this() = this(false, true)
@@ -74,11 +74,11 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
  */
 @Since("1.1.0")
 @Experimental
-class StandardScalerModel (
-    val std: Vector,
-    val mean: Vector,
-    var withStd: Boolean,
-    var withMean: Boolean) extends VectorTransformer {
+class StandardScalerModel @Since("1.3.0") (
+    @Since("1.3.0") val std: Vector,
+    @Since("1.1.0") val mean: Vector,
+    @Since("1.3.0") var withStd: Boolean,
+    @Since("1.3.0") var withMean: Boolean) extends VectorTransformer {
 
   /**
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index e6f45ae4b01d5..36b124c5d2966 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -436,6 +436,7 @@ class Word2Vec extends Serializable with Logging {
  *                    (i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
+@Since("1.1.0")
 class Word2VecModel private[mllib] (
     private val wordIndex: Map[String, Int],
     private val wordVectors: Array[Float]) extends Serializable with Saveable {

From 75d4773aa50e24972c533e8b48697fde586429eb Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 25 Aug 2015 23:48:16 -0700
Subject: [PATCH 091/802] [SPARK-9316] [SPARKR] Add support for filtering using
 `[` (synonym for filter / select)

Add support for
```
   df[df$name == "Smith", c(1,2)]
   df[df$age %in% c(19, 30), 1:2]
```

shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #8394 from felixcheung/rsubset.
---
 R/pkg/R/DataFrame.R              | 22 +++++++++++++++++++++-
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ae1d912cf6da1..a5162de705f8f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -985,9 +985,11 @@ setMethod("$<-", signature(x = "DataFrame"),
             x
           })
 
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
 #' @rdname select
 #' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
           function(x, i) {
             if (is.numeric(i)) {
               cols <- columns(x)
@@ -1010,6 +1012,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
             select(x, j)
           })
 
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+          function(x, i, j, ...) {
+            # It could handle i as "character" but it seems confusing and not required
+            # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+            filtered <- filter(x, i)
+            if (!missing(j)) {
+              filtered[, j]
+            } else {
+              filtered
+            }
+          })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -1028,8 +1044,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 #'   # Columns can also be selected using `[[` and `[`
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
+#'   # It can also be subset on rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5447054..ee48a3dc0cc05 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
 })
 
+test_that("subsetting", {
+  # jsonFile returns columns in random order
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  filtered <- df[df$age > 20,]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1]
+  expect_is(df2, "DataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+  
+  df5 <- df[df$age %in% c(19), c(1,2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+})
+
 test_that("selectExpr() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")

From bb1640529725c6c38103b95af004f8bd90eeee5c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 26 Aug 2015 00:37:04 -0700
Subject: [PATCH 092/802] Closes #8443


From 6519fd06cc8175c9182ef16cf8a37d7f255eb846 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 26 Aug 2015 11:47:05 -0700
Subject: [PATCH 093/802] [SPARK-9665] [MLLIB] audit MLlib API annotations

I only found `ml.NaiveBayes` missing `Experimental` annotation. This PR doesn't cover Python APIs.

cc jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #8452 from mengxr/SPARK-9665.
---
 .../apache/spark/ml/classification/NaiveBayes.scala  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 97cbaf1fa8761..69cb88a7e6718 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -18,11 +18,11 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.SparkException
-import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
-import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param, DoubleParam}
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.PredictorParams
+import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes}
-import org.apache.spark.mllib.classification.{NaiveBayesModel => OldNaiveBayesModel}
+import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, NaiveBayesModel => OldNaiveBayesModel}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
@@ -59,6 +59,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
 }
 
 /**
+ * :: Experimental ::
  * Naive Bayes Classifiers.
  * It supports both Multinomial NB
  * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]])
@@ -68,6 +69,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
  * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]).
  * The input feature values must be nonnegative.
  */
+@Experimental
 class NaiveBayes(override val uid: String)
   extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
   with NaiveBayesParams {
@@ -101,11 +103,13 @@ class NaiveBayes(override val uid: String)
 }
 
 /**
+ * :: Experimental ::
  * Model produced by [[NaiveBayes]]
  * @param pi log of class priors, whose dimension is C (number of classes)
  * @param theta log of class conditional probabilities, whose dimension is C (number of classes)
  *              by D (number of features)
  */
+@Experimental
 class NaiveBayesModel private[ml] (
     override val uid: String,
     val pi: Vector,

From de7209c256aaf79a0978cfcf6e98bb013267b93a Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Wed, 26 Aug 2015 12:19:36 -0700
Subject: [PATCH 094/802] HOTFIX: Increase PRB timeout

---
 dev/run-tests-jenkins | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index c4d39d95d5890..f144c053046c5 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -48,8 +48,8 @@ COMMIT_URL="https://github.com/apache/spark/commit/${ghprbActualCommit}"
 SHORT_COMMIT_HASH="${ghprbActualCommit:0:7}"
 
 # format: http://linux.die.net/man/1/timeout
-# must be less than the timeout configured on Jenkins (currently 180m)
-TESTS_TIMEOUT="175m"
+# must be less than the timeout configured on Jenkins (currently 300m)
+TESTS_TIMEOUT="250m"
 
 # Array to capture all tests to run on the pull request. These tests are held under the
 #+ dev/tests/ directory.

From 086d4681df3ebfccfc04188262c10482f44553b0 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 26 Aug 2015 14:02:19 -0700
Subject: [PATCH 095/802] [SPARK-10241] [MLLIB] update since versions in
 mllib.recommendation

Same as #8421 but for `mllib.recommendation`.

cc srowen coderxiang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8432 from mengxr/SPARK-10241.
---
 .../spark/mllib/recommendation/ALS.scala      | 22 ++++++++++++++++++-
 .../MatrixFactorizationModel.scala            |  8 +++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index b27ef1b949e2e..33aaf853e599d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -28,7 +28,10 @@ import org.apache.spark.storage.StorageLevel
  * A more compact class to represent a rating than Tuple3[Int, Int, Double].
  */
 @Since("0.8.0")
-case class Rating(user: Int, product: Int, rating: Double)
+case class Rating @Since("0.8.0") (
+    @Since("0.8.0") user: Int,
+    @Since("0.8.0") product: Int,
+    @Since("0.8.0") rating: Double)
 
 /**
  * Alternating Least Squares matrix factorization.
@@ -59,6 +62,7 @@ case class Rating(user: Int, product: Int, rating: Double)
  * indicated user
  * preferences rather than explicit ratings given to items.
  */
+@Since("0.8.0")
 class ALS private (
     private var numUserBlocks: Int,
     private var numProductBlocks: Int,
@@ -74,6 +78,7 @@ class ALS private (
    * Constructs an ALS instance with default parameters: {numBlocks: -1, rank: 10, iterations: 10,
    * lambda: 0.01, implicitPrefs: false, alpha: 1.0}.
    */
+  @Since("0.8.0")
   def this() = this(-1, -1, 10, 10, 0.01, false, 1.0)
 
   /** If true, do alternating nonnegative least squares. */
@@ -90,6 +95,7 @@ class ALS private (
    * Set the number of blocks for both user blocks and product blocks to parallelize the computation
    * into; pass -1 for an auto-configured number of blocks. Default: -1.
    */
+  @Since("0.8.0")
   def setBlocks(numBlocks: Int): this.type = {
     this.numUserBlocks = numBlocks
     this.numProductBlocks = numBlocks
@@ -99,6 +105,7 @@ class ALS private (
   /**
    * Set the number of user blocks to parallelize the computation.
    */
+  @Since("1.1.0")
   def setUserBlocks(numUserBlocks: Int): this.type = {
     this.numUserBlocks = numUserBlocks
     this
@@ -107,30 +114,35 @@ class ALS private (
   /**
    * Set the number of product blocks to parallelize the computation.
    */
+  @Since("1.1.0")
   def setProductBlocks(numProductBlocks: Int): this.type = {
     this.numProductBlocks = numProductBlocks
     this
   }
 
   /** Set the rank of the feature matrices computed (number of features). Default: 10. */
+  @Since("0.8.0")
   def setRank(rank: Int): this.type = {
     this.rank = rank
     this
   }
 
   /** Set the number of iterations to run. Default: 10. */
+  @Since("0.8.0")
   def setIterations(iterations: Int): this.type = {
     this.iterations = iterations
     this
   }
 
   /** Set the regularization parameter, lambda. Default: 0.01. */
+  @Since("0.8.0")
   def setLambda(lambda: Double): this.type = {
     this.lambda = lambda
     this
   }
 
   /** Sets whether to use implicit preference. Default: false. */
+  @Since("0.8.1")
   def setImplicitPrefs(implicitPrefs: Boolean): this.type = {
     this.implicitPrefs = implicitPrefs
     this
@@ -139,12 +151,14 @@ class ALS private (
   /**
    * Sets the constant used in computing confidence in implicit ALS. Default: 1.0.
    */
+  @Since("0.8.1")
   def setAlpha(alpha: Double): this.type = {
     this.alpha = alpha
     this
   }
 
   /** Sets a random seed to have deterministic results. */
+  @Since("1.0.0")
   def setSeed(seed: Long): this.type = {
     this.seed = seed
     this
@@ -154,6 +168,7 @@ class ALS private (
    * Set whether the least-squares problems solved at each iteration should have
    * nonnegativity constraints.
    */
+  @Since("1.1.0")
   def setNonnegative(b: Boolean): this.type = {
     this.nonnegative = b
     this
@@ -166,6 +181,7 @@ class ALS private (
    * set `spark.rdd.compress` to `true` to reduce the space requirement, at the cost of speed.
    */
   @DeveloperApi
+  @Since("1.1.0")
   def setIntermediateRDDStorageLevel(storageLevel: StorageLevel): this.type = {
     require(storageLevel != StorageLevel.NONE,
       "ALS is not designed to run without persisting intermediate RDDs.")
@@ -181,6 +197,7 @@ class ALS private (
    * at the cost of speed.
    */
   @DeveloperApi
+  @Since("1.3.0")
   def setFinalRDDStorageLevel(storageLevel: StorageLevel): this.type = {
     this.finalRDDStorageLevel = storageLevel
     this
@@ -194,6 +211,7 @@ class ALS private (
    * this setting is ignored.
    */
   @DeveloperApi
+  @Since("1.4.0")
   def setCheckpointInterval(checkpointInterval: Int): this.type = {
     this.checkpointInterval = checkpointInterval
     this
@@ -203,6 +221,7 @@ class ALS private (
    * Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
    * Returns a MatrixFactorizationModel with feature vectors for each user and product.
    */
+  @Since("0.8.0")
   def run(ratings: RDD[Rating]): MatrixFactorizationModel = {
     val sc = ratings.context
 
@@ -250,6 +269,7 @@ class ALS private (
   /**
    * Java-friendly version of [[ALS.run]].
    */
+  @Since("1.3.0")
   def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd)
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index ba4cfdcd9f1dd..46562eb2ad0f7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -52,10 +52,10 @@ import org.apache.spark.storage.StorageLevel
  *                        and the features computed for this product.
  */
 @Since("0.8.0")
-class MatrixFactorizationModel(
-    val rank: Int,
-    val userFeatures: RDD[(Int, Array[Double])],
-    val productFeatures: RDD[(Int, Array[Double])])
+class MatrixFactorizationModel @Since("0.8.0") (
+    @Since("0.8.0") val rank: Int,
+    @Since("0.8.0") val userFeatures: RDD[(Int, Array[Double])],
+    @Since("0.8.0") val productFeatures: RDD[(Int, Array[Double])])
   extends Saveable with Serializable with Logging {
 
   require(rank > 0)

From d41d6c48207159490c1e1d9cc54015725cfa41b2 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 26 Aug 2015 16:04:44 -0700
Subject: [PATCH 096/802] [SPARK-10305] [SQL] fix create DataFrame from Python
 class

cc jkbradley

Author: Davies Liu <davies@databricks.com>

Closes #8470 from davies/fix_create_df.
---
 python/pyspark/sql/tests.py | 12 ++++++++++++
 python/pyspark/sql/types.py |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index aacfb34c77618..cd32e26c64f22 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -145,6 +145,12 @@ class PythonOnlyPoint(ExamplePoint):
     __UDT__ = PythonOnlyUDT()
 
 
+class MyObject(object):
+    def __init__(self, key, value):
+        self.key = key
+        self.value = value
+
+
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
     def test_data_type_eq(self):
@@ -383,6 +389,12 @@ def test_infer_nested_schema(self):
         df = self.sqlCtx.inferSchema(rdd)
         self.assertEquals(Row(field1=1, field2=u'row1'), df.first())
 
+    def test_create_dataframe_from_objects(self):
+        data = [MyObject(1, "1"), MyObject(2, "2")]
+        df = self.sqlCtx.createDataFrame(data)
+        self.assertEqual(df.dtypes, [("key", "bigint"), ("value", "string")])
+        self.assertEqual(df.first(), Row(key=1, value="1"))
+
     def test_select_null_literal(self):
         df = self.sqlCtx.sql("select null as col")
         self.assertEquals(Row(col=None), df.first())
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index ed4e5b594bd61..94e581a78364c 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -537,6 +537,9 @@ def toInternal(self, obj):
                 return tuple(f.toInternal(obj.get(n)) for n, f in zip(self.names, self.fields))
             elif isinstance(obj, (tuple, list)):
                 return tuple(f.toInternal(v) for f, v in zip(self.fields, obj))
+            elif hasattr(obj, "__dict__"):
+                d = obj.__dict__
+                return tuple(f.toInternal(d.get(n)) for n, f in zip(self.names, self.fields))
             else:
                 raise ValueError("Unexpected tuple %r with StructType" % obj)
         else:
@@ -544,6 +547,9 @@ def toInternal(self, obj):
                 return tuple(obj.get(n) for n in self.names)
             elif isinstance(obj, (list, tuple)):
                 return tuple(obj)
+            elif hasattr(obj, "__dict__"):
+                d = obj.__dict__
+                return tuple(d.get(n) for n in self.names)
             else:
                 raise ValueError("Unexpected tuple %r with StructType" % obj)
 

From ad7f0f160be096c0fdae6e6cf7e3b6ba4a606de7 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 26 Aug 2015 18:13:07 -0700
Subject: [PATCH 097/802] [SPARK-10308] [SPARKR] Add %in% to the exported
 namespace

I also checked all the other functions defined in column.R, functions.R and DataFrame.R and everything else looked fine.

cc yu-iskw

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #8473 from shivaram/in-namespace.
---
 R/pkg/NAMESPACE | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 3e5c89d779b7b..5286c01986204 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -47,12 +47,12 @@ exportMethods("arrange",
               "join",
               "limit",
               "merge",
+              "mutate",
+              "na.omit",
               "names",
               "ncol",
               "nrow",
               "orderBy",
-              "mutate",
-              "names",
               "persist",
               "printSchema",
               "rbind",
@@ -82,7 +82,8 @@ exportMethods("arrange",
 
 exportClasses("Column")
 
-exportMethods("abs",
+exportMethods("%in%",
+              "abs",
               "acos",
               "add_months",
               "alias",

From 773ca037a43d464ce7f16fe693ca6034f09a35b7 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 26 Aug 2015 18:14:32 -0700
Subject: [PATCH 098/802] [MINOR] [SPARKR] Fix some validation problems in
 SparkR

Getting rid of some validation problems in SparkR
https://github.com/apache/spark/pull/7883

cc shivaram

```
inst/tests/test_Serde.R:26:1: style: Trailing whitespace is superfluous.

^~
inst/tests/test_Serde.R:34:1: style: Trailing whitespace is superfluous.

^~
inst/tests/test_Serde.R:37:38: style: Trailing whitespace is superfluous.
  expect_equal(class(x), "character")
                                     ^~
inst/tests/test_Serde.R:50:1: style: Trailing whitespace is superfluous.

^~
inst/tests/test_Serde.R:55:1: style: Trailing whitespace is superfluous.

^~
inst/tests/test_Serde.R:60:1: style: Trailing whitespace is superfluous.

^~
inst/tests/test_sparkSQL.R:611:1: style: Trailing whitespace is superfluous.

^~
R/DataFrame.R:664:1: style: Trailing whitespace is superfluous.

^~~~~~~~~~~~~~
R/DataFrame.R:670:55: style: Trailing whitespace is superfluous.
                df <- data.frame(row.names = 1 : nrow)
                                                      ^~~~~~~~~~~~~~~~
R/DataFrame.R:672:1: style: Trailing whitespace is superfluous.

^~~~~~~~~~~~~~
R/DataFrame.R:686:49: style: Trailing whitespace is superfluous.
                    df[[names[colIndex]]] <- vec
                                                ^~~~~~~~~~~~~~~~~~
```

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8474 from yu-iskw/minor-fix-sparkr.
---
 R/pkg/R/DataFrame.R              |  8 ++++----
 R/pkg/inst/tests/test_Serde.R    | 12 ++++++------
 R/pkg/inst/tests/test_sparkSQL.R |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a5162de705f8f..dd8126aebf467 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -661,15 +661,15 @@ setMethod("collect",
               # listCols is a list of columns
               listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf)
               stopifnot(length(listCols) == ncol)
-              
+
               # An empty data.frame with 0 columns and number of rows as collected
               nrow <- length(listCols[[1]])
               if (nrow <= 0) {
                 df <- data.frame()
               } else {
-                df <- data.frame(row.names = 1 : nrow)                
+                df <- data.frame(row.names = 1 : nrow)
               }
-              
+
               # Append columns one by one
               for (colIndex in 1 : ncol) {
                 # Note: appending a column of list type into a data.frame so that
@@ -683,7 +683,7 @@ setMethod("collect",
                   # TODO: more robust check on column of primitive types
                   vec <- do.call(c, col)
                   if (class(vec) != "list") {
-                    df[[names[colIndex]]] <- vec                  
+                    df[[names[colIndex]]] <- vec
                   } else {
                     # For columns of complex type, be careful to access them.
                     # Get a column of complex type returns a list.
diff --git a/R/pkg/inst/tests/test_Serde.R b/R/pkg/inst/tests/test_Serde.R
index 009db85da2beb..dddce54d70443 100644
--- a/R/pkg/inst/tests/test_Serde.R
+++ b/R/pkg/inst/tests/test_Serde.R
@@ -23,7 +23,7 @@ test_that("SerDe of primitive types", {
   x <- callJStatic("SparkRHandler", "echo", 1L)
   expect_equal(x, 1L)
   expect_equal(class(x), "integer")
-  
+
   x <- callJStatic("SparkRHandler", "echo", 1)
   expect_equal(x, 1)
   expect_equal(class(x), "numeric")
@@ -31,10 +31,10 @@ test_that("SerDe of primitive types", {
   x <- callJStatic("SparkRHandler", "echo", TRUE)
   expect_true(x)
   expect_equal(class(x), "logical")
-  
+
   x <- callJStatic("SparkRHandler", "echo", "abc")
   expect_equal(x, "abc")
-  expect_equal(class(x), "character")  
+  expect_equal(class(x), "character")
 })
 
 test_that("SerDe of list of primitive types", {
@@ -47,17 +47,17 @@ test_that("SerDe of list of primitive types", {
   y <- callJStatic("SparkRHandler", "echo", x)
   expect_equal(x, y)
   expect_equal(class(y[[1]]), "numeric")
-  
+
   x <- list(TRUE, FALSE)
   y <- callJStatic("SparkRHandler", "echo", x)
   expect_equal(x, y)
   expect_equal(class(y[[1]]), "logical")
-  
+
   x <- list("a", "b", "c")
   y <- callJStatic("SparkRHandler", "echo", x)
   expect_equal(x, y)
   expect_equal(class(y[[1]]), "character")
-  
+
   # Empty list
   x <- list()
   y <- callJStatic("SparkRHandler", "echo", x)
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index ee48a3dc0cc05..8e22c56824b16 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -608,7 +608,7 @@ test_that("subsetting", {
   df4 <- df[df$age %in% c(19, 30), 1:2]
   expect_equal(count(df4), 2)
   expect_equal(columns(df4), c("name", "age"))
-  
+
   df5 <- df[df$age %in% c(19), c(1,2)]
   expect_equal(count(df5), 1)
   expect_equal(columns(df5), c("name", "age"))

From 0fac144f6bd835395059154532d72cdb5dc7ef8d Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 26 Aug 2015 18:14:54 -0700
Subject: [PATCH 099/802] [SPARK-9424] [SQL] Parquet programming guide updates
 for 1.5

Author: Cheng Lian <lian@databricks.com>

Closes #8467 from liancheng/spark-9424/parquet-docs-for-1.5.
---
 docs/sql-programming-guide.md | 45 ++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 33e7893d7bd0a..e64190b9b209d 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1124,6 +1124,13 @@ a simple schema, and gradually add more columns to the schema as needed.  In thi
 up with multiple Parquet files with different but mutually compatible schemas.  The Parquet data
 source is now able to automatically detect this case and merge schemas of all these files.
 
+Since schema merging is a relatively expensive operation, and is not a necessity in most cases, we
+turned it off by default starting from 1.5.0.  You may enable it by
+
+1. setting data source option `mergeSchema` to `true` when reading Parquet files (as shown in the
+   examples below), or
+2. setting the global SQL option `spark.sql.parquet.mergeSchema` to `true`.
+
 <div class="codetabs">
 
 <div data-lang="scala"  markdown="1">
@@ -1143,7 +1150,7 @@ val df2 = sc.makeRDD(6 to 10).map(i => (i, i * 3)).toDF("single", "triple")
 df2.write.parquet("data/test_table/key=2")
 
 // Read the partitioned table
-val df3 = sqlContext.read.parquet("data/test_table")
+val df3 = sqlContext.read.option("mergeSchema", "true").parquet("data/test_table")
 df3.printSchema()
 
 // The final schema consists of all 3 columns in the Parquet files together
@@ -1165,16 +1172,16 @@ df3.printSchema()
 # Create a simple DataFrame, stored into a partition directory
 df1 = sqlContext.createDataFrame(sc.parallelize(range(1, 6))\
                                    .map(lambda i: Row(single=i, double=i * 2)))
-df1.save("data/test_table/key=1", "parquet")
+df1.write.parquet("data/test_table/key=1")
 
 # Create another DataFrame in a new partition directory,
 # adding a new column and dropping an existing column
 df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11))
                                    .map(lambda i: Row(single=i, triple=i * 3)))
-df2.save("data/test_table/key=2", "parquet")
+df2.write.parquet("data/test_table/key=2")
 
 # Read the partitioned table
-df3 = sqlContext.load("data/test_table", "parquet")
+df3 = sqlContext.read.option("mergeSchema", "true").parquet("data/test_table")
 df3.printSchema()
 
 # The final schema consists of all 3 columns in the Parquet files together
@@ -1201,7 +1208,7 @@ saveDF(df1, "data/test_table/key=1", "parquet", "overwrite")
 saveDF(df2, "data/test_table/key=2", "parquet", "overwrite")
 
 # Read the partitioned table
-df3 <- loadDF(sqlContext, "data/test_table", "parquet")
+df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true")
 printSchema(df3)
 
 # The final schema consists of all 3 columns in the Parquet files together
@@ -1301,7 +1308,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
   <td><code>spark.sql.parquet.binaryAsString</code></td>
   <td>false</td>
   <td>
-    Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do
+    Some other Parquet-producing systems, in particular Impala, Hive, and older versions of Spark SQL, do
     not differentiate between binary data and strings when writing out the Parquet schema. This
     flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
   </td>
@@ -1310,8 +1317,7 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
   <td><code>spark.sql.parquet.int96AsTimestamp</code></td>
   <td>true</td>
   <td>
-    Some Parquet-producing systems, in particular Impala, store Timestamp into INT96. Spark would also
-    store Timestamp as INT96 because we need to avoid precision lost of the nanoseconds field. This
+    Some Parquet-producing systems, in particular Impala and Hive, store Timestamp into INT96.  This
     flag tells Spark SQL to interpret INT96 data as a timestamp to provide compatibility with these systems.
   </td>
 </tr>
@@ -1355,6 +1361,9 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
     <p>
       <b>Note:</b>
       <ul>
+        <li>
+          This option is automatically ignored if <code>spark.speculation</code> is turned on.
+        </li>
         <li>
           This option must be set via Hadoop <code>Configuration</code> rather than Spark
           <code>SQLConf</code>.
@@ -1371,6 +1380,26 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
     </p>
   </td>
 </tr>
+<tr>
+  <td><code>spark.sql.parquet.mergeSchema</code></td>
+  <td><code>false</code></td>
+  <td>
+    <p>
+      When true, the Parquet data source merges schemas collected from all data files, otherwise the
+      schema is picked from the summary file or a random data file if no summary file is available.
+    </p>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.mergeSchema</code></td>
+  <td><code>false</code></td>
+  <td>
+    <p>
+      When true, the Parquet data source merges schemas collected from all data files, otherwise the
+      schema is picked from the summary file or a random data file if no summary file is available.
+    </p>
+  </td>
+</tr>
 </table>
 
 ## JSON Datasets

From ce97834dc0cc55eece0e909a4061ca6f2123f60d Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Aug 2015 22:19:11 -0700
Subject: [PATCH 100/802] [SPARK-9964] [PYSPARK] [SQL] PySpark DataFrameReader
 accept RDD of String for JSON

PySpark DataFrameReader should could accept an RDD of Strings (like the Scala version does) for JSON, rather than only taking a path.
If this PR is merged, it should be duplicated to cover the other input types (not just JSON).

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8444 from yanboliang/spark-9964.
---
 python/pyspark/sql/readwriter.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 78247c8fa7372..3fa6895880a97 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -15,8 +15,14 @@
 # limitations under the License.
 #
 
+import sys
+
+if sys.version >= '3':
+    basestring = unicode = str
+
 from py4j.java_gateway import JavaClass
 
+from pyspark import RDD
 from pyspark.sql import since
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
@@ -125,23 +131,33 @@ def load(self, path=None, format=None, schema=None, **options):
     @since(1.4)
     def json(self, path, schema=None):
         """
-        Loads a JSON file (one object per line) and returns the result as
-        a :class`DataFrame`.
+        Loads a JSON file (one object per line) or an RDD of Strings storing JSON objects
+        (one object per record) and returns the result as a :class`DataFrame`.
 
         If the ``schema`` parameter is not specified, this function goes
         through the input once to determine the input schema.
 
-        :param path: string, path to the JSON dataset.
+        :param path: string represents path to the JSON dataset,
+                     or RDD of Strings storing JSON objects.
         :param schema: an optional :class:`StructType` for the input schema.
 
-        >>> df = sqlContext.read.json('python/test_support/sql/people.json')
-        >>> df.dtypes
+        >>> df1 = sqlContext.read.json('python/test_support/sql/people.json')
+        >>> df1.dtypes
+        [('age', 'bigint'), ('name', 'string')]
+        >>> rdd = sc.textFile('python/test_support/sql/people.json')
+        >>> df2 = sqlContext.read.json(rdd)
+        >>> df2.dtypes
         [('age', 'bigint'), ('name', 'string')]
 
         """
         if schema is not None:
             self.schema(schema)
-        return self._df(self._jreader.json(path))
+        if isinstance(path, basestring):
+            return self._df(self._jreader.json(path))
+        elif isinstance(path, RDD):
+            return self._df(self._jreader.json(path._jrdd))
+        else:
+            raise TypeError("path can be only string or RDD")
 
     @since(1.4)
     def table(self, tableName):

From e936cf8088a06d6aefce44305f3904bbeb17b432 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 26 Aug 2015 22:27:31 -0700
Subject: [PATCH 101/802] [SPARK-10219] [SPARKR] Fix varargsToEnv and add test
 case

cc sun-rui davies

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #8475 from shivaram/varargs-fix.
---
 R/pkg/R/utils.R                  | 3 ++-
 R/pkg/inst/tests/test_sparkSQL.R | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 4f9f4d9cad2a8..3babcb519378e 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -314,7 +314,8 @@ convertEnvsToList <- function(keys, vals) {
 
 # Utility function to capture the varargs into environment object
 varargsToEnv <- function(...) {
-  pairs <- as.list(substitute(list(...)))[-1L]
+  # Based on http://stackoverflow.com/a/3057419/4577954
+  pairs <- list(...)
   env <- new.env()
   for (name in names(pairs)) {
     env[[name]] <- pairs[[name]]
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 8e22c56824b16..4b672e115f924 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1060,6 +1060,12 @@ test_that("parquetFile works with multiple input paths", {
   parquetDF <- parquetFile(sqlContext, parquetPath, parquetPath2)
   expect_is(parquetDF, "DataFrame")
   expect_equal(count(parquetDF), count(df) * 2)
+
+  # Test if varargs works with variables
+  saveMode <- "overwrite"
+  mergeSchema <- "true"
+  parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet")
+  write.df(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema)
 })
 
 test_that("describe() and summarize() on a DataFrame", {

From de0278286cf6db8df53b0b68918ea114f2c77f1f Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Wed, 26 Aug 2015 23:12:55 -0700
Subject: [PATCH 102/802] =?UTF-8?q?[SPARK-10251]=20[CORE]=20some=20common?=
 =?UTF-8?q?=20types=20are=20not=20registered=20for=20Kryo=20Serializat?=
 =?UTF-8?q?=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ion by default

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #8465 from harsha2010/SPARK-10251.
---
 .../spark/serializer/KryoSerializer.scala     | 35 ++++++++++++++++++-
 .../serializer/KryoSerializerSuite.scala      | 30 ++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 048a938507277..b977711e7d5ad 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -22,6 +22,7 @@ import java.nio.ByteBuffer
 import javax.annotation.Nullable
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.{Kryo, KryoException}
@@ -38,7 +39,7 @@ import org.apache.spark.network.nio.{GetBlock, GotBlock, PutBlock}
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus}
 import org.apache.spark.storage._
-import org.apache.spark.util.{BoundedPriorityQueue, SerializableConfiguration, SerializableJobConf}
+import org.apache.spark.util.{Utils, BoundedPriorityQueue, SerializableConfiguration, SerializableJobConf}
 import org.apache.spark.util.collection.CompactBuffer
 
 /**
@@ -131,6 +132,38 @@ class KryoSerializer(conf: SparkConf)
     // our code override the generic serializers in Chill for things like Seq
     new AllScalaRegistrar().apply(kryo)
 
+    // Register types missed by Chill.
+    // scalastyle:off
+    kryo.register(classOf[Array[Tuple1[Any]]])
+    kryo.register(classOf[Array[Tuple2[Any, Any]]])
+    kryo.register(classOf[Array[Tuple3[Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple4[Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple5[Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple6[Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple7[Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple8[Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple9[Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+    kryo.register(classOf[Array[Tuple22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]]])
+
+    // scalastyle:on
+
+    kryo.register(None.getClass)
+    kryo.register(Nil.getClass)
+    kryo.register(Utils.classForName("scala.collection.immutable.$colon$colon"))
+    kryo.register(classOf[ArrayBuffer[Any]])
+
     kryo.setClassLoader(classLoader)
     kryo
   }
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 8d1c9d17e977e..e428414cf6e85 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -150,6 +150,36 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
       mutable.HashMap(1->"one", 2->"two", 3->"three")))
   }
 
+  test("Bug: SPARK-10251") {
+    val ser = new KryoSerializer(conf.clone.set("spark.kryo.registrationRequired", "true"))
+      .newInstance()
+    def check[T: ClassTag](t: T) {
+      assert(ser.deserialize[T](ser.serialize(t)) === t)
+    }
+    check((1, 3))
+    check(Array((1, 3)))
+    check(List((1, 3)))
+    check(List[Int]())
+    check(List[Int](1, 2, 3))
+    check(List[String]())
+    check(List[String]("x", "y", "z"))
+    check(None)
+    check(Some(1))
+    check(Some("hi"))
+    check(1 -> 1)
+    check(mutable.ArrayBuffer(1, 2, 3))
+    check(mutable.ArrayBuffer("1", "2", "3"))
+    check(mutable.Map())
+    check(mutable.Map(1 -> "one", 2 -> "two"))
+    check(mutable.Map("one" -> 1, "two" -> 2))
+    check(mutable.HashMap(1 -> "one", 2 -> "two"))
+    check(mutable.HashMap("one" -> 1, "two" -> 2))
+    check(List(Some(mutable.HashMap(1->1, 2->2)), None, Some(mutable.HashMap(3->4))))
+    check(List(
+      mutable.HashMap("one" -> 1, "two" -> 2),
+      mutable.HashMap(1->"one", 2->"two", 3->"three")))
+  }
+
   test("ranges") {
     val ser = new KryoSerializer(conf).newInstance()
     def check[T: ClassTag](t: T) {

From 9625d13d575c97bbff264f6a94838aae72c9202d Mon Sep 17 00:00:00 2001
From: Moussa Taifi <moutai10@gmail.com>
Date: Thu, 27 Aug 2015 10:34:47 +0100
Subject: [PATCH 103/802] [DOCS] [STREAMING] [KAFKA] Fix typo in exactly once
 semantics

Fix Typo in exactly once semantics
[Semantics of output operations] link

Author: Moussa Taifi <moutai10@gmail.com>

Closes #8468 from moutai/patch-3.
---
 docs/streaming-kafka-integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index 7571e22575efd..5db39ae54a274 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -82,7 +82,7 @@ This approach has the following advantages over the receiver-based approach (i.e
 
 - *Efficiency:* Achieving zero-data loss in the first approach required the data to be stored in a Write Ahead Log, which further replicated the data. This is actually inefficient as the data effectively gets replicated twice - once by Kafka, and a second time by the Write Ahead Log. This second approach eliminates the problem as there is no receiver, and hence no need for Write Ahead Logs. As long as you have sufficient Kafka retention, messages can be recovered from Kafka.
 
-- *Exactly-once semantics:* The first approach uses Kafka's high level API to store consumed offsets in Zookeeper. This is traditionally the way to consume data from Kafka. While this approach (in combination with write ahead logs) can ensure zero data loss (i.e. at-least once semantics), there is a small chance some records may get consumed twice under some failures. This occurs because of inconsistencies between data reliably received by Spark Streaming and offsets tracked by Zookeeper. Hence, in this second approach, we use simple Kafka API that does not use Zookeeper. Offsets are tracked by Spark Streaming within its checkpoints. This eliminates inconsistencies between Spark Streaming and Zookeeper/Kafka, and so each record is received by Spark Streaming effectively exactly once despite failures. In order to achieve exactly-once semantics for output of your results, your output operation that saves the data to an external data store must be either idempotent, or an atomic transaction that saves results and offsets (see [Semanitcs of output operations](streaming-programming-guide.html#semantics-of-output-operations) in the main programming guide for further information).
+- *Exactly-once semantics:* The first approach uses Kafka's high level API to store consumed offsets in Zookeeper. This is traditionally the way to consume data from Kafka. While this approach (in combination with write ahead logs) can ensure zero data loss (i.e. at-least once semantics), there is a small chance some records may get consumed twice under some failures. This occurs because of inconsistencies between data reliably received by Spark Streaming and offsets tracked by Zookeeper. Hence, in this second approach, we use simple Kafka API that does not use Zookeeper. Offsets are tracked by Spark Streaming within its checkpoints. This eliminates inconsistencies between Spark Streaming and Zookeeper/Kafka, and so each record is received by Spark Streaming effectively exactly once despite failures. In order to achieve exactly-once semantics for output of your results, your output operation that saves the data to an external data store must be either idempotent, or an atomic transaction that saves results and offsets (see [Semantics of output operations](streaming-programming-guide.html#semantics-of-output-operations) in the main programming guide for further information).
 
 Note that one disadvantage of this approach is that it does not update offsets in Zookeeper, hence Zookeeper-based Kafka monitoring tools will not show progress. However, you can access the offsets processed by this approach in each batch and update Zookeeper yourself (see below).
 

From 1650f6f56ed4b7f1a7f645c9e8d5ac533464bd78 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 10:44:44 +0100
Subject: [PATCH 104/802] [SPARK-10254] [ML] Removes Guava dependencies in
 spark.ml.feature JavaTests

* Replaces `com.google.common` dependencies with `java.util.Arrays`
* Small clean up in `JavaNormalizerSuite`

Author: Feynman Liang <fliang@databricks.com>

Closes #8445 from feynmanliang/SPARK-10254.
---
 .../apache/spark/ml/feature/JavaBucketizerSuite.java  |  5 +++--
 .../org/apache/spark/ml/feature/JavaDCTSuite.java     |  5 +++--
 .../apache/spark/ml/feature/JavaHashingTFSuite.java   |  5 +++--
 .../apache/spark/ml/feature/JavaNormalizerSuite.java  | 11 +++++------
 .../org/apache/spark/ml/feature/JavaPCASuite.java     |  4 ++--
 .../ml/feature/JavaPolynomialExpansionSuite.java      |  5 +++--
 .../spark/ml/feature/JavaStandardScalerSuite.java     |  4 ++--
 .../apache/spark/ml/feature/JavaTokenizerSuite.java   |  6 ++++--
 .../spark/ml/feature/JavaVectorIndexerSuite.java      |  5 ++---
 .../spark/ml/feature/JavaVectorSlicerSuite.java       |  4 ++--
 .../apache/spark/ml/feature/JavaWord2VecSuite.java    | 11 ++++++-----
 11 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
index d5bd230a957a1..47d68de599da2 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -54,7 +55,7 @@ public void tearDown() {
   public void bucketizerTest() {
     double[] splits = {-0.5, 0.0, 0.5};
 
-    JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
       RowFactory.create(-0.5),
       RowFactory.create(-0.3),
       RowFactory.create(0.0),
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
index 845eed61c45c6..0f6ec64d97d36 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D;
 import org.junit.After;
 import org.junit.Assert;
@@ -56,7 +57,7 @@ public void tearDown() {
   @Test
   public void javaCompatibilityTest() {
     double[] input = new double[] {1D, 2D, 3D, 4D};
-    JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
       RowFactory.create(Vectors.dense(input))
     ));
     DataFrame dataset = jsql.createDataFrame(data, new StructType(new StructField[]{
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
index 599e9cfd23ad4..03dd5369bddf7 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -54,7 +55,7 @@ public void tearDown() {
 
   @Test
   public void hashingTF() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
       RowFactory.create(0.0, "Hi I heard about Spark"),
       RowFactory.create(0.0, "I wish Java could use case classes"),
       RowFactory.create(1.0, "Logistic regression models are neat")
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
index d82f3b7e8c076..e17d549c5059b 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaNormalizerSuite.java
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ml.feature;
 
-import java.util.List;
+import java.util.Arrays;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 
@@ -48,13 +48,12 @@ public void tearDown() {
   @Test
   public void normalizer() {
     // The tests are to check Java compatibility.
-    List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
+    JavaRDD<VectorIndexerSuite.FeatureData> points = jsc.parallelize(Arrays.asList(
       new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
       new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)),
       new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0))
-    );
-    DataFrame dataFrame = jsql.createDataFrame(jsc.parallelize(points, 2),
-      VectorIndexerSuite.FeatureData.class);
+    ));
+    DataFrame dataFrame = jsql.createDataFrame(points, VectorIndexerSuite.FeatureData.class);
     Normalizer normalizer = new Normalizer()
       .setInputCol("features")
       .setOutputCol("normFeatures");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
index 5cf43fec6f29e..e8f329f9cf29e 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
@@ -18,11 +18,11 @@
 package org.apache.spark.ml.feature;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -78,7 +78,7 @@ public Vector getExpected() {
 
   @Test
   public void testPCA() {
-    List<Vector> points = Lists.newArrayList(
+    List<Vector> points = Arrays.asList(
       Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0}),
       Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
       Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
index 5e8211c2c5118..834fedbb59e1b 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -59,7 +60,7 @@ public void polynomialExpansionTest() {
       .setOutputCol("polyFeatures")
       .setDegree(3);
 
-    JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
       RowFactory.create(
         Vectors.dense(-2.0, 2.3),
         Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java
index 74eb2733f06ef..ed74363f59e34 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStandardScalerSuite.java
@@ -17,9 +17,9 @@
 
 package org.apache.spark.ml.feature;
 
+import java.util.Arrays;
 import java.util.List;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -48,7 +48,7 @@ public void tearDown() {
   @Test
   public void standardScaler() {
     // The tests are to check Java compatibility.
-    List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
+    List<VectorIndexerSuite.FeatureData> points = Arrays.asList(
       new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
       new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)),
       new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0))
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
index 3806f650025b2..02309ce63219a 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -54,7 +55,8 @@ public void regexTokenizer() {
       .setGaps(true)
       .setMinTokenLength(3);
 
-    JavaRDD<TokenizerTestData> rdd = jsc.parallelize(Lists.newArrayList(
+
+    JavaRDD<TokenizerTestData> rdd = jsc.parallelize(Arrays.asList(
       new TokenizerTestData("Test of tok.", new String[] {"Test", "tok."}),
       new TokenizerTestData("Te,st.  punct", new String[] {"Te,st.", "punct"})
     ));
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
index c7ae5468b9429..bfcca62fa1c98 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorIndexerSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 
@@ -26,8 +27,6 @@
 import org.junit.Before;
 import org.junit.Test;
 
-import com.google.common.collect.Lists;
-
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.feature.VectorIndexerSuite.FeatureData;
 import org.apache.spark.mllib.linalg.Vectors;
@@ -52,7 +51,7 @@ public void tearDown() {
   @Test
   public void vectorIndexerAPI() {
     // The tests are to check Java compatibility.
-    List<FeatureData> points = Lists.newArrayList(
+    List<FeatureData> points = Arrays.asList(
       new FeatureData(Vectors.dense(0.0, -2.0)),
       new FeatureData(Vectors.dense(1.0, 3.0)),
       new FeatureData(Vectors.dense(1.0, 4.0))
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
index 56988b9fb29cb..f953361427586 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -63,7 +63,7 @@ public void vectorSlice() {
     };
     AttributeGroup group = new AttributeGroup("userFeatures", attrs);
 
-    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
       RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
       RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
     ));
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
index 39c70157f83c0..70f5ad9432212 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.feature;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -50,10 +51,10 @@ public void tearDown() {
 
   @Test
   public void testJavaWord2Vec() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
-      RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
-      RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
-      RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+      RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+      RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
     ));
     StructType schema = new StructType(new StructField[]{
       new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())

From 75d62307946283b03bec6aaf1bdd4f2b08c93915 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 10:45:35 +0100
Subject: [PATCH 105/802] [SPARK-10255] [ML] Removes Guava dependencies from
 spark.ml.param JavaTests

Author: Feynman Liang <fliang@databricks.com>

Closes #8446 from feynmanliang/SPARK-10255.
---
 .../java/org/apache/spark/ml/param/JavaParamsSuite.java    | 7 ++++---
 .../java/org/apache/spark/ml/param/JavaTestParams.java     | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
index 9890155e9f865..fa777f3d42a9a 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.ml.param;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -61,7 +62,7 @@ public void testParamValidate() {
     ParamValidators.ltEq(1.0);
     ParamValidators.inRange(0, 1, true, false);
     ParamValidators.inRange(0, 1);
-    ParamValidators.inArray(Lists.newArrayList(0, 1, 3));
-    ParamValidators.inArray(Lists.newArrayList("a", "b"));
+    ParamValidators.inArray(Arrays.asList(0, 1, 3));
+    ParamValidators.inArray(Arrays.asList("a", "b"));
   }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
index dc6ce8061f62b..65841182df9b4 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -17,10 +17,9 @@
 
 package org.apache.spark.ml.param;
 
+import java.util.Arrays;
 import java.util.List;
 
-import com.google.common.collect.Lists;
-
 import org.apache.spark.ml.util.Identifiable$;
 
 /**
@@ -89,7 +88,7 @@ private void init() {
     myIntParam_ = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0));
     myDoubleParam_ = new DoubleParam(this, "myDoubleParam", "this is a double param",
       ParamValidators.inRange(0.0, 1.0));
-    List<String> validStrings = Lists.newArrayList("a", "b");
+    List<String> validStrings = Arrays.asList("a", "b");
     myStringParam_ = new Param<String>(this, "myStringParam", "this is a string param",
       ParamValidators.inArray(validStrings));
     myDoubleArrayParam_ =

From 1a446f75b6cac46caea0217a66abeb226946ac71 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 10:46:18 +0100
Subject: [PATCH 106/802] [SPARK-10256] [ML] Removes guava dependency from
 spark.ml.classification JavaTests

Author: Feynman Liang <fliang@databricks.com>

Closes #8447 from feynmanliang/SPARK-10256.
---
 .../apache/spark/ml/classification/JavaNaiveBayesSuite.java   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index a700c9cddb206..8fd7bf55a2e5d 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -18,8 +18,8 @@
 package org.apache.spark.ml.classification;
 
 import java.io.Serializable;
+import java.util.Arrays;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -74,7 +74,7 @@ public void naiveBayesDefaultParams() {
 
   @Test
   public void testNaiveBayes() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
       RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)),

From b02e8187225d1765f67ce38864dfaca487be8a44 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Thu, 27 Aug 2015 11:07:37 +0100
Subject: [PATCH 107/802] [SPARK-9613] [HOTFIX] Fix usage of JavaConverters
 removed in Scala 2.11

Fix for [JavaConverters.asJavaListConverter](http://www.scala-lang.org/api/2.10.5/index.html#scala.collection.JavaConverters$) being removed in 2.11.7 and hence the build fails with the 2.11 profile enabled. Tested with the default 2.10 and 2.11 profiles. BUILD SUCCESS in both cases.

Build for 2.10:

    ./build/mvn -Pyarn -Phadoop-2.6 -Dhadoop.version=2.7.1 -DskipTests clean install

and 2.11:

    ./dev/change-scala-version.sh 2.11
    ./build/mvn -Pyarn -Phadoop-2.6 -Dhadoop.version=2.7.1 -Dscala-2.11 -DskipTests clean install

Author: Jacek Laskowski <jacek@japila.pl>

Closes #8479 from jaceklaskowski/SPARK-9613-hotfix.
---
 .../org/apache/spark/ml/classification/JavaOneVsRestSuite.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
index 2744e020e9e49..253cabf0133d0 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
@@ -55,7 +55,7 @@ public void setUp() {
 
         double[] xMean = {5.843, 3.057, 3.758, 1.199};
         double[] xVariance = {0.6856, 0.1899, 3.116, 0.581};
-        List<LabeledPoint> points = JavaConverters.asJavaListConverter(
+        List<LabeledPoint> points = JavaConverters.seqAsJavaListConverter(
             generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
         ).asJava();
         datasetRDD = jsc.parallelize(points, 2);

From e1f4de4a7d15d4ca4b5c64ff929ac3980f5d706f Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 18:46:41 +0100
Subject: [PATCH 108/802] [SPARK-10257] [MLLIB] Removes Guava from all
 spark.mllib Java tests

* Replaces instances of `Lists.newArrayList` with `Arrays.asList`
* Replaces `commons.lang.StringUtils` over `com.google.collections.Strings`
* Replaces `List` interface over `ArrayList` implementations

This PR along with #8445 #8446 #8447 completely removes all `com.google.collections.Lists` dependencies within mllib's Java tests.

Author: Feynman Liang <fliang@databricks.com>

Closes #8451 from feynmanliang/SPARK-10257.
---
 .../JavaStreamingLogisticRegressionSuite.java | 10 +++----
 .../clustering/JavaGaussianMixtureSuite.java  |  4 +--
 .../mllib/clustering/JavaKMeansSuite.java     |  9 +++----
 .../clustering/JavaStreamingKMeansSuite.java  | 10 +++----
 .../spark/mllib/feature/JavaTfIdfSuite.java   | 19 +++++++------
 .../mllib/feature/JavaWord2VecSuite.java      |  6 ++---
 .../mllib/fpm/JavaAssociationRulesSuite.java  |  5 ++--
 .../spark/mllib/fpm/JavaFPGrowthSuite.java    | 17 ++++++------
 .../spark/mllib/linalg/JavaVectorsSuite.java  |  5 ++--
 .../mllib/random/JavaRandomRDDsSuite.java     | 27 ++++++++++---------
 .../mllib/recommendation/JavaALSSuite.java    |  5 ++--
 .../JavaIsotonicRegressionSuite.java          |  7 ++---
 .../JavaStreamingLinearRegressionSuite.java   | 10 +++----
 .../spark/mllib/stat/JavaStatisticsSuite.java | 11 ++++----
 14 files changed, 71 insertions(+), 74 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
index 55787f8606d48..c9e5ee22f3273 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaStreamingLogisticRegressionSuite.java
@@ -18,11 +18,11 @@
 package org.apache.spark.mllib.classification;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -60,16 +60,16 @@ public void tearDown() {
   @Test
   @SuppressWarnings("unchecked")
   public void javaAPI() {
-    List<LabeledPoint> trainingBatch = Lists.newArrayList(
+    List<LabeledPoint> trainingBatch = Arrays.asList(
       new LabeledPoint(1.0, Vectors.dense(1.0)),
       new LabeledPoint(0.0, Vectors.dense(0.0)));
     JavaDStream<LabeledPoint> training =
-      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
-    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      attachTestInputStream(ssc, Arrays.asList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Arrays.asList(
       new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
       new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
     JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
-      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+      attachTestInputStream(ssc, Arrays.asList(testBatch, testBatch), 2));
     StreamingLogisticRegressionWithSGD slr = new StreamingLogisticRegressionWithSGD()
       .setNumIterations(2)
       .setInitialWeights(Vectors.dense(0.0));
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
index 467a7a69e8f30..123f78da54e34 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaGaussianMixtureSuite.java
@@ -18,9 +18,9 @@
 package org.apache.spark.mllib.clustering;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -48,7 +48,7 @@ public void tearDown() {
 
   @Test
   public void runGaussianMixture() {
-    List<Vector> points = Lists.newArrayList(
+    List<Vector> points = Arrays.asList(
       Vectors.dense(1.0, 2.0, 6.0),
       Vectors.dense(1.0, 3.0, 0.0),
       Vectors.dense(1.0, 4.0, 6.0)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaKMeansSuite.java
index 31676e64025d0..ad06676c72ac6 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaKMeansSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.clustering;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.After;
@@ -25,8 +26,6 @@
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-import com.google.common.collect.Lists;
-
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
@@ -48,7 +47,7 @@ public void tearDown() {
 
   @Test
   public void runKMeansUsingStaticMethods() {
-    List<Vector> points = Lists.newArrayList(
+    List<Vector> points = Arrays.asList(
       Vectors.dense(1.0, 2.0, 6.0),
       Vectors.dense(1.0, 3.0, 0.0),
       Vectors.dense(1.0, 4.0, 6.0)
@@ -67,7 +66,7 @@ public void runKMeansUsingStaticMethods() {
 
   @Test
   public void runKMeansUsingConstructor() {
-    List<Vector> points = Lists.newArrayList(
+    List<Vector> points = Arrays.asList(
       Vectors.dense(1.0, 2.0, 6.0),
       Vectors.dense(1.0, 3.0, 0.0),
       Vectors.dense(1.0, 4.0, 6.0)
@@ -90,7 +89,7 @@ public void runKMeansUsingConstructor() {
 
   @Test
   public void testPredictJavaRDD() {
-    List<Vector> points = Lists.newArrayList(
+    List<Vector> points = Arrays.asList(
       Vectors.dense(1.0, 2.0, 6.0),
       Vectors.dense(1.0, 3.0, 0.0),
       Vectors.dense(1.0, 4.0, 6.0)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
index 3b0e879eec77f..d644766d1e54d 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaStreamingKMeansSuite.java
@@ -18,11 +18,11 @@
 package org.apache.spark.mllib.clustering;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -60,16 +60,16 @@ public void tearDown() {
   @Test
   @SuppressWarnings("unchecked")
   public void javaAPI() {
-    List<Vector> trainingBatch = Lists.newArrayList(
+    List<Vector> trainingBatch = Arrays.asList(
       Vectors.dense(1.0),
       Vectors.dense(0.0));
     JavaDStream<Vector> training =
-      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
-    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      attachTestInputStream(ssc, Arrays.asList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Arrays.asList(
       new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
       new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
     JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
-      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+      attachTestInputStream(ssc, Arrays.asList(testBatch, testBatch), 2));
     StreamingKMeans skmeans = new StreamingKMeans()
       .setK(1)
       .setDecayFactor(1.0)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
index fbc26167ce66f..8a320afa4b13d 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -18,14 +18,13 @@
 package org.apache.spark.mllib.feature;
 
 import java.io.Serializable;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
-import com.google.common.collect.Lists;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -50,10 +49,10 @@ public void tfIdf() {
     // The tests are to check Java compatibility.
     HashingTF tf = new HashingTF();
     @SuppressWarnings("unchecked")
-    JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
-      Lists.newArrayList("this is a sentence".split(" ")),
-      Lists.newArrayList("this is another sentence".split(" ")),
-      Lists.newArrayList("this is still a sentence".split(" "))), 2);
+    JavaRDD<List<String>> documents = sc.parallelize(Arrays.asList(
+      Arrays.asList("this is a sentence".split(" ")),
+      Arrays.asList("this is another sentence".split(" ")),
+      Arrays.asList("this is still a sentence".split(" "))), 2);
     JavaRDD<Vector> termFreqs = tf.transform(documents);
     termFreqs.collect();
     IDF idf = new IDF();
@@ -70,10 +69,10 @@ public void tfIdfMinimumDocumentFrequency() {
     // The tests are to check Java compatibility.
     HashingTF tf = new HashingTF();
     @SuppressWarnings("unchecked")
-    JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
-      Lists.newArrayList("this is a sentence".split(" ")),
-      Lists.newArrayList("this is another sentence".split(" ")),
-      Lists.newArrayList("this is still a sentence".split(" "))), 2);
+    JavaRDD<List<String>> documents = sc.parallelize(Arrays.asList(
+      Arrays.asList("this is a sentence".split(" ")),
+      Arrays.asList("this is another sentence".split(" ")),
+      Arrays.asList("this is still a sentence".split(" "))), 2);
     JavaRDD<Vector> termFreqs = tf.transform(documents);
     termFreqs.collect();
     IDF idf = new IDF(2);
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java
index fb7afe8c6434b..e13ed07e283dd 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java
@@ -18,11 +18,11 @@
 package org.apache.spark.mllib.feature;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
 import com.google.common.base.Strings;
 import org.junit.After;
 import org.junit.Assert;
@@ -51,8 +51,8 @@ public void tearDown() {
   public void word2Vec() {
     // The tests are to check Java compatibility.
     String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10);
-    List<String> words = Lists.newArrayList(sentence.split(" "));
-    List<List<String>> localDoc = Lists.newArrayList(words, words);
+    List<String> words = Arrays.asList(sentence.split(" "));
+    List<List<String>> localDoc = Arrays.asList(words, words);
     JavaRDD<List<String>> doc = sc.parallelize(localDoc);
     Word2Vec word2vec = new Word2Vec()
       .setVectorSize(10)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
index d7c2cb3ae2067..2bef7a8609757 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
@@ -17,17 +17,16 @@
 package org.apache.spark.mllib.fpm;
 
 import java.io.Serializable;
+import java.util.Arrays;
 
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
-import com.google.common.collect.Lists;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
 
-
 public class JavaAssociationRulesSuite implements Serializable {
   private transient JavaSparkContext sc;
 
@@ -46,7 +45,7 @@ public void tearDown() {
   public void runAssociationRules() {
 
     @SuppressWarnings("unchecked")
-    JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = sc.parallelize(Lists.newArrayList(
+    JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = sc.parallelize(Arrays.asList(
       new FreqItemset<String>(new String[] {"a"}, 15L),
       new FreqItemset<String>(new String[] {"b"}, 35L),
       new FreqItemset<String>(new String[] {"a", "b"}, 12L)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
index 9ce2c52dca8b6..154f75d75e4a6 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
@@ -18,13 +18,12 @@
 package org.apache.spark.mllib.fpm;
 
 import java.io.Serializable;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
-import com.google.common.collect.Lists;
 import static org.junit.Assert.*;
 
 import org.apache.spark.api.java.JavaRDD;
@@ -48,13 +47,13 @@ public void tearDown() {
   public void runFPGrowth() {
 
     @SuppressWarnings("unchecked")
-    JavaRDD<ArrayList<String>> rdd = sc.parallelize(Lists.newArrayList(
-      Lists.newArrayList("r z h k p".split(" ")),
-      Lists.newArrayList("z y x w v u t s".split(" ")),
-      Lists.newArrayList("s x o n r".split(" ")),
-      Lists.newArrayList("x z y m t s q e".split(" ")),
-      Lists.newArrayList("z".split(" ")),
-      Lists.newArrayList("x z y r q t p".split(" "))), 2);
+    JavaRDD<List<String>> rdd = sc.parallelize(Arrays.asList(
+      Arrays.asList("r z h k p".split(" ")),
+      Arrays.asList("z y x w v u t s".split(" ")),
+      Arrays.asList("s x o n r".split(" ")),
+      Arrays.asList("x z y m t s q e".split(" ")),
+      Arrays.asList("z".split(" ")),
+      Arrays.asList("x z y r q t p".split(" "))), 2);
 
     FPGrowthModel<String> model = new FPGrowth()
       .setMinSupport(0.5)
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaVectorsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaVectorsSuite.java
index 1421067dc61ed..77c8c6274f374 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaVectorsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaVectorsSuite.java
@@ -18,11 +18,10 @@
 package org.apache.spark.mllib.linalg;
 
 import java.io.Serializable;
+import java.util.Arrays;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
-
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -37,7 +36,7 @@ public void denseArrayConstruction() {
   @Test
   public void sparseArrayConstruction() {
     @SuppressWarnings("unchecked")
-    Vector v = Vectors.sparse(3, Lists.<Tuple2<Integer, Double>>newArrayList(
+    Vector v = Vectors.sparse(3, Arrays.asList(
         new Tuple2<Integer, Double>(0, 2.0),
         new Tuple2<Integer, Double>(2, 3.0)));
     assertArrayEquals(new double[]{2.0, 0.0, 3.0}, v.toArray(), 0.0);
diff --git a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
index fcc13c00cbdc5..33d81b1e9592b 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
@@ -17,7 +17,8 @@
 
 package org.apache.spark.mllib.random;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.apache.spark.api.java.JavaRDD;
 import org.junit.Assert;
 import org.junit.After;
@@ -51,7 +52,7 @@ public void testUniformRDD() {
     JavaDoubleRDD rdd1 = uniformJavaRDD(sc, m);
     JavaDoubleRDD rdd2 = uniformJavaRDD(sc, m, p);
     JavaDoubleRDD rdd3 = uniformJavaRDD(sc, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -64,7 +65,7 @@ public void testNormalRDD() {
     JavaDoubleRDD rdd1 = normalJavaRDD(sc, m);
     JavaDoubleRDD rdd2 = normalJavaRDD(sc, m, p);
     JavaDoubleRDD rdd3 = normalJavaRDD(sc, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -79,7 +80,7 @@ public void testLNormalRDD() {
     JavaDoubleRDD rdd1 = logNormalJavaRDD(sc, mean, std, m);
     JavaDoubleRDD rdd2 = logNormalJavaRDD(sc, mean, std, m, p);
     JavaDoubleRDD rdd3 = logNormalJavaRDD(sc, mean, std, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -93,7 +94,7 @@ public void testPoissonRDD() {
     JavaDoubleRDD rdd1 = poissonJavaRDD(sc, mean, m);
     JavaDoubleRDD rdd2 = poissonJavaRDD(sc, mean, m, p);
     JavaDoubleRDD rdd3 = poissonJavaRDD(sc, mean, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -107,7 +108,7 @@ public void testExponentialRDD() {
     JavaDoubleRDD rdd1 = exponentialJavaRDD(sc, mean, m);
     JavaDoubleRDD rdd2 = exponentialJavaRDD(sc, mean, m, p);
     JavaDoubleRDD rdd3 = exponentialJavaRDD(sc, mean, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -122,7 +123,7 @@ public void testGammaRDD() {
     JavaDoubleRDD rdd1 = gammaJavaRDD(sc, shape, scale, m);
     JavaDoubleRDD rdd2 = gammaJavaRDD(sc, shape, scale, m, p);
     JavaDoubleRDD rdd3 = gammaJavaRDD(sc, shape, scale, m, p, seed);
-    for (JavaDoubleRDD rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaDoubleRDD rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
     }
   }
@@ -138,7 +139,7 @@ public void testUniformVectorRDD() {
     JavaRDD<Vector> rdd1 = uniformJavaVectorRDD(sc, m, n);
     JavaRDD<Vector> rdd2 = uniformJavaVectorRDD(sc, m, n, p);
     JavaRDD<Vector> rdd3 = uniformJavaVectorRDD(sc, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
@@ -154,7 +155,7 @@ public void testNormalVectorRDD() {
     JavaRDD<Vector> rdd1 = normalJavaVectorRDD(sc, m, n);
     JavaRDD<Vector> rdd2 = normalJavaVectorRDD(sc, m, n, p);
     JavaRDD<Vector> rdd3 = normalJavaVectorRDD(sc, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
@@ -172,7 +173,7 @@ public void testLogNormalVectorRDD() {
     JavaRDD<Vector> rdd1 = logNormalJavaVectorRDD(sc, mean, std, m, n);
     JavaRDD<Vector> rdd2 = logNormalJavaVectorRDD(sc, mean, std, m, n, p);
     JavaRDD<Vector> rdd3 = logNormalJavaVectorRDD(sc, mean, std, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
@@ -189,7 +190,7 @@ public void testPoissonVectorRDD() {
     JavaRDD<Vector> rdd1 = poissonJavaVectorRDD(sc, mean, m, n);
     JavaRDD<Vector> rdd2 = poissonJavaVectorRDD(sc, mean, m, n, p);
     JavaRDD<Vector> rdd3 = poissonJavaVectorRDD(sc, mean, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
@@ -206,7 +207,7 @@ public void testExponentialVectorRDD() {
     JavaRDD<Vector> rdd1 = exponentialJavaVectorRDD(sc, mean, m, n);
     JavaRDD<Vector> rdd2 = exponentialJavaVectorRDD(sc, mean, m, n, p);
     JavaRDD<Vector> rdd3 = exponentialJavaVectorRDD(sc, mean, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
@@ -224,7 +225,7 @@ public void testGammaVectorRDD() {
     JavaRDD<Vector> rdd1 = gammaJavaVectorRDD(sc, shape, scale, m, n);
     JavaRDD<Vector> rdd2 = gammaJavaVectorRDD(sc, shape, scale, m, n, p);
     JavaRDD<Vector> rdd3 = gammaJavaVectorRDD(sc, shape, scale, m, n, p, seed);
-    for (JavaRDD<Vector> rdd: Lists.newArrayList(rdd1, rdd2, rdd3)) {
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
       Assert.assertEquals(m, rdd.count());
       Assert.assertEquals(n, rdd.first().size());
     }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java b/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
index af688c504cf1e..271dda4662e0d 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
@@ -18,12 +18,12 @@
 package org.apache.spark.mllib.recommendation;
 
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.List;
 
 import scala.Tuple2;
 import scala.Tuple3;
 
-import com.google.common.collect.Lists;
 import org.jblas.DoubleMatrix;
 import org.junit.After;
 import org.junit.Assert;
@@ -56,8 +56,7 @@ void validatePrediction(
       double matchThreshold,
       boolean implicitPrefs,
       DoubleMatrix truePrefs) {
-    List<Tuple2<Integer, Integer>> localUsersProducts =
-      Lists.newArrayListWithCapacity(users * products);
+    List<Tuple2<Integer, Integer>> localUsersProducts = new ArrayList(users * products);
     for (int u=0; u < users; ++u) {
       for (int p=0; p < products; ++p) {
         localUsersProducts.add(new Tuple2<Integer, Integer>(u, p));
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java
index d38fc91ace3cf..32c2f4f3395b7 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java
@@ -18,11 +18,12 @@
 package org.apache.spark.mllib.regression;
 
 import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple3;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -36,7 +37,7 @@ public class JavaIsotonicRegressionSuite implements Serializable {
   private transient JavaSparkContext sc;
 
   private List<Tuple3<Double, Double, Double>> generateIsotonicInput(double[] labels) {
-    List<Tuple3<Double, Double, Double>> input = Lists.newArrayList();
+    ArrayList<Tuple3<Double, Double, Double>> input = new ArrayList(labels.length);
 
     for (int i = 1; i <= labels.length; i++) {
       input.add(new Tuple3<Double, Double, Double>(labels[i-1], (double) i, 1d));
@@ -77,7 +78,7 @@ public void testIsotonicRegressionPredictionsJavaRDD() {
     IsotonicRegressionModel model =
       runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12});
 
-    JavaDoubleRDD testRDD = sc.parallelizeDoubles(Lists.newArrayList(0.0, 1.0, 9.5, 12.0, 13.0));
+    JavaDoubleRDD testRDD = sc.parallelizeDoubles(Arrays.asList(0.0, 1.0, 9.5, 12.0, 13.0));
     List<Double> predictions = model.predict(testRDD).collect();
 
     Assert.assertTrue(predictions.get(0) == 1d);
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
index 899c4ea607869..dbf6488d41085 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
@@ -18,11 +18,11 @@
 package org.apache.spark.mllib.regression;
 
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import scala.Tuple2;
 
-import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -59,16 +59,16 @@ public void tearDown() {
   @Test
   @SuppressWarnings("unchecked")
   public void javaAPI() {
-    List<LabeledPoint> trainingBatch = Lists.newArrayList(
+    List<LabeledPoint> trainingBatch = Arrays.asList(
       new LabeledPoint(1.0, Vectors.dense(1.0)),
       new LabeledPoint(0.0, Vectors.dense(0.0)));
     JavaDStream<LabeledPoint> training =
-      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
-    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      attachTestInputStream(ssc, Arrays.asList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Arrays.asList(
       new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
       new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
     JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
-      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+      attachTestInputStream(ssc, Arrays.asList(testBatch, testBatch), 2));
     StreamingLinearRegressionWithSGD slr = new StreamingLinearRegressionWithSGD()
       .setNumIterations(2)
       .setInitialWeights(Vectors.dense(0.0));
diff --git a/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java
index eb4e3698624bc..4795809e47a46 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/stat/JavaStatisticsSuite.java
@@ -19,7 +19,8 @@
 
 import java.io.Serializable;
 
-import com.google.common.collect.Lists;
+import java.util.Arrays;
+
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -50,8 +51,8 @@ public void tearDown() {
 
   @Test
   public void testCorr() {
-    JavaRDD<Double> x = sc.parallelize(Lists.newArrayList(1.0, 2.0, 3.0, 4.0));
-    JavaRDD<Double> y = sc.parallelize(Lists.newArrayList(1.1, 2.2, 3.1, 4.3));
+    JavaRDD<Double> x = sc.parallelize(Arrays.asList(1.0, 2.0, 3.0, 4.0));
+    JavaRDD<Double> y = sc.parallelize(Arrays.asList(1.1, 2.2, 3.1, 4.3));
 
     Double corr1 = Statistics.corr(x, y);
     Double corr2 = Statistics.corr(x, y, "pearson");
@@ -61,7 +62,7 @@ public void testCorr() {
 
   @Test
   public void kolmogorovSmirnovTest() {
-    JavaDoubleRDD data = sc.parallelizeDoubles(Lists.newArrayList(0.2, 1.0, -1.0, 2.0));
+    JavaDoubleRDD data = sc.parallelizeDoubles(Arrays.asList(0.2, 1.0, -1.0, 2.0));
     KolmogorovSmirnovTestResult testResult1 = Statistics.kolmogorovSmirnovTest(data, "norm");
     KolmogorovSmirnovTestResult testResult2 = Statistics.kolmogorovSmirnovTest(
       data, "norm", 0.0, 1.0);
@@ -69,7 +70,7 @@ public void kolmogorovSmirnovTest() {
 
   @Test
   public void chiSqTest() {
-    JavaRDD<LabeledPoint> data = sc.parallelize(Lists.newArrayList(
+    JavaRDD<LabeledPoint> data = sc.parallelize(Arrays.asList(
       new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)),
       new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)),
       new LabeledPoint(0.0, Vectors.dense(2.4, 8.1))));

From fdd466bed7a7151dd066d732ef98d225f4acda4a Mon Sep 17 00:00:00 2001
From: Vyacheslav Baranov <slavik.baranov@gmail.com>
Date: Thu, 27 Aug 2015 18:56:18 +0100
Subject: [PATCH 109/802] [SPARK-10182] [MLLIB] GeneralizedLinearModel doesn't
 unpersist cached data

`GeneralizedLinearModel` creates a cached RDD when building a model. It's inconvenient, since these RDDs flood the memory when building several models in a row, so useful data might get evicted from the cache.

The proposed solution is to always cache the dataset & remove the warning. There's a caveat though: input dataset gets evaluated twice, in line 270 when fitting `StandardScaler` for the first time, and when running optimizer for the second time. So, it might worth to return removed warning.

Another possible solution is to disable caching entirely & return removed warning. I don't really know what approach is better.

Author: Vyacheslav Baranov <slavik.baranov@gmail.com>

Closes #8395 from SlavikBaranov/SPARK-10182.
---
 .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 7e3b4d5648fe3..8f657bfb9c730 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -359,6 +359,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
         + " parent RDDs are also uncached.")
     }
 
+    // Unpersist cached data
+    if (data.getStorageLevel != StorageLevel.NONE) {
+      data.unpersist(false)
+    }
+
     createModel(weights, intercept)
   }
 }

From dc86a227e4fc8a9d8c3e8c68da8dff9298447fd0 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 27 Aug 2015 11:45:15 -0700
Subject: [PATCH 110/802] [SPARK-9148] [SPARK-10252] [SQL] Update SQL
 Programming Guide

Author: Michael Armbrust <michael@databricks.com>

Closes #8441 from marmbrus/documentation.
---
 docs/sql-programming-guide.md | 92 +++++++++++++++++++++++++++--------
 1 file changed, 73 insertions(+), 19 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index e64190b9b209d..99fec6c7785af 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -11,7 +11,7 @@ title: Spark SQL and DataFrames
 
 Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as distributed SQL query engine.
 
-For how to enable Hive support, please refer to the [Hive Tables](#hive-tables) section.
+Spark SQL can also be used to read data from an existing Hive installation.  For more on how to configure this feature, please refer to the [Hive Tables](#hive-tables) section.
 
 # DataFrames
 
@@ -213,6 +213,11 @@ df.groupBy("age").count().show()
 // 30   1
 {% endhighlight %}
 
+For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/scala/index.html#org.apache.spark.sql.DataFrame).
+
+In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.DataFrame).
+
+
 </div>
 
 <div data-lang="java" markdown="1">
@@ -263,6 +268,10 @@ df.groupBy("age").count().show();
 // 30   1
 {% endhighlight %}
 
+For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/java/org/apache/spark/sql/DataFrame.html).
+
+In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/java/org/apache/spark/sql/functions.html).
+
 </div>
 
 <div data-lang="python"  markdown="1">
@@ -320,6 +329,10 @@ df.groupBy("age").count().show()
 
 {% endhighlight %}
 
+For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/python/pyspark.sql.html#pyspark.sql.DataFrame).
+
+In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/python/pyspark.sql.html#module-pyspark.sql.functions).
+
 </div>
 
 <div data-lang="r"  markdown="1">
@@ -370,10 +383,13 @@ showDF(count(groupBy(df, "age")))
 
 {% endhighlight %}
 
-</div>
+For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html).
+
+In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/R/index.html).
 
 </div>
 
+</div>
 
 ## Running SQL Queries Programmatically
 
@@ -870,12 +886,11 @@ saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
 
 Save operations can optionally take a `SaveMode`, that specifies how to handle existing data if
 present.  It is important to realize that these save modes do not utilize any locking and are not
-atomic.  Thus, it is not safe to have multiple writers attempting to write to the same location.
-Additionally, when performing a `Overwrite`, the data will be deleted before writing out the
+atomic.  Additionally, when performing a `Overwrite`, the data will be deleted before writing out the
 new data.
 
 <table class="table">
-<tr><th>Scala/Java</th><th>Python</th><th>Meaning</th></tr>
+<tr><th>Scala/Java</th><th>Any Language</th><th>Meaning</th></tr>
 <tr>
   <td><code>SaveMode.ErrorIfExists</code> (default)</td>
   <td><code>"error"</code> (default)</td>
@@ -1671,12 +1686,12 @@ results <- collect(sql(sqlContext, "FROM src SELECT key, value"))
 ### Interacting with Different Versions of Hive Metastore
 
 One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore,
-which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
+which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary 
+build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
+Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL
+will compile against Hive 1.2.1 and use those classes for internal execution (serdes, UDFs, UDAFs, etc).
 
-Internally, Spark SQL uses two Hive clients, one for executing native Hive commands like `SET`
-and `DESCRIBE`, the other dedicated for communicating with Hive metastore. The former uses Hive
-jars of version 0.13.1, which are bundled with Spark 1.4.0. The latter uses Hive jars of the
-version specified by users. An isolated classloader is used here to avoid dependency conflicts.
+The following options can be used to configure the version of Hive that is used to retrieve metadata:
 
 <table class="table">
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1685,7 +1700,7 @@ version specified by users. An isolated classloader is used here to avoid depend
     <td><code>0.13.1</code></td>
     <td>
       Version of the Hive metastore. Available
-      options are <code>0.12.0</code> and <code>0.13.1</code>. Support for more versions is coming in the future.
+      options are <code>0.12.0</code> through <code>1.2.1</code>.
     </td>
   </tr>
   <tr>
@@ -1696,12 +1711,16 @@ version specified by users. An isolated classloader is used here to avoid depend
       property can be one of three options:
       <ol>
         <li><code>builtin</code></li>
-        Use Hive 0.13.1, which is bundled with the Spark assembly jar when <code>-Phive</code> is
+        Use Hive 1.2.1, which is bundled with the Spark assembly jar when <code>-Phive</code> is
         enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
-        either <code>0.13.1</code> or not defined.
+        either <code>1.2.1</code> or not defined.
         <li><code>maven</code></li>
-        Use Hive jars of specified version downloaded from Maven repositories.
-        <li>A classpath in the standard format for both Hive and Hadoop.</li>
+        Use Hive jars of specified version downloaded from Maven repositories.  This configuration
+        is not generally recommended for production deployments. 
+        <li>A classpath in the standard format for the JVM.  This classpath must include all of Hive 
+        and its dependencies, including the correct version of Hadoop.  These jars only need to be
+        present on the driver, but if you are running in yarn cluster mode then you must ensure
+        they are packaged with you application.</li>
       </ol>
     </td>
   </tr>
@@ -2017,6 +2036,28 @@ options.
 
 # Migration Guide
 
+## Upgrading From Spark SQL 1.4 to 1.5
+
+ - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with
+   code generation for expression evaluation.  These features can both be disabled by setting
+   `spark.sql.tungsten.enabled` to `false.
+ - Parquet schema merging is no longer enabled by default.  It can be re-enabled by setting 
+   `spark.sql.parquet.mergeSchema` to `true`.
+ - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or 
+   access nested values.  For example `df['table.column.nestedField']`.  However, this means that if 
+   your column name contains any dots you must now escape them using backticks (e.g., ``table.`column.with.dots`.nested``).   
+ - In-memory columnar storage partition pruning is on by default. It can be disabled by setting
+   `spark.sql.inMemoryColumnarStorage.partitionPruning` to `false`.
+ - Unlimited precision decimal columns are no longer supported, instead Spark SQL enforces a maximum
+   precision of 38.  When inferring schema from `BigDecimal` objects, a precision of (38, 18) is now
+   used. When no precision is specified in DDL then the default remains `Decimal(10, 0)`.
+ - Timestamps are now stored at a precision of 1us, rather than 1ns
+ - In the `sql` dialect, floating point numbers are now parsed as decimal.  HiveQL parsing remains
+   unchanged.
+ - The canonical name of SQL/DataFrame functions are now lower case (e.g. sum vs SUM).
+ - It has been determined that using the DirectOutputCommitter when speculation is enabled is unsafe
+   and thus this output committer will not be used when speculation is on, independent of configuration.
+
 ## Upgrading from Spark SQL 1.3 to 1.4
 
 #### DataFrame data reader/writer interface
@@ -2038,7 +2079,8 @@ See the API docs for `SQLContext.read` (
 
 #### DataFrame.groupBy retains grouping columns
 
-Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
+Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the
+grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
@@ -2175,7 +2217,7 @@ Python UDF registration is unchanged.
 When using DataTypes in Python you will need to construct them (i.e. `StringType()`) instead of
 referencing a singleton.
 
-## Migration Guide for Shark User
+## Migration Guide for Shark Users
 
 ### Scheduling
 To set a [Fair Scheduler](job-scheduling.html#fair-scheduler-pools) pool for a JDBC client session,
@@ -2251,6 +2293,7 @@ Spark SQL supports the vast majority of Hive features, such as:
 * User defined functions (UDF)
 * User defined aggregation functions (UDAF)
 * User defined serialization formats (SerDes)
+* Window functions
 * Joins
   * `JOIN`
   * `{LEFT|RIGHT|FULL} OUTER JOIN`
@@ -2261,7 +2304,7 @@ Spark SQL supports the vast majority of Hive features, such as:
   * `SELECT col FROM ( SELECT a + b AS col from t1) t2`
 * Sampling
 * Explain
-* Partitioned tables
+* Partitioned tables including dynamic partition insertion
 * View
 * All Hive DDL Functions, including:
   * `CREATE TABLE`
@@ -2323,8 +2366,9 @@ releases of Spark SQL.
   Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
   metadata. Spark SQL does not support that.
 
+# Reference
 
-# Data Types
+## Data Types
 
 Spark SQL and DataFrames support the following data types:
 
@@ -2937,3 +2981,13 @@ from pyspark.sql.types import *
 
 </div>
 
+## NaN Semantics
+
+There is specially handling for not-a-number (NaN) when dealing with `float` or `double` types that
+does not exactly match standard floating point semantics.
+Specifically:
+
+ - NaN = NaN returns true.
+ - In aggregations all NaN values are grouped together.
+ - NaN is treated as a normal value in join keys.
+ - NaN values go last when in ascending order, larger than any other numeric value.

From 84baa5e9b5edc8c55871fbed5057324450bf097f Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Thu, 27 Aug 2015 20:19:09 +0100
Subject: [PATCH 111/802] [SPARK-10315] remove document on
 spark.akka.failure-detector.threshold

https://issues.apache.org/jira/browse/SPARK-10315

this parameter is not used any longer and there is some mistake in the current document , should be 'akka.remote.watch-failure-detector.threshold'

Author: CodingCat <zhunansjtu@gmail.com>

Closes #8483 from CodingCat/SPARK_10315.
---
 docs/configuration.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 4a6e4dd05b661..77c5cbc7b3196 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -906,16 +906,6 @@ Apart from these, the following properties are also available, and may be useful
 #### Networking
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
-<tr>
-  <td><code>spark.akka.failure-detector.threshold</code></td>
-  <td>300.0</td>
-  <td>
-     This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
-     enabled again, if you plan to use this feature (Not recommended). This maps to akka's
-     `akka.remote.transport-failure-detector.threshold`. Tune this in combination of
-     `spark.akka.heartbeat.pauses` and `spark.akka.heartbeat.interval` if you need to.
-  </td>
-</tr>
 <tr>
   <td><code>spark.akka.frameSize</code></td>
   <td>128</td>

From 6185cdd2afcd492b77ff225b477b3624e3bc7bb2 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 27 Aug 2015 13:57:20 -0700
Subject: [PATCH 112/802] [SPARK-9901] User guide for RowMatrix Tall-and-skinny
 QR

jira: https://issues.apache.org/jira/browse/SPARK-9901

The jira covers only the document update. I can further provide example code for QR (like the ones for SVD and PCA) in a separate PR.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #8462 from hhbyyh/qrDoc.
---
 docs/mllib-data-types.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index f0e8d5495675d..065bf4727624f 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -337,7 +337,10 @@ limited by the integer range but it should be much smaller in practice.
 <div data-lang="scala" markdown="1">
 
 A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
-created from an `RDD[Vector]` instance.  Then we can compute its column summary statistics.
+created from an `RDD[Vector]` instance.  Then we can compute its column summary statistics and decompositions.
+[QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) is of the form A = QR where Q is an orthogonal matrix and R is an upper triangular matrix.
+For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [principal component analysis (PCA)](https://en.wikipedia.org/wiki/Principal_component_analysis), please refer to [Dimensionality reduction](mllib-dimensionality-reduction.html).
+
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vector
@@ -350,6 +353,9 @@ val mat: RowMatrix = new RowMatrix(rows)
 // Get its size.
 val m = mat.numRows()
 val n = mat.numCols()
+
+// QR decomposition 
+val qrResult = mat.tallSkinnyQR(true)
 {% endhighlight %}
 </div>
 
@@ -370,6 +376,9 @@ RowMatrix mat = new RowMatrix(rows.rdd());
 // Get its size.
 long m = mat.numRows();
 long n = mat.numCols();
+
+// QR decomposition 
+QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
 {% endhighlight %}
 </div>
 

From c94ecdfc5b3c0fe6c38a170dc2af9259354dc9e3 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 27 Aug 2015 15:33:43 -0700
Subject: [PATCH 113/802] [SPARK-9906] [ML] User guide for
 LogisticRegressionSummary

User guide for LogisticRegression summaries

Author: MechCoder <manojkumarsivaraj334@gmail.com>
Author: Manoj Kumar <mks542@nyu.edu>
Author: Feynman Liang <fliang@databricks.com>

Closes #8197 from MechCoder/log_summary_user_guide.
---
 docs/ml-linear-methods.md | 149 ++++++++++++++++++++++++++++++++++----
 1 file changed, 133 insertions(+), 16 deletions(-)

diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 1ac83d94c9e81..2761aeb789621 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -23,20 +23,41 @@ displayTitle: <a href="ml-guide.html">ML</a> - Linear Methods
 \]`
 
 
-In MLlib, we implement popular linear methods such as logistic regression and linear least squares with L1 or L2 regularization. Refer to [the linear methods in mllib](mllib-linear-methods.html) for details. In `spark.ml`, we also include Pipelines API for [Elastic net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid of L1 and L2 regularization proposed in [this paper](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). Mathematically it is defined as a linear combination of the L1-norm and the L2-norm:
+In MLlib, we implement popular linear methods such as logistic
+regression and linear least squares with $L_1$ or $L_2$ regularization.
+Refer to [the linear methods in mllib](mllib-linear-methods.html) for
+details.  In `spark.ml`, we also include Pipelines API for [Elastic
+net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid
+of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization
+and variable selection via the elastic
+net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf).
+Mathematically, it is defined as a convex combination of the $L_1$ and
+the $L_2$ regularization terms:
 `\[
-\alpha \|\wv\|_1 + (1-\alpha) \frac{1}{2}\|\wv\|_2^2, \alpha \in [0, 1].
+\alpha~\lambda \|\wv\|_1 + (1-\alpha) \frac{\lambda}{2}\|\wv\|_2^2, \alpha \in [0, 1], \lambda \geq 0.
 \]`
-By setting $\alpha$ properly, it contains both L1 and L2 regularization as special cases. For example, if a [linear regression](https://en.wikipedia.org/wiki/Linear_regression) model is trained with the elastic net parameter $\alpha$ set to $1$, it is equivalent to a [Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. On the other hand, if $\alpha$ is set to $0$, the trained model reduces to a [ridge regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. We implement Pipelines API for both linear regression and logistic regression with elastic net regularization.
-
-**Examples**
+By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$
+regularization as special cases. For example, if a [linear
+regression](https://en.wikipedia.org/wiki/Linear_regression) model is
+trained with the elastic net parameter $\alpha$ set to $1$, it is
+equivalent to a
+[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model.
+On the other hand, if $\alpha$ is set to $0$, the trained model reduces
+to a [ridge
+regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model.
+We implement Pipelines API for both linear regression and logistic
+regression with elastic net regularization.
+
+## Example: Logistic Regression
+
+The following example shows how to train a logistic regression model
+with elastic net regularization. `elasticNetParam` corresponds to
+$\alpha$ and `regParam` corresponds to $\lambda$.
 
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-
 {% highlight scala %}
-
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.mllib.util.MLUtils
 
@@ -53,15 +74,11 @@ val lrModel = lr.fit(training)
 
 // Print the weights and intercept for logistic regression
 println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
-
 {% endhighlight %}
-
 </div>
 
 <div data-lang="java" markdown="1">
-
 {% highlight java %}
-
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
@@ -99,9 +116,7 @@ public class LogisticRegressionWithElasticNetExample {
 </div>
 
 <div data-lang="python" markdown="1">
-
 {% highlight python %}
-
 from pyspark.ml.classification import LogisticRegression
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.util import MLUtils
@@ -118,12 +133,114 @@ lrModel = lr.fit(training)
 print("Weights: " + str(lrModel.weights))
 print("Intercept: " + str(lrModel.intercept))
 {% endhighlight %}
+</div>
 
 </div>
 
+The `spark.ml` implementation of logistic regression also supports
+extracting a summary of the model over the training set. Note that the
+predictions and metrics which are stored as `Dataframe` in
+`BinaryLogisticRegressionSummary` are annotated `@transient` and hence
+only available on the driver.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary)
+provides a summary for a
+[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel).
+Currently, only binary classification is supported and the
+summary must be explicitly cast to
+[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
+This will likely change when multiclass classification is supported.
+
+Continuing the earlier example:
+
+{% highlight scala %}
+// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
+val trainingSummary = lrModel.summary
+
+// Obtain the loss per iteration.
+val objectiveHistory = trainingSummary.objectiveHistory
+objectiveHistory.foreach(loss => println(loss))
+
+// Obtain the metrics useful to judge performance on test data.
+// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a
+// binary classification problem.
+val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]
+
+// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
+val roc = binarySummary.roc
+roc.show()
+roc.select("FPR").show()
+println(binarySummary.areaUnderROC)
+
+// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
+// this selected threshold.
+val fMeasure = binarySummary.fMeasureByThreshold
+val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
+val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).
+  select("threshold").head().getDouble(0)
+logReg.setThreshold(bestThreshold)
+logReg.fit(logRegDataFrame)
+{% endhighlight %}
 </div>
 
-### Optimization
+<div data-lang="java" markdown="1">
+[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html)
+provides a summary for a
+[`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html).
+Currently, only binary classification is supported and the
+summary must be explicitly cast to
+[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html).
+This will likely change when multiclass classification is supported.
+
+Continuing the earlier example:
+
+{% highlight java %}
+// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
+LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary();
+
+// Obtain the loss per iteration.
+double[] objectiveHistory = trainingSummary.objectiveHistory();
+for (double lossPerIteration : objectiveHistory) {
+  System.out.println(lossPerIteration);
+}
+
+// Obtain the metrics useful to judge performance on test data.
+// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a
+// binary classification problem.
+BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary;
+
+// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
+DataFrame roc = binarySummary.roc();
+roc.show();
+roc.select("FPR").show();
+System.out.println(binarySummary.areaUnderROC());
+
+// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
+// this selected threshold.
+DataFrame fMeasure = binarySummary.fMeasureByThreshold();
+double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0);
+double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)).
+  select("threshold").head().getDouble(0);
+logReg.setThreshold(bestThreshold);
+logReg.fit(logRegDataFrame);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+Logistic regression model summary is not yet supported in Python.
+</div>
+
+</div>
+
+# Optimization
+
+The optimization algorithm underlying the implementation is called
+[Orthant-Wise Limited-memory
+QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
+regularization and elastic net.
 
-The optimization algorithm underlies the implementation is called [Orthant-Wise Limited-memory QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 regularization and elastic net.

From 5bfe9e1111d9862084586549a7dc79476f67bab9 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 16:10:37 -0700
Subject: [PATCH 114/802] [SPARK-9680] [MLLIB] [DOC] StopWordsRemovers user
 guide and Java compatibility test

* Adds user guide for ml.feature.StopWordsRemovers, ran code examples on my machine
* Cleans up scaladocs for public methods
* Adds test for Java compatibility
* Follow up Python user guide code example is tracked by SPARK-10249

Author: Feynman Liang <fliang@databricks.com>

Closes #8436 from feynmanliang/SPARK-10230.
---
 docs/ml-features.md                           | 102 +++++++++++++++++-
 .../ml/feature/JavaStopWordsRemoverSuite.java |  72 +++++++++++++
 2 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 62de4838981cb..89a9bad570446 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -306,15 +306,111 @@ regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern=
 </div>
 </div>
 
+## StopWordsRemover
+[Stop words](https://en.wikipedia.org/wiki/Stop_words) are words which
+should be excluded from the input, typically because the words appear
+frequently and don't carry as much meaning.
+
+`StopWordsRemover` takes as input a sequence of strings (e.g. the output
+of a [Tokenizer](ml-features.html#tokenizer)) and drops all the stop
+words from the input sequences. The list of stopwords is specified by
+the `stopWords` parameter.  We provide [a list of stop
+words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by
+default, accessible by calling `getStopWords` on a newly instantiated
+`StopWordsRemover` instance.
 
-## $n$-gram
+**Examples**
 
-An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams.
+Assume that we have the following DataFrame with columns `id` and `raw`:
 
-`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer).  The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words.  If the input sequence contains fewer than `n` strings, no output is produced.
+~~~~
+ id | raw
+----|----------
+ 0  | [I, saw, the, red, baloon]
+ 1  | [Mary, had, a, little, lamb]
+~~~~
+
+Applying `StopWordsRemover` with `raw` as the input column and `filtered` as the output
+column, we should get the following:
+
+~~~~
+ id | raw                         | filtered
+----|-----------------------------|--------------------
+ 0  | [I, saw, the, red, baloon]  |  [saw, red, baloon]
+ 1  | [Mary, had, a, little, lamb]|[Mary, little, lamb]
+~~~~
+
+In `filtered`, the stop words "I", "the", "had", and "a" have been
+filtered out.
 
 <div class="codetabs">
+
 <div data-lang="scala" markdown="1">
+
+[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.StopWordsRemover
+
+val remover = new StopWordsRemover()
+  .setInputCol("raw")
+  .setOutputCol("filtered")
+val dataSet = sqlContext.createDataFrame(Seq(
+  (0, Seq("I", "saw", "the", "red", "baloon")),
+  (1, Seq("Mary", "had", "a", "little", "lamb"))
+)).toDF("id", "raw")
+
+remover.transform(dataSet).show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+StopWordsRemover remover = new StopWordsRemover()
+  .setInputCol("raw")
+  .setOutputCol("filtered");
+
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+  RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame dataset = jsql.createDataFrame(rdd, schema);
+
+remover.transform(dataset).show();
+{% endhighlight %}
+</div>
+</div>
+
+## $n$-gram
+
+An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams.
+
+`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer)).  The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words.  If the input sequence contains fewer than `n` strings, no output is produced.
+
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
new file mode 100644
index 0000000000000..76cdd0fae84ab
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+
+public class JavaStopWordsRemoverSuite {
+
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaStopWordsRemoverSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void javaCompatibilityTest() {
+    StopWordsRemover remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered");
+
+    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+      RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+    ));
+    StructType schema = new StructType(new StructField[] {
+      new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+    });
+    DataFrame dataset = jsql.createDataFrame(rdd, schema);
+
+    remover.transform(dataset).collect();
+  }
+}

From b3dd569ad40905f8861a547a1e25ed3ca8e1d272 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 27 Aug 2015 16:11:25 -0700
Subject: [PATCH 115/802] [SPARK-10287] [SQL] Fixes JSONRelation refreshing on
 read path

https://issues.apache.org/jira/browse/SPARK-10287

After porting json to HadoopFsRelation, it seems hard to keep the behavior of picking up new files automatically for JSON. This PR removes this behavior, so JSON is consistent with others (ORC and Parquet).

Author: Yin Huai <yhuai@databricks.com>

Closes #8469 from yhuai/jsonRefresh.
---
 docs/sql-programming-guide.md                     |  6 ++++++
 .../execution/datasources/json/JSONRelation.scala |  9 ---------
 .../org/apache/spark/sql/sources/interfaces.scala |  2 +-
 .../apache/spark/sql/sources/InsertSuite.scala    | 15 ---------------
 4 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 99fec6c7785af..e8eb88488ee24 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -2057,6 +2057,12 @@ options.
  - The canonical name of SQL/DataFrame functions are now lower case (e.g. sum vs SUM).
  - It has been determined that using the DirectOutputCommitter when speculation is enabled is unsafe
    and thus this output committer will not be used when speculation is on, independent of configuration.
+ - JSON data source will not automatically load new files that are created by other applications
+   (i.e. files that are not inserted to the dataset through Spark SQL).
+   For a JSON persistent table (i.e. the metadata of the table is stored in Hive Metastore),
+   users can use `REFRESH TABLE` SQL command or `HiveContext`'s `refreshTable` method
+   to include those new files to the table. For a DataFrame representing a JSON dataset, users need to recreate
+   the DataFrame and the new DataFrame will include new files.
 
 ## Upgrading from Spark SQL 1.3 to 1.4
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 114c8b211891e..ab8ca5f748f24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -111,15 +111,6 @@ private[sql] class JSONRelation(
     jsonSchema
   }
 
-  override private[sql] def buildScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputPaths: Array[String],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
-    refresh()
-    super.buildScan(requiredColumns, filters, inputPaths, broadcastedConf)
-  }
-
   override def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index b3b326fe612c7..dff726b33fc74 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -562,7 +562,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     })
   }
 
-  private[sql] def buildScan(
+  final private[sql] def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
       inputPaths: Array[String],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 78bd3e5582964..084d83f6e9bff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -167,21 +167,6 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
     )
   }
 
-  test("save directly to the path of a JSON table") {
-    caseInsensitiveContext.table("jt").selectExpr("a * 5 as a", "b")
-      .write.mode(SaveMode.Overwrite).json(path.toString)
-    checkAnswer(
-      sql("SELECT a, b FROM jsonTable"),
-      (1 to 10).map(i => Row(i * 5, s"str$i"))
-    )
-
-    caseInsensitiveContext.table("jt").write.mode(SaveMode.Overwrite).json(path.toString)
-    checkAnswer(
-      sql("SELECT a, b FROM jsonTable"),
-      (1 to 10).map(i => Row(i, s"str$i"))
-    )
-  }
-
   test("it is not allowed to write to a table while querying it.") {
     val message = intercept[AnalysisException] {
       sql(

From 54cda0deb6bebf1470f16ba5bcc6c4fb842bdac1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 27 Aug 2015 16:38:00 -0700
Subject: [PATCH 116/802] [SPARK-10321] sizeInBytes in HadoopFsRelation

Having sizeInBytes in HadoopFsRelation to enable broadcast join.

cc marmbrus

Author: Davies Liu <davies@databricks.com>

Closes #8490 from davies/sizeInByte.
---
 .../main/scala/org/apache/spark/sql/sources/interfaces.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index dff726b33fc74..7b030b7d73bd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -518,6 +518,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
   override def inputFiles: Array[String] = cachedLeafStatuses().map(_.getPath.toString).toArray
 
+  override def sizeInBytes: Long = cachedLeafStatuses().map(_.getLen).sum
+
   /**
    * Partition columns.  Can be either defined by [[userDefinedPartitionColumns]] or automatically
    * discovered.  Note that they should always be nullable.

From 1f90c5e2198bcf49e115d97ec300c17c1be4dcb4 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 27 Aug 2015 19:38:53 -0700
Subject: [PATCH 117/802] [SPARK-8505] [SPARKR] Add settings to kick `lint-r`
 from `./dev/run-test.py`

JoshRosen we'd like to check the SparkR source code with the `dev/lint-r` script on the Jenkins. I tried to incorporate the script into `dev/run-test.py`. Could you review it when you have time?

shivaram I modified `dev/lint-r` and `dev/lint-r.R` to install lintr package into a local directory(`R/lib/`) and to exit with a lint status. Could you review it?

- [[SPARK-8505] Add settings to kick `lint-r` from `./dev/run-test.py` - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-8505)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7883 from yu-iskw/SPARK-8505.
---
 dev/lint-r             | 11 +++++++++++
 dev/lint-r.R           | 12 +++++++-----
 dev/run-tests-codes.sh | 13 +++++++------
 dev/run-tests-jenkins  |  2 ++
 dev/run-tests.py       | 21 ++++++++++++++++++++-
 5 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/dev/lint-r b/dev/lint-r
index 7d5f4cd31153d..c15d57aad86da 100755
--- a/dev/lint-r
+++ b/dev/lint-r
@@ -28,3 +28,14 @@ if ! type "Rscript" > /dev/null; then
 fi
 
 `which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME"
+
+NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME"`
+if [ "$NUM_LINES" = "0" ] ; then
+  lint_status=0
+  echo "lintr checks passed."
+else
+  lint_status=1
+  echo "lintr checks failed."
+fi
+
+exit "$lint_status"
diff --git a/dev/lint-r.R b/dev/lint-r.R
index 48bd6246096ae..999eef571b824 100644
--- a/dev/lint-r.R
+++ b/dev/lint-r.R
@@ -17,8 +17,14 @@
 
 argv <- commandArgs(TRUE)
 SPARK_ROOT_DIR <- as.character(argv[1])
+LOCAL_LIB_LOC <- file.path(SPARK_ROOT_DIR, "R", "lib")
 
-# Installs lintr from Github.
+# Checks if SparkR is installed in a local directory.
+if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) {
+  stop("You should install SparkR in a local directory with `R/install-dev.sh`.")
+}
+
+# Installs lintr from Github in a local directory.
 # NOTE: The CRAN's version is too old to adapt to our rules.
 if ("lintr" %in% row.names(installed.packages())  == FALSE) {
   devtools::install_github("jimhester/lintr")
@@ -27,9 +33,5 @@ if ("lintr" %in% row.names(installed.packages())  == FALSE) {
 library(lintr)
 library(methods)
 library(testthat)
-if (! library(SparkR, lib.loc = file.path(SPARK_ROOT_DIR, "R", "lib"), logical.return = TRUE)) {
-  stop("You should install SparkR in a local directory with `R/install-dev.sh`.")
-}
-
 path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg")
 lint_package(path.to.package, cache = FALSE)
diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
index f4b238e1b78a7..1f16790522e76 100644
--- a/dev/run-tests-codes.sh
+++ b/dev/run-tests-codes.sh
@@ -21,9 +21,10 @@ readonly BLOCK_GENERAL=10
 readonly BLOCK_RAT=11
 readonly BLOCK_SCALA_STYLE=12
 readonly BLOCK_PYTHON_STYLE=13
-readonly BLOCK_DOCUMENTATION=14
-readonly BLOCK_BUILD=15
-readonly BLOCK_MIMA=16
-readonly BLOCK_SPARK_UNIT_TESTS=17
-readonly BLOCK_PYSPARK_UNIT_TESTS=18
-readonly BLOCK_SPARKR_UNIT_TESTS=19
+readonly BLOCK_R_STYLE=14
+readonly BLOCK_DOCUMENTATION=15
+readonly BLOCK_BUILD=16
+readonly BLOCK_MIMA=17
+readonly BLOCK_SPARK_UNIT_TESTS=18
+readonly BLOCK_PYSPARK_UNIT_TESTS=19
+readonly BLOCK_SPARKR_UNIT_TESTS=20
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index f144c053046c5..39cf54f78104c 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -210,6 +210,8 @@ done
       failing_test="Scala style tests"
     elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
       failing_test="Python style tests"
+    elif [ "$test_result" -eq "$BLOCK_R_STYLE" ]; then
+      failing_test="R style tests"
     elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then
       failing_test="to generate documentation"
     elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
diff --git a/dev/run-tests.py b/dev/run-tests.py
index f689425ee40b6..4fd703a7c219f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -209,6 +209,18 @@ def run_python_style_checks():
     run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
 
 
+def run_sparkr_style_checks():
+    set_title_and_block("Running R style checks", "BLOCK_R_STYLE")
+
+    if which("R"):
+        # R style check should be executed after `install-dev.sh`.
+        # Since warnings about `no visible global function definition` appear
+        # without the installation. SEE ALSO: SPARK-9121.
+        run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
+    else:
+        print("Ignoring SparkR style check as R was not found in PATH")
+
+
 def build_spark_documentation():
     set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
     os.environ["PRODUCTION"] = "1 jekyll build"
@@ -387,7 +399,6 @@ def run_sparkr_tests():
     set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
 
     if which("R"):
-        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
         run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
     else:
         print("Ignoring SparkR tests as R was not found in PATH")
@@ -438,6 +449,12 @@ def main():
     if java_version.minor < 8:
         print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
 
+    # install SparkR
+    if which("R"):
+        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
+    else:
+        print("Can't install SparkR as R is was not found in PATH")
+
     if os.environ.get("AMPLAB_JENKINS"):
         # if we're on the Amplab Jenkins build servers setup variables
         # to reflect the environment settings
@@ -485,6 +502,8 @@ def main():
         run_scala_style_checks()
     if not changed_files or any(f.endswith(".py") for f in changed_files):
         run_python_style_checks()
+    if not changed_files or any(f.endswith(".R") for f in changed_files):
+        run_sparkr_style_checks()
 
     # determine if docs were changed and if we're inside the amplab environment
     # note - the below commented out until *all* Jenkins workers can get `jekyll` installed

From 30734d45fbbb269437c062241a9161e198805a76 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 27 Aug 2015 21:44:06 -0700
Subject: [PATCH 118/802] [SPARK-9911] [DOC] [ML] Update Userguide for
 Evaluator

I added a small note about the different types of evaluator and the metrics used.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8304 from MechCoder/multiclass_evaluator.
---
 docs/ml-guide.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index de8fead3529e4..01bf5ee18e328 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -643,6 +643,13 @@ An important task in ML is *model selection*, or using data to find the best mod
 Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.Evaluator).
 `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing.
 `CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
+
+The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.RegressionEvaluator)
+for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.BinaryClassificationEvaluator)
+for binary data or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator)
+for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the setMetric
+method in each of these evaluators.
+
 The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model.
 `CrossValidator` finally fits the `Estimator` using the best `ParamMap` and the entire dataset.
 
@@ -708,9 +715,12 @@ val pipeline = new Pipeline()
 // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
 // This will allow us to jointly choose parameters for all Pipeline stages.
 // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
+// used is areaUnderROC.
 val crossval = new CrossValidator()
   .setEstimator(pipeline)
   .setEvaluator(new BinaryClassificationEvaluator)
+
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
@@ -831,9 +841,12 @@ Pipeline pipeline = new Pipeline()
 // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
 // This will allow us to jointly choose parameters for all Pipeline stages.
 // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
+// used is areaUnderROC.
 CrossValidator crossval = new CrossValidator()
     .setEstimator(pipeline)
     .setEvaluator(new BinaryClassificationEvaluator());
+
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.

From af0e1249b1c881c0fa7a921fd21fd2c27214b980 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 27 Aug 2015 21:55:20 -0700
Subject: [PATCH 119/802] [SPARK-9905] [ML] [DOC] Adds LinearRegressionSummary
 user guide

* Adds user guide for `LinearRegressionSummary`
* Fixes unresolved issues in  #8197

CC jkbradley mengxr

Author: Feynman Liang <fliang@databricks.com>

Closes #8491 from feynmanliang/SPARK-9905.
---
 docs/ml-linear-methods.md | 140 ++++++++++++++++++++++++++++++++++----
 1 file changed, 127 insertions(+), 13 deletions(-)

diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 2761aeb789621..cdd9d4999fa1b 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -34,7 +34,7 @@ net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf).
 Mathematically, it is defined as a convex combination of the $L_1$ and
 the $L_2$ regularization terms:
 `\[
-\alpha~\lambda \|\wv\|_1 + (1-\alpha) \frac{\lambda}{2}\|\wv\|_2^2, \alpha \in [0, 1], \lambda \geq 0.
+\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0
 \]`
 By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$
 regularization as special cases. For example, if a [linear
@@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
 
     SparkContext sc = new SparkContext(conf);
     SQLContext sql = new SQLContext(sc);
-    String path = "sample_libsvm_data.txt";
+    String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
     DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
@@ -103,7 +103,7 @@ public class LogisticRegressionWithElasticNetExample {
     LogisticRegression lr = new LogisticRegression()
       .setMaxIter(10)
       .setRegParam(0.3)
-      .setElasticNetParam(0.8)
+      .setElasticNetParam(0.8);
 
     // Fit the model
     LogisticRegressionModel lrModel = lr.fit(training);
@@ -158,10 +158,12 @@ This will likely change when multiclass classification is supported.
 Continuing the earlier example:
 
 {% highlight scala %}
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary
+
 // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
 val trainingSummary = lrModel.summary
 
-// Obtain the loss per iteration.
+// Obtain the objective per iteration.
 val objectiveHistory = trainingSummary.objectiveHistory
 objectiveHistory.foreach(loss => println(loss))
 
@@ -173,17 +175,14 @@ val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary
 // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
 val roc = binarySummary.roc
 roc.show()
-roc.select("FPR").show()
 println(binarySummary.areaUnderROC)
 
-// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
-// this selected threshold.
+// Set the model threshold to maximize F-Measure
 val fMeasure = binarySummary.fMeasureByThreshold
 val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
 val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).
   select("threshold").head().getDouble(0)
-logReg.setThreshold(bestThreshold)
-logReg.fit(logRegDataFrame)
+lrModel.setThreshold(bestThreshold)
 {% endhighlight %}
 </div>
 
@@ -199,8 +198,12 @@ This will likely change when multiclass classification is supported.
 Continuing the earlier example:
 
 {% highlight java %}
+import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
+import org.apache.spark.sql.functions;
+
 // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
-LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary();
+LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
 
 // Obtain the loss per iteration.
 double[] objectiveHistory = trainingSummary.objectiveHistory();
@@ -222,20 +225,131 @@ System.out.println(binarySummary.areaUnderROC());
 // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
 // this selected threshold.
 DataFrame fMeasure = binarySummary.fMeasureByThreshold();
-double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0);
+double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
 double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)).
   select("threshold").head().getDouble(0);
-logReg.setThreshold(bestThreshold);
-logReg.fit(logRegDataFrame);
+lrModel.setThreshold(bestThreshold);
 {% endhighlight %}
 </div>
 
+<!--- TODO: Add python model summaries once implemented -->
 <div data-lang="python" markdown="1">
 Logistic regression model summary is not yet supported in Python.
 </div>
 
 </div>
 
+## Example: Linear Regression
+
+The interface for working with linear regression models and model
+summaries is similar to the logistic regression case. The following
+example demonstrates training an elastic net regularized linear
+regression model and extracting model summary statistics.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data
+val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+val lr = new LinearRegression()
+  .setMaxIter(10)
+  .setRegParam(0.3)
+  .setElasticNetParam(0.8)
+
+// Fit the model
+val lrModel = lr.fit(training)
+
+// Print the weights and intercept for linear regression
+println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
+
+// Summarize the model over the training set and print out some metrics
+val trainingSummary = lrModel.summary
+println(s"numIterations: ${trainingSummary.totalIterations}")
+println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
+trainingSummary.residuals.show()
+println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
+println(s"r2: ${trainingSummary.r2}")
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.regression.LinearRegressionModel;
+import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class LinearRegressionWithElasticNetExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf()
+      .setAppName("Linear Regression with Elastic Net Example");
+
+    SparkContext sc = new SparkContext(conf);
+    SQLContext sql = new SQLContext(sc);
+    String path = "data/mllib/sample_libsvm_data.txt";
+
+    // Load training data
+    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+
+    LinearRegression lr = new LinearRegression()
+      .setMaxIter(10)
+      .setRegParam(0.3)
+      .setElasticNetParam(0.8);
+
+    // Fit the model
+    LinearRegressionModel lrModel = lr.fit(training);
+
+    // Print the weights and intercept for linear regression
+    System.out.println("Weights: " + lrModel.weights() + " Intercept: " + lrModel.intercept());
+
+    // Summarize the model over the training set and print out some metrics
+    LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
+    System.out.println("numIterations: " + trainingSummary.totalIterations());
+    System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
+    trainingSummary.residuals().show();
+    System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
+    System.out.println("r2: " + trainingSummary.r2());
+  }
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+<!--- TODO: Add python model summaries once implemented -->
+{% highlight python %}
+from pyspark.ml.regression import LinearRegression
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+
+# Load training data
+training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+# Fit the model
+lrModel = lr.fit(training)
+
+# Print the weights and intercept for linear regression
+print("Weights: " + str(lrModel.weights))
+print("Intercept: " + str(lrModel.intercept))
+
+# Linear regression model summary is not yet supported in Python.
+{% endhighlight %}
+</div>
+
+</div>
+
 # Optimization
 
 The optimization algorithm underlying the implementation is called

From 89b943438512fcfb239c268b43431397de46cbcf Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 27 Aug 2015 22:30:01 -0700
Subject: [PATCH 120/802] [SPARK-SQL] [MINOR] Fixes some typos in HiveContext

Author: Cheng Lian <lian@databricks.com>

Closes #8481 from liancheng/hive-context-typo.
---
 .../scala/org/apache/spark/sql/hive/HiveContext.scala     | 8 ++++----
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c0a458fa9ab8d..2e791cea96b41 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -171,11 +171,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * Overrides default Hive configurations to avoid breaking changes to Spark SQL users.
    *  - allow SQL11 keywords to be used as identifiers
    */
-  private[sql] def defaultOverides() = {
+  private[sql] def defaultOverrides() = {
     setConf(ConfVars.HIVE_SUPPORT_SQL11_RESERVED_KEYWORDS.varname, "false")
   }
 
-  defaultOverides()
+  defaultOverrides()
 
   /**
    * The copy of the Hive client that is used to retrieve metadata from the Hive MetaStore.
@@ -190,8 +190,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     // into the isolated client loader
     val metadataConf = new HiveConf()
 
-    val defaltWarehouseLocation = metadataConf.get("hive.metastore.warehouse.dir")
-    logInfo("defalt warehouse location is " + defaltWarehouseLocation)
+    val defaultWarehouseLocation = metadataConf.get("hive.metastore.warehouse.dir")
+    logInfo("default warehouse location is " + defaultWarehouseLocation)
 
     // `configure` goes second to override other settings.
     val allConfig = metadataConf.asScala.map(e => e.getKey -> e.getValue).toMap ++ configure
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 572eaebe81ac2..57fea5d8db343 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -434,7 +434,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
         case (k, v) =>
           metadataHive.runSqlHive(s"SET $k=$v")
       }
-      defaultOverides()
+      defaultOverrides()
 
       runSqlHive("USE default")
 

From 7583681e6b0824d7eed471dc4d8fa0b2addf9ffc Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Thu, 27 Aug 2015 23:59:30 -0700
Subject: [PATCH 121/802] [SPARK-10188] [PYSPARK] Pyspark CrossValidator with
 RMSE selects incorrect model

* Added isLargerBetter() method to Pyspark Evaluator to match the Scala version.
* JavaEvaluator delegates isLargerBetter() to underlying Scala object.
* Added check for isLargerBetter() in CrossValidator to determine whether to use argmin or argmax.
* Added test cases for where smaller is better (RMSE) and larger is better (R-Squared).

(This contribution is my original work and that I license the work to the project under Sparks' open source license)

Author: noelsmith <mail@noelsmith.com>

Closes #8399 from noel-smith/pyspark-rmse-xval-fix.
---
 python/pyspark/ml/evaluation.py | 12 +++++
 python/pyspark/ml/tests.py      | 87 +++++++++++++++++++++++++++++++++
 python/pyspark/ml/tuning.py     |  6 ++-
 3 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 6b0a9ffde9f42..cb3b07947e488 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -66,6 +66,14 @@ def evaluate(self, dataset, params=None):
         else:
             raise ValueError("Params must be a param map but got %s." % type(params))
 
+    def isLargerBetter(self):
+        """
+        Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
+        (True, default) or minimized (False).
+        A given evaluator may support multiple metrics which may be maximized or minimized.
+        """
+        return True
+
 
 @inherit_doc
 class JavaEvaluator(Evaluator, JavaWrapper):
@@ -85,6 +93,10 @@ def _evaluate(self, dataset):
         self._transfer_params_to_java()
         return self._java_obj.evaluate(dataset._jdf)
 
+    def isLargerBetter(self):
+        self._transfer_params_to_java()
+        return self._java_obj.isLargerBetter()
+
 
 @inherit_doc
 class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index c151d21fd661a..60e4237293adc 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -32,11 +32,14 @@
 
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
 from pyspark.sql import DataFrame, SQLContext
+from pyspark.sql.functions import rand
+from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed
 from pyspark.ml.util import keyword_only
 from pyspark.ml import Estimator, Model, Pipeline, Transformer
 from pyspark.ml.feature import *
+from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
 from pyspark.mllib.linalg import DenseVector
 
 
@@ -264,5 +267,89 @@ def test_ngram(self):
         self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])
 
 
+class HasInducedError(Params):
+
+    def __init__(self):
+        super(HasInducedError, self).__init__()
+        self.inducedError = Param(self, "inducedError",
+                                  "Uniformly-distributed error added to feature")
+
+    def getInducedError(self):
+        return self.getOrDefault(self.inducedError)
+
+
+class InducedErrorModel(Model, HasInducedError):
+
+    def __init__(self):
+        super(InducedErrorModel, self).__init__()
+
+    def _transform(self, dataset):
+        return dataset.withColumn("prediction",
+                                  dataset.feature + (rand(0) * self.getInducedError()))
+
+
+class InducedErrorEstimator(Estimator, HasInducedError):
+
+    def __init__(self, inducedError=1.0):
+        super(InducedErrorEstimator, self).__init__()
+        self._set(inducedError=inducedError)
+
+    def _fit(self, dataset):
+        model = InducedErrorModel()
+        self._copyValues(model)
+        return model
+
+
+class CrossValidatorTests(PySparkTestCase):
+
+    def test_fit_minimize_metric(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([
+            (10, 10.0),
+            (50, 50.0),
+            (100, 100.0),
+            (500, 500.0)] * 10,
+            ["feature", "label"])
+
+        iee = InducedErrorEstimator()
+        evaluator = RegressionEvaluator(metricName="rmse")
+
+        grid = (ParamGridBuilder()
+                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
+                .build())
+        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
+        cvModel = cv.fit(dataset)
+        bestModel = cvModel.bestModel
+        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
+
+        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
+                         "Best model should have zero induced error")
+        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
+
+    def test_fit_maximize_metric(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([
+            (10, 10.0),
+            (50, 50.0),
+            (100, 100.0),
+            (500, 500.0)] * 10,
+            ["feature", "label"])
+
+        iee = InducedErrorEstimator()
+        evaluator = RegressionEvaluator(metricName="r2")
+
+        grid = (ParamGridBuilder()
+                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
+                .build())
+        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
+        cvModel = cv.fit(dataset)
+        bestModel = cvModel.bestModel
+        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
+
+        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
+                         "Best model should have zero induced error")
+        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index dcfee6a3170ab..cae778869e9c5 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -223,7 +223,11 @@ def _fit(self, dataset):
                 # TODO: duplicate evaluator to take extra params from input
                 metric = eva.evaluate(model.transform(validation, epm[j]))
                 metrics[j] += metric
-        bestIndex = np.argmax(metrics)
+
+        if eva.isLargerBetter():
+            bestIndex = np.argmax(metrics)
+        else:
+            bestIndex = np.argmin(metrics)
         bestModel = est.fit(dataset, epm[bestIndex])
         return CrossValidatorModel(bestModel)
 

From 2f99c37273c1d82e2ba39476e4429ea4aaba7ec6 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 28 Aug 2015 00:37:50 -0700
Subject: [PATCH 122/802] [SPARK-10328] [SPARKR] Fix generic for na.omit

S3 function is at https://stat.ethz.ch/R-manual/R-patched/library/stats/html/na.fail.html

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Author: Shivaram Venkataraman <shivaram.venkataraman@gmail.com>
Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8495 from shivaram/na-omit-fix.
---
 R/pkg/R/DataFrame.R              |  6 +++---
 R/pkg/R/generics.R               |  2 +-
 R/pkg/inst/tests/test_sparkSQL.R | 23 ++++++++++++++++++++++-
 dev/lint-r                       |  2 +-
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index dd8126aebf467..74de7c81e35a6 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1699,9 +1699,9 @@ setMethod("dropna",
 #' @name na.omit
 #' @export
 setMethod("na.omit",
-          signature(x = "DataFrame"),
-          function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
-            dropna(x, how, minNonNulls, cols)
+          signature(object = "DataFrame"),
+          function(object, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+            dropna(object, how, minNonNulls, cols)
           })
 
 #' fillna
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index a829d46c1894c..b578b8789d2c5 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -413,7 +413,7 @@ setGeneric("dropna",
 #' @rdname nafunctions
 #' @export
 setGeneric("na.omit",
-           function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+           function(object, ...) {
              standardGeneric("na.omit")
            })
 
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 4b672e115f924..933b11c8ee7e2 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1083,7 +1083,7 @@ test_that("describe() and summarize() on a DataFrame", {
   expect_equal(collect(stats2)[5, "age"], "30")
 })
 
-test_that("dropna() on a DataFrame", {
+test_that("dropna() and na.omit() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPathNa)
   rows <- collect(df)
 
@@ -1092,6 +1092,8 @@ test_that("dropna() on a DataFrame", {
   expected <- rows[!is.na(rows$name),]
   actual <- collect(dropna(df, cols = "name"))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, cols = "name"))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age),]
   actual <- collect(dropna(df, cols = "age"))
@@ -1101,48 +1103,67 @@ test_that("dropna() on a DataFrame", {
   expect_identical(expected$age, actual$age)
   expect_identical(expected$height, actual$height)
   expect_identical(expected$name, actual$name)
+  actual <- collect(na.omit(df, cols = "age"))
 
   expected <- rows[!is.na(rows$age) & !is.na(rows$height),]
   actual <- collect(dropna(df, cols = c("age", "height")))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, cols = c("age", "height")))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
   actual <- collect(dropna(df))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df))
+  expect_identical(expected, actual)
 
   # drop with how
 
   expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
   actual <- collect(dropna(df))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age) | !is.na(rows$height) | !is.na(rows$name),]
   actual <- collect(dropna(df, "all"))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, "all"))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),]
   actual <- collect(dropna(df, "any"))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, "any"))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age) & !is.na(rows$height),]
   actual <- collect(dropna(df, "any", cols = c("age", "height")))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, "any", cols = c("age", "height")))
+  expect_identical(expected, actual)
 
   expected <- rows[!is.na(rows$age) | !is.na(rows$height),]
   actual <- collect(dropna(df, "all", cols = c("age", "height")))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, "all", cols = c("age", "height")))
+  expect_identical(expected, actual)
 
   # drop with threshold
 
   expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) >= 2,]
   actual <- collect(dropna(df, minNonNulls = 2, cols = c("age", "height")))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, minNonNulls = 2, cols = c("age", "height")))
+  expect_identical(expected, actual)
 
   expected <- rows[as.integer(!is.na(rows$age)) +
                    as.integer(!is.na(rows$height)) +
                    as.integer(!is.na(rows$name)) >= 3,]
   actual <- collect(dropna(df, minNonNulls = 3, cols = c("name", "age", "height")))
   expect_identical(expected, actual)
+  actual <- collect(na.omit(df, minNonNulls = 3, cols = c("name", "age", "height")))
+  expect_identical(expected, actual)
 })
 
 test_that("fillna() on a DataFrame", {
diff --git a/dev/lint-r b/dev/lint-r
index c15d57aad86da..bfda0bca15eb7 100755
--- a/dev/lint-r
+++ b/dev/lint-r
@@ -29,7 +29,7 @@ fi
 
 `which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME"
 
-NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME"`
+NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME" | awk '{print $1}'`
 if [ "$NUM_LINES" = "0" ] ; then
   lint_status=0
   echo "lintr checks passed."

From 4eeda8d472498acd40ef54723d1be9924a273d76 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 28 Aug 2015 00:50:26 -0700
Subject: [PATCH 123/802] [SPARK-10260] [ML] Add @Since annotation to
 ml.clustering

### JIRA
[[SPARK-10260] Add Since annotation to ml.clustering - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-10260)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8455 from yu-iskw/SPARK-10260.
---
 .../apache/spark/ml/clustering/KMeans.scala   | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 47a18cdb31b53..f40ab71fb22a6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.clustering
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Since, Experimental}
 import org.apache.spark.ml.param.{Param, Params, IntParam, ParamMap}
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -39,9 +39,11 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
    * Set the number of clusters to create (k). Must be > 1. Default: 2.
    * @group param
    */
+  @Since("1.5.0")
   final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
 
   /** @group getParam */
+  @Since("1.5.0")
   def getK: Int = $(k)
 
   /**
@@ -50,10 +52,12 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
    * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
    * @group expertParam
    */
+  @Since("1.5.0")
   final val initMode = new Param[String](this, "initMode", "initialization algorithm",
     (value: String) => MLlibKMeans.validateInitMode(value))
 
   /** @group expertGetParam */
+  @Since("1.5.0")
   def getInitMode: String = $(initMode)
 
   /**
@@ -61,10 +65,12 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
    * setting -- the default of 5 is almost always enough. Must be > 0. Default: 5.
    * @group expertParam
    */
+  @Since("1.5.0")
   final val initSteps = new IntParam(this, "initSteps", "number of steps for k-means||",
     (value: Int) => value > 0)
 
   /** @group expertGetParam */
+  @Since("1.5.0")
   def getInitSteps: Int = $(initSteps)
 
   /**
@@ -84,27 +90,32 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
  *
  * @param parentModel a model trained by spark.mllib.clustering.KMeans.
  */
+@Since("1.5.0")
 @Experimental
 class KMeansModel private[ml] (
-    override val uid: String,
+    @Since("1.5.0") override val uid: String,
     private val parentModel: MLlibKMeansModel) extends Model[KMeansModel] with KMeansParams {
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): KMeansModel = {
     val copied = new KMeansModel(uid, parentModel)
     copyValues(copied, extra)
   }
 
+  @Since("1.5.0")
   override def transform(dataset: DataFrame): DataFrame = {
     val predictUDF = udf((vector: Vector) => predict(vector))
     dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
   }
 
+  @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema)
   }
 
   private[clustering] def predict(features: Vector): Int = parentModel.predict(features)
 
+  @Since("1.5.0")
   def clusterCenters: Array[Vector] = parentModel.clusterCenters
 }
 
@@ -114,8 +125,11 @@ class KMeansModel private[ml] (
  *
  * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]]
  */
+@Since("1.5.0")
 @Experimental
-class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMeansParams {
+class KMeans @Since("1.5.0") (
+    @Since("1.5.0") override val uid: String)
+  extends Estimator[KMeansModel] with KMeansParams {
 
   setDefault(
     k -> 2,
@@ -124,34 +138,45 @@ class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMean
     initSteps -> 5,
     tol -> 1e-4)
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): KMeans = defaultCopy(extra)
 
+  @Since("1.5.0")
   def this() = this(Identifiable.randomUID("kmeans"))
 
   /** @group setParam */
+  @Since("1.5.0")
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setK(value: Int): this.type = set(k, value)
 
   /** @group expertSetParam */
+  @Since("1.5.0")
   def setInitMode(value: String): this.type = set(initMode, value)
 
   /** @group expertSetParam */
+  @Since("1.5.0")
   def setInitSteps(value: Int): this.type = set(initSteps, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setTol(value: Double): this.type = set(tol, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
+  @Since("1.5.0")
   override def fit(dataset: DataFrame): KMeansModel = {
     val rdd = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
 
@@ -167,6 +192,7 @@ class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMean
     copyValues(model)
   }
 
+  @Since("1.5.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema)
   }

From cc39803062119c1d14611dc227b9ed0ed1284d38 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 28 Aug 2015 09:32:23 +0100
Subject: [PATCH 124/802] [SPARK-10295] [CORE] Dynamic allocation in Mesos does
 not release when RDDs are cached

Remove obsolete warning about dynamic allocation not working with cached RDDs

See discussion in https://issues.apache.org/jira/browse/SPARK-10295

Author: Sean Owen <sowen@cloudera.com>

Closes #8489 from srowen/SPARK-10295.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index f3da04a7f55d0..738887076b0d1 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1590,11 +1590,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * Register an RDD to be persisted in memory and/or disk storage
    */
   private[spark] def persistRDD(rdd: RDD[_]) {
-    _executorAllocationManager.foreach { _ =>
-      logWarning(
-        s"Dynamic allocation currently does not support cached RDDs. Cached data for RDD " +
-        s"${rdd.id} will be lost when executors are removed.")
-    }
     persistentRdds(rdd.id) = rdd
   }
 

From 18294cd8710427076caa86bfac596de67089d57e Mon Sep 17 00:00:00 2001
From: Keiji Yoshida <yoshida.keiji.84@gmail.com>
Date: Fri, 28 Aug 2015 09:36:50 +0100
Subject: [PATCH 125/802] Fix DynamodDB/DynamoDB typo in Kinesis Integration
 doc

Fix DynamodDB/DynamoDB typo in Kinesis Integration doc

Author: Keiji Yoshida <yoshida.keiji.84@gmail.com>

Closes #8501 from yosssi/patch-1.
---
 docs/streaming-kinesis-integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index a7bcaec6fcd84..238a911a9199f 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -91,7 +91,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 
 	- Kinesis data processing is ordered per partition and occurs at-least once per message.
 
-	- Multiple applications can read from the same Kinesis stream.  Kinesis will maintain the application-specific shard and checkpoint info in DynamodDB.
+	- Multiple applications can read from the same Kinesis stream.  Kinesis will maintain the application-specific shard and checkpoint info in DynamoDB.
 
 	- A single Kinesis stream shard is processed by one input DStream at a time.
 

From 71a077f6c16c8816eae13341f645ba50d997f63d Mon Sep 17 00:00:00 2001
From: Dharmesh Kakadia <dharmeshkakadia@users.noreply.github.com>
Date: Fri, 28 Aug 2015 09:38:35 +0100
Subject: [PATCH 126/802] typo in comment

Author: Dharmesh Kakadia <dharmeshkakadia@users.noreply.github.com>

Closes #8497 from dharmeshkakadia/patch-2.
---
 .../apache/spark/network/shuffle/protocol/RegisterExecutor.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
index cca8b17c4f129..167ef33104227 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java
@@ -27,7 +27,7 @@
 
 /**
  * Initial registration message between an executor and its local shuffle server.
- * Returns nothing (empty bye array).
+ * Returns nothing (empty byte array).
  */
 public class RegisterExecutor extends BlockTransferMessage {
   public final String appId;

From 1502a0f6c5d2f85a331b29d3bf50002911ea393e Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Fri, 28 Aug 2015 09:32:52 -0500
Subject: [PATCH 127/802] [YARN] [MINOR] Avoid hard code port number in
 YarnShuffleService test

Current port number is fixed as default (7337) in test, this will introduce port contention exception, better to change to a random number in unit test.

squito , seems you're author of this unit test, mind taking a look at this fix? Thanks a lot.

```
[info] - executor state kept across NM restart *** FAILED *** (597 milliseconds)
[info]   org.apache.hadoop.service.ServiceStateException: java.net.BindException: Address already in use
[info]   at org.apache.hadoop.service.ServiceStateException.convert(ServiceStateException.java:59)
[info]   at org.apache.hadoop.service.AbstractService.init(AbstractService.java:172)
[info]   at org.apache.spark.network.yarn.YarnShuffleServiceSuite$$anonfun$1.apply$mcV$sp(YarnShuffleServiceSuite.scala:72)
[info]   at org.apache.spark.network.yarn.YarnShuffleServiceSuite$$anonfun$1.apply(YarnShuffleServiceSuite.scala:70)
[info]   at org.apache.spark.network.yarn.YarnShuffleServiceSuite$$anonfun$1.apply(YarnShuffleServiceSuite.scala:70)
[info]   at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
[info]   at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
[info]   at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
[info]   at org.scalatest.Transformer.apply(Transformer.scala:22)
[info]   at org.scalatest.Transformer.apply(Transformer.scala:20)
[info]   at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
[info]   at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:42)
...
```

Author: jerryshao <sshao@hortonworks.com>

Closes #8502 from jerryshao/avoid-hardcode-port.
---
 .../org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
index 2f22cbdbeac37..6aa8c814cd4f0 100644
--- a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
@@ -37,6 +37,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
     yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
       classOf[YarnShuffleService].getCanonicalName)
+    yarnConfig.setInt("spark.shuffle.service.port", 0)
 
     yarnConfig.get("yarn.nodemanager.local-dirs").split(",").foreach { dir =>
       val d = new File(dir)

From e2a843090cb031f6aa774f6d9c031a7f0f732ee1 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 28 Aug 2015 08:00:44 -0700
Subject: [PATCH 128/802] [SPARK-9890] [DOC] [ML] User guide for
 CountVectorizer

jira: https://issues.apache.org/jira/browse/SPARK-9890

document with Scala and java examples

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #8487 from hhbyyh/cvDoc.
---
 docs/ml-features.md | 109 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 89a9bad570446..90654d1e5a248 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -211,6 +211,115 @@ for feature in result.select("result").take(3):
 </div>
 </div>
 
+## CountVectorizer
+
+`CountVectorizer` and `CountVectorizerModel` aim to help convert a collection of text documents
+ to vectors of token counts. When an a-priori dictionary is not available, `CountVectorizer` can
+ be used as an `Estimator` to extract the vocabulary and generates a `CountVectorizerModel`. The
+ model produces sparse representations for the documents over the vocabulary, which can then be
+ passed to other algorithms like LDA.
+
+ During the fitting process, `CountVectorizer` will select the top `vocabSize` words ordered by
+ term frequency across the corpus. An optional parameter "minDF" also affect the fitting process
+ by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be
+ included in the vocabulary.
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id` and `texts`:
+
+~~~~
+ id | texts
+----|----------
+ 0  | Array("a", "b", "c")
+ 1  | Array("a", "b", "b", "c", "a")
+~~~~
+
+each row in`texts` is a document of type Array[String].
+Invoking fit of `CountVectorizer` produces a `CountVectorizerModel` with vocabulary (a, b, c),
+then the output column "vector" after transformation contains:
+
+~~~~
+ id | texts                           | vector
+----|---------------------------------|---------------
+ 0  | Array("a", "b", "c")            | (3,[0,1,2],[1.0,1.0,1.0])
+ 1  | Array("a", "b", "b", "c", "a")  | (3,[0,1,2],[2.0,2.0,1.0])
+~~~~
+
+each vector represents the token counts of the document over the vocabulary.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+More details can be found in the API docs for
+[CountVectorizer](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer) and
+[CountVectorizerModel](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel).
+{% highlight scala %}
+import org.apache.spark.ml.feature.CountVectorizer
+import org.apache.spark.mllib.util.CountVectorizerModel
+
+val df = sqlContext.createDataFrame(Seq(
+  (0, Array("a", "b", "c")),
+  (1, Array("a", "b", "b", "c", "a"))
+)).toDF("id", "words")
+
+// fit a CountVectorizerModel from the corpus
+val cvModel: CountVectorizerModel = new CountVectorizer()
+  .setInputCol("words")
+  .setOutputCol("features")
+  .setVocabSize(3)
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df)
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+  .setInputCol("words")
+  .setOutputCol("features")
+
+cvModel.transform(df).select("features").show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+More details can be found in the API docs for
+[CountVectorizer](api/java/org/apache/spark/ml/feature/CountVectorizer.html) and
+[CountVectorizerModel](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html).
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Arrays.asList("a", "b", "c")),
+  RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+));
+StructType schema = new StructType(new StructField [] {
+  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+// fit a CountVectorizerModel from the corpus
+CountVectorizerModel cvModel = new CountVectorizer()
+  .setInputCol("text")
+  .setOutputCol("feature")
+  .setVocabSize(3)
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df);
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+  .setInputCol("text")
+  .setOutputCol("feature");
+
+cvModel.transform(df).show();
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Transformers
 
 ## Tokenizer

From 499e8e154bdcc9d7b2f685b159e0ddb4eae48fe4 Mon Sep 17 00:00:00 2001
From: Luciano Resende <lresende@apache.org>
Date: Fri, 28 Aug 2015 09:13:21 -0700
Subject: [PATCH 129/802] [SPARK-8952] [SPARKR] - Wrap normalizePath calls with
 suppressWarnings

This is based on davies comment on SPARK-8952 which suggests to only call normalizePath() when path starts with '~'

Author: Luciano Resende <lresende@apache.org>

Closes #8343 from lresende/SPARK-8952.
---
 R/pkg/R/SQLContext.R | 4 ++--
 R/pkg/R/sparkR.R     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 110117a18ccbc..1bc6445311473 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -201,7 +201,7 @@ setMethod("toDF", signature(x = "RDD"),
 
 jsonFile <- function(sqlContext, path) {
   # Allow the user to have a more flexible definiton of the text file path
-  path <- normalizePath(path)
+  path <- suppressWarnings(normalizePath(path))
   # Convert a string vector of paths to a string containing comma separated paths
   path <- paste(path, collapse = ",")
   sdf <- callJMethod(sqlContext, "jsonFile", path)
@@ -251,7 +251,7 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) {
 # TODO: Implement saveasParquetFile and write examples for both
 parquetFile <- function(sqlContext, ...) {
   # Allow the user to have a more flexible definiton of the text file path
-  paths <- lapply(list(...), normalizePath)
+  paths <- lapply(list(...), function(x) suppressWarnings(normalizePath(x)))
   sdf <- callJMethod(sqlContext, "parquetFile", paths)
   dataFrame(sdf)
 }
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index e83104f116422..3c57a44db257d 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -160,7 +160,7 @@ sparkR.init <- function(
   })
 
   if (nchar(sparkHome) != 0) {
-    sparkHome <- normalizePath(sparkHome)
+    sparkHome <- suppressWarnings(normalizePath(sparkHome))
   }
 
   sparkEnvirMap <- new.env()

From d3f87dc39480f075170817bbd00142967a938078 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 28 Aug 2015 11:51:42 -0700
Subject: [PATCH 130/802] [SPARK-10325] Override hashCode() for public Row

This commit fixes an issue where the public SQL `Row` class did not override `hashCode`, causing it to violate the hashCode() + equals() contract. To fix this, I simply ported the `hashCode` implementation from the 1.4.x version of `Row`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8500 from JoshRosen/SPARK-10325 and squashes the following commits:

51ffea1 [Josh Rosen] Override hashCode() for public Row.
---
 .../src/main/scala/org/apache/spark/sql/Row.scala   | 13 +++++++++++++
 .../test/scala/org/apache/spark/sql/RowSuite.scala  |  9 +++++++++
 2 files changed, 22 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index cfd9cb0e62598..ed2fdf9f2f7cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import scala.collection.JavaConverters._
+import scala.util.hashing.MurmurHash3
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericRow
@@ -410,6 +411,18 @@ trait Row extends Serializable {
     true
   }
 
+  override def hashCode: Int = {
+    // Using Scala's Seq hash code implementation.
+    var n = 0
+    var h = MurmurHash3.seqSeed
+    val len = length
+    while (n < len) {
+      h = MurmurHash3.mix(h, apply(n).##)
+      n += 1
+    }
+    MurmurHash3.finalizeHash(h, n)
+  }
+
   /* ---------------------- utility methods for Scala ---------------------- */
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 795d4e983f27e..77ccd6f775e50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -85,4 +85,13 @@ class RowSuite extends SparkFunSuite with SharedSQLContext {
     val r2 = Row(Double.NaN)
     assert(r1 === r2)
   }
+
+  test("equals and hashCode") {
+    val r1 = Row("Hello")
+    val r2 = Row("Hello")
+    assert(r1 === r2)
+    assert(r1.hashCode() === r2.hashCode())
+    val r3 = Row("World")
+    assert(r3.hashCode() != r1.hashCode())
+  }
 }

From c53c902fa9c458200245f919067b41dde9cd9418 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 28 Aug 2015 12:33:40 -0700
Subject: [PATCH 131/802] [SPARK-9284] [TESTS] Allow all tests to run without
 an assembly.

This change aims at speeding up the dev cycle a little bit, by making
sure that all tests behave the same w.r.t. where the code to be tested
is loaded from. Namely, that means that tests don't rely on the assembly
anymore, rather loading all needed classes from the build directories.

The main change is to make sure all build directories (classes and test-classes)
are added to the classpath of child processes when running tests.

YarnClusterSuite required some custom code since the executors are run
differently (i.e. not through the launcher library, like standalone and
Mesos do).

I also found a couple of tests that could leak a SparkContext on failure,
and added code to handle those.

With this patch, it's possible to run the following command from a clean
source directory and have all tests pass:

  mvn -Pyarn -Phadoop-2.4 -Phive-thriftserver install

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7629 from vanzin/SPARK-9284.
---
 bin/spark-class                               | 16 ++++----
 .../spark/launcher/SparkLauncherSuite.java    |  0
 .../spark/broadcast/BroadcastSuite.scala      | 10 ++++-
 .../launcher/AbstractCommandBuilder.java      | 28 ++++++++------
 pom.xml                                       |  8 ++++
 project/SparkBuild.scala                      |  2 +
 .../deploy/yarn/BaseYarnClusterSuite.scala    | 37 +++++++++++++------
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 20 ++++++++--
 .../yarn/YarnShuffleIntegrationSuite.scala    |  2 +-
 .../spark/launcher/TestClasspathBuilder.scala | 36 ++++++++++++++++++
 10 files changed, 122 insertions(+), 37 deletions(-)
 rename {launcher => core}/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java (100%)
 create mode 100644 yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala

diff --git a/bin/spark-class b/bin/spark-class
index 2b59e5df5736f..e38e08dec40e4 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -43,17 +43,19 @@ else
 fi
 
 num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
-if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then
+if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" -a "$SPARK_PREPEND_CLASSES" != "1" ]; then
   echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
   echo "You need to build Spark before running this program." 1>&2
   exit 1
 fi
-ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
-if [ "$num_jars" -gt "1" ]; then
-  echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
-  echo "$ASSEMBLY_JARS" 1>&2
-  echo "Please remove all but one jar." 1>&2
-  exit 1
+if [ -d "$ASSEMBLY_DIR" ]; then
+  ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
+  if [ "$num_jars" -gt "1" ]; then
+    echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
+    echo "$ASSEMBLY_JARS" 1>&2
+    echo "Please remove all but one jar." 1>&2
+    exit 1
+  fi
 fi
 
 SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
similarity index 100%
rename from launcher/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
rename to core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 48e74f06f79b1..fb7a8ae3f9d41 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -310,8 +310,14 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
       val _sc =
         new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", broadcastConf)
       // Wait until all salves are up
-      _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
-      _sc
+      try {
+        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
+        _sc
+      } catch {
+        case e: Throwable =>
+          _sc.stop()
+          throw e
+      }
     } else {
       new SparkContext("local", "test", broadcastConf)
     }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 5e793a5c48775..0a237ee73b670 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -169,9 +169,11 @@ List<String> buildClassPath(String appClassPath) throws IOException {
         "streaming", "tools", "sql/catalyst", "sql/core", "sql/hive", "sql/hive-thriftserver",
         "yarn", "launcher");
       if (prependClasses) {
-        System.err.println(
-          "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark classes ahead of " +
-          "assembly.");
+        if (!isTesting) {
+          System.err.println(
+            "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark classes ahead of " +
+            "assembly.");
+        }
         for (String project : projects) {
           addToClassPath(cp, String.format("%s/%s/target/scala-%s/classes", sparkHome, project,
             scala));
@@ -200,7 +202,7 @@ List<String> buildClassPath(String appClassPath) throws IOException {
     // For the user code case, we fall back to looking for the Spark assembly under SPARK_HOME.
     // That duplicates some of the code in the shell scripts that look for the assembly, though.
     String assembly = getenv(ENV_SPARK_ASSEMBLY);
-    if (assembly == null && isEmpty(getenv("SPARK_TESTING"))) {
+    if (assembly == null && !isTesting) {
       assembly = findAssembly();
     }
     addToClassPath(cp, assembly);
@@ -215,12 +217,14 @@ List<String> buildClassPath(String appClassPath) throws IOException {
       libdir = new File(sparkHome, "lib_managed/jars");
     }
 
-    checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
-      libdir.getAbsolutePath());
-    for (File jar : libdir.listFiles()) {
-      if (jar.getName().startsWith("datanucleus-")) {
-        addToClassPath(cp, jar.getAbsolutePath());
+    if (libdir.isDirectory()) {
+      for (File jar : libdir.listFiles()) {
+        if (jar.getName().startsWith("datanucleus-")) {
+          addToClassPath(cp, jar.getAbsolutePath());
+        }
       }
+    } else {
+      checkState(isTesting, "Library directory '%s' does not exist.", libdir.getAbsolutePath());
     }
 
     addToClassPath(cp, getenv("HADOOP_CONF_DIR"));
@@ -256,15 +260,15 @@ String getScalaVersion() {
       return scala;
     }
     String sparkHome = getSparkHome();
-    File scala210 = new File(sparkHome, "assembly/target/scala-2.10");
-    File scala211 = new File(sparkHome, "assembly/target/scala-2.11");
+    File scala210 = new File(sparkHome, "launcher/target/scala-2.10");
+    File scala211 = new File(sparkHome, "launcher/target/scala-2.11");
     checkState(!scala210.isDirectory() || !scala211.isDirectory(),
       "Presence of build for both scala versions (2.10 and 2.11) detected.\n" +
       "Either clean one of them or set SPARK_SCALA_VERSION in your environment.");
     if (scala210.isDirectory()) {
       return "2.10";
     } else {
-      checkState(scala211.isDirectory(), "Cannot find any assembly build directories.");
+      checkState(scala211.isDirectory(), "Cannot find any build directories.");
       return "2.11";
     }
   }
diff --git a/pom.xml b/pom.xml
index 0716016523ee1..88ebceca769e9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1421,6 +1421,10 @@
             <groupId>org.apache.thrift</groupId>
             <artifactId>libthrift</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>servlet-api</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>com.google.guava</groupId>
             <artifactId>guava</artifactId>
@@ -1892,6 +1896,8 @@
                 launched by the tests have access to the correct test-time classpath.
               -->
               <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
+              <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
+              <SPARK_TESTING>1</SPARK_TESTING>
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
             <systemProperties>
@@ -1929,6 +1935,8 @@
                 launched by the tests have access to the correct test-time classpath.
               -->
               <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
+              <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
+              <SPARK_TESTING>1</SPARK_TESTING>
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
             <systemProperties>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index ea52bfd67944a..901cfa538d23e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -547,6 +547,8 @@ object TestSettings {
     envVars in Test ++= Map(
       "SPARK_DIST_CLASSPATH" ->
         (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
+      "SPARK_PREPEND_CLASSES" -> "1",
+      "SPARK_TESTING" -> "1",
       "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
     javaOptions in Test += s"-Djava.io.tmpdir=$testTempDir",
     javaOptions in Test += "-Dspark.test.home=" + sparkHome,
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index b4f8049bff577..17c59ff06e0c1 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.yarn.server.MiniYARNCluster
 import org.scalatest.{BeforeAndAfterAll, Matchers}
 
 import org.apache.spark._
+import org.apache.spark.launcher.TestClasspathBuilder
 import org.apache.spark.util.Utils
 
 abstract class BaseYarnClusterSuite
@@ -43,6 +44,9 @@ abstract class BaseYarnClusterSuite
     |log4j.appender.console.target=System.err
     |log4j.appender.console.layout=org.apache.log4j.PatternLayout
     |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+    |log4j.logger.org.apache.hadoop=WARN
+    |log4j.logger.org.eclipse.jetty=WARN
+    |log4j.logger.org.spark-project.jetty=WARN
     """.stripMargin
 
   private var yarnCluster: MiniYARNCluster = _
@@ -51,8 +55,7 @@ abstract class BaseYarnClusterSuite
   private var hadoopConfDir: File = _
   private var logConfDir: File = _
 
-
-  def yarnConfig: YarnConfiguration
+  def newYarnConfig(): YarnConfiguration
 
   override def beforeAll() {
     super.beforeAll()
@@ -65,8 +68,14 @@ abstract class BaseYarnClusterSuite
     val logConfFile = new File(logConfDir, "log4j.properties")
     Files.write(LOG4J_CONF, logConfFile, UTF_8)
 
+    // Disable the disk utilization check to avoid the test hanging when people's disks are
+    // getting full.
+    val yarnConf = newYarnConfig()
+    yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
+      "100.0")
+
     yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
-    yarnCluster.init(yarnConfig)
+    yarnCluster.init(yarnConf)
     yarnCluster.start()
 
     // There's a race in MiniYARNCluster in which start() may return before the RM has updated
@@ -114,19 +123,23 @@ abstract class BaseYarnClusterSuite
       sparkArgs: Seq[String] = Nil,
       extraClassPath: Seq[String] = Nil,
       extraJars: Seq[String] = Nil,
-      extraConf: Map[String, String] = Map()): Unit = {
+      extraConf: Map[String, String] = Map(),
+      extraEnv: Map[String, String] = Map()): Unit = {
     val master = if (clientMode) "yarn-client" else "yarn-cluster"
     val props = new Properties()
 
     props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
 
-    val childClasspath = logConfDir.getAbsolutePath() +
-      File.pathSeparator +
-      sys.props("java.class.path") +
-      File.pathSeparator +
-      extraClassPath.mkString(File.pathSeparator)
-    props.setProperty("spark.driver.extraClassPath", childClasspath)
-    props.setProperty("spark.executor.extraClassPath", childClasspath)
+    val testClasspath = new TestClasspathBuilder()
+      .buildClassPath(
+        logConfDir.getAbsolutePath() +
+        File.pathSeparator +
+        extraClassPath.mkString(File.pathSeparator))
+      .asScala
+      .mkString(File.pathSeparator)
+
+    props.setProperty("spark.driver.extraClassPath", testClasspath)
+    props.setProperty("spark.executor.extraClassPath", testClasspath)
 
     // SPARK-4267: make sure java options are propagated correctly.
     props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
@@ -168,7 +181,7 @@ abstract class BaseYarnClusterSuite
       appArgs
 
     Utils.executeAndGetOutput(argv,
-      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()))
+      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv)
   }
 
   /**
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 5a4ea2ea2f4ff..b5a42fd6afd98 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -28,7 +28,9 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.scalatest.Matchers
 
 import org.apache.spark._
-import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart, SparkListenerExecutorAdded}
+import org.apache.spark.launcher.TestClasspathBuilder
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
+  SparkListenerExecutorAdded}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.Utils
 
@@ -39,7 +41,7 @@ import org.apache.spark.util.Utils
  */
 class YarnClusterSuite extends BaseYarnClusterSuite {
 
-  override def yarnConfig: YarnConfiguration = new YarnConfiguration()
+  override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
 
   private val TEST_PYFILE = """
     |import mod1, mod2
@@ -111,6 +113,17 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     val primaryPyFile = new File(tempDir, "test.py")
     Files.write(TEST_PYFILE, primaryPyFile, UTF_8)
 
+    // When running tests, let's not assume the user has built the assembly module, which also
+    // creates the pyspark archive. Instead, let's use PYSPARK_ARCHIVES_PATH to point at the
+    // needed locations.
+    val sparkHome = sys.props("spark.test.home");
+    val pythonPath = Seq(
+        s"$sparkHome/python/lib/py4j-0.8.2.1-src.zip",
+        s"$sparkHome/python")
+    val extraEnv = Map(
+      "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
+      "PYTHONPATH" -> pythonPath.mkString(File.pathSeparator))
+
     val moduleDir =
       if (clientMode) {
         // In client-mode, .py files added with --py-files are not visible in the driver.
@@ -130,7 +143,8 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
 
     runSpark(clientMode, primaryPyFile.getAbsolutePath(),
       sparkArgs = Seq("--py-files", pyFiles),
-      appArgs = Seq(result.getAbsolutePath()))
+      appArgs = Seq(result.getAbsolutePath()),
+      extraEnv = extraEnv)
     checkResult(result)
   }
 
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 5e8238822b90a..8d9c9b3004eda 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
  */
 class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
-  override def yarnConfig: YarnConfiguration = {
+  override def newYarnConfig(): YarnConfiguration = {
     val yarnConfig = new YarnConfiguration()
     yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
     yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
diff --git a/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala b/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
new file mode 100644
index 0000000000000..da9e8e21a26ae
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.util.{List => JList, Map => JMap}
+
+/**
+ * Exposes AbstractCommandBuilder to the YARN tests, so that they can build classpaths the same
+ * way other cluster managers do.
+ */
+private[spark] class TestClasspathBuilder extends AbstractCommandBuilder {
+
+  childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, sys.props("spark.test.home"))
+
+  override def buildClassPath(extraCp: String): JList[String] = super.buildClassPath(extraCp)
+
+  /** Not used by the YARN tests. */
+  override def buildCommand(env: JMap[String, String]): JList[String] =
+    throw new UnsupportedOperationException()
+
+}

From 45723214e694b9a440723e9504c562e6393709f3 Mon Sep 17 00:00:00 2001
From: Shuo Xiang <sxiang@pinterest.com>
Date: Fri, 28 Aug 2015 13:09:13 -0700
Subject: [PATCH 132/802] [SPARK-10336][example] fix not being able to set
 intercept in LR example

`fitIntercept` is a command line option but not set in the main program.

dbtsai

Author: Shuo Xiang <sxiang@pinterest.com>

Closes #8510 from coderxiang/intercept and squashes the following commits:

57c9b7d [Shuo Xiang] fix not being able to set intercept in LR example
---
 .../org/apache/spark/examples/ml/LogisticRegressionExample.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index 7682557127b51..8e3760ddb50a9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -136,6 +136,7 @@ object LogisticRegressionExample {
       .setElasticNetParam(params.elasticNetParam)
       .setMaxIter(params.maxIter)
       .setTol(params.tol)
+      .setFitIntercept(params.fitIntercept)
 
     stages += lor
     val pipeline = new Pipeline().setStages(stages.toArray)

From 88032ecaf0455886aed7a66b30af80dae7f6cff7 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 28 Aug 2015 13:53:31 -0700
Subject: [PATCH 133/802] [SPARK-9671] [MLLIB] re-org user guide and add
 migration guide

This PR updates the MLlib user guide and adds migration guide for 1.4->1.5.

* merge migration guide for `spark.mllib` and `spark.ml` packages
* remove dependency section from `spark.ml` guide
* move the paragraph about `spark.mllib` and `spark.ml` to the top and recommend `spark.ml`
* move Sam's talk to footnote to make the section focus on dependencies

Minor changes to code examples and other wording will be in a separate PR.

jkbradley srowen feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8498 from mengxr/SPARK-9671.
---
 docs/ml-guide.md               |  52 ++------------
 docs/mllib-guide.md            | 119 ++++++++++++++++-----------------
 docs/mllib-migration-guides.md |  30 +++++++++
 3 files changed, 95 insertions(+), 106 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 01bf5ee18e328..ce53400b6ee56 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -21,19 +21,11 @@ title: Spark ML Programming Guide
 \]`
 
 
-Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of
-high-level APIs that help users create and tune practical machine learning pipelines.
-
-*Graduated from Alpha!*  The Pipelines API is no longer an alpha component, although many elements of it are still `Experimental` or `DeveloperApi`.
-
-Note that we will keep supporting and adding features to `spark.mllib` along with the
-development of `spark.ml`.
-Users should be comfortable using `spark.mllib` features and expect more features coming.
-Developers should contribute new algorithms to `spark.mllib` and can optionally contribute
-to `spark.ml`.
-
-See the [Algorithm Guides section](#algorithm-guides) below for guides on sub-packages of `spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more.
-
+The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of
+[DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical
+machine learning pipelines.
+See the [Algorithm Guides section](#algorithm-guides) below for guides on sub-packages of
+`spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more.
 
 **Table of Contents**
 
@@ -171,7 +163,7 @@ This is useful if there are two algorithms with the `maxIter` parameter in a `Pi
 
 # Algorithm Guides
 
-There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
+There are now several algorithms in the Pipelines API which are not in the `spark.mllib` API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
 
 **Pipelines API Algorithm Guides**
 
@@ -880,35 +872,3 @@ jsc.stop();
 </div>
 
 </div>
-
-# Dependencies
-
-Spark ML currently depends on MLlib and has the same dependencies.
-Please see the [MLlib Dependencies guide](mllib-guide.html#dependencies) for more info.
-
-Spark ML also depends upon Spark SQL, but the relevant parts of Spark SQL do not bring additional dependencies.
-
-# Migration Guide
-
-## From 1.3 to 1.4
-
-Several major API changes occurred, including:
-* `Param` and other APIs for specifying parameters
-* `uid` unique IDs for Pipeline components
-* Reorganization of certain classes
-Since the `spark.ml` API was an Alpha Component in Spark 1.3, we do not list all changes here.
-
-However, now that `spark.ml` is no longer an Alpha Component, we will provide details on any API changes for future releases.
-
-## From 1.2 to 1.3
-
-The main API changes are from Spark SQL.  We list the most important changes here:
-
-* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API.  All algorithms in Spark ML which used to use SchemaRDD now use DataFrame.
-* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`.  These implicits have been moved, so we now call `import sqlContext.implicits._`.
-* Java APIs for SQL have also changed accordingly.  Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details.
-
-Other changes were in `LogisticRegression`:
-
-* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability").  The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future).
-* In Spark 1.2, `LogisticRegressionModel` did not include an intercept.  In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS).  The option to use an intercept will be added in the future.
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 6330c977552d1..876dcfd40ed7b 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -5,21 +5,28 @@ displayTitle: Machine Learning Library (MLlib) Guide
 description: MLlib machine learning library overview for Spark SPARK_VERSION_SHORT
 ---
 
-MLlib is Spark's scalable machine learning library consisting of common learning algorithms and utilities,
-including classification, regression, clustering, collaborative
-filtering, dimensionality reduction, as well as underlying optimization primitives.
-Guides for individual algorithms are listed below.
+MLlib is Spark's machine learning (ML) library.
+Its goal is to make practical machine learning scalable and easy.
+It consists of common learning algorithms and utilities, including classification, regression,
+clustering, collaborative filtering, dimensionality reduction, as well as lower-level optimization
+primitives and higher-level pipeline APIs.
 
-The API is divided into 2 parts:
+It divides into two packages:
 
-* [The original `spark.mllib` API](mllib-guide.html#mllib-types-algorithms-and-utilities) is the primary API.
-* [The "Pipelines" `spark.ml` API](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) is a higher-level API for constructing ML workflows.
+* [`spark.mllib`](mllib-guide.html#mllib-types-algorithms-and-utilities) contains the original API
+  built on top of RDDs.
+* [`spark.ml`](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) provides higher-level API
+  built on top of DataFrames for constructing ML pipelines.
 
-We list major functionality from both below, with links to detailed guides.
+Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible.
+But we will keep supporting `spark.mllib` along with the development of `spark.ml`.
+Users should be comfortable using `spark.mllib` features and expect more features coming.
+Developers should contribute new algorithms to `spark.ml` if they fit the ML pipeline concept well,
+e.g., feature extractors and transformers.
 
-# MLlib types, algorithms and utilities
+We list major functionality from both below, with links to detailed guides.
 
-This lists functionality included in `spark.mllib`, the main MLlib API.
+# spark.mllib: data types, algorithms, and utilities
 
 * [Data types](mllib-data-types.html)
 * [Basic statistics](mllib-statistics.html)
@@ -56,71 +63,63 @@ This lists functionality included in `spark.mllib`, the main MLlib API.
   * [limited-memory BFGS (L-BFGS)](mllib-optimization.html#limited-memory-bfgs-l-bfgs)
 * [PMML model export](mllib-pmml-model-export.html)
 
-MLlib is under active development.
-The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
-and the migration guide below will explain all changes between releases.
-
 # spark.ml: high-level APIs for ML pipelines
 
-Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of
-high-level APIs that help users create and tune practical machine learning pipelines.
-
-*Graduated from Alpha!*  The Pipelines API is no longer an alpha component, although many elements of it are still `Experimental` or `DeveloperApi`.
-
-Note that we will keep supporting and adding features to `spark.mllib` along with the
-development of `spark.ml`.
-Users should be comfortable using `spark.mllib` features and expect more features coming.
-Developers should contribute new algorithms to `spark.mllib` and can optionally contribute
-to `spark.ml`.
-
-Guides for `spark.ml` include:
+**[spark.ml programming guide](ml-guide.html)** provides an overview of the Pipelines API and major
+concepts. It also contains sections on using algorithms within the Pipelines API, for example:
 
-* **[spark.ml programming guide](ml-guide.html)**: overview of the Pipelines API and major concepts
-* Guides on using algorithms within the Pipelines API:
-  * [Feature transformers](ml-features.html), including a few not in the lower-level `spark.mllib` API
-  * [Decision trees](ml-decision-tree.html)
-  * [Ensembles](ml-ensembles.html)
-  * [Linear methods](ml-linear-methods.html)
+* [Feature Extraction, Transformation, and Selection](ml-features.html)
+* [Decision Trees for Classification and Regression](ml-decision-tree.html)
+* [Ensembles](ml-ensembles.html)
+* [Linear methods with elastic net regularization](ml-linear-methods.html)
+* [Multilayer perceptron classifier](ml-ann.html)
 
 # Dependencies
 
-MLlib uses the linear algebra package
-[Breeze](http://www.scalanlp.org/), which depends on
-[netlib-java](https://github.com/fommil/netlib-java) for optimised
-numerical processing. If natives are not available at runtime, you
-will see a warning message and a pure JVM implementation will be used
-instead.
+MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on
+[netlib-java](https://github.com/fommil/netlib-java) for optimised numerical processing.
+If natives libraries[^1] are not available at runtime, you will see a warning message and a pure JVM
+implementation will be used instead.
 
-To learn more about the benefits and background of system optimised
-natives, you may wish to watch Sam Halliday's ScalaX talk on
-[High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/)).
+Due to licensing issues with runtime proprietary binaries, we do not include `netlib-java`'s native
+proxies by default.
+To configure `netlib-java` / Breeze to use system optimised binaries, include
+`com.github.fommil.netlib:all:1.1.2` (or build Spark with `-Pnetlib-lgpl`) as a dependency of your
+project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your
+platform's additional installation instructions.
 
-Due to licensing issues with runtime proprietary binaries, we do not
-include `netlib-java`'s native proxies by default. To configure
-`netlib-java` / Breeze to use system optimised binaries, include
-`com.github.fommil.netlib:all:1.1.2` (or build Spark with
-`-Pnetlib-lgpl`) as a dependency of your project and read the
-[netlib-java](https://github.com/fommil/netlib-java) documentation for
-your platform's additional installation instructions.
+To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer.
 
-To use MLlib in Python, you will need [NumPy](http://www.numpy.org)
-version 1.4 or newer.
+[^1]: To learn more about the benefits and background of system optimised natives, you may wish to
+    watch Sam Halliday's ScalaX talk on [High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/).
 
----
+# Migration guide
 
-# Migration Guide
+MLlib is under active development.
+The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
+and the migration guide below will explain all changes between releases.
+
+## From 1.4 to 1.5
 
-For the `spark.ml` package, please see the [spark.ml Migration Guide](ml-guide.html#migration-guide).
+In the `spark.mllib` package, there are no break API changes but several behavior changes:
 
-## From 1.3 to 1.4
+* [SPARK-9005](https://issues.apache.org/jira/browse/SPARK-9005):
+  `RegressionMetrics.explainedVariance` returns the average regression sum of squares.
+* [SPARK-8600](https://issues.apache.org/jira/browse/SPARK-8600): `NaiveBayesModel.labels` become
+  sorted.
+* [SPARK-3382](https://issues.apache.org/jira/browse/SPARK-3382): `GradientDescent` has a default
+  convergence tolerance `1e-3`, and hence iterations might end earlier than 1.4.
 
-In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs:
+In the `spark.ml` package, there exists one break API change and one behavior change:
 
-* Gradient-Boosted Trees
-    * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed.  This is only an issues for users who wrote their own losses for GBTs.
-    * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields.  This could be an issue for users who use `BoostingStrategy` to set GBT parameters.
-* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed.  It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`.  The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm.
+* [SPARK-9268](https://issues.apache.org/jira/browse/SPARK-9268): Java's varargs support is removed
+  from `Params.setDefault` due to a
+  [Scala compiler bug](https://issues.scala-lang.org/browse/SI-9013).
+* [SPARK-10097](https://issues.apache.org/jira/browse/SPARK-10097): `Evaluator.isLargerBetter` is
+  added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4.
 
-## Previous Spark Versions
+## Previous Spark versions
 
 Earlier migration guides are archived [on this page](mllib-migration-guides.html).
+
+---
diff --git a/docs/mllib-migration-guides.md b/docs/mllib-migration-guides.md
index 8df68d81f3c78..774b85d1f773a 100644
--- a/docs/mllib-migration-guides.md
+++ b/docs/mllib-migration-guides.md
@@ -7,6 +7,25 @@ description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
 
 The migration guide for the current Spark version is kept on the [MLlib Programming Guide main page](mllib-guide.html#migration-guide).
 
+## From 1.3 to 1.4
+
+In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs:
+
+* Gradient-Boosted Trees
+    * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed.  This is only an issues for users who wrote their own losses for GBTs.
+    * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields.  This could be an issue for users who use `BoostingStrategy` to set GBT parameters.
+* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed.  It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`.  The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm.
+
+In the `spark.ml` package, several major API changes occurred, including:
+
+* `Param` and other APIs for specifying parameters
+* `uid` unique IDs for Pipeline components
+* Reorganization of certain classes
+
+Since the `spark.ml` API was an alpha component in Spark 1.3, we do not list all changes here.
+However, since 1.4 `spark.ml` is no longer an alpha component, we will provide details on any API
+changes for future releases.
+
 ## From 1.2 to 1.3
 
 In the `spark.mllib` package, there were several breaking changes.  The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental.
@@ -23,6 +42,17 @@ In the `spark.mllib` package, there were several breaking changes.  The first ch
 * In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2.
   So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2.
 
+In the `spark.ml` package, the main API changes are from Spark SQL.  We list the most important changes here:
+
+* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API.  All algorithms in Spark ML which used to use SchemaRDD now use DataFrame.
+* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`.  These implicits have been moved, so we now call `import sqlContext.implicits._`.
+* Java APIs for SQL have also changed accordingly.  Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details.
+
+Other changes were in `LogisticRegression`:
+
+* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability").  The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future).
+* In Spark 1.2, `LogisticRegressionModel` did not include an intercept.  In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS).  The option to use an intercept will be added in the future.
+
 ## From 1.1 to 1.2
 
 The only API changes in MLlib v1.2 are in

From bb7f35239385ec74b5ee69631b5480fbcee253e4 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 28 Aug 2015 14:38:20 -0700
Subject: [PATCH 134/802] [SPARK-10323] [SQL] fix nullability of
 In/InSet/ArrayContain

After this PR, In/InSet/ArrayContain will return null if value is null, instead of false. They also will return null even if there is a null in the set/array.

Author: Davies Liu <davies@databricks.com>

Closes #8492 from davies/fix_in.
---
 .../expressions/collectionOperations.scala    | 62 ++++++++--------
 .../sql/catalyst/expressions/predicates.scala | 71 +++++++++++++++----
 .../sql/catalyst/optimizer/Optimizer.scala    |  6 --
 .../CollectionFunctionsSuite.scala            | 12 +++-
 .../catalyst/expressions/PredicateSuite.scala | 49 +++++++++----
 .../optimizer/ConstantFoldingSuite.scala      | 21 +-----
 .../spark/sql/DataFrameFunctionsSuite.scala   | 14 ++--
 7 files changed, 138 insertions(+), 97 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 646afa4047d84..7b8c5b723ded4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -19,9 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.Comparator
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{
-  CodegenFallback, CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, CodegenFallback, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 
 /**
@@ -145,46 +143,42 @@ case class ArrayContains(left: Expression, right: Expression)
     }
   }
 
-  override def nullable: Boolean = false
+  override def nullable: Boolean = {
+    left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull
+  }
 
-  override def eval(input: InternalRow): Boolean = {
-    val arr = left.eval(input)
-    if (arr == null) {
-      false
-    } else {
-      val value = right.eval(input)
-      if (value == null) {
-        false
-      } else {
-        arr.asInstanceOf[ArrayData].foreach(right.dataType, (i, v) =>
-          if (v == value) return true
-        )
-        false
+  override def nullSafeEval(arr: Any, value: Any): Any = {
+    var hasNull = false
+    arr.asInstanceOf[ArrayData].foreach(right.dataType, (i, v) =>
+      if (v == null) {
+        hasNull = true
+      } else if (v == value) {
+        return true
       }
+    )
+    if (hasNull) {
+      null
+    } else {
+      false
     }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val arrGen = left.gen(ctx)
-    val elementGen = right.gen(ctx)
-    val i = ctx.freshName("i")
-    val getValue = ctx.getValue(arrGen.primitive, right.dataType, i)
-    s"""
-      ${arrGen.code}
-      boolean ${ev.isNull} = false;
-      boolean ${ev.primitive} = false;
-      if (!${arrGen.isNull}) {
-        ${elementGen.code}
-        if (!${elementGen.isNull}) {
-          for (int $i = 0; $i < ${arrGen.primitive}.numElements(); $i ++) {
-            if (${ctx.genEqual(right.dataType, elementGen.primitive, getValue)}) {
-              ${ev.primitive} = true;
-              break;
-            }
-          }
+    nullSafeCodeGen(ctx, ev, (arr, value) => {
+      val i = ctx.freshName("i")
+      val getValue = ctx.getValue(arr, right.dataType, i)
+      s"""
+      for (int $i = 0; $i < $arr.numElements(); $i ++) {
+        if ($arr.isNullAt($i)) {
+          ${ev.isNull} = true;
+        } else if (${ctx.genEqual(right.dataType, value, getValue)}) {
+          ${ev.isNull} = false;
+          ${ev.primitive} = true;
+          break;
         }
       }
      """
+    })
   }
 
   override def prettyName: String = "array_contains"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index fe7dffb815987..65706dba7d975 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.collection.mutable
-
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
@@ -103,6 +101,8 @@ case class Not(child: Expression)
 case class In(value: Expression, list: Seq[Expression]) extends Predicate
     with ImplicitCastInputTypes {
 
+  require(list != null, "list should not be null")
+
   override def inputTypes: Seq[AbstractDataType] = value.dataType +: list.map(_.dataType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
@@ -116,12 +116,31 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate
 
   override def children: Seq[Expression] = value +: list
 
-  override def nullable: Boolean = false // TODO: Figure out correct nullability semantics of IN.
+  override def nullable: Boolean = children.exists(_.nullable)
+  override def foldable: Boolean = children.forall(_.foldable)
+
   override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}"
 
   override def eval(input: InternalRow): Any = {
     val evaluatedValue = value.eval(input)
-    list.exists(e => e.eval(input) == evaluatedValue)
+    if (evaluatedValue == null) {
+      null
+    } else {
+      var hasNull = false
+      list.foreach { e =>
+        val v = e.eval(input)
+        if (v == evaluatedValue) {
+          return true
+        } else if (v == null) {
+          hasNull = true
+        }
+      }
+      if (hasNull) {
+        null
+      } else {
+        false
+      }
+    }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -131,7 +150,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate
       s"""
         if (!${ev.primitive}) {
           ${x.code}
-          if (${ctx.genEqual(value.dataType, valueGen.primitive, x.primitive)}) {
+          if (${x.isNull}) {
+            ${ev.isNull} = true;
+          } else if (${ctx.genEqual(value.dataType, valueGen.primitive, x.primitive)}) {
+            ${ev.isNull} = false;
             ${ev.primitive} = true;
           }
         }
@@ -139,8 +161,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate
     s"""
       ${valueGen.code}
       boolean ${ev.primitive} = false;
-      boolean ${ev.isNull} = false;
-      $listCode
+      boolean ${ev.isNull} = ${valueGen.isNull};
+      if (!${ev.isNull}) {
+        $listCode
+      }
     """
   }
 }
@@ -151,11 +175,22 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate
  */
 case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with Predicate {
 
-  override def nullable: Boolean = false // TODO: Figure out correct nullability semantics of IN.
+  require(hset != null, "hset could not be null")
+
   override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}"
 
-  override def eval(input: InternalRow): Any = {
-    hset.contains(child.eval(input))
+  @transient private[this] lazy val hasNull: Boolean = hset.contains(null)
+
+  override def nullable: Boolean = child.nullable || hasNull
+
+  protected override def nullSafeEval(value: Any): Any = {
+    if (hset.contains(value)) {
+      true
+    } else if (hasNull) {
+      null
+    } else {
+      false
+    }
   }
 
   def getHSet(): Set[Any] = hset
@@ -166,12 +201,20 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     val childGen = child.gen(ctx)
     ctx.references += this
     val hsetTerm = ctx.freshName("hset")
+    val hasNullTerm = ctx.freshName("hasNull")
     ctx.addMutableState(setName, hsetTerm,
       s"$hsetTerm = (($InSetName)expressions[${ctx.references.size - 1}]).getHSet();")
+    ctx.addMutableState("boolean", hasNullTerm, s"$hasNullTerm = $hsetTerm.contains(null);")
     s"""
       ${childGen.code}
-      boolean ${ev.isNull} = false;
-      boolean ${ev.primitive} = $hsetTerm.contains(${childGen.primitive});
+      boolean ${ev.isNull} = ${childGen.isNull};
+      boolean ${ev.primitive} = false;
+      if (!${ev.isNull}) {
+        ${ev.primitive} = $hsetTerm.contains(${childGen.primitive});
+        if (!${ev.primitive} && $hasNullTerm) {
+          ${ev.isNull} = true;
+        }
+      }
      """
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 854463dd11c74..a430000bef653 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -395,12 +395,6 @@ object ConstantFolding extends Rule[LogicalPlan] {
 
       // Fold expressions that are foldable.
       case e if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType)
-
-      // Fold "literal in (item1, item2, ..., literal, ...)" into true directly.
-      case In(Literal(v, _), list) if list.exists {
-          case Literal(candidate, _) if candidate == v => true
-          case _ => false
-        } => Literal.create(true, BooleanType)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
index 95f0e38212a1a..a3e81888dfd0d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
@@ -70,14 +70,20 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
     val a1 = Literal.create(Seq[String](null, ""), ArrayType(StringType))
     val a2 = Literal.create(Seq(null), ArrayType(LongType))
+    val a3 = Literal.create(null, ArrayType(StringType))
 
     checkEvaluation(ArrayContains(a0, Literal(1)), true)
     checkEvaluation(ArrayContains(a0, Literal(0)), false)
-    checkEvaluation(ArrayContains(a0, Literal(null)), false)
+    checkEvaluation(ArrayContains(a0, Literal.create(null, IntegerType)), null)
 
     checkEvaluation(ArrayContains(a1, Literal("")), true)
-    checkEvaluation(ArrayContains(a1, Literal(null)), false)
+    checkEvaluation(ArrayContains(a1, Literal("a")), null)
+    checkEvaluation(ArrayContains(a1, Literal.create(null, StringType)), null)
 
-    checkEvaluation(ArrayContains(a2, Literal(null)), false)
+    checkEvaluation(ArrayContains(a2, Literal(1L)), null)
+    checkEvaluation(ArrayContains(a2, Literal.create(null, LongType)), null)
+
+    checkEvaluation(ArrayContains(a3, Literal("")), null)
+    checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 54c04faddb477..03e7611fce8ff 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.immutable.HashSet
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.types._
 
@@ -119,6 +118,12 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       (null, null, null) :: Nil)
 
   test("IN") {
+    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal(1), Literal(2))), null)
+    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal.create(null, IntegerType))),
+      null)
+    checkEvaluation(In(Literal(1), Seq(Literal.create(null, IntegerType))), null)
+    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal.create(null, IntegerType))), true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal.create(null, IntegerType))), null)
     checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
@@ -126,14 +131,18 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
       true)
 
-    checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("^Ba*n"))), true)
+    val ns = Literal.create(null, StringType)
+    checkEvaluation(In(ns, Seq(Literal("1"), Literal("2"))), null)
+    checkEvaluation(In(ns, Seq(ns)), null)
+    checkEvaluation(In(Literal("a"), Seq(ns)), null)
+    checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("^Ba*n"), ns)), true)
     checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^Ba*n"))), true)
     checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^n"))), false)
 
     val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, ByteType, ShortType,
       LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
     primitiveTypes.map { t =>
-      val dataGen = RandomDataGenerator.forType(t, nullable = false).get
+      val dataGen = RandomDataGenerator.forType(t, nullable = true).get
       val inputData = Seq.fill(10) {
         val value = dataGen.apply()
         value match {
@@ -142,9 +151,17 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
           case _ => value
         }
       }
-      val input = inputData.map(Literal(_))
-      checkEvaluation(In(input(0), input.slice(1, 10)),
-        inputData.slice(1, 10).contains(inputData(0)))
+      val input = inputData.map(Literal.create(_, t))
+      val expected = if (inputData(0) == null) {
+        null
+      } else if (inputData.slice(1, 10).contains(inputData(0))) {
+        true
+      } else if (inputData.slice(1, 10).contains(null)) {
+        null
+      } else {
+        false
+      }
+      checkEvaluation(In(input(0), input.slice(1, 10)), expected)
     }
   }
 
@@ -158,15 +175,15 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(InSet(one, hS), true)
     checkEvaluation(InSet(two, hS), true)
     checkEvaluation(InSet(two, nS), true)
-    checkEvaluation(InSet(nl, nS), true)
     checkEvaluation(InSet(three, hS), false)
-    checkEvaluation(InSet(three, nS), false)
-    checkEvaluation(And(InSet(one, hS), InSet(two, hS)), true)
+    checkEvaluation(InSet(three, nS), null)
+    checkEvaluation(InSet(nl, hS), null)
+    checkEvaluation(InSet(nl, nS), null)
 
     val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, ByteType, ShortType,
       LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
     primitiveTypes.map { t =>
-      val dataGen = RandomDataGenerator.forType(t, nullable = false).get
+      val dataGen = RandomDataGenerator.forType(t, nullable = true).get
       val inputData = Seq.fill(10) {
         val value = dataGen.apply()
         value match {
@@ -176,8 +193,16 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         }
       }
       val input = inputData.map(Literal(_))
-      checkEvaluation(InSet(input(0), inputData.slice(1, 10).toSet),
-        inputData.slice(1, 10).contains(inputData(0)))
+      val expected = if (inputData(0) == null) {
+        null
+      } else if (inputData.slice(1, 10).contains(inputData(0))) {
+        true
+      } else if (inputData.slice(1, 10).contains(null)) {
+        null
+      } else {
+        false
+      }
+      checkEvaluation(InSet(input(0), inputData.slice(1, 10).toSet), expected)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index ec3b2f1edfa05..e67606288f514 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -250,29 +250,14 @@ class ConstantFoldingSuite extends PlanTest {
   }
 
   test("Constant folding test: Fold In(v, list) into true or false") {
-    var originalQuery =
+    val originalQuery =
       testRelation
         .select('a)
         .where(In(Literal(1), Seq(Literal(1), Literal(2))))
 
-    var optimized = Optimize.execute(originalQuery.analyze)
-
-    var correctAnswer =
-      testRelation
-        .select('a)
-        .where(Literal(true))
-        .analyze
-
-    comparePlans(optimized, correctAnswer)
-
-    originalQuery =
-      testRelation
-        .select('a)
-        .where(In(Literal(1), Seq(Literal(1), 'a.attr)))
-
-    optimized = Optimize.execute(originalQuery.analyze)
+    val optimized = Optimize.execute(originalQuery.analyze)
 
-    correctAnswer =
+    val correctAnswer =
       testRelation
         .select('a)
         .where(Literal(true))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 9d965258e389d..3a3f19af1473b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -366,10 +366,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
       df.selectExpr("array_contains(a, 1)"),
       Seq(Row(true), Row(false))
     )
-    checkAnswer(
-      df.select(array_contains(array(lit(2), lit(null)), 1)),
-      Seq(Row(false), Row(false))
-    )
 
     // In hive, this errors because null has no type information
     intercept[AnalysisException] {
@@ -382,15 +378,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
       df.selectExpr("array_contains(null, 1)")
     }
 
-    // In hive, if either argument has a matching type has a null value, return false, even if
-    // the first argument array contains a null and the second argument is null
     checkAnswer(
-      df.selectExpr("array_contains(array(array(1), null)[1], 1)"),
-      Seq(Row(false), Row(false))
+      df.selectExpr("array_contains(array(array(1), null)[0], 1)"),
+      Seq(Row(true), Row(true))
     )
     checkAnswer(
-      df.selectExpr("array_contains(array(0, null), array(1, null)[1])"),
-      Seq(Row(false), Row(false))
+      df.selectExpr("array_contains(array(1, null), array(1, null)[0])"),
+      Seq(Row(true), Row(true))
     )
   }
 }

From 2a4e00ca4d4e7a148b4ff8ce0ad1c6d517cee55f Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Fri, 28 Aug 2015 18:35:01 -0700
Subject: [PATCH 135/802] [SPARK-9803] [SPARKR] Add subset and transform +
 tests

Add subset and transform
Also reorganize `[` & `[[` to subset instead of select

Note: for transform, transform is very similar to mutate. Spark doesn't seem to replace existing column with the name in mutate (ie. `mutate(df, age = df$age + 2)` - returned DataFrame has 2 columns with the same name 'age'), so therefore not doing that for now in transform.
Though it is clearly stated it should replace column with matching name (should I open a JIRA for mutate/transform?)

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #8503 from felixcheung/rsubset_transform.
---
 R/pkg/NAMESPACE                  |  2 +
 R/pkg/R/DataFrame.R              | 70 +++++++++++++++++++++++++-------
 R/pkg/R/generics.R               | 10 ++++-
 R/pkg/inst/tests/test_sparkSQL.R | 20 ++++++++-
 4 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5286c01986204..9d39630706436 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -69,9 +69,11 @@ exportMethods("arrange",
               "selectExpr",
               "show",
               "showDF",
+              "subset",
               "summarize",
               "summary",
               "take",
+              "transform",
               "unionAll",
               "unique",
               "unpersist",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 74de7c81e35a6..8a00238b41d60 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -987,7 +987,7 @@ setMethod("$<-", signature(x = "DataFrame"),
 
 setClassUnion("numericOrcharacter", c("numeric", "character"))
 
-#' @rdname select
+#' @rdname subset
 #' @name [[
 setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
           function(x, i) {
@@ -998,7 +998,7 @@ setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
             getColumn(x, i)
           })
 
-#' @rdname select
+#' @rdname subset
 #' @name [
 setMethod("[", signature(x = "DataFrame", i = "missing"),
           function(x, i, j, ...) {
@@ -1012,7 +1012,7 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
             select(x, j)
           })
 
-#' @rdname select
+#' @rdname subset
 #' @name [
 setMethod("[", signature(x = "DataFrame", i = "Column"),
           function(x, i, j, ...) {
@@ -1020,12 +1020,43 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
             # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
             filtered <- filter(x, i)
             if (!missing(j)) {
-              filtered[, j]
+              filtered[, j, ...]
             } else {
               filtered
             }
           })
 
+#' Subset
+#'
+#' Return subsets of DataFrame according to given conditions
+#' @param x A DataFrame
+#' @param subset A logical expression to filter on rows
+#' @param select expression for the single Column or a list of columns to select from the DataFrame
+#' @return A new DataFrame containing only the rows that meet the condition with selected columns
+#' @export
+#' @rdname subset
+#' @name subset
+#' @aliases [
+#' @family subsetting functions
+#' @examples
+#' \dontrun{
+#'   # Columns can be selected using `[[` and `[`
+#'   df[[2]] == df[["age"]]
+#'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
+#'   # Or to filter rows
+#'   df[df$age > 20,]
+#'   # DataFrame can be subset on both rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
+#'   subset(df, df$age %in% c(19, 30), 1:2)
+#'   subset(df, df$age %in% c(19), select = c(1,2))
+#' }
+setMethod("subset", signature(x = "DataFrame"),
+          function(x, subset, select, ...) {
+            x[subset, select, ...]
+          })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -1034,6 +1065,8 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #' @return A new DataFrame with selected columns
 #' @export
 #' @rdname select
+#' @name select
+#' @family subsetting functions
 #' @examples
 #' \dontrun{
 #'   select(df, "*")
@@ -1041,15 +1074,8 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #'   select(df, df$name, df$age + 1)
 #'   select(df, c("col1", "col2"))
 #'   select(df, list(df$name, df$age + 1))
-#'   # Columns can also be selected using `[[` and `[`
-#'   df[[2]] == df[["age"]]
-#'   df[,2] == df[,"age"]
-#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
-#'   # It can also be subset on rows and Columns
-#'   df[df$name == "Smith", c(1,2)]
-#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
@@ -1121,7 +1147,7 @@ setMethod("selectExpr",
 #' @return A DataFrame with the new column added.
 #' @rdname withColumn
 #' @name withColumn
-#' @aliases mutate
+#' @aliases mutate transform
 #' @export
 #' @examples
 #'\dontrun{
@@ -1141,11 +1167,12 @@ setMethod("withColumn",
 #'
 #' Return a new DataFrame with the specified columns added.
 #'
-#' @param x A DataFrame
+#' @param .data A DataFrame
 #' @param col a named argument of the form name = col
 #' @return A new DataFrame with the new columns added.
 #' @rdname withColumn
 #' @name mutate
+#' @aliases withColumn transform
 #' @export
 #' @examples
 #'\dontrun{
@@ -1155,10 +1182,12 @@ setMethod("withColumn",
 #' df <- jsonFile(sqlContext, path)
 #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
 #' names(newDF) # Will contain newCol, newCol2
+#' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2)
 #' }
 setMethod("mutate",
-          signature(x = "DataFrame"),
-          function(x, ...) {
+          signature(.data = "DataFrame"),
+          function(.data, ...) {
+            x <- .data
             cols <- list(...)
             stopifnot(length(cols) > 0)
             stopifnot(class(cols[[1]]) == "Column")
@@ -1173,6 +1202,16 @@ setMethod("mutate",
             do.call(select, c(x, x$"*", cols))
           })
 
+#' @export
+#' @rdname withColumn
+#' @name transform
+#' @aliases withColumn mutate
+setMethod("transform",
+          signature(`_data` = "DataFrame"),
+          function(`_data`, ...) {
+            mutate(`_data`, ...)
+          })
+
 #' WithColumnRenamed
 #'
 #' Rename an existing column in a DataFrame.
@@ -1300,6 +1339,7 @@ setMethod("orderBy",
 #' @return A DataFrame containing only the rows that meet the condition.
 #' @rdname filter
 #' @name filter
+#' @family subsetting functions
 #' @export
 #' @examples
 #'\dontrun{
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index b578b8789d2c5..43dd8d283ab6b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -467,7 +467,7 @@ setGeneric("merge")
 
 #' @rdname withColumn
 #' @export
-setGeneric("mutate", function(x, ...) {standardGeneric("mutate") })
+setGeneric("mutate", function(.data, ...) {standardGeneric("mutate") })
 
 #' @rdname arrange
 #' @export
@@ -507,6 +507,10 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
   standardGeneric("saveAsTable")
 })
 
+#' @rdname withColumn
+#' @export
+setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
+
 #' @rdname write.df
 #' @export
 setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
@@ -531,6 +535,10 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr")
 #' @export
 setGeneric("showDF", function(x,...) { standardGeneric("showDF") })
 
+# @rdname subset
+# @export
+setGeneric("subset", function(x, subset, select, ...) { standardGeneric("subset") })
+
 #' @rdname agg
 #' @export
 setGeneric("summarize", function(x,...) { standardGeneric("summarize") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 933b11c8ee7e2..0da5e38654732 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -612,6 +612,10 @@ test_that("subsetting", {
   df5 <- df[df$age %in% c(19), c(1,2)]
   expect_equal(count(df5), 1)
   expect_equal(columns(df5), c("name", "age"))
+
+  df6 <- subset(df, df$age %in% c(30), c(1,2))
+  expect_equal(count(df6), 1)
+  expect_equal(columns(df6), c("name", "age"))
 })
 
 test_that("selectExpr() on a DataFrame", {
@@ -1028,7 +1032,7 @@ test_that("withColumn() and withColumnRenamed()", {
   expect_equal(columns(newDF2)[1], "newerAge")
 })
 
-test_that("mutate(), rename() and names()", {
+test_that("mutate(), transform(), rename() and names()", {
   df <- jsonFile(sqlContext, jsonPath)
   newDF <- mutate(df, newAge = df$age + 2)
   expect_equal(length(columns(newDF)), 3)
@@ -1042,6 +1046,20 @@ test_that("mutate(), rename() and names()", {
   names(newDF2) <- c("newerName", "evenNewerAge")
   expect_equal(length(names(newDF2)), 2)
   expect_equal(names(newDF2)[1], "newerName")
+
+  transformedDF <- transform(df, newAge = -df$age, newAge2 = df$age / 2)
+  expect_equal(length(columns(transformedDF)), 4)
+  expect_equal(columns(transformedDF)[3], "newAge")
+  expect_equal(columns(transformedDF)[4], "newAge2")
+  expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30)
+
+  # test if transform on local data frames works
+  # ensure the proper signature is used - otherwise this will fail to run
+  attach(airquality)
+  result <- transform(Ozone, logOzone = log(Ozone))
+  expect_equal(nrow(result), 153)
+  expect_equal(ncol(result), 2)
+  detach(airquality)
 })
 
 test_that("write.df() on DataFrame and works with parquetFile", {

From e8ea5bafee9ca734edf62021145d0c2d5491cba8 Mon Sep 17 00:00:00 2001
From: martinzapletal <zapletal-martin@email.cz>
Date: Fri, 28 Aug 2015 21:03:48 -0700
Subject: [PATCH 136/802] [SPARK-9910] [ML] User guide for train validation
 split

Author: martinzapletal <zapletal-martin@email.cz>

Closes #8377 from zapletal-martin/SPARK-9910.
---
 docs/ml-guide.md                              | 117 ++++++++++++++++++
 .../ml/JavaTrainValidationSplitExample.java   |  90 ++++++++++++++
 .../ml/TrainValidationSplitExample.scala      |  80 ++++++++++++
 3 files changed, 287 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index ce53400b6ee56..a92a285f3af85 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -872,3 +872,120 @@ jsc.stop();
 </div>
 
 </div>
+
+## Example: Model Selection via Train Validation Split
+In addition to  `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning.
+`TrainValidationSplit` only evaluates each combination of parameters once as opposed to k times in
+ case of `CrossValidator`. It is therefore less expensive,
+ but will not produce as reliable results when the training dataset is not sufficiently large..
+
+`TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter,
+and an `Evaluator`.
+It begins by splitting the dataset into two parts using `trainRatio` parameter
+which are used as separate training and test datasets. For example with `$trainRatio=0.75$` (default),
+`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation.
+Similar to `CrossValidator`, `TrainValidationSplit` also iterates through the set of `ParamMap`s.
+For each combination of parameters, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
+The `ParamMap` which produces the best evaluation metric is selected as the best option.
+`TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
+import org.apache.spark.mllib.util.MLUtils
+
+// Prepare training and test data.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
+
+val lr = new LinearRegression()
+
+// We use a ParamGridBuilder to construct a grid of parameters to search over.
+// TrainValidationSplit will try all combinations of values and determine best model using
+// the evaluator.
+val paramGrid = new ParamGridBuilder()
+  .addGrid(lr.regParam, Array(0.1, 0.01))
+  .addGrid(lr.fitIntercept, Array(true, false))
+  .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
+  .build()
+
+// In this case the estimator is simply the linear regression.
+// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+val trainValidationSplit = new TrainValidationSplit()
+  .setEstimator(lr)
+  .setEvaluator(new RegressionEvaluator)
+  .setEstimatorParamMaps(paramGrid)
+
+// 80% of the data will be used for training and the remaining 20% for validation.
+trainValidationSplit.setTrainRatio(0.8)
+
+// Run train validation split, and choose the best set of parameters.
+val model = trainValidationSplit.fit(training)
+
+// Make predictions on test data. model is the model with combination of parameters
+// that performed best.
+model.transform(test)
+  .select("features", "label", "prediction")
+  .show()
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.tuning.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame data = jsql.createDataFrame(
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
+  LabeledPoint.class);
+
+// Prepare training and test data.
+DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
+DataFrame training = splits[0];
+DataFrame test = splits[1];
+
+LinearRegression lr = new LinearRegression();
+
+// We use a ParamGridBuilder to construct a grid of parameters to search over.
+// TrainValidationSplit will try all combinations of values and determine best model using
+// the evaluator.
+ParamMap[] paramGrid = new ParamGridBuilder()
+  .addGrid(lr.regParam(), new double[] {0.1, 0.01})
+  .addGrid(lr.fitIntercept())
+  .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
+  .build();
+
+// In this case the estimator is simply the linear regression.
+// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
+  .setEstimator(lr)
+  .setEvaluator(new RegressionEvaluator())
+  .setEstimatorParamMaps(paramGrid);
+
+// 80% of the data will be used for training and the remaining 20% for validation.
+trainValidationSplit.setTrainRatio(0.8);
+
+// Run train validation split, and choose the best set of parameters.
+TrainValidationSplitModel model = trainValidationSplit.fit(training);
+
+// Make predictions on test data. model is the model with combination of parameters
+// that performed best.
+model.transform(test)
+  .select("features", "label", "prediction")
+  .show();
+
+{% endhighlight %}
+</div>
+
+</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
new file mode 100644
index 0000000000000..23f834ab4332b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.tuning.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * A simple example demonstrating model selection using TrainValidationSplit.
+ *
+ * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample}
+ * using linear regression.
+ *
+ * Run with
+ * {{{
+ * bin/run-example ml.JavaTrainValidationSplitExample
+ * }}}
+ */
+public class JavaTrainValidationSplitExample {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    DataFrame data = jsql.createDataFrame(
+      MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
+      LabeledPoint.class);
+
+    // Prepare training and test data.
+    DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
+    DataFrame training = splits[0];
+    DataFrame test = splits[1];
+
+    LinearRegression lr = new LinearRegression();
+
+    // We use a ParamGridBuilder to construct a grid of parameters to search over.
+    // TrainValidationSplit will try all combinations of values and determine best model using
+    // the evaluator.
+    ParamMap[] paramGrid = new ParamGridBuilder()
+      .addGrid(lr.regParam(), new double[] {0.1, 0.01})
+      .addGrid(lr.fitIntercept())
+      .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
+      .build();
+
+    // In this case the estimator is simply the linear regression.
+    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+    TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEvaluator(new RegressionEvaluator())
+      .setEstimatorParamMaps(paramGrid);
+
+    // 80% of the data will be used for training and the remaining 20% for validation.
+    trainValidationSplit.setTrainRatio(0.8);
+
+    // Run train validation split, and choose the best set of parameters.
+    TrainValidationSplitModel model = trainValidationSplit.fit(training);
+
+    // Make predictions on test data. model is the model with combination of parameters
+    // that performed best.
+    model.transform(test)
+      .select("features", "label", "prediction")
+      .show();
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
new file mode 100644
index 0000000000000..1abdf219b1c00
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * A simple example demonstrating model selection using TrainValidationSplit.
+ *
+ * The example is based on [[SimpleParamsExample]] using linear regression.
+ * Run with
+ * {{{
+ * bin/run-example ml.TrainValidationSplitExample
+ * }}}
+ */
+object TrainValidationSplitExample {
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("TrainValidationSplitExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+
+    // Prepare training and test data.
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
+
+    val lr = new LinearRegression()
+
+    // We use a ParamGridBuilder to construct a grid of parameters to search over.
+    // TrainValidationSplit will try all combinations of values and determine best model using
+    // the evaluator.
+    val paramGrid = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.1, 0.01))
+      .addGrid(lr.fitIntercept, Array(true, false))
+      .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
+      .build()
+
+    // In this case the estimator is simply the linear regression.
+    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+    val trainValidationSplit = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEvaluator(new RegressionEvaluator)
+      .setEstimatorParamMaps(paramGrid)
+
+    // 80% of the data will be used for training and the remaining 20% for validation.
+    trainValidationSplit.setTrainRatio(0.8)
+
+    // Run train validation split, and choose the best set of parameters.
+    val model = trainValidationSplit.fit(training)
+
+    // Make predictions on test data. model is the model with combination of parameters
+    // that performed best.
+    model.transform(test)
+      .select("features", "label", "prediction")
+      .show()
+
+    sc.stop()
+  }
+}

From 5369be806848f43cb87c76504258c4e7de930c90 Mon Sep 17 00:00:00 2001
From: GuoQiang Li <witgo@qq.com>
Date: Sat, 29 Aug 2015 13:20:22 -0700
Subject: [PATCH 137/802] [SPARK-10350] [DOC] [SQL] Removed duplicated option
 description from SQL guide

Author: GuoQiang Li <witgo@qq.com>

Closes #8520 from witgo/SPARK-10350.
---
 docs/sql-programming-guide.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index e8eb88488ee24..6a1b0fbfa1eb3 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1405,16 +1405,6 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
     </p>
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.parquet.mergeSchema</code></td>
-  <td><code>false</code></td>
-  <td>
-    <p>
-      When true, the Parquet data source merges schemas collected from all data files, otherwise the
-      schema is picked from the summary file or a random data file if no summary file is available.
-    </p>
-  </td>
-</tr>
 </table>
 
 ## JSON Datasets

From 24ffa85c002a095ffb270175ec838995d3ed5469 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 29 Aug 2015 13:24:32 -0700
Subject: [PATCH 138/802] [SPARK-10289] [SQL] A direct write API for testing
 Parquet

This PR introduces a direct write API for testing Parquet. It's a DSL flavored version of the [`writeDirect` method] [1] comes with parquet-avro testing code. With this API, it's much easier to construct arbitrary Parquet structures. It's especially useful when adding regression tests for various compatibility corner cases.

Sample usage of this API can be found in the new test case added in `ParquetThriftCompatibilitySuite`.

[1]: https://github.com/apache/parquet-mr/blob/apache-parquet-1.8.1/parquet-avro/src/test/java/org/apache/parquet/avro/TestArrayCompatibility.java#L945-L972

Author: Cheng Lian <lian@databricks.com>

Closes #8454 from liancheng/spark-10289/parquet-testing-direct-write-api.
---
 .../parquet/ParquetCompatibilityTest.scala    |  84 +++++++++++++--
 .../ParquetThriftCompatibilitySuite.scala     | 100 +++++++++++++++---
 2 files changed, 160 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
index df68432faeeb3..91f3ce4d34c8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import scala.collection.JavaConverters._
+import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter}
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{Path, PathFilter}
-import org.apache.parquet.hadoop.ParquetFileReader
-import org.apache.parquet.schema.MessageType
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter}
+import org.apache.parquet.io.api.RecordConsumer
+import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.sql.QueryTest
 
@@ -38,11 +42,10 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
     val fs = fsPath.getFileSystem(configuration)
     val parquetFiles = fs.listStatus(fsPath, new PathFilter {
       override def accept(path: Path): Boolean = pathFilter(path)
-    }).toSeq
+    }).toSeq.asJava
 
-    val footers =
-      ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles.asJava, true)
-    footers.iterator().next().getParquetMetadata.getFileMetaData.getSchema
+    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
+    footers.asScala.head.getParquetMetadata.getFileMetaData.getSchema
   }
 
   protected def logParquetSchema(path: String): Unit = {
@@ -53,8 +56,69 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
   }
 }
 
-object ParquetCompatibilityTest {
-  def makeNullable[T <: AnyRef](i: Int)(f: => T): T = {
-    if (i % 3 == 0) null.asInstanceOf[T] else f
+private[sql] object ParquetCompatibilityTest {
+  implicit class RecordConsumerDSL(consumer: RecordConsumer) {
+    def message(f: => Unit): Unit = {
+      consumer.startMessage()
+      f
+      consumer.endMessage()
+    }
+
+    def group(f: => Unit): Unit = {
+      consumer.startGroup()
+      f
+      consumer.endGroup()
+    }
+
+    def field(name: String, index: Int)(f: => Unit): Unit = {
+      consumer.startField(name, index)
+      f
+      consumer.endField(name, index)
+    }
+  }
+
+  /**
+   * A testing Parquet [[WriteSupport]] implementation used to write manually constructed Parquet
+   * records with arbitrary structures.
+   */
+  private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String])
+    extends WriteSupport[RecordConsumer => Unit] {
+
+    private var recordConsumer: RecordConsumer = _
+
+    override def init(configuration: Configuration): WriteContext = {
+      new WriteContext(schema, metadata.asJava)
+    }
+
+    override def write(recordWriter: RecordConsumer => Unit): Unit = {
+      recordWriter.apply(recordConsumer)
+    }
+
+    override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+      this.recordConsumer = recordConsumer
+    }
+  }
+
+  /**
+   * Writes arbitrary messages conforming to a given `schema` to a Parquet file located by `path`.
+   * Records are produced by `recordWriters`.
+   */
+  def writeDirect(path: String, schema: String, recordWriters: (RecordConsumer => Unit)*): Unit = {
+    writeDirect(path, schema, Map.empty[String, String], recordWriters: _*)
+  }
+
+  /**
+   * Writes arbitrary messages conforming to a given `schema` to a Parquet file located by `path`
+   * with given user-defined key-value `metadata`. Records are produced by `recordWriters`.
+   */
+  def writeDirect(
+      path: String,
+      schema: String,
+      metadata: Map[String, String],
+      recordWriters: (RecordConsumer => Unit)*): Unit = {
+    val messageType = MessageTypeParser.parseMessageType(schema)
+    val writeSupport = new DirectWriteSupport(messageType, metadata)
+    val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport)
+    try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
index b789c5a106e56..88a3d878f97fe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
@@ -33,11 +33,9 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
        """.stripMargin)
 
     checkAnswer(sqlContext.read.parquet(parquetFilePath.toString), (0 until 10).map { i =>
-      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
-
       val suits = Array("SPADES", "HEARTS", "DIAMONDS", "CLUBS")
 
-      Row(
+      val nonNullablePrimitiveValues = Seq(
         i % 2 == 0,
         i.toByte,
         (i + 1).toShort,
@@ -50,18 +48,15 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
         s"val_$i",
         s"val_$i",
         // Thrift ENUM values are converted to Parquet binaries containing UTF-8 strings
-        suits(i % 4),
-
-        nullable(i % 2 == 0: java.lang.Boolean),
-        nullable(i.toByte: java.lang.Byte),
-        nullable((i + 1).toShort: java.lang.Short),
-        nullable(i + 2: Integer),
-        nullable((i * 10).toLong: java.lang.Long),
-        nullable(i.toDouble + 0.2d: java.lang.Double),
-        nullable(s"val_$i"),
-        nullable(s"val_$i"),
-        nullable(suits(i % 4)),
+        suits(i % 4))
+
+      val nullablePrimitiveValues = if (i % 3 == 0) {
+        Seq.fill(nonNullablePrimitiveValues.length)(null)
+      } else {
+        nonNullablePrimitiveValues
+      }
 
+      val complexValues = Seq(
         Seq.tabulate(3)(n => s"arr_${i + n}"),
         // Thrift `SET`s are converted to Parquet `LIST`s
         Seq(i),
@@ -71,6 +66,83 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
             Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}")
           }
         }.toMap)
+
+      Row(nonNullablePrimitiveValues ++ nullablePrimitiveValues ++ complexValues: _*)
     })
   }
+
+  test("SPARK-10136 list of primitive list") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      // This Parquet schema is translated from the following Thrift schema:
+      //
+      //   struct ListOfPrimitiveList {
+      //     1: list<list<i32>> f;
+      //   }
+      val schema =
+        s"""message ListOfPrimitiveList {
+           |  required group f (LIST) {
+           |    repeated group f_tuple (LIST) {
+           |      repeated int32 f_tuple_tuple;
+           |    }
+           |  }
+           |}
+         """.stripMargin
+
+      writeDirect(path, schema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("f_tuple", 0) {
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(0)
+                    rc.addInteger(1)
+                  }
+                }
+
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(2)
+                    rc.addInteger(3)
+                  }
+                }
+              }
+            }
+          }
+        }
+      }, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("f_tuple", 0) {
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(4)
+                    rc.addInteger(5)
+                  }
+                }
+
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(6)
+                    rc.addInteger(7)
+                  }
+                }
+              }
+            }
+          }
+        }
+      })
+
+      logParquetSchema(path)
+
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(Seq(Seq(0, 1), Seq(2, 3))),
+          Row(Seq(Seq(4, 5), Seq(6, 7)))))
+    }
+  }
 }

From 5c3d16a9b91bb9a458d3ba141f7bef525cf3d285 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Sat, 29 Aug 2015 13:26:01 -0700
Subject: [PATCH 139/802] [SPARK-10344] [SQL] Add tests for extraStrategies

Actually using this API requires access to a lot of classes that we might make private by accident.  I've added some tests to prevent this.

Author: Michael Armbrust <michael@databricks.com>

Closes #8516 from marmbrus/extraStrategiesTests.
---
 .../spark/sql/ExtraStrategiesSuite.scala      | 67 +++++++++++++++++++
 .../spark/sql/test/SharedSQLContext.scala     |  2 +-
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
new file mode 100644
index 0000000000000..8d2f45d70308b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute}
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.{Row, Strategy, QueryTest}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.unsafe.types.UTF8String
+
+case class FastOperator(output: Seq[Attribute]) extends SparkPlan {
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val str = Literal("so fast").value
+    val row = new GenericInternalRow(Array[Any](str))
+    sparkContext.parallelize(Seq(row))
+  }
+
+  override def children: Seq[SparkPlan] = Nil
+}
+
+object TestStrategy extends Strategy {
+  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case Project(Seq(attr), _) if attr.name == "a" =>
+      FastOperator(attr.toAttribute :: Nil) :: Nil
+    case _ => Nil
+  }
+}
+
+class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("insert an extraStrategy") {
+    try {
+      sqlContext.experimental.extraStrategies = TestStrategy :: Nil
+
+      val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
+      checkAnswer(
+        df.select("a"),
+        Row("so fast"))
+
+      checkAnswer(
+        df.select("a", "b"),
+        Row("so slow", 1))
+    } finally {
+      sqlContext.experimental.extraStrategies = Nil
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 8a061b6bc690d..d23c6a0732669 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.{ColumnName, SQLContext}
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSQLContext]].
  */
-private[sql] trait SharedSQLContext extends SQLTestUtils {
+trait SharedSQLContext extends SQLTestUtils {
 
   /**
    * The [[TestSQLContext]] to use for all tests in this suite.

From 277148b285748e863f2b9fdf6cf12963977f91ca Mon Sep 17 00:00:00 2001
From: wangwei <wangwei82@huawei.com>
Date: Sat, 29 Aug 2015 13:29:50 -0700
Subject: [PATCH 140/802] [SPARK-10226] [SQL] Fix exclamation mark issue in
 SparkSQL

When I tested the latest version of spark with exclamation mark, I got some errors. Then I reseted the spark version and found that commit id "a2409d1c8e8ddec04b529ac6f6a12b5993f0eeda" brought the bug. With jline version changing from 0.9.94 to 2.12 after this commit, exclamation mark would be treated as a special character in ConsoleReader.

Author: wangwei <wangwei82@huawei.com>

Closes #8420 from small-wang/jline-SPARK-10226.
---
 .../apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index a29df567983b1..b5073961a1c84 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -171,6 +171,7 @@ private[hive] object SparkSQLCLIDriver extends Logging {
 
     val reader = new ConsoleReader()
     reader.setBellEnabled(false)
+    reader.setExpandEvents(false)
     // reader.setDebug(new PrintWriter(new FileWriter("writer.debug", true)))
     CliDriver.getCommandCompleter.foreach((e) => reader.addCompleter(e))
 

From 6a6f3c91ee1f63dd464eb03d156d02c1a5887d88 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 29 Aug 2015 13:36:25 -0700
Subject: [PATCH 141/802] [SPARK-10330] Use SparkHadoopUtil TaskAttemptContext
 reflection methods in more places

SparkHadoopUtil contains methods that use reflection to work around TaskAttemptContext binary incompatibilities between Hadoop 1.x and 2.x. We should use these methods in more places.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8499 from JoshRosen/use-hadoop-reflection-in-more-places.
---
 .../sql/execution/datasources/WriterContainer.scala    | 10 +++++++---
 .../sql/execution/datasources/json/JSONRelation.scala  |  7 +++++--
 .../datasources/parquet/ParquetRelation.scala          |  7 +++++--
 .../org/apache/spark/sql/hive/orc/OrcRelation.scala    |  9 ++++++---
 .../apache/spark/sql/sources/SimpleTextRelation.scala  |  7 +++++--
 5 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 78f48a5cd72c7..879fd69863211 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter}
 import org.apache.spark._
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.sql._
@@ -145,7 +146,8 @@ private[sql] abstract class BaseWriterContainer(
           "because spark.speculation is configured to be true.")
       defaultOutputCommitter
     } else {
-      val committerClass = context.getConfiguration.getClass(
+      val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+      val committerClass = configuration.getClass(
         SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
 
       Option(committerClass).map { clazz =>
@@ -227,7 +229,8 @@ private[sql] class DefaultWriterContainer(
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
     executorSideSetup(taskContext)
-    taskAttemptContext.getConfiguration.set("spark.sql.sources.output.path", outputPath)
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext)
+    configuration.set("spark.sql.sources.output.path", outputPath)
     val writer = outputWriterFactory.newInstance(getWorkPath, dataSchema, taskAttemptContext)
     writer.initConverter(dataSchema)
 
@@ -395,7 +398,8 @@ private[sql] class DynamicPartitionWriterContainer(
     def newOutputWriter(key: InternalRow): OutputWriter = {
       val partitionPath = getPartitionString(key).getString(0)
       val path = new Path(getWorkPath, partitionPath)
-      taskAttemptContext.getConfiguration.set(
+      val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext)
+      configuration.set(
         "spark.sql.sources.output.path", new Path(outputPath, partitionPath).toString)
       val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext)
       newWriter.initConverter(dataSchema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index ab8ca5f748f24..7a49157d9e72c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.Logging
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -169,8 +170,10 @@ private[json] class JsonOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-        val split = context.getTaskAttemptID.getTaskID.getId
+        val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+        val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+        val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+        val split = taskAttemptId.getTaskID.getId
         new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
       }
     }.getRecordWriter(context)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 64982f37cf872..c6bbc392cad4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -40,6 +40,7 @@ import org.apache.parquet.{Log => ApacheParquetLog}
 import org.slf4j.bridge.SLF4JBridgeHandler
 
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
@@ -81,8 +82,10 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
         //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
         //     partitions in the case of dynamic partitioning.
         override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-          val split = context.getTaskAttemptID.getTaskID.getId
+          val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+          val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+          val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+          val split = taskAttemptId.getTaskID.getId
           new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
         }
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 1cff5cf9c3543..4eeca9aec12bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.Logging
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -77,7 +78,8 @@ private[orc] class OrcOutputWriter(
     }.mkString(":"))
 
     val serde = new OrcSerde
-    serde.initialize(context.getConfiguration, table)
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+    serde.initialize(configuration, table)
     serde
   }
 
@@ -109,9 +111,10 @@ private[orc] class OrcOutputWriter(
   private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
     recordWriterInstantiated = true
 
-    val conf = context.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
     val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID")
-    val partition = context.getTaskAttemptID.getTaskID.getId
+    val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+    val partition = taskAttemptId.getTaskID.getId
     val filename = f"part-r-$partition%05d-$uniqueWriteJobId.orc"
 
     new OrcOutputFormat().getRecordWriter(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index e8141923a9b5c..527ca7a81cad8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputForma
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types.{DataType, StructType}
@@ -53,8 +54,10 @@ class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullW
   numberFormat.setGroupingUsed(false)
 
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-    val split = context.getTaskAttemptID.getTaskID.getId
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+    val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+    val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+    val split = taskAttemptId.getTaskID.getId
     val name = FileOutputFormat.getOutputName(context)
     new Path(outputFile, s"$name-${numberFormat.format(split)}-$uniqueWriteJobId")
   }

From 097a7e36e0bf7290b1879331375bacc905583bd3 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Sat, 29 Aug 2015 16:39:40 -0700
Subject: [PATCH 142/802] [SPARK-10339] [SPARK-10334] [SPARK-10301] [SQL]
 Partitioned table scan can OOM driver and throw a better error message when
 users need to enable parquet schema merging

This fixes the problem that scanning partitioned table causes driver have a high memory pressure and takes down the cluster. Also, with this fix, we will be able to correctly show the query plan of a query consuming partitioned tables.

https://issues.apache.org/jira/browse/SPARK-10339
https://issues.apache.org/jira/browse/SPARK-10334

Finally, this PR squeeze in a "quick fix" for SPARK-10301. It is not a real fix, but it just throw a better error message to let user know what to do.

Author: Yin Huai <yhuai@databricks.com>

Closes #8515 from yhuai/partitionedTableScan.
---
 .../datasources/DataSourceStrategy.scala      | 85 ++++++++++---------
 .../parquet/CatalystRowConverter.scala        |  7 ++
 .../ParquetHadoopFsRelationSuite.scala        | 15 +++-
 3 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 6c1ef6a6df887..c58213155daa8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _}
@@ -121,7 +122,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       projections: Seq[NamedExpression],
       filters: Seq[Expression],
       partitionColumns: StructType,
-      partitions: Array[Partition]) = {
+      partitions: Array[Partition]): SparkPlan = {
     val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation]
 
     // Because we are creating one RDD per partition, we need to have a shared HadoopConf.
@@ -130,49 +131,51 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     val confBroadcast =
       relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf))
 
-    // Builds RDD[Row]s for each selected partition.
-    val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-      // The table scan operator (PhysicalRDD) which retrieves required columns from data files.
-      // Notice that the schema of data files, represented by `relation.dataSchema`, may contain
-      // some partition column(s).
-      val scan =
-        pruneFilterProject(
-          logicalRelation,
-          projections,
-          filters,
-          (columns: Seq[Attribute], filters) => {
-            val partitionColNames = partitionColumns.fieldNames
-
-            // Don't scan any partition columns to save I/O.  Here we are being optimistic and
-            // assuming partition columns data stored in data files are always consistent with those
-            // partition values encoded in partition directory paths.
-            val needed = columns.filterNot(a => partitionColNames.contains(a.name))
-            val dataRows =
-              relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
-
-            // Merges data values with partition values.
-            mergeWithPartitionValues(
-              relation.schema,
-              columns.map(_.name).toArray,
-              partitionColNames,
-              partitionValues,
-              toCatalystRDD(logicalRelation, needed, dataRows))
-          })
-
-      scan.execute()
-    }
+    // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
+    // will union all partitions and attach partition values if needed.
+    val scanBuilder = {
+      (columns: Seq[Attribute], filters: Array[Filter]) => {
+        // Builds RDD[Row]s for each selected partition.
+        val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
+          val partitionColNames = partitionColumns.fieldNames
+
+          // Don't scan any partition columns to save I/O.  Here we are being optimistic and
+          // assuming partition columns data stored in data files are always consistent with those
+          // partition values encoded in partition directory paths.
+          val needed = columns.filterNot(a => partitionColNames.contains(a.name))
+          val dataRows =
+            relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
+
+          // Merges data values with partition values.
+          mergeWithPartitionValues(
+            relation.schema,
+            columns.map(_.name).toArray,
+            partitionColNames,
+            partitionValues,
+            toCatalystRDD(logicalRelation, needed, dataRows))
+        }
+
+        val unionedRows =
+          if (perPartitionRows.length == 0) {
+            relation.sqlContext.emptyResult
+          } else {
+            new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+          }
 
-    val unionedRows =
-      if (perPartitionRows.length == 0) {
-        relation.sqlContext.emptyResult
-      } else {
-        new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+        unionedRows
       }
+    }
+
+    // Create the scan operator. If needed, add Filter and/or Project on top of the scan.
+    // The added Filter/Project is on top of the unioned RDD. We do not want to create
+    // one Filter/Project for every partition.
+    val sparkPlan = pruneFilterProject(
+      logicalRelation,
+      projections,
+      filters,
+      scanBuilder)
 
-    execution.PhysicalRDD.createFromDataSource(
-      projections.map(_.toAttribute),
-      unionedRows,
-      logicalRelation.relation)
+    sparkPlan
   }
 
   // TODO: refactor this thing. It is very complicated because it does projection internally.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index f682ca0d8ff4f..fe13dfbbed385 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -196,6 +196,13 @@ private[parquet] class CatalystRowConverter(
       }
     }
 
+    if (paddedParquetFields.length != catalystType.length) {
+      throw new UnsupportedOperationException(
+        "A Parquet file's schema has different number of fields with the table schema. " +
+          "Please enable schema merging by setting \"mergeSchema\" to true when load " +
+          "a Parquet dataset or set spark.sql.parquet.mergeSchema to true in SQLConf.")
+    }
+
     paddedParquetFields.zip(catalystType).zipWithIndex.map {
       case ((parquetFieldType, catalystField), ordinal) =>
         // Converted field value should be set to the `ordinal`-th cell of `currentRow`
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
index cb4cedddbfddd..06dadbb5feab0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -23,7 +23,7 @@ import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.{AnalysisException, SaveMode}
+import org.apache.spark.sql.{execution, AnalysisException, SaveMode}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
 
@@ -136,4 +136,17 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       assert(fs.exists(commonSummaryPath))
     }
   }
+
+  test("SPARK-10334 Projections and filters should be kept in physical plan") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      sqlContext.range(2).select('id as 'a, 'id as 'b).write.partitionBy("b").parquet(path)
+      val df = sqlContext.read.parquet(path).filter('a === 0).select('b)
+      val physicalPlan = df.queryExecution.executedPlan
+
+      assert(physicalPlan.collect { case p: execution.Project => p }.length === 1)
+      assert(physicalPlan.collect { case p: execution.Filter => p }.length === 1)
+    }
+  }
 }

From 13f5f8ec97c6886346641b73bd99004e0d70836c Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 29 Aug 2015 18:10:44 -0700
Subject: [PATCH 143/802] [SPARK-9986] [SPARK-9991] [SPARK-9993] [SQL] Create a
 simple test framework for local operators

This PR includes the following changes:
- Add `LocalNodeTest` for local operator tests and add unit tests for FilterNode and ProjectNode.
- Add `LimitNode` and `UnionNode` and their unit tests to show how to use `LocalNodeTest`. (SPARK-9991, SPARK-9993)

Author: zsxwing <zsxwing@gmail.com>

Closes #8464 from zsxwing/local-execution.
---
 .../sql/execution/local/FilterNode.scala      |   6 +-
 .../spark/sql/execution/local/LimitNode.scala |  45 ++++++
 .../spark/sql/execution/local/LocalNode.scala |  13 +-
 .../sql/execution/local/ProjectNode.scala     |   4 +-
 .../sql/execution/local/SeqScanNode.scala     |   2 +-
 .../spark/sql/execution/local/UnionNode.scala |  72 +++++++++
 .../spark/sql/execution/SparkPlanTest.scala   |  46 +-----
 .../sql/execution/local/FilterNodeSuite.scala |  41 +++++
 .../sql/execution/local/LimitNodeSuite.scala  |  39 +++++
 .../sql/execution/local/LocalNodeTest.scala   | 146 ++++++++++++++++++
 .../execution/local/ProjectNodeSuite.scala    |  44 ++++++
 .../sql/execution/local/UnionNodeSuite.scala  |  52 +++++++
 .../apache/spark/sql/test/SQLTestData.scala   |   8 +
 .../apache/spark/sql/test/SQLTestUtils.scala  |  46 +++++-
 14 files changed, 509 insertions(+), 55 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
index a485a1a1d7ae4..81dd37c7da733 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
@@ -35,13 +35,13 @@ case class FilterNode(condition: Expression, child: LocalNode) extends UnaryLoca
 
   override def next(): Boolean = {
     var found = false
-    while (child.next() && !found) {
-      found = predicate.apply(child.get())
+    while (!found && child.next()) {
+      found = predicate.apply(child.fetch())
     }
     found
   }
 
-  override def get(): InternalRow = child.get()
+  override def fetch(): InternalRow = child.fetch()
 
   override def close(): Unit = child.close()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
new file mode 100644
index 0000000000000..fffc52abf6dd5
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
@@ -0,0 +1,45 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+
+case class LimitNode(limit: Int, child: LocalNode) extends UnaryLocalNode {
+
+  private[this] var count = 0
+
+  override def output: Seq[Attribute] = child.output
+
+  override def open(): Unit = child.open()
+
+  override def close(): Unit = child.close()
+
+  override def fetch(): InternalRow = child.fetch()
+
+  override def next(): Boolean = {
+    if (count < limit) {
+      count += 1
+      child.next()
+    } else {
+      false
+    }
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index 341c81438e6d6..1c4469acbf264 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -48,10 +48,10 @@ abstract class LocalNode extends TreeNode[LocalNode] {
   /**
    * Returns the current tuple.
    */
-  def get(): InternalRow
+  def fetch(): InternalRow
 
   /**
-   * Closes the iterator and releases all resources.
+   * Closes the iterator and releases all resources. It should be idempotent.
    *
    * Implementations of this must also call the `close()` function of its children.
    */
@@ -64,10 +64,13 @@ abstract class LocalNode extends TreeNode[LocalNode] {
     val converter = CatalystTypeConverters.createToScalaConverter(StructType.fromAttributes(output))
     val result = new scala.collection.mutable.ArrayBuffer[Row]
     open()
-    while (next()) {
-      result += converter.apply(get()).asInstanceOf[Row]
+    try {
+      while (next()) {
+        result += converter.apply(fetch()).asInstanceOf[Row]
+      }
+    } finally {
+      close()
     }
-    close()
     result
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
index e574d1473cdcb..9b8a4fe493026 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
@@ -34,8 +34,8 @@ case class ProjectNode(projectList: Seq[NamedExpression], child: LocalNode) exte
 
   override def next(): Boolean = child.next()
 
-  override def get(): InternalRow = {
-    project.apply(child.get())
+  override def fetch(): InternalRow = {
+    project.apply(child.fetch())
   }
 
   override def close(): Unit = child.close()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
index 994de8afa9a02..242cb66e07b7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
@@ -41,7 +41,7 @@ case class SeqScanNode(output: Seq[Attribute], data: Seq[InternalRow]) extends L
     }
   }
 
-  override def get(): InternalRow = currentRow
+  override def fetch(): InternalRow = currentRow
 
   override def close(): Unit = {
     // Do nothing
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
new file mode 100644
index 0000000000000..ba4aa7671aebd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
@@ -0,0 +1,72 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class UnionNode(children: Seq[LocalNode]) extends LocalNode {
+
+  override def output: Seq[Attribute] = children.head.output
+
+  private[this] var currentChild: LocalNode = _
+
+  private[this] var nextChildIndex: Int = _
+
+  override def open(): Unit = {
+    currentChild = children.head
+    currentChild.open()
+    nextChildIndex = 1
+  }
+
+  private def advanceToNextChild(): Boolean = {
+    var found = false
+    var exit = false
+    while (!exit && !found) {
+      if (currentChild != null) {
+        currentChild.close()
+      }
+      if (nextChildIndex >= children.size) {
+        found = false
+        exit = true
+      } else {
+        currentChild = children(nextChildIndex)
+        nextChildIndex += 1
+        currentChild.open()
+        found = currentChild.next()
+      }
+    }
+    found
+  }
+
+  override def close(): Unit = {
+    if (currentChild != null) {
+      currentChild.close()
+    }
+  }
+
+  override def fetch(): InternalRow = currentChild.fetch()
+
+  override def next(): Boolean = {
+    if (currentChild.next()) {
+      true
+    } else {
+      advanceToNextChild()
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 3a87f374d94b0..5ab8f44faebf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -24,7 +24,7 @@ import scala.util.control.NonFatal
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row, SQLContext}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.test.SQLTestUtils
 
 /**
  * Base class for writing tests for individual physical operators. For an example of how this
@@ -184,7 +184,7 @@ object SparkPlanTest {
         return Some(errorMessage)
     }
 
-    compareAnswers(actualAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+    SQLTestUtils.compareAnswers(actualAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
       s"""
          | Results do not match.
          | Actual result Spark plan:
@@ -229,7 +229,7 @@ object SparkPlanTest {
         return Some(errorMessage)
     }
 
-    compareAnswers(sparkAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+    SQLTestUtils.compareAnswers(sparkAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
       s"""
          | Results do not match for Spark plan:
          | $outputPlan
@@ -238,46 +238,6 @@ object SparkPlanTest {
     }
   }
 
-  private def compareAnswers(
-      sparkAnswer: Seq[Row],
-      expectedAnswer: Seq[Row],
-      sort: Boolean): Option[String] = {
-    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
-      // Converts data to types that we can do equality comparison using Scala collections.
-      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
-      // Java's java.math.BigDecimal.compareTo).
-      // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
-      // equality test.
-      // This function is copied from Catalyst's QueryTest
-      val converted: Seq[Row] = answer.map { s =>
-        Row.fromSeq(s.toSeq.map {
-          case d: java.math.BigDecimal => BigDecimal(d)
-          case b: Array[Byte] => b.toSeq
-          case o => o
-        })
-      }
-      if (sort) {
-        converted.sortBy(_.toString())
-      } else {
-        converted
-      }
-    }
-    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
-      val errorMessage =
-        s"""
-           | == Results ==
-           | ${sideBySide(
-              s"== Expected Answer - ${expectedAnswer.size} ==" +:
-              prepareAnswer(expectedAnswer).map(_.toString()),
-              s"== Actual Answer - ${sparkAnswer.size} ==" +:
-              prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
-      """.stripMargin
-      Some(errorMessage)
-    } else {
-      None
-    }
-  }
-
   private def executePlan(outputPlan: SparkPlan, _sqlContext: SQLContext): Seq[Row] = {
     // A very simple resolver to make writing tests easier. In contrast to the real resolver
     // this is always case sensitive and does not try to handle scoping or complex type resolution.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
new file mode 100644
index 0000000000000..07209f3779248
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
@@ -0,0 +1,41 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class FilterNodeSuite extends LocalNodeTest with SharedSQLContext {
+
+  test("basic") {
+    val condition = (testData.col("key") % 2) === 0
+    checkAnswer(
+      testData,
+      node => FilterNode(condition.expr, node),
+      testData.filter(condition).collect()
+    )
+  }
+
+  test("empty") {
+    val condition = (emptyTestData.col("key") % 2) === 0
+    checkAnswer(
+      emptyTestData,
+      node => FilterNode(condition.expr, node),
+      emptyTestData.filter(condition).collect()
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
new file mode 100644
index 0000000000000..523c02f4a6014
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
@@ -0,0 +1,39 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class LimitNodeSuite extends LocalNodeTest with SharedSQLContext {
+
+  test("basic") {
+    checkAnswer(
+      testData,
+      node => LimitNode(10, node),
+      testData.limit(10).collect()
+    )
+  }
+
+  test("empty") {
+    checkAnswer(
+      emptyTestData,
+      node => LimitNode(10, node),
+      emptyTestData.limit(10).collect()
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
new file mode 100644
index 0000000000000..95f06081bd0a8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -0,0 +1,146 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.test.SQLTestUtils
+
+class LocalNodeTest extends SparkFunSuite {
+
+  /**
+   * Runs the LocalNode and makes sure the answer matches the expected result.
+   * @param input the input data to be used.
+   * @param nodeFunction a function which accepts the input LocalNode and uses it to instantiate
+   *                     the local physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
+   */
+  protected def checkAnswer(
+      input: DataFrame,
+      nodeFunction: LocalNode => LocalNode,
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean = true): Unit = {
+    doCheckAnswer(
+      input :: Nil,
+      nodes => nodeFunction(nodes.head),
+      expectedAnswer,
+      sortAnswers)
+  }
+
+  /**
+   * Runs the LocalNode and makes sure the answer matches the expected result.
+   * @param left the left input data to be used.
+   * @param right the right input data to be used.
+   * @param nodeFunction a function which accepts the input LocalNode and uses it to instantiate
+   *                     the local physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
+   */
+  protected def checkAnswer2(
+      left: DataFrame,
+      right: DataFrame,
+      nodeFunction: (LocalNode, LocalNode) => LocalNode,
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean = true): Unit = {
+    doCheckAnswer(
+      left :: right :: Nil,
+      nodes => nodeFunction(nodes(0), nodes(1)),
+      expectedAnswer,
+      sortAnswers)
+  }
+
+  /**
+   * Runs the `LocalNode`s and makes sure the answer matches the expected result.
+   * @param input the input data to be used.
+   * @param nodeFunction a function which accepts a sequence of input `LocalNode`s and uses them to
+   *                     instantiate the local physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
+   */
+  protected def doCheckAnswer(
+    input: Seq[DataFrame],
+    nodeFunction: Seq[LocalNode] => LocalNode,
+    expectedAnswer: Seq[Row],
+    sortAnswers: Boolean = true): Unit = {
+    LocalNodeTest.checkAnswer(
+      input.map(dataFrameToSeqScanNode), nodeFunction, expectedAnswer, sortAnswers) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
+  }
+
+  protected def dataFrameToSeqScanNode(df: DataFrame): SeqScanNode = {
+    new SeqScanNode(
+      df.queryExecution.sparkPlan.output,
+      df.queryExecution.toRdd.map(_.copy()).collect())
+  }
+
+}
+
+/**
+ * Helper methods for writing tests of individual local physical operators.
+ */
+object LocalNodeTest {
+
+  /**
+   * Runs the `LocalNode`s and makes sure the answer matches the expected result.
+   * @param input the input data to be used.
+   * @param nodeFunction a function which accepts the input `LocalNode`s and uses them to
+   *                     instantiate the local physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
+   */
+  def checkAnswer(
+    input: Seq[SeqScanNode],
+    nodeFunction: Seq[LocalNode] => LocalNode,
+    expectedAnswer: Seq[Row],
+    sortAnswers: Boolean): Option[String] = {
+
+    val outputNode = nodeFunction(input)
+
+    val outputResult: Seq[Row] = try {
+      outputNode.collect()
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+              | Exception thrown while executing local plan:
+              | $outputNode
+              | == Exception ==
+              | $e
+              | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
+
+    SQLTestUtils.compareAnswers(outputResult, expectedAnswer, sortAnswers).map { errorMessage =>
+      s"""
+          | Results do not match for local plan:
+          | $outputNode
+          | $errorMessage
+       """.stripMargin
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
new file mode 100644
index 0000000000000..ffcf092e2c66a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
@@ -0,0 +1,44 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ProjectNodeSuite extends LocalNodeTest with SharedSQLContext {
+
+  test("basic") {
+    val output = testData.queryExecution.sparkPlan.output
+    val columns = Seq(output(1), output(0))
+    checkAnswer(
+      testData,
+      node => ProjectNode(columns, node),
+      testData.select("value", "key").collect()
+    )
+  }
+
+  test("empty") {
+    val output = emptyTestData.queryExecution.sparkPlan.output
+    val columns = Seq(output(1), output(0))
+    checkAnswer(
+      emptyTestData,
+      node => ProjectNode(columns, node),
+      emptyTestData.select("value", "key").collect()
+    )
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
new file mode 100644
index 0000000000000..34670287c3e1d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
@@ -0,0 +1,52 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class UnionNodeSuite extends LocalNodeTest with SharedSQLContext {
+
+  test("basic") {
+    checkAnswer2(
+      testData,
+      testData,
+      (node1, node2) => UnionNode(Seq(node1, node2)),
+      testData.unionAll(testData).collect()
+    )
+  }
+
+  test("empty") {
+    checkAnswer2(
+      emptyTestData,
+      emptyTestData,
+      (node1, node2) => UnionNode(Seq(node1, node2)),
+      emptyTestData.unionAll(emptyTestData).collect()
+    )
+  }
+
+  test("complicated union") {
+    val dfs = Seq(testData, emptyTestData, emptyTestData, testData, testData, emptyTestData,
+      emptyTestData, emptyTestData, testData, emptyTestData)
+    doCheckAnswer(
+      dfs,
+      nodes => UnionNode(nodes),
+      dfs.reduce(_.unionAll(_)).collect()
+    )
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
index 1374a97476ca1..3fc02df954e23 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
@@ -36,6 +36,13 @@ private[sql] trait SQLTestData { self =>
 
   // Note: all test data should be lazy because the SQLContext is not set up yet.
 
+  protected lazy val emptyTestData: DataFrame = {
+    val df = _sqlContext.sparkContext.parallelize(
+      Seq.empty[Int].map(i => TestData(i, i.toString))).toDF()
+    df.registerTempTable("emptyTestData")
+    df
+  }
+
   protected lazy val testData: DataFrame = {
     val df = _sqlContext.sparkContext.parallelize(
       (1 to 100).map(i => TestData(i, i.toString))).toDF()
@@ -240,6 +247,7 @@ private[sql] trait SQLTestData { self =>
    */
   def loadTestData(): Unit = {
     assert(_sqlContext != null, "attempted to initialize test data before SQLContext.")
+    emptyTestData
     testData
     testData2
     testData3
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index cdd691e035897..dc08306ad9cb4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -27,8 +27,9 @@ import org.apache.hadoop.conf.Configuration
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits}
+import org.apache.spark.sql.{DataFrame, Row, SQLContext, SQLImplicits}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.util.Utils
 
 /**
@@ -179,3 +180,46 @@ private[sql] trait SQLTestUtils
     DataFrame(_sqlContext, plan)
   }
 }
+
+private[sql] object SQLTestUtils {
+
+  def compareAnswers(
+      sparkAnswer: Seq[Row],
+      expectedAnswer: Seq[Row],
+      sort: Boolean): Option[String] = {
+    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
+      // Converts data to types that we can do equality comparison using Scala collections.
+      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
+      // Java's java.math.BigDecimal.compareTo).
+      // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
+      // equality test.
+      // This function is copied from Catalyst's QueryTest
+      val converted: Seq[Row] = answer.map { s =>
+        Row.fromSeq(s.toSeq.map {
+          case d: java.math.BigDecimal => BigDecimal(d)
+          case b: Array[Byte] => b.toSeq
+          case o => o
+        })
+      }
+      if (sort) {
+        converted.sortBy(_.toString())
+      } else {
+        converted
+      }
+    }
+    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
+      val errorMessage =
+        s"""
+           | == Results ==
+           | ${sideBySide(
+          s"== Expected Answer - ${expectedAnswer.size} ==" +:
+            prepareAnswer(expectedAnswer).map(_.toString()),
+          s"== Actual Answer - ${sparkAnswer.size} ==" +:
+            prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
+      """.stripMargin
+      Some(errorMessage)
+    } else {
+      None
+    }
+  }
+}

From 905fbe498bdd29116468628e6a2a553c1fd57165 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sat, 29 Aug 2015 23:26:23 -0700
Subject: [PATCH 144/802] [SPARK-10348] [MLLIB] updates ml-guide

* replace `ML Dataset` by `DataFrame` to unify the abstraction
* ML algorithms -> pipeline components to describe the main concept
* remove Scala API doc links from the main guide
* `Section Title` -> `Section tile` to be consistent with other section titles in MLlib guide
* modified lines break at 100 chars or periods

jkbradley feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8517 from mengxr/SPARK-10348.
---
 docs/ml-guide.md    | 118 +++++++++++++++++++++++++++-----------------
 docs/mllib-guide.md |  12 ++---
 2 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index a92a285f3af85..4ba07542bfb40 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -24,61 +24,74 @@ title: Spark ML Programming Guide
 The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of
 [DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical
 machine learning pipelines.
-See the [Algorithm Guides section](#algorithm-guides) below for guides on sub-packages of
+See the [algorithm guides](#algorithm-guides) section below for guides on sub-packages of
 `spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more.
 
-**Table of Contents**
+**Table of contents**
 
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-# Main Concepts
+# Main concepts
 
-Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow.  This section covers the key concepts introduced by the Spark ML API.
+Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple
+algorithms into a single pipeline, or workflow.
+This section covers the key concepts introduced by the Spark ML API, where the pipeline concept is
+mostly inspired by the [scikit-learn](http://scikit-learn.org/) project.
 
-* **[ML Dataset](ml-guide.html#ml-dataset)**: Spark ML uses the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL as a dataset which can hold a variety of data types.
-E.g., a dataset could have different columns storing text, feature vectors, true labels, and predictions.
+* **[`DataFrame`](ml-guide.html#dataframe)**: Spark ML uses `DataFrame` from Spark SQL as an ML
+  dataset, which can hold a variety of data types.
+  E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions.
 
 * **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
-E.g., an ML model is a `Transformer` which transforms an RDD with features into an RDD with predictions.
+E.g., an ML model is a `Transformer` which transforms `DataFrame` with features into a `DataFrame` with predictions.
 
 * **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
-E.g., a learning algorithm is an `Estimator` which trains on a dataset and produces a model.
+E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model.
 
 * **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
 
-* **[`Param`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
+* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
 
-## ML Dataset
+## DataFrame
 
 Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
-Spark ML adopts the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL in order to support a variety of data types under a unified Dataset concept.
+Spark ML adopts the `DataFrame` from Spark SQL in order to support a variety of data types.
 
 `DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
-In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) types.
+In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types.
 
 A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
 
 Columns in a `DataFrame` are named.  The code examples below use names such as "text," "features," and "label."
 
-## ML Algorithms
+## Pipeline components
 
 ### Transformers
 
-A [`Transformer`](api/scala/index.html#org.apache.spark.ml.Transformer) is an abstraction which includes feature transformers and learned models.  Technically, a `Transformer` implements a method `transform()` which converts one `DataFrame` into another, generally by appending one or more columns.
+A `Transformer` is an abstraction that includes feature transformers and learned models.
+Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into
+another, generally by appending one or more columns.
 For example:
 
-* A feature transformer might take a dataset, read a column (e.g., text), convert it into a new column (e.g., feature vectors), append the new column to the dataset, and output the updated dataset.
-* A learning model might take a dataset, read the column containing feature vectors, predict the label for each feature vector, append the labels as a new column, and output the updated dataset.
+* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new
+  column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended.
+* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the
+  label for each feature vector, and output a new `DataFrame` with predicted labels appended as a
+  column.
 
 ### Estimators
 
-An [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator) abstracts the concept of a learning algorithm or any algorithm which fits or trains on data.  Technically, an `Estimator` implements a method `fit()` which accepts a `DataFrame` and produces a `Transformer`.
-For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling `fit()` trains a `LogisticRegressionModel`, which is a `Transformer`.
+An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on
+data.
+Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a
+`Model`, which is a `Transformer`.
+For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling
+`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`.
 
-### Properties of ML Algorithms
+### Properties of pipeline components
 
-`Transformer`s and `Estimator`s are both stateless.  In the future, stateful algorithms may be supported via alternative concepts.
+`Transformer.transform()`s and `Estimator.fit()`s are both stateless.  In the future, stateful algorithms may be supported via alternative concepts.
 
 Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below).
 
@@ -91,15 +104,16 @@ E.g., a simple text document processing workflow might include several stages:
 * Convert each document's words into a numerical feature vector.
 * Learn a prediction model using the feature vectors and labels.
 
-Spark ML represents such a workflow as a [`Pipeline`](api/scala/index.html#org.apache.spark.ml.Pipeline),
-which consists of a sequence of [`PipelineStage`s](api/scala/index.html#org.apache.spark.ml.PipelineStage) (`Transformer`s and `Estimator`s) to be run in a specific order.  We will use this simple workflow as a running example in this section.
+Spark ML represents such a workflow as a `Pipeline`, which consists of a sequence of
+`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order.
+We will use this simple workflow as a running example in this section.
 
-### How It Works
+### How it works
 
 A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`.
-These stages are run in order, and the input dataset is modified as it passes through each stage.
-For `Transformer` stages, the `transform()` method is called on the dataset.
-For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the dataset.
+These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage.
+For `Transformer` stages, the `transform()` method is called on the `DataFrame`.
+For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`.
 
 We illustrate this for the simple text document workflow.  The figure below is for the *training time* usage of a `Pipeline`.
 
@@ -115,14 +129,17 @@ We illustrate this for the simple text document workflow.  The figure below is f
 Above, the top row represents a `Pipeline` with three stages.
 The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red).
 The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s.
-The `Pipeline.fit()` method is called on the original dataset which has raw text documents and labels.
-The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words into the dataset.
-The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the dataset.
+The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels.
+The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`.
+The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`.
 Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`.
-If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` method on the dataset before passing the dataset to the next stage.
+If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()`
+method on the `DataFrame` before passing the `DataFrame` to the next stage.
 
 A `Pipeline` is an `Estimator`.
-Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel` which is a `Transformer`.  This `PipelineModel` is used at *test time*; the figure below illustrates this usage.
+Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a
+`Transformer`.
+This `PipelineModel` is used at *test time*; the figure below illustrates this usage.
 
 <p style="text-align: center;">
   <img
@@ -134,7 +151,8 @@ Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel` wh
 </p>
 
 In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s.
-When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed through the `Pipeline` in order.
+When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed
+through the fitted pipeline in order.
 Each stage's `transform()` method updates the dataset and passes it to the next stage.
 
 `Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps.
@@ -143,40 +161,48 @@ Each stage's `transform()` method updates the dataset and passes it to the next
 
 *DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array.  The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage.  It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG).  This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters).  If the `Pipeline` forms a DAG, then the stages must be specified in topological order.
 
-*Runtime checking*: Since `Pipeline`s can operate on datasets with varied types, they cannot use compile-time type checking.  `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.  This type checking is done using the dataset *schema*, a description of the data types of columns in the `DataFrame`.
+*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use
+compile-time type checking.
+`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.
+This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`.
 
 ## Parameters
 
 Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters.
 
-A [`Param`](api/scala/index.html#org.apache.spark.ml.param.Param) is a named parameter with self-contained documentation.
-A [`ParamMap`](api/scala/index.html#org.apache.spark.ml.param.ParamMap) is a set of (parameter, value) pairs.
+A `Param` is a named parameter with self-contained documentation.
+A `ParamMap` is a set of (parameter, value) pairs.
 
 There are two main ways to pass parameters to an algorithm:
 
-1. Set parameters for an instance.  E.g., if `lr` is an instance of `LogisticRegression`, one could call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations.  This API resembles the API used in MLlib.
+1. Set parameters for an instance.  E.g., if `lr` is an instance of `LogisticRegression`, one could
+   call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations.
+   This API resembles the API used in `spark.mllib` package.
 2. Pass a `ParamMap` to `fit()` or `transform()`.  Any parameters in the `ParamMap` will override parameters previously specified via setter methods.
 
 Parameters belong to specific instances of `Estimator`s and `Transformer`s.
 For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
 This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
 
-# Algorithm Guides
+# Algorithm guides
 
 There are now several algorithms in the Pipelines API which are not in the `spark.mllib` API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
 
-**Pipelines API Algorithm Guides**
-
-* [Feature Extraction, Transformation, and Selection](ml-features.html)
-* [Decision Trees for Classification and Regression](ml-decision-tree.html)
+* [Feature extraction, transformation, and selection](ml-features.html)
+* [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
 * [Multilayer perceptron classifier](ml-ann.html)
 
-# Code Examples
+# Code examples
 
 This section gives code examples illustrating the functionality discussed above.
-There is not yet documentation for specific algorithms in Spark ML.  For more info, please refer to the [API Documentation](api/scala/index.html#org.apache.spark.ml.package).  Spark ML algorithms are currently wrappers for MLlib algorithms, and the [MLlib programming guide](mllib-guide.html) has details on specific algorithms.
+For more info, please refer to the API documentation
+([Scala](api/scala/index.html#org.apache.spark.ml.package),
+[Java](api/java/org/apache/spark/ml/package-summary.html),
+and [Python](api/python/pyspark.ml.html)).
+Some Spark ML algorithms are wrappers for `spark.mllib` algorithms, and the
+[MLlib programming guide](mllib-guide.html) has details on specific algorithms.
 
 ## Example: Estimator, Transformer, and Param
 
@@ -627,7 +653,7 @@ sc.stop()
 
 </div>
 
-## Example: Model Selection via Cross-Validation
+## Example: model selection via cross-validation
 
 An important task in ML is *model selection*, or using data to find the best model or parameters for a given task.  This is also called *tuning*.
 `Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately.
@@ -873,11 +899,11 @@ jsc.stop();
 
 </div>
 
-## Example: Model Selection via Train Validation Split
+## Example: model selection via train validation split
 In addition to  `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning.
 `TrainValidationSplit` only evaluates each combination of parameters once as opposed to k times in
  case of `CrossValidator`. It is therefore less expensive,
- but will not produce as reliable results when the training dataset is not sufficiently large..
+ but will not produce as reliable results when the training dataset is not sufficiently large.
 
 `TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter,
 and an `Evaluator`.
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 876dcfd40ed7b..257f7cc7603fa 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -14,9 +14,9 @@ primitives and higher-level pipeline APIs.
 It divides into two packages:
 
 * [`spark.mllib`](mllib-guide.html#mllib-types-algorithms-and-utilities) contains the original API
-  built on top of RDDs.
+  built on top of [RDDs](programming-guide.html#resilient-distributed-datasets-rdds).
 * [`spark.ml`](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) provides higher-level API
-  built on top of DataFrames for constructing ML pipelines.
+  built on top of [DataFrames](sql-programming-guide.html#dataframes) for constructing ML pipelines.
 
 Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible.
 But we will keep supporting `spark.mllib` along with the development of `spark.ml`.
@@ -57,19 +57,19 @@ We list major functionality from both below, with links to detailed guides.
   * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth)
   * [association rules](mllib-frequent-pattern-mining.html#association-rules)
   * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span)
-* [Evaluation Metrics](mllib-evaluation-metrics.html)
+* [Evaluation metrics](mllib-evaluation-metrics.html)
+* [PMML model export](mllib-pmml-model-export.html)
 * [Optimization (developer)](mllib-optimization.html)
   * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd)
   * [limited-memory BFGS (L-BFGS)](mllib-optimization.html#limited-memory-bfgs-l-bfgs)
-* [PMML model export](mllib-pmml-model-export.html)
 
 # spark.ml: high-level APIs for ML pipelines
 
 **[spark.ml programming guide](ml-guide.html)** provides an overview of the Pipelines API and major
 concepts. It also contains sections on using algorithms within the Pipelines API, for example:
 
-* [Feature Extraction, Transformation, and Selection](ml-features.html)
-* [Decision Trees for Classification and Regression](ml-decision-tree.html)
+* [Feature extraction, transformation, and selection](ml-features.html)
+* [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
 * [Multilayer perceptron classifier](ml-ann.html)

From ca69fc8efda8a3e5442ffa16692a2b1eb86b7673 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sat, 29 Aug 2015 23:57:09 -0700
Subject: [PATCH 145/802] [SPARK-10331] [MLLIB] Update example code in ml-guide

* The example code was added in 1.2, before `createDataFrame`. This PR switches to `createDataFrame`. Java code still uses JavaBean.
* assume `sqlContext` is available
* fix some minor issues from previous code review

jkbradley srowen feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #8518 from mengxr/SPARK-10331.
---
 docs/ml-guide.md | 362 +++++++++++++++++++----------------------------
 1 file changed, 147 insertions(+), 215 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 4ba07542bfb40..78c93a95c7807 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -212,26 +212,18 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.Row
 
-val conf = new SparkConf().setAppName("SimpleParamsExample")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training data.
-// We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of case classes
-// into DataFrames, where it uses the case class metadata to infer the schema.
-val training = sc.parallelize(Seq(
-  LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
-  LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
-  LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
+// Prepare training data from a list of (label, features) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (1.0, Vectors.dense(0.0, 1.1, 0.1)),
+  (0.0, Vectors.dense(2.0, 1.0, -1.0)),
+  (0.0, Vectors.dense(2.0, 1.3, 1.0)),
+  (1.0, Vectors.dense(0.0, 1.2, -0.5))
+)).toDF("label", "features")
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 val lr = new LogisticRegression()
@@ -243,7 +235,7 @@ lr.setMaxIter(10)
   .setRegParam(0.01)
 
 // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-val model1 = lr.fit(training.toDF)
+val model1 = lr.fit(training)
 // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
 // we can view the parameters it used during fit().
 // This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -253,8 +245,8 @@ println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
 // We may alternatively specify parameters using a ParamMap,
 // which supports several methods for specifying parameters.
 val paramMap = ParamMap(lr.maxIter -> 20)
-paramMap.put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
-paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
+  .put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
+  .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
 
 // One can also combine ParamMaps.
 val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
@@ -262,27 +254,27 @@ val paramMapCombined = paramMap ++ paramMap2
 
 // Now learn a new model using the paramMapCombined parameters.
 // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-val model2 = lr.fit(training.toDF, paramMapCombined)
+val model2 = lr.fit(training, paramMapCombined)
 println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
 
 // Prepare test data.
-val test = sc.parallelize(Seq(
-  LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-  LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-  LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
+val test = sqlContext.createDataFrame(Seq(
+  (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+  (0.0, Vectors.dense(3.0, 2.0, -0.1)),
+  (1.0, Vectors.dense(0.0, 2.2, -1.5))
+)).toDF("label", "features")
 
 // Make predictions on test data using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
 // Note that model2.transform() outputs a 'myProbability' column instead of the usual
 // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-model2.transform(test.toDF)
+model2.transform(test)
   .select("features", "label", "myProbability", "prediction")
   .collect()
   .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
     println(s"($features, $label) -> prob=$prob, prediction=$prediction")
   }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -291,30 +283,23 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.Row;
 
-SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
-
 // Prepare training data.
 // We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
 // into DataFrames, where it uses the bean metadata to infer the schema.
-List<LabeledPoint> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
+  new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))
+), LabeledPoint.class);
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 LogisticRegression lr = new LogisticRegression();
@@ -334,14 +319,14 @@ LogisticRegressionModel model1 = lr.fit(training);
 System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
 
 // We may alternatively specify parameters using a ParamMap.
-ParamMap paramMap = new ParamMap();
-paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
-paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
-paramMap.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
+ParamMap paramMap = new ParamMap()
+  .put(lr.maxIter().w(20)) // Specify 1 Param.
+  .put(lr.maxIter(), 30) // This overwrites the original maxIter.
+  .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
 
 // One can also combine ParamMaps.
-ParamMap paramMap2 = new ParamMap();
-paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
+ParamMap paramMap2 = new ParamMap()
+  .put(lr.probabilityCol().w("myProbability")); // Change output column name
 ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
 
 // Now learn a new model using the paramMapCombined parameters.
@@ -350,11 +335,11 @@ LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
 System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
 
 // Prepare test documents.
-List<LabeledPoint> localTest = Arrays.asList(
-    new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-    new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-    new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
+  new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+  new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
+  new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))
+), LabeledPoint.class);
 
 // Make predictions on test documents using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
@@ -366,28 +351,21 @@ for (Row r: results.select("features", "label", "myProbability", "prediction").c
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.linalg import Vectors
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.param import Param, Params
-from pyspark.sql import Row, SQLContext
 
-sc = SparkContext(appName="SimpleParamsExample")
-sqlContext = SQLContext(sc)
-
-# Prepare training data.
-# We use LabeledPoint.
-# Spark SQL can convert RDDs of LabeledPoints into DataFrames.
-training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1,  0.1]),
-                           LabeledPoint(0.0, [2.0, 1.0, -1.0]),
-                           LabeledPoint(0.0, [2.0, 1.3,  1.0]),
-                           LabeledPoint(1.0, [0.0, 1.2, -0.5])])
+# Prepare training data from a list of (label, features) tuples.
+training = sqlContext.createDataFrame([
+    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
+    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
+    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
+    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
 
 # Create a LogisticRegression instance. This instance is an Estimator.
 lr = LogisticRegression(maxIter=10, regParam=0.01)
@@ -395,7 +373,7 @@ lr = LogisticRegression(maxIter=10, regParam=0.01)
 print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
 
 # Learn a LogisticRegression model. This uses the parameters stored in lr.
-model1 = lr.fit(training.toDF())
+model1 = lr.fit(training)
 
 # Since model1 is a Model (i.e., a transformer produced by an Estimator),
 # we can view the parameters it used during fit().
@@ -416,25 +394,25 @@ paramMapCombined.update(paramMap2)
 
 # Now learn a new model using the paramMapCombined parameters.
 # paramMapCombined overrides all parameters set earlier via lr.set* methods.
-model2 = lr.fit(training.toDF(), paramMapCombined)
+model2 = lr.fit(training, paramMapCombined)
 print "Model 2 was fit using parameters: "
 print model2.extractParamMap()
 
 # Prepare test data
-test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5,  1.3]),
-                       LabeledPoint(0.0, [ 3.0, 2.0, -0.1]),
-                       LabeledPoint(1.0, [ 0.0, 2.2, -1.5])])
+test = sqlContext.createDataFrame([
+    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
+    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
+    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
 
 # Make predictions on test data using the Transformer.transform() method.
 # LogisticRegression.transform will only use the 'features' column.
 # Note that model2.transform() outputs a "myProbability" column instead of the usual
 # 'probability' column since we renamed the lr.probabilityCol parameter previously.
-prediction = model2.transform(test.toDF())
+prediction = model2.transform(test)
 selected = prediction.select("features", "label", "myProbability", "prediction")
 for row in selected.collect():
     print row
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -448,30 +426,19 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.{Row, SQLContext}
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from case classes.
-case class LabeledDocument(id: Long, text: String, label: Double)
-case class Document(id: Long, text: String)
+import org.apache.spark.sql.Row
 
-// Set up contexts.  Import implicit conversions to DataFrame from sqlContext.
-val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training documents, which are labeled.
-val training = sc.parallelize(Seq(
-  LabeledDocument(0L, "a b c d e spark", 1.0),
-  LabeledDocument(1L, "b d", 0.0),
-  LabeledDocument(2L, "spark f g h", 1.0),
-  LabeledDocument(3L, "hadoop mapreduce", 0.0)))
+// Prepare training documents from a list of (id, text, label) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (0L, "a b c d e spark", 1.0),
+  (1L, "b d", 0.0),
+  (2L, "spark f g h", 1.0),
+  (3L, "hadoop mapreduce", 0.0)
+)).toDF("id", "text", "label")
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 val tokenizer = new Tokenizer()
@@ -488,14 +455,15 @@ val pipeline = new Pipeline()
   .setStages(Array(tokenizer, hashingTF, lr))
 
 // Fit the pipeline to training documents.
-val model = pipeline.fit(training.toDF)
+val model = pipeline.fit(training)
 
-// Prepare test documents, which are unlabeled.
-val test = sc.parallelize(Seq(
-  Document(4L, "spark i j k"),
-  Document(5L, "l m n"),
-  Document(6L, "mapreduce spark"),
-  Document(7L, "apache hadoop")))
+// Prepare test documents, which are unlabeled (id, text) tuples.
+val test = sqlContext.createDataFrame(Seq(
+  (4L, "spark i j k"),
+  (5L, "l m n"),
+  (6L, "mapreduce spark"),
+  (7L, "apache hadoop")
+)).toDF("id", "text")
 
 // Make predictions on test documents.
 model.transform(test.toDF)
@@ -505,7 +473,6 @@ model.transform(test.toDF)
     println(s"($id, $text) --> prob=$prob, prediction=$prediction")
   }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -514,8 +481,6 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
 import org.apache.spark.ml.PipelineStage;
@@ -524,7 +489,6 @@ import org.apache.spark.ml.feature.HashingTF;
 import org.apache.spark.ml.feature.Tokenizer;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
 
 // Labeled and unlabeled instance types.
 // Spark SQL can infer schema from Java Beans.
@@ -556,18 +520,13 @@ public class LabeledDocument extends Document implements Serializable {
   public void setLabel(double label) { this.label = label; }
 }
 
-// Set up contexts.
-SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
-
 // Prepare training documents, which are labeled.
-List<LabeledDocument> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledDocument(0L, "a b c d e spark", 1.0),
   new LabeledDocument(1L, "b d", 0.0),
   new LabeledDocument(2L, "spark f g h", 1.0),
-  new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+  new LabeledDocument(3L, "hadoop mapreduce", 0.0)
+), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -587,12 +546,12 @@ Pipeline pipeline = new Pipeline()
 PipelineModel model = pipeline.fit(training);
 
 // Prepare test documents, which are unlabeled.
-List<Document> localTest = Arrays.asList(
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
   new Document(4L, "spark i j k"),
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop"));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+  new Document(7L, "apache hadoop")
+), Document.class);
 
 // Make predictions on test documents.
 DataFrame predictions = model.transform(test);
@@ -601,28 +560,23 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark import SparkContext
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row, SQLContext
-
-sc = SparkContext(appName="SimpleTextClassificationPipeline")
-sqlContext = SQLContext(sc)
+from pyspark.sql import Row
 
-# Prepare training documents, which are labeled.
+# Prepare training documents from a list of (id, text, label) tuples.
 LabeledDocument = Row("id", "text", "label")
-training = sc.parallelize([(0L, "a b c d e spark", 1.0),
-                           (1L, "b d", 0.0),
-                           (2L, "spark f g h", 1.0),
-                           (3L, "hadoop mapreduce", 0.0)]) \
-    .map(lambda x: LabeledDocument(*x)).toDF()
+training = sqlContext.createDataFrame([
+    (0L, "a b c d e spark", 1.0),
+    (1L, "b d", 0.0),
+    (2L, "spark f g h", 1.0),
+    (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
 
 # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
 tokenizer = Tokenizer(inputCol="text", outputCol="words")
@@ -633,13 +587,12 @@ pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 # Fit the pipeline to training documents.
 model = pipeline.fit(training)
 
-# Prepare test documents, which are unlabeled.
-Document = Row("id", "text")
-test = sc.parallelize([(4L, "spark i j k"),
-                       (5L, "l m n"),
-                       (6L, "mapreduce spark"),
-                       (7L, "apache hadoop")]) \
-    .map(lambda x: Document(*x)).toDF()
+# Prepare test documents, which are unlabeled (id, text) tuples.
+test = sqlContext.createDataFrame([
+    (4L, "spark i j k"),
+    (5L, "l m n"),
+    (6L, "mapreduce spark"),
+    (7L, "apache hadoop")], ["id", "text"])
 
 # Make predictions on test documents and print columns of interest.
 prediction = model.transform(test)
@@ -647,7 +600,6 @@ selected = prediction.select("id", "text", "prediction")
 for row in selected.collect():
     print(row)
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -664,8 +616,8 @@ Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/
 
 The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.RegressionEvaluator)
 for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.BinaryClassificationEvaluator)
-for binary data or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator)
-for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the setMetric
+for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator)
+for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the `setMetric`
 method in each of these evaluators.
 
 The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model.
@@ -684,39 +636,29 @@ However, it is also a well-established method for choosing parameters which is m
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.{Row, SQLContext}
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from case classes.
-case class LabeledDocument(id: Long, text: String, label: Double)
-case class Document(id: Long, text: String)
-
-val conf = new SparkConf().setAppName("CrossValidatorExample")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training documents, which are labeled.
-val training = sc.parallelize(Seq(
-  LabeledDocument(0L, "a b c d e spark", 1.0),
-  LabeledDocument(1L, "b d", 0.0),
-  LabeledDocument(2L, "spark f g h", 1.0),
-  LabeledDocument(3L, "hadoop mapreduce", 0.0),
-  LabeledDocument(4L, "b spark who", 1.0),
-  LabeledDocument(5L, "g d a y", 0.0),
-  LabeledDocument(6L, "spark fly", 1.0),
-  LabeledDocument(7L, "was mapreduce", 0.0),
-  LabeledDocument(8L, "e spark program", 1.0),
-  LabeledDocument(9L, "a e c l", 0.0),
-  LabeledDocument(10L, "spark compile", 1.0),
-  LabeledDocument(11L, "hadoop software", 0.0)))
+import org.apache.spark.sql.Row
+
+// Prepare training data from a list of (id, text, label) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (0L, "a b c d e spark", 1.0),
+  (1L, "b d", 0.0),
+  (2L, "spark f g h", 1.0),
+  (3L, "hadoop mapreduce", 0.0),
+  (4L, "b spark who", 1.0),
+  (5L, "g d a y", 0.0),
+  (6L, "spark fly", 1.0),
+  (7L, "was mapreduce", 0.0),
+  (8L, "e spark program", 1.0),
+  (9L, "a e c l", 0.0),
+  (10L, "spark compile", 1.0),
+  (11L, "hadoop software", 0.0)
+)).toDF("id", "text", "label")
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 val tokenizer = new Tokenizer()
@@ -730,15 +672,6 @@ val lr = new LogisticRegression()
 val pipeline = new Pipeline()
   .setStages(Array(tokenizer, hashingTF, lr))
 
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
-// used is areaUnderROC.
-val crossval = new CrossValidator()
-  .setEstimator(pipeline)
-  .setEvaluator(new BinaryClassificationEvaluator)
-
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
@@ -746,28 +679,37 @@ val paramGrid = new ParamGridBuilder()
   .addGrid(hashingTF.numFeatures, Array(10, 100, 1000))
   .addGrid(lr.regParam, Array(0.1, 0.01))
   .build()
-crossval.setEstimatorParamMaps(paramGrid)
-crossval.setNumFolds(2) // Use 3+ in practice
+
+// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
+// This will allow us to jointly choose parameters for all Pipeline stages.
+// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
+// is areaUnderROC.
+val cv = new CrossValidator()
+  .setEstimator(pipeline)
+  .setEvaluator(new BinaryClassificationEvaluator)
+  .setEstimatorParamMaps(paramGrid)
+  .setNumFolds(2) // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
-val cvModel = crossval.fit(training.toDF)
+val cvModel = cv.fit(training)
 
-// Prepare test documents, which are unlabeled.
-val test = sc.parallelize(Seq(
-  Document(4L, "spark i j k"),
-  Document(5L, "l m n"),
-  Document(6L, "mapreduce spark"),
-  Document(7L, "apache hadoop")))
+// Prepare test documents, which are unlabeled (id, text) tuples.
+val test = sqlContext.createDataFrame(Seq(
+  (4L, "spark i j k"),
+  (5L, "l m n"),
+  (6L, "mapreduce spark"),
+  (7L, "apache hadoop")
+)).toDF("id", "text")
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
-cvModel.transform(test.toDF)
+cvModel.transform(test)
   .select("id", "text", "probability", "prediction")
   .collect()
   .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-  println(s"($id, $text) --> prob=$prob, prediction=$prediction")
-}
+    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
+  }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -776,8 +718,6 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -790,7 +730,6 @@ import org.apache.spark.ml.tuning.CrossValidatorModel;
 import org.apache.spark.ml.tuning.ParamGridBuilder;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
 
 // Labeled and unlabeled instance types.
 // Spark SQL can infer schema from Java Beans.
@@ -822,12 +761,9 @@ public class LabeledDocument extends Document implements Serializable {
   public void setLabel(double label) { this.label = label; }
 }
 
-SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
 
 // Prepare training documents, which are labeled.
-List<LabeledDocument> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledDocument(0L, "a b c d e spark", 1.0),
   new LabeledDocument(1L, "b d", 0.0),
   new LabeledDocument(2L, "spark f g h", 1.0),
@@ -839,8 +775,8 @@ List<LabeledDocument> localTraining = Arrays.asList(
   new LabeledDocument(8L, "e spark program", 1.0),
   new LabeledDocument(9L, "a e c l", 0.0),
   new LabeledDocument(10L, "spark compile", 1.0),
-  new LabeledDocument(11L, "hadoop software", 0.0));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+  new LabeledDocument(11L, "hadoop software", 0.0)
+), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -856,15 +792,6 @@ LogisticRegression lr = new LogisticRegression()
 Pipeline pipeline = new Pipeline()
   .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
 
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
-// used is areaUnderROC.
-CrossValidator crossval = new CrossValidator()
-    .setEstimator(pipeline)
-    .setEvaluator(new BinaryClassificationEvaluator());
-
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
@@ -872,19 +799,28 @@ ParamMap[] paramGrid = new ParamGridBuilder()
     .addGrid(hashingTF.numFeatures(), new int[]{10, 100, 1000})
     .addGrid(lr.regParam(), new double[]{0.1, 0.01})
     .build();
-crossval.setEstimatorParamMaps(paramGrid);
-crossval.setNumFolds(2); // Use 3+ in practice
+
+// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
+// This will allow us to jointly choose parameters for all Pipeline stages.
+// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
+// is areaUnderROC.
+CrossValidator cv = new CrossValidator()
+  .setEstimator(pipeline)
+  .setEvaluator(new BinaryClassificationEvaluator())
+  .setEstimatorParamMaps(paramGrid)
+  .setNumFolds(2); // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
-CrossValidatorModel cvModel = crossval.fit(training);
+CrossValidatorModel cvModel = cv.fit(training);
 
 // Prepare test documents, which are unlabeled.
-List<Document> localTest = Arrays.asList(
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
   new Document(4L, "spark i j k"),
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop"));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+  new Document(7L, "apache hadoop")
+), Document.class);
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
 DataFrame predictions = cvModel.transform(test);
@@ -893,7 +829,6 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
 {% endhighlight %}
 </div>
 
@@ -935,7 +870,7 @@ val lr = new LinearRegression()
 // the evaluator.
 val paramGrid = new ParamGridBuilder()
   .addGrid(lr.regParam, Array(0.1, 0.01))
-  .addGrid(lr.fitIntercept, Array(true, false))
+  .addGrid(lr.fitIntercept)
   .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
   .build()
 
@@ -945,9 +880,8 @@ val trainValidationSplit = new TrainValidationSplit()
   .setEstimator(lr)
   .setEvaluator(new RegressionEvaluator)
   .setEstimatorParamMaps(paramGrid)
-
-// 80% of the data will be used for training and the remaining 20% for validation.
-trainValidationSplit.setTrainRatio(0.8)
+  // 80% of the data will be used for training and the remaining 20% for validation.
+  .setTrainRatio(0.8)
 
 // Run train validation split, and choose the best set of parameters.
 val model = trainValidationSplit.fit(training)
@@ -972,12 +906,12 @@ import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
-DataFrame data = jsql.createDataFrame(
+DataFrame data = sqlContext.createDataFrame(
   MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
   LabeledPoint.class);
 
 // Prepare training and test data.
-DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
+DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
 DataFrame training = splits[0];
 DataFrame test = splits[1];
 
@@ -997,10 +931,8 @@ ParamMap[] paramGrid = new ParamGridBuilder()
 TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
   .setEstimator(lr)
   .setEvaluator(new RegressionEvaluator())
-  .setEstimatorParamMaps(paramGrid);
-
-// 80% of the data will be used for training and the remaining 20% for validation.
-trainValidationSplit.setTrainRatio(0.8);
+  .setEstimatorParamMaps(paramGrid)
+  .setTrainRatio(0.8); // 80% for training and the remaining 20% for validation
 
 // Run train validation split, and choose the best set of parameters.
 TrainValidationSplitModel model = trainValidationSplit.fit(training);

From 1bfd9347822df65e76201c4c471a26488d722319 Mon Sep 17 00:00:00 2001
From: ihainan <ihainan72@gmail.com>
Date: Sun, 30 Aug 2015 08:26:14 +0100
Subject: [PATCH 146/802] [SPARK-10184] [CORE] Optimization for bounds
 determination in RangePartitioner

JIRA Issue: https://issues.apache.org/jira/browse/SPARK-10184

Change `cumWeight > target` to `cumWeight >= target` in `RangePartitioner.determineBounds` method to make the output partitions more balanced.

Author: ihainan <ihainan72@gmail.com>

Closes #8397 from ihainan/opt_for_rangepartitioner.
---
 core/src/main/scala/org/apache/spark/Partitioner.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 4b9d59975bdc2..29e581bb57cbc 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -291,7 +291,7 @@ private[spark] object RangePartitioner {
     while ((i < numCandidates) && (j < partitions - 1)) {
       val (key, weight) = ordered(i)
       cumWeight += weight
-      if (cumWeight > target) {
+      if (cumWeight >= target) {
         // Skip duplicate values.
         if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) {
           bounds += key

From 8d2ab75d3b71b632f2394f2453af32f417cb45e5 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Sun, 30 Aug 2015 12:21:15 -0700
Subject: [PATCH 147/802] [SPARK-10353] [MLLIB] BLAS gemm not scaling when beta
 = 0.0 for some subset of matrix multiplications

mengxr jkbradley rxin

It would be great if this fix made it into RC3!

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #8525 from brkyvz/blas-scaling.
---
 .../org/apache/spark/mllib/linalg/BLAS.scala  | 26 +++++++------------
 .../apache/spark/mllib/linalg/BLASSuite.scala |  5 ++++
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index bbbcc8436b7c2..ab475af264dd3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -305,6 +305,8 @@ private[spark] object BLAS extends Serializable with Logging {
       "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.")
     if (alpha == 0.0 && beta == 1.0) {
       logDebug("gemm: alpha is equal to 0 and beta is equal to 1. Returning C.")
+    } else if (alpha == 0.0) {
+      f2jBLAS.dscal(C.values.length, beta, C.values, 1)
     } else {
       A match {
         case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C)
@@ -408,8 +410,8 @@ private[spark] object BLAS extends Serializable with Logging {
         }
       }
     } else {
-      // Scale matrix first if `beta` is not equal to 0.0
-      if (beta != 0.0) {
+      // Scale matrix first if `beta` is not equal to 1.0
+      if (beta != 1.0) {
         f2jBLAS.dscal(C.values.length, beta, C.values, 1)
       }
       // Perform matrix multiplication and add to C. The rows of A are multiplied by the columns of
@@ -470,8 +472,10 @@ private[spark] object BLAS extends Serializable with Logging {
       s"The columns of A don't match the number of elements of x. A: ${A.numCols}, x: ${x.size}")
     require(A.numRows == y.size,
       s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}")
-    if (alpha == 0.0) {
-      logDebug("gemv: alpha is equal to 0. Returning y.")
+    if (alpha == 0.0 && beta == 1.0) {
+      logDebug("gemv: alpha is equal to 0 and beta is equal to 1. Returning y.")
+    } else if (alpha == 0.0) {
+      scal(beta, y)
     } else {
       (A, x) match {
         case (smA: SparseMatrix, dvx: DenseVector) =>
@@ -526,11 +530,6 @@ private[spark] object BLAS extends Serializable with Logging {
     val xValues = x.values
     val yValues = y.values
 
-    if (alpha == 0.0) {
-      scal(beta, y)
-      return
-    }
-
     if (A.isTransposed) {
       var rowCounterForA = 0
       while (rowCounterForA < mA) {
@@ -581,11 +580,6 @@ private[spark] object BLAS extends Serializable with Logging {
     val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs
     val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices
 
-    if (alpha == 0.0) {
-      scal(beta, y)
-      return
-    }
-
     if (A.isTransposed) {
       var rowCounter = 0
       while (rowCounter < mA) {
@@ -604,7 +598,7 @@ private[spark] object BLAS extends Serializable with Logging {
         rowCounter += 1
       }
     } else {
-      scal(beta, y)
+      if (beta != 1.0) scal(beta, y)
 
       var colCounterForA = 0
       var k = 0
@@ -659,7 +653,7 @@ private[spark] object BLAS extends Serializable with Logging {
         rowCounter += 1
       }
     } else {
-      scal(beta, y)
+      if (beta != 1.0) scal(beta, y)
       // Perform matrix-vector multiplication and add to y
       var colCounterForA = 0
       while (colCounterForA < nA) {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index d119e0b50a393..8db5c8424abe9 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -204,6 +204,7 @@ class BLASSuite extends SparkFunSuite {
     val C14 = C1.copy
     val C15 = C1.copy
     val C16 = C1.copy
+    val C17 = C1.copy
     val expected2 = new DenseMatrix(4, 2, Array(2.0, 1.0, 4.0, 2.0, 4.0, 0.0, 4.0, 3.0))
     val expected3 = new DenseMatrix(4, 2, Array(2.0, 2.0, 4.0, 2.0, 8.0, 0.0, 6.0, 6.0))
     val expected4 = new DenseMatrix(4, 2, Array(5.0, 0.0, 10.0, 5.0, 0.0, 0.0, 5.0, 0.0))
@@ -217,6 +218,10 @@ class BLASSuite extends SparkFunSuite {
     assert(C2 ~== expected2 absTol 1e-15)
     assert(C3 ~== expected3 absTol 1e-15)
     assert(C4 ~== expected3 absTol 1e-15)
+    gemm(1.0, dA, B, 0.0, C17)
+    assert(C17 ~== expected absTol 1e-15)
+    gemm(1.0, sA, B, 0.0, C17)
+    assert(C17 ~== expected absTol 1e-15)
 
     withClue("columns of A don't match the rows of B") {
       intercept[Exception] {

From 35e896a79bb5e72d63b82b047f46f4f6fa2e1970 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Sun, 30 Aug 2015 21:39:16 -0700
Subject: [PATCH 148/802] SPARK-9545, SPARK-9547: Use Maven in PRB if title
 contains "[test-maven]"

This is just some small glue code to actually make use of the
AMPLAB_JENKINS_BUILD_TOOL switch. As far as I can tell, we actually
don't currently use the Maven support in the tool even though it exists.
This patch switches to Maven when the PR title contains "test-maven".

There are a few small other pieces of cleanup in the patch as well.

Author: Patrick Wendell <patrick@databricks.com>

Closes #7878 from pwendell/maven-tests.
---
 dev/run-tests-jenkins | 18 ++++++++++++++++--
 dev/run-tests.py      | 28 ++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 39cf54f78104c..3be78575e70f1 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -164,8 +164,9 @@ pr_message=""
 current_pr_head="`git rev-parse HEAD`"
 
 echo "HEAD:  `git rev-parse HEAD`"
-echo "GHPRB: $ghprbActualCommit"
-echo "SHA1:  $sha1"
+echo "\$ghprbActualCommit: $ghprbActualCommit"
+echo "\$sha1:  $sha1"
+echo "\$ghprbPullTitle: $ghprbPullTitle"
 
 # Run pull request tests
 for t in "${PR_TESTS[@]}"; do
@@ -189,6 +190,19 @@ done
 {
   # Marks this build is a pull request build.
   export AMP_JENKINS_PRB=true
+  if [[ $ghprbPullTitle == *"test-maven"* ]]; then
+    export AMPLAB_JENKINS_BUILD_TOOL="maven"
+  fi
+  if [[ $ghprbPullTitle == *"test-hadoop1.0"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop1.0"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.0"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.0"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.2"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.2"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.3"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.3"
+  fi
+
   timeout "${TESTS_TIMEOUT}" ./dev/run-tests
   test_result="$?"
 
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 4fd703a7c219f..d8b22e1665e7b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -21,6 +21,7 @@
 import itertools
 from optparse import OptionParser
 import os
+import random
 import re
 import sys
 import subprocess
@@ -239,11 +240,32 @@ def build_spark_documentation():
     os.chdir(SPARK_HOME)
 
 
+def get_zinc_port():
+    """
+    Get a randomized port on which to start Zinc
+    """
+    return random.randrange(3030, 4030)
+
+
+def kill_zinc_on_port(zinc_port):
+    """
+    Kill the Zinc process running on the given port, if one exists.
+    """
+    cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN "
+           "| awk '{ print $2; }' | xargs kill") % zinc_port
+    subprocess.check_call(cmd, shell=True)
+
+
 def exec_maven(mvn_args=()):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
-    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
+    zinc_port = get_zinc_port()
+    os.environ["ZINC_PORT"] = "%s" % zinc_port
+    zinc_flag = "-DzincPort=%s" % zinc_port
+    flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag]
+    run_cmd(flags + mvn_args)
+    kill_zinc_on_port(zinc_port)
 
 
 def exec_sbt(sbt_args=()):
@@ -514,7 +536,9 @@ def main():
     build_apache_spark(build_tool, hadoop_version)
 
     # backwards compatibility checks
-    detect_binary_inop_with_mima()
+    if build_tool == "sbt":
+        # Note: compatiblity tests only supported in sbt for now
+        detect_binary_inop_with_mima()
 
     # run the test suites
     run_scala_tests(build_tool, hadoop_version, test_modules)

From 8694c3ad7dcafca9563649e93b7a08076748d6f2 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Sun, 30 Aug 2015 23:12:56 -0700
Subject: [PATCH 149/802] [SPARK-10351] [SQL] Fixes UTF8String.fromAddress to
 handle off-heap memory

CC rxin marmbrus

Author: Feynman Liang <fliang@databricks.com>

Closes #8523 from feynmanliang/SPARK-10351.
---
 .../test/scala/org/apache/spark/sql/UnsafeRowSuite.scala | 9 +++++----
 .../java/org/apache/spark/unsafe/types/UTF8String.java   | 6 +-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 219435dff5bc8..2476b10e3cf9e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -43,12 +43,12 @@ class UnsafeRowSuite extends SparkFunSuite {
     val arrayBackedUnsafeRow: UnsafeRow =
       UnsafeProjection.create(Array[DataType](StringType, StringType, IntegerType)).apply(row)
     assert(arrayBackedUnsafeRow.getBaseObject.isInstanceOf[Array[Byte]])
-    val bytesFromArrayBackedRow: Array[Byte] = {
+    val (bytesFromArrayBackedRow, field0StringFromArrayBackedRow): (Array[Byte], String) = {
       val baos = new ByteArrayOutputStream()
       arrayBackedUnsafeRow.writeToStream(baos, null)
-      baos.toByteArray
+      (baos.toByteArray, arrayBackedUnsafeRow.getString(0))
     }
-    val bytesFromOffheapRow: Array[Byte] = {
+    val (bytesFromOffheapRow, field0StringFromOffheapRow): (Array[Byte], String) = {
       val offheapRowPage = MemoryAllocator.UNSAFE.allocate(arrayBackedUnsafeRow.getSizeInBytes)
       try {
         Platform.copyMemory(
@@ -69,13 +69,14 @@ class UnsafeRowSuite extends SparkFunSuite {
         val baos = new ByteArrayOutputStream()
         val writeBuffer = new Array[Byte](1024)
         offheapUnsafeRow.writeToStream(baos, writeBuffer)
-        baos.toByteArray
+        (baos.toByteArray, offheapUnsafeRow.getString(0))
       } finally {
         MemoryAllocator.UNSAFE.free(offheapRowPage)
       }
     }
 
     assert(bytesFromArrayBackedRow === bytesFromOffheapRow)
+    assert(field0StringFromArrayBackedRow === field0StringFromOffheapRow)
   }
 
   test("calling getDouble() and getFloat() on null columns") {
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index cbcab958c05a9..216aeea60d1c8 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -90,11 +90,7 @@ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
    * Creates an UTF8String from given address (base and offset) and length.
    */
   public static UTF8String fromAddress(Object base, long offset, int numBytes) {
-    if (base != null) {
-      return new UTF8String(base, offset, numBytes);
-    } else {
-      return null;
-    }
+    return new UTF8String(base, offset, numBytes);
   }
 
   /**

From f0f563a3c43fc9683e6920890cce44611c0c5f4b Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 30 Aug 2015 23:20:03 -0700
Subject: [PATCH 150/802] [SPARK-100354] [MLLIB] fix some apparent memory
 issues in k-means|| initializaiton

* do not cache first cost RDD
* change following cost RDD cache level to MEMORY_AND_DISK
* remove Vector wrapper to save a object per instance

Further improvements will be addressed in SPARK-10329

cc: yu-iskw HuJiayin

Author: Xiangrui Meng <meng@databricks.com>

Closes #8526 from mengxr/SPARK-10354.
---
 .../spark/mllib/clustering/KMeans.scala       | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 46920fffe6e1a..7168aac32c997 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -369,7 +369,7 @@ class KMeans private (
   : Array[Array[VectorWithNorm]] = {
     // Initialize empty centers and point costs.
     val centers = Array.tabulate(runs)(r => ArrayBuffer.empty[VectorWithNorm])
-    var costs = data.map(_ => Vectors.dense(Array.fill(runs)(Double.PositiveInfinity))).cache()
+    var costs = data.map(_ => Array.fill(runs)(Double.PositiveInfinity))
 
     // Initialize each run's first center to a random point.
     val seed = new XORShiftRandom(this.seed).nextInt()
@@ -394,21 +394,28 @@ class KMeans private (
       val bcNewCenters = data.context.broadcast(newCenters)
       val preCosts = costs
       costs = data.zip(preCosts).map { case (point, cost) =>
-        Vectors.dense(
           Array.tabulate(runs) { r =>
             math.min(KMeans.pointCost(bcNewCenters.value(r), point), cost(r))
-          })
-      }.cache()
+          }
+        }.persist(StorageLevel.MEMORY_AND_DISK)
       val sumCosts = costs
-        .aggregate(Vectors.zeros(runs))(
+        .aggregate(new Array[Double](runs))(
           seqOp = (s, v) => {
             // s += v
-            axpy(1.0, v, s)
+            var r = 0
+            while (r < runs) {
+              s(r) += v(r)
+              r += 1
+            }
             s
           },
           combOp = (s0, s1) => {
             // s0 += s1
-            axpy(1.0, s1, s0)
+            var r = 0
+            while (r < runs) {
+              s0(r) += s1(r)
+              r += 1
+            }
             s0
           }
         )

From 72f6dbf7b0c8b271f5f9c762374422c69c8ab43d Mon Sep 17 00:00:00 2001
From: EugenCepoi <cepoi.eugen@gmail.com>
Date: Mon, 31 Aug 2015 13:24:35 -0500
Subject: [PATCH 151/802] [SPARK-8730] Fixes - Deser objects containing a
 primitive class attribute

Author: EugenCepoi <cepoi.eugen@gmail.com>

Closes #7122 from EugenCepoi/master.
---
 .../spark/serializer/JavaSerializer.scala     | 27 +++++++++++++++----
 .../serializer/JavaSerializerSuite.scala      | 18 +++++++++++++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 4a5274b46b7a0..b463a71d5bd7d 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -62,17 +62,34 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa
   extends DeserializationStream {
 
   private val objIn = new ObjectInputStream(in) {
-    override def resolveClass(desc: ObjectStreamClass): Class[_] = {
-      // scalastyle:off classforname
-      Class.forName(desc.getName, false, loader)
-      // scalastyle:on classforname
-    }
+    override def resolveClass(desc: ObjectStreamClass): Class[_] =
+      try {
+        // scalastyle:off classforname
+        Class.forName(desc.getName, false, loader)
+        // scalastyle:on classforname
+      } catch {
+        case e: ClassNotFoundException =>
+          JavaDeserializationStream.primitiveMappings.get(desc.getName).getOrElse(throw e)
+      }
   }
 
   def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T]
   def close() { objIn.close() }
 }
 
+private object JavaDeserializationStream {
+  val primitiveMappings = Map[String, Class[_]](
+    "boolean" -> classOf[Boolean],
+    "byte" -> classOf[Byte],
+    "char" -> classOf[Char],
+    "short" -> classOf[Short],
+    "int" -> classOf[Int],
+    "long" -> classOf[Long],
+    "float" -> classOf[Float],
+    "double" -> classOf[Double],
+    "void" -> classOf[Void]
+  )
+}
 
 private[spark] class JavaSerializerInstance(
     counterReset: Int, extraDebugInfo: Boolean, defaultClassLoader: ClassLoader)
diff --git a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
index 329a2b6dad831..20f45670bc2ba 100644
--- a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
@@ -25,4 +25,22 @@ class JavaSerializerSuite extends SparkFunSuite {
     val instance = serializer.newInstance()
     instance.deserialize[JavaSerializer](instance.serialize(serializer))
   }
+
+  test("Deserialize object containing a primitive Class as attribute") {
+    val serializer = new JavaSerializer(new SparkConf())
+    val instance = serializer.newInstance()
+    instance.deserialize[JavaSerializer](instance.serialize(new ContainsPrimitiveClass()))
+  }
+}
+
+private class ContainsPrimitiveClass extends Serializable {
+  val intClass = classOf[Int]
+  val longClass = classOf[Long]
+  val shortClass = classOf[Short]
+  val charClass = classOf[Char]
+  val doubleClass = classOf[Double]
+  val floatClass = classOf[Float]
+  val booleanClass = classOf[Boolean]
+  val byteClass = classOf[Byte]
+  val voidClass = classOf[Void]
 }

From 4a5fe091658b1d06f427e404a11a84fc84f953c5 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 31 Aug 2015 12:19:11 -0700
Subject: [PATCH 152/802] [SPARK-10369] [STREAMING] Don't remove
 ReceiverTrackingInfo when deregisterReceivering since we may reuse it later

`deregisterReceiver` should not remove `ReceiverTrackingInfo`. Otherwise, it will throw `java.util.NoSuchElementException: key not found` when restarting it.

Author: zsxwing <zsxwing@gmail.com>

Closes #8538 from zsxwing/SPARK-10369.
---
 .../streaming/scheduler/ReceiverTracker.scala |  4 +-
 .../scheduler/ReceiverTrackerSuite.scala      | 51 +++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 3d532a675db02..f86fd44b48719 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -291,7 +291,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         ReceiverTrackingInfo(
           streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo))
     }
-    receiverTrackingInfos -= streamId
+    receiverTrackingInfos(streamId) = newReceiverTrackingInfo
     listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo))
     val messageWithError = if (error != null && !error.isEmpty) {
       s"$message - $error"
@@ -483,7 +483,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         context.reply(true)
       // Local messages
       case AllReceiverIds =>
-        context.reply(receiverTrackingInfos.keys.toSeq)
+        context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq)
       case StopAllReceivers =>
         assert(isTrackerStopping || isTrackerStopped)
         stopReceivers()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index dd292ba4dd949..45138b748ecab 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -60,6 +60,26 @@ class ReceiverTrackerSuite extends TestSuiteBase {
       }
     }
   }
+
+  test("should restart receiver after stopping it") {
+    withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc =>
+      @volatile var startTimes = 0
+      ssc.addStreamingListener(new StreamingListener {
+        override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
+          startTimes += 1
+        }
+      })
+      val input = ssc.receiverStream(new StoppableReceiver)
+      val output = new TestOutputStream(input)
+      output.register()
+      ssc.start()
+      StoppableReceiver.shouldStop = true
+      eventually(timeout(10 seconds), interval(10 millis)) {
+        // The receiver is stopped once, so if it's restarted, it should be started twice.
+        assert(startTimes === 2)
+      }
+    }
+  }
 }
 
 /** An input DStream with for testing rate controlling */
@@ -132,3 +152,34 @@ private[streaming] object RateTestReceiver {
 
   def getActive(): Option[RateTestReceiver] = Option(activeReceiver)
 }
+
+/**
+ * A custom receiver that could be stopped via StoppableReceiver.shouldStop
+ */
+class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
+
+  var receivingThreadOption: Option[Thread] = None
+
+  def onStart() {
+    val thread = new Thread() {
+      override def run() {
+        while (!StoppableReceiver.shouldStop) {
+          Thread.sleep(10)
+        }
+        StoppableReceiver.this.stop("stop")
+      }
+    }
+    thread.start()
+  }
+
+  def onStop() {
+    StoppableReceiver.shouldStop = true
+    receivingThreadOption.foreach(_.join())
+    // Reset it so as to restart it
+    StoppableReceiver.shouldStop = false
+  }
+}
+
+object StoppableReceiver {
+  @volatile var shouldStop = false
+}

From a2d5c72091b1c602694dbca823a7b26f86b02864 Mon Sep 17 00:00:00 2001
From: sureshthalamati <suresh.thalamati@gmail.com>
Date: Mon, 31 Aug 2015 12:39:58 -0700
Subject: [PATCH 153/802] [SPARK-10170] [SQL] Add DB2 JDBC dialect support.

Data frame write to DB2 database is failing because by default JDBC data source implementation is generating a table schema with DB2 unsupported data types TEXT for String, and BIT1(1) for Boolean.

This patch registers DB2 JDBC Dialect that maps String, Boolean to valid DB2 data types.

Author: sureshthalamati <suresh.thalamati@gmail.com>

Closes #8393 from sureshthalamati/db2_dialect_spark-10170.
---
 .../apache/spark/sql/jdbc/JdbcDialects.scala   | 18 ++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala  |  7 +++++++
 2 files changed, 25 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 8849fc2f1f0ef..c6d05c9b83b98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -125,6 +125,7 @@ object JdbcDialects {
 
   registerDialect(MySQLDialect)
   registerDialect(PostgresDialect)
+  registerDialect(DB2Dialect)
 
   /**
    * Fetch the JdbcDialect class corresponding to a given database url.
@@ -222,3 +223,20 @@ case object MySQLDialect extends JdbcDialect {
     s"`$colName`"
   }
 }
+
+/**
+ * :: DeveloperApi ::
+ * Default DB2 dialect, mapping string/boolean on write to valid DB2 types.
+ * By default string, and boolean gets mapped to db2 invalid types TEXT, and BIT(1).
+ */
+@DeveloperApi
+case object DB2Dialect extends JdbcDialect {
+
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:db2")
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case StringType => Some(JdbcType("CLOB", java.sql.Types.CLOB))
+    case BooleanType => Some(JdbcType("CHAR(1)", java.sql.Types.CHAR))
+    case _ => None
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0edac0848c3bb..d8c9a08d84c61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -407,6 +407,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   test("Default jdbc dialect registration") {
     assert(JdbcDialects.get("jdbc:mysql://127.0.0.1/db") == MySQLDialect)
     assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
+    assert(JdbcDialects.get("jdbc:db2://127.0.0.1/db") == DB2Dialect)
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
@@ -443,4 +444,10 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(agg.getCatalystType(0, "", 1, null) === Some(LongType))
     assert(agg.getCatalystType(1, "", 1, null) === Some(StringType))
   }
+
+  test("DB2Dialect type mapping") {
+    val db2Dialect = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
+    assert(db2Dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")
+    assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)")
+  }
 }

From 23e39cc7b1bb7f1087c4706234c9b5165a571357 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 31 Aug 2015 15:49:25 -0700
Subject: [PATCH 154/802] [SPARK-9954] [MLLIB] use first 128 nonzeros to
 compute Vector.hashCode

This could help reduce hash collisions, e.g., in `RDD[Vector].repartition`. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #8182 from mengxr/SPARK-9954.
---
 .../apache/spark/mllib/linalg/Vectors.scala   | 38 ++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 06ebb15869909..3642e9286504f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -71,20 +71,22 @@ sealed trait Vector extends Serializable {
   }
 
   /**
-   * Returns a hash code value for the vector. The hash code is based on its size and its nonzeros
-   * in the first 16 entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+   * Returns a hash code value for the vector. The hash code is based on its size and its first 128
+   * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
    */
   override def hashCode(): Int = {
     // This is a reference implementation. It calls return in foreachActive, which is slow.
     // Subclasses should override it with optimized implementation.
     var result: Int = 31 + size
+    var nnz = 0
     this.foreachActive { (index, value) =>
-      if (index < 16) {
+      if (nnz < Vectors.MAX_HASH_NNZ) {
         // ignore explicit 0 for comparison between sparse and dense
         if (value != 0) {
           result = 31 * result + index
           val bits = java.lang.Double.doubleToLongBits(value)
           result = 31 * result + (bits ^ (bits >>> 32)).toInt
+          nnz += 1
         }
       } else {
         return result
@@ -536,6 +538,9 @@ object Vectors {
     }
     allEqual
   }
+
+  /** Max number of nonzero entries used in computing hash code. */
+  private[linalg] val MAX_HASH_NNZ = 128
 }
 
 /**
@@ -578,13 +583,15 @@ class DenseVector @Since("1.0.0") (
   override def hashCode(): Int = {
     var result: Int = 31 + size
     var i = 0
-    val end = math.min(values.length, 16)
-    while (i < end) {
+    val end = values.length
+    var nnz = 0
+    while (i < end && nnz < Vectors.MAX_HASH_NNZ) {
       val v = values(i)
       if (v != 0.0) {
         result = 31 * result + i
         val bits = java.lang.Double.doubleToLongBits(values(i))
         result = 31 * result + (bits ^ (bits >>> 32)).toInt
+        nnz += 1
       }
       i += 1
     }
@@ -707,19 +714,16 @@ class SparseVector @Since("1.0.0") (
   override def hashCode(): Int = {
     var result: Int = 31 + size
     val end = values.length
-    var continue = true
     var k = 0
-    while ((k < end) & continue) {
-      val i = indices(k)
-      if (i < 16) {
-        val v = values(k)
-        if (v != 0.0) {
-          result = 31 * result + i
-          val bits = java.lang.Double.doubleToLongBits(v)
-          result = 31 * result + (bits ^ (bits >>> 32)).toInt
-        }
-      } else {
-        continue = false
+    var nnz = 0
+    while (k < end && nnz < Vectors.MAX_HASH_NNZ) {
+      val v = values(k)
+      if (v != 0.0) {
+        val i = indices(k)
+        result = 31 * result + i
+        val bits = java.lang.Double.doubleToLongBits(v)
+        result = 31 * result + (bits ^ (bits >>> 32)).toInt
+        nnz += 1
       }
       k += 1
     }

From 5b3245d6dff65972fc39c73f90d5cbdf84d19129 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 31 Aug 2015 15:50:41 -0700
Subject: [PATCH 155/802] [SPARK-8472] [ML] [PySpark] Python API for DCT

Add Python API for ml.feature.DCT.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8485 from yanboliang/spark-8472.
---
 python/pyspark/ml/feature.py | 65 +++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 04b2b2ccc9e55..59300a607815b 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,7 +26,7 @@
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
+__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
            'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
            'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
            'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
@@ -166,6 +166,69 @@ def getSplits(self):
         return self.getOrDefault(self.splits)
 
 
+@inherit_doc
+class DCT(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    A feature transformer that takes the 1D discrete cosine transform
+    of a real vector. No zero padding is performed on the input vector.
+    It returns a real vector of the same length representing the DCT.
+    The return vector is scaled such that the transform matrix is
+    unitary (aka scaled DCT-II).
+
+    More information on
+    `https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
+    >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
+    >>> df2 = dct.transform(df1)
+    >>> df2.head().resultVec
+    DenseVector([10.969..., -0.707..., -2.041...])
+    >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
+    >>> df3.head().origVec
+    DenseVector([5.0, 8.0, 6.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
+                    "default False.")
+
+    @keyword_only
+    def __init__(self, inverse=False, inputCol=None, outputCol=None):
+        """
+        __init__(self, inverse=False, inputCol=None, outputCol=None)
+        """
+        super(DCT, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
+        self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
+                             "default False.")
+        self._setDefault(inverse=False)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inverse=False, inputCol=None, outputCol=None):
+        """
+        setParams(self, inverse=False, inputCol=None, outputCol=None)
+        Sets params for this DCT.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setInverse(self, value):
+        """
+        Sets the value of :py:attr:`inverse`.
+        """
+        self._paramMap[self.inverse] = value
+        return self
+
+    def getInverse(self):
+        """
+        Gets the value of inverse or its default value.
+        """
+        return self.getOrDefault(self.inverse)
+
+
 @inherit_doc
 class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     """

From 540bdee93103a73736d282b95db6a8cda8f6a2b1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 31 Aug 2015 15:55:22 -0700
Subject: [PATCH 156/802] [SPARK-10341] [SQL] fix memory starving in unsafe SMJ

In SMJ, the first ExternalSorter could consume all the memory before spilling, then the second can not even acquire the first page.

Before we have a better memory allocator, SMJ should call prepare() before call any compute() of it's children.

cc rxin JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #8511 from davies/smj_memory.
---
 .../rdd/MapPartitionsWithPreparationRDD.scala | 21 +++++++++++++++++--
 .../spark/rdd/ZippedPartitionsRDD.scala       | 13 ++++++++++++
 ...MapPartitionsWithPreparationRDDSuite.scala | 14 +++++++++----
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
index b475bd8d79f85..1f2213d0c4346 100644
--- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rdd
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Partition, Partitioner, TaskContext}
@@ -38,12 +39,28 @@ private[spark] class MapPartitionsWithPreparationRDD[U: ClassTag, T: ClassTag, M
 
   override def getPartitions: Array[Partition] = firstParent[T].partitions
 
+  // In certain join operations, prepare can be called on the same partition multiple times.
+  // In this case, we need to ensure that each call to compute gets a separate prepare argument.
+  private[this] var preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M]
+
+  /**
+   * Prepare a partition for a single call to compute.
+   */
+  def prepare(): Unit = {
+    preparedArguments += preparePartition()
+  }
+
   /**
    * Prepare a partition before computing it from its parent.
    */
   override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
-    val preparedArgument = preparePartition()
+    val prepared =
+      if (preparedArguments.isEmpty) {
+        preparePartition()
+      } else {
+        preparedArguments.remove(0)
+      }
     val parentIterator = firstParent[T].iterator(partition, context)
-    executePartition(context, partition.index, preparedArgument, parentIterator)
+    executePartition(context, partition.index, prepared, parentIterator)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index 81f40ad33aa5d..b3c64394abc76 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -73,6 +73,16 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag](
     super.clearDependencies()
     rdds = null
   }
+
+  /**
+   * Call the prepare method of every parent that has one.
+   * This is needed for reserving execution memory in advance.
+   */
+  protected def tryPrepareParents(): Unit = {
+    rdds.collect {
+      case rdd: MapPartitionsWithPreparationRDD[_, _, _] => rdd.prepare()
+    }
+  }
 }
 
 private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag](
@@ -84,6 +94,7 @@ private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag]
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context))
   }
@@ -107,6 +118,7 @@ private[spark] class ZippedPartitionsRDD3
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
@@ -134,6 +146,7 @@ private[spark] class ZippedPartitionsRDD4
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
diff --git a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
index c16930e7d6491..e281e817e493d 100644
--- a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
@@ -46,11 +46,17 @@ class MapPartitionsWithPreparationRDDSuite extends SparkFunSuite with LocalSpark
     }
 
     // Verify that the numbers are pushed in the order expected
-    val result = {
-      new MapPartitionsWithPreparationRDD[Int, Int, Unit](
-        parent, preparePartition, executePartition).collect()
-    }
+    val rdd = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
+      parent, preparePartition, executePartition)
+    val result = rdd.collect()
     assert(result === Array(10, 20, 30))
+
+    TestObject.things.clear()
+    // Zip two of these RDDs, both should be prepared before the parent is executed
+    val rdd2 = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
+      parent, preparePartition, executePartition)
+    val result2 = rdd.zipPartitions(rdd2)((a, b) => a).collect()
+    assert(result2 === Array(10, 10, 20, 30, 20, 30))
   }
 
 }

From fe16fd0b8b717f01151bc659ec3299dab091c97a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 31 Aug 2015 16:06:38 -0700
Subject: [PATCH 157/802] [SPARK-10349] [ML] OneVsRest use 'when ... otherwise'
 not UDF to generate new label at binary reduction

Currently OneVsRest use UDF to generate new binary label during training.
Considering that [SPARK-7321](https://issues.apache.org/jira/browse/SPARK-7321) has been merged, we can use ```when ... otherwise``` which will be more efficiency.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8519 from yanboliang/spark-10349.
---
 .../org/apache/spark/ml/classification/OneVsRest.scala | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index c62e132f5d533..debc164bf2432 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -91,7 +91,6 @@ final class OneVsRestModel private[ml] (
     // add an accumulator column to store predictions of all the models
     val accColName = "mbc$acc" + UUID.randomUUID().toString
     val initUDF = udf { () => Map[Int, Double]() }
-    val mapType = MapType(IntegerType, DoubleType, valueContainsNull = false)
     val newDataset = dataset.withColumn(accColName, initUDF())
 
     // persist if underlying dataset is not persistent.
@@ -195,16 +194,11 @@ final class OneVsRest(override val uid: String)
 
     // create k columns, one for each binary classifier.
     val models = Range(0, numClasses).par.map { index =>
-      val labelUDF = udf { (label: Double) =>
-        if (label.toInt == index) 1.0 else 0.0
-      }
-
       // generate new label metadata for the binary problem.
-      // TODO: use when ... otherwise after SPARK-7321 is merged
       val newLabelMeta = BinaryAttribute.defaultAttr.withName("label").toMetadata()
       val labelColName = "mc2b$" + index
-      val trainingDataset =
-        multiclassLabeled.withColumn(labelColName, labelUDF(col($(labelCol))), newLabelMeta)
+      val trainingDataset = multiclassLabeled.withColumn(
+        labelColName, when(col($(labelCol)) === index.toDouble, 1.0).otherwise(0.0), newLabelMeta)
       val classifier = getClassifier
       val paramMap = new ParamMap()
       paramMap.put(classifier.labelCol -> labelColName)

From 52ea399e6ee37b7c44aae7709863e006fca88906 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 31 Aug 2015 16:11:27 -0700
Subject: [PATCH 158/802] [SPARK-10355] [ML] [PySpark] Add Python API for
 SQLTransformer

Add Python API for SQLTransformer

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8527 from yanboliang/spark-10355.
---
 python/pyspark/ml/feature.py | 57 ++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 59300a607815b..0626281e200a1 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -28,9 +28,9 @@
 
 __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
            'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
-           'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
-           'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
-           'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+           'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
+           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
+           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
@@ -743,6 +743,57 @@ def getPattern(self):
         return self.getOrDefault(self.pattern)
 
 
+@inherit_doc
+class SQLTransformer(JavaTransformer):
+    """
+    Implements the transforms which are defined by SQL statement.
+    Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+    where '__THIS__' represents the underlying table of the input dataset.
+
+    >>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
+    >>> sqlTrans = SQLTransformer(
+    ...     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+    >>> sqlTrans.transform(df).head()
+    Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+    """
+
+    # a placeholder to make it appear in the generated doc
+    statement = Param(Params._dummy(), "statement", "SQL statement")
+
+    @keyword_only
+    def __init__(self, statement=None):
+        """
+        __init__(self, statement=None)
+        """
+        super(SQLTransformer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
+        self.statement = Param(self, "statement", "SQL statement")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, statement=None):
+        """
+        setParams(self, statement=None)
+        Sets params for this SQLTransformer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setStatement(self, value):
+        """
+        Sets the value of :py:attr:`statement`.
+        """
+        self._paramMap[self.statement] = value
+        return self
+
+    def getStatement(self):
+        """
+        Gets the value of statement or its default value.
+        """
+        return self.getOrDefault(self.statement)
+
+
 @inherit_doc
 class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     """

From d65656c455d19b83c6412571873586b458aa355e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 31 Aug 2015 18:09:24 -0700
Subject: [PATCH 159/802] [SPARK-10378][SQL][Test] Remove
 HashJoinCompatibilitySuite.

They don't bring much value since we now have better unit test coverage for hash joins. This will also help reduce the test time.

Author: Reynold Xin <rxin@databricks.com>

Closes #8542 from rxin/SPARK-10378.
---
 .../HashJoinCompatibilitySuite.scala          | 169 ------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
deleted file mode 100644
index 1a5ba20404c4e..0000000000000
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import java.io.File
-
-import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.hive.test.TestHive
-
-/**
- * Runs the test cases that are included in the hive distribution with hash joins.
- */
-class HashJoinCompatibilitySuite extends HiveCompatibilitySuite {
-  override def beforeAll() {
-    super.beforeAll()
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
-  }
-
-  override def afterAll() {
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
-    super.afterAll()
-  }
-
-  override def whiteList = Seq(
-    "auto_join0",
-    "auto_join1",
-    "auto_join10",
-    "auto_join11",
-    "auto_join12",
-    "auto_join13",
-    "auto_join14",
-    "auto_join14_hadoop20",
-    "auto_join15",
-    "auto_join17",
-    "auto_join18",
-    "auto_join19",
-    "auto_join2",
-    "auto_join20",
-    "auto_join21",
-    "auto_join22",
-    "auto_join23",
-    "auto_join24",
-    "auto_join25",
-    "auto_join26",
-    "auto_join27",
-    "auto_join28",
-    "auto_join3",
-    "auto_join30",
-    "auto_join31",
-    "auto_join32",
-    "auto_join4",
-    "auto_join5",
-    "auto_join6",
-    "auto_join7",
-    "auto_join8",
-    "auto_join9",
-    "auto_join_filters",
-    "auto_join_nulls",
-    "auto_join_reordering_values",
-    "auto_smb_mapjoin_14",
-    "auto_sortmerge_join_1",
-    "auto_sortmerge_join_10",
-    "auto_sortmerge_join_11",
-    "auto_sortmerge_join_12",
-    "auto_sortmerge_join_13",
-    "auto_sortmerge_join_14",
-    "auto_sortmerge_join_15",
-    "auto_sortmerge_join_16",
-    "auto_sortmerge_join_2",
-    "auto_sortmerge_join_3",
-    "auto_sortmerge_join_4",
-    "auto_sortmerge_join_5",
-    "auto_sortmerge_join_6",
-    "auto_sortmerge_join_7",
-    "auto_sortmerge_join_8",
-    "auto_sortmerge_join_9",
-    "correlationoptimizer1",
-    "correlationoptimizer10",
-    "correlationoptimizer11",
-    "correlationoptimizer13",
-    "correlationoptimizer14",
-    "correlationoptimizer15",
-    "correlationoptimizer2",
-    "correlationoptimizer3",
-    "correlationoptimizer4",
-    "correlationoptimizer6",
-    "correlationoptimizer7",
-    "correlationoptimizer8",
-    "correlationoptimizer9",
-    "join0",
-    "join1",
-    "join10",
-    "join11",
-    "join12",
-    "join13",
-    "join14",
-    "join14_hadoop20",
-    "join15",
-    "join16",
-    "join17",
-    "join18",
-    "join19",
-    "join2",
-    "join20",
-    "join21",
-    "join22",
-    "join23",
-    "join24",
-    "join25",
-    "join26",
-    "join27",
-    "join28",
-    "join29",
-    "join3",
-    "join30",
-    "join31",
-    "join32",
-    "join32_lessSize",
-    "join33",
-    "join34",
-    "join35",
-    "join36",
-    "join37",
-    "join38",
-    "join39",
-    "join4",
-    "join40",
-    "join41",
-    "join5",
-    "join6",
-    "join7",
-    "join8",
-    "join9",
-    "join_1to1",
-    "join_array",
-    "join_casesensitive",
-    "join_empty",
-    "join_filters",
-    "join_hive_626",
-    "join_map_ppr",
-    "join_nulls",
-    "join_nullsafe",
-    "join_rc",
-    "join_reorder2",
-    "join_reorder3",
-    "join_reorder4",
-    "join_star"
-  )
-
-  // Only run those query tests in the realWhileList (do not try other ignored query files).
-  override def testCases: Seq[(String, File)] = super.testCases.filter {
-    case (name, _) => realWhiteList.contains(name)
-  }
-}

From 391e6be0ae883f3ea0fab79463eb8b618af79afb Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 1 Sep 2015 16:52:59 +0800
Subject: [PATCH 160/802] [SPARK-10301] [SQL] Fixes schema merging for nested
 structs

This PR can be quite challenging to review.  I'm trying to give a detailed description of the problem as well as its solution here.

When reading Parquet files, we need to specify a potentially nested Parquet schema (of type `MessageType`) as requested schema for column pruning.  This Parquet schema is translated from a Catalyst schema (of type `StructType`), which is generated by the query planner and represents all requested columns.  However, this translation can be fairly complicated because of several reasons:

1.  Requested schema must conform to the real schema of the physical file to be read.

    This means we have to tailor the actual file schema of every individual physical Parquet file to be read according to the given Catalyst schema.  Fortunately we are already doing this in Spark 1.5 by pushing request schema conversion to executor side in PR #7231.

1.  Support for schema merging.

    A single Parquet dataset may consist of multiple physical Parquet files come with different but compatible schemas.  This means we may request for a column path that doesn't exist in a physical Parquet file.  All requested column paths can be nested.  For example, for a Parquet file schema

    ```
    message root {
      required group f0 {
        required group f00 {
          required int32 f000;
          required binary f001 (UTF8);
        }
      }
    }
    ```

    we may request for column paths defined in the following schema:

    ```
    message root {
      required group f0 {
        required group f00 {
          required binary f001 (UTF8);
          required float f002;
        }
      }

      optional double f1;
    }
    ```

    Notice that we pruned column path `f0.f00.f000`, but added `f0.f00.f002` and `f1`.

    The good news is that Parquet handles non-existing column paths properly and always returns null for them.

1.  The map from `StructType` to `MessageType` is a one-to-many map.

    This is the most unfortunate part.

    Due to historical reasons (dark histories!), schemas of Parquet files generated by different libraries have different "flavors".  For example, to handle a schema with a single non-nullable column, whose type is an array of non-nullable integers, parquet-protobuf generates the following Parquet schema:

    ```
    message m0 {
      repeated int32 f;
    }
    ```

    while parquet-avro generates another version:

    ```
    message m1 {
      required group f (LIST) {
        repeated int32 array;
      }
    }
    ```

    and parquet-thrift spills this:

    ```
    message m1 {
      required group f (LIST) {
        repeated int32 f_tuple;
      }
    }
    ```

    All of them can be mapped to the following _unique_ Catalyst schema:

    ```
    StructType(
      StructField(
        "f",
        ArrayType(IntegerType, containsNull = false),
        nullable = false))
    ```

    This greatly complicates Parquet requested schema construction, since the path of a given column varies in different cases.  To read the array elements from files with the above schemas, we must use `f` for `m0`, `f.array` for `m1`, and `f.f_tuple` for `m2`.

In earlier Spark versions, we didn't try to fix this issue properly.  Spark 1.4 and prior versions simply translate the Catalyst schema in a way more or less compatible with parquet-hive and parquet-avro, but is broken in many other cases.  Earlier revisions of Spark 1.5 only try to tailor the Parquet file schema at the first level, and ignore nested ones.  This caused [SPARK-10301] [spark-10301] as well as [SPARK-10005] [spark-10005].  In PR #8228, I tried to avoid the hard part of the problem and made a minimum change in `CatalystRowConverter` to fix SPARK-10005.  However, when taking SPARK-10301 into consideration, keeping hacking `CatalystRowConverter` doesn't seem to be a good idea.  So this PR is an attempt to fix the problem in a proper way.

For a given physical Parquet file with schema `ps` and a compatible Catalyst requested schema `cs`, we use the following algorithm to tailor `ps` to get the result Parquet requested schema `ps'`:

For a leaf column path `c` in `cs`:

- if `c` exists in `cs` and a corresponding Parquet column path `c'` can be found in `ps`, `c'` should be included in `ps'`;
- otherwise, we convert `c` to a Parquet column path `c"` using `CatalystSchemaConverter`, and include `c"` in `ps'`;
- no other column paths should exist in `ps'`.

Then comes the most tedious part:

> Given `cs`, `ps`, and `c`, how to locate `c'` in `ps`?

Unfortunately, there's no quick answer, and we have to enumerate all possible structures defined in parquet-format spec.  They are:

1.  the standard structure of nested types, and
1.  cases defined in all backwards-compatibility rules for `LIST` and `MAP`.

The core part of this PR is `CatalystReadSupport.clipParquetType()`, which tailors a given Parquet file schema according to a requested schema in its Catalyst form.  Backwards-compatibility rules of `LIST` and `MAP` are covered in `clipParquetListType()` and `clipParquetMapType()` respectively.  The column path selection algorithm is implemented in `clipParquetGroupFields()`.

With this PR, we no longer need to do schema tailoring in `CatalystReadSupport` and `CatalystRowConverter`.  Another benefit is that, now we can also read Parquet datasets consist of files with different physical Parquet schema but share the same logical schema, for example, files generated by different Parquet libraries.  This situation is illustrated by [this test case] [test-case].

[spark-10301]: https://issues.apache.org/jira/browse/SPARK-10301
[spark-10005]: https://issues.apache.org/jira/browse/SPARK-10005
[test-case]: https://github.com/liancheng/spark/commit/38644d8a45175cbdf20d2ace021c2c2544a50ab3#diff-a9b98e28ce3ae30641829dffd1173be2R26

Author: Cheng Lian <lian@databricks.com>

Closes #8509 from liancheng/spark-10301/fix-parquet-requested-schema.
---
 .../parquet/CatalystReadSupport.scala         | 235 +++++++++----
 .../parquet/CatalystRowConverter.scala        |  51 +--
 .../parquet/CatalystSchemaConverter.scala     |  14 +-
 .../ParquetAvroCompatibilitySuite.scala       |   1 +
 .../ParquetInteroperabilitySuite.scala        |  90 +++++
 .../parquet/ParquetQuerySuite.scala           |  77 +++++
 .../parquet/ParquetSchemaSuite.scala          | 310 ++++++++++++++++++
 7 files changed, 653 insertions(+), 125 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 0a6bb44445f6e..dc4ff06df6f22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -19,17 +19,18 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.{Map => JMap}
 
-import scala.collection.JavaConverters._
+import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, mapAsScalaMapConverter}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
 import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
 import org.apache.parquet.io.api.RecordMaterializer
-import org.apache.parquet.schema.MessageType
+import org.apache.parquet.schema.Type.Repetition
+import org.apache.parquet.schema._
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types._
 
 private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging {
   // Called after `init()` when initializing Parquet record reader.
@@ -81,70 +82,10 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
     // `StructType` containing all requested columns.
     val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
 
-    // Below we construct a Parquet schema containing all requested columns.  This schema tells
-    // Parquet which columns to read.
-    //
-    // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema.  Otherwise,
-    // we have to fallback to the full file schema which contains all columns in the file.
-    // Obviously this may waste IO bandwidth since it may read more columns than requested.
-    //
-    // Two things to note:
-    //
-    // 1. It's possible that some requested columns don't exist in the target Parquet file.  For
-    //    example, in the case of schema merging, the globally merged schema may contain extra
-    //    columns gathered from other Parquet files.  These columns will be simply filled with nulls
-    //    when actually reading the target Parquet file.
-    //
-    // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
-    //    Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
-    //    non-standard behaviors of some Parquet libraries/tools.  For example, a Parquet file
-    //    containing a single integer array field `f1` may have the following legacy 2-level
-    //    structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          required INT32 element;
-    //        }
-    //      }
-    //
-    //    while `CatalystSchemaConverter` may generate a standard 3-level structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          repeated group list {
-    //            required INT32 element;
-    //          }
-    //        }
-    //      }
-    //
-    //    Apparently, we can't use the 2nd schema to read the target Parquet file as they have
-    //    different physical structures.
     val parquetRequestedSchema =
       maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
-        val toParquet = new CatalystSchemaConverter(conf)
-        val fileSchema = context.getFileSchema.asGroupType()
-        val fileFieldNames = fileSchema.getFields.asScala.map(_.getName).toSet
-
-        StructType
-          // Deserializes the Catalyst schema of requested columns
-          .fromString(schemaString)
-          .map { field =>
-            if (fileFieldNames.contains(field.name)) {
-              // If the field exists in the target Parquet file, extracts the field type from the
-              // full file schema and makes a single-field Parquet schema
-              new MessageType("root", fileSchema.getType(field.name))
-            } else {
-              // Otherwise, just resorts to `CatalystSchemaConverter`
-              toParquet.convert(StructType(Array(field)))
-            }
-          }
-          // Merges all single-field Parquet schemas to form a complete schema for all requested
-          // columns.  Note that it's possible that no columns are requested at all (e.g., count
-          // some partition column of a partitioned Parquet table). That's why `fold` is used here
-          // and always fallback to an empty Parquet schema.
-          .fold(new MessageType("root")) {
-            _ union _
-          }
+        val catalystRequestedSchema = StructType.fromString(schemaString)
+        CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
       }
 
     val metadata =
@@ -160,4 +101,168 @@ private[parquet] object CatalystReadSupport {
   val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
 
   val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
+
+  /**
+   * Tailors `parquetSchema` according to `catalystSchema` by removing column paths don't exist
+   * in `catalystSchema`, and adding those only exist in `catalystSchema`.
+   */
+  def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = {
+    val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema)
+    Types.buildMessage().addFields(clippedParquetFields: _*).named("root")
+  }
+
+  private def clipParquetType(parquetType: Type, catalystType: DataType): Type = {
+    catalystType match {
+      case t: ArrayType if !isPrimitiveCatalystType(t.elementType) =>
+        // Only clips array types with nested type as element type.
+        clipParquetListType(parquetType.asGroupType(), t.elementType)
+
+      case t: MapType if !isPrimitiveCatalystType(t.valueType) =>
+        // Only clips map types with nested type as value type.
+        clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType)
+
+      case t: StructType =>
+        clipParquetGroup(parquetType.asGroupType(), t)
+
+      case _ =>
+        parquetType
+    }
+  }
+
+  /**
+   * Whether a Catalyst [[DataType]] is primitive.  Primitive [[DataType]] is not equivalent to
+   * [[AtomicType]].  For example, [[CalendarIntervalType]] is primitive, but it's not an
+   * [[AtomicType]].
+   */
+  private def isPrimitiveCatalystType(dataType: DataType): Boolean = {
+    dataType match {
+      case _: ArrayType | _: MapType | _: StructType => false
+      case _ => true
+    }
+  }
+
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[ArrayType]].  The element type
+   * of the [[ArrayType]] should also be a nested type, namely an [[ArrayType]], a [[MapType]], or a
+   * [[StructType]].
+   */
+  private def clipParquetListType(parquetList: GroupType, elementType: DataType): Type = {
+    // Precondition of this method, should only be called for lists with nested element types.
+    assert(!isPrimitiveCatalystType(elementType))
+
+    // Unannotated repeated group should be interpreted as required list of required element, so
+    // list element type is just the group itself.  Clip it.
+    if (parquetList.getOriginalType == null && parquetList.isRepetition(Repetition.REPEATED)) {
+      clipParquetType(parquetList, elementType)
+    } else {
+      assert(
+        parquetList.getOriginalType == OriginalType.LIST,
+        "Invalid Parquet schema. " +
+          "Original type of annotated Parquet lists must be LIST: " +
+          parquetList.toString)
+
+      assert(
+        parquetList.getFieldCount == 1 && parquetList.getType(0).isRepetition(Repetition.REPEATED),
+        "Invalid Parquet schema. " +
+          "LIST-annotated group should only have exactly one repeated field: " +
+          parquetList)
+
+      // Precondition of this method, should only be called for lists with nested element types.
+      assert(!parquetList.getType(0).isPrimitive)
+
+      val repeatedGroup = parquetList.getType(0).asGroupType()
+
+      // If the repeated field is a group with multiple fields, or the repeated field is a group
+      // with one field and is named either "array" or uses the LIST-annotated group's name with
+      // "_tuple" appended then the repeated type is the element type and elements are required.
+      // Build a new LIST-annotated group with clipped `repeatedGroup` as element type and the
+      // only field.
+      if (
+        repeatedGroup.getFieldCount > 1 ||
+        repeatedGroup.getName == "array" ||
+        repeatedGroup.getName == parquetList.getName + "_tuple"
+      ) {
+        Types
+          .buildGroup(parquetList.getRepetition)
+          .as(OriginalType.LIST)
+          .addField(clipParquetType(repeatedGroup, elementType))
+          .named(parquetList.getName)
+      } else {
+        // Otherwise, the repeated field's type is the element type with the repeated field's
+        // repetition.
+        Types
+          .buildGroup(parquetList.getRepetition)
+          .as(OriginalType.LIST)
+          .addField(
+            Types
+              .repeatedGroup()
+              .addField(clipParquetType(repeatedGroup.getType(0), elementType))
+              .named(repeatedGroup.getName))
+          .named(parquetList.getName)
+      }
+    }
+  }
+
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[MapType]].  The value type
+   * of the [[MapType]] should also be a nested type, namely an [[ArrayType]], a [[MapType]], or a
+   * [[StructType]].  Note that key type of any [[MapType]] is always a primitive type.
+   */
+  private def clipParquetMapType(
+      parquetMap: GroupType, keyType: DataType, valueType: DataType): GroupType = {
+    // Precondition of this method, should only be called for maps with nested value types.
+    assert(!isPrimitiveCatalystType(valueType))
+
+    val repeatedGroup = parquetMap.getType(0).asGroupType()
+    val parquetKeyType = repeatedGroup.getType(0)
+    val parquetValueType = repeatedGroup.getType(1)
+
+    val clippedRepeatedGroup =
+      Types
+        .repeatedGroup()
+        .as(repeatedGroup.getOriginalType)
+        .addField(parquetKeyType)
+        .addField(clipParquetType(parquetValueType, valueType))
+        .named(repeatedGroup.getName)
+
+    Types
+      .buildGroup(parquetMap.getRepetition)
+      .as(parquetMap.getOriginalType)
+      .addField(clippedRepeatedGroup)
+      .named(parquetMap.getName)
+  }
+
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]].
+   *
+   * @return A clipped [[GroupType]], which has at least one field.
+   * @note Parquet doesn't allow creating empty [[GroupType]] instances except for empty
+   *       [[MessageType]].  Because it's legal to construct an empty requested schema for column
+   *       pruning.
+   */
+  private def clipParquetGroup(parquetRecord: GroupType, structType: StructType): GroupType = {
+    val clippedParquetFields = clipParquetGroupFields(parquetRecord, structType)
+    Types
+      .buildGroup(parquetRecord.getRepetition)
+      .as(parquetRecord.getOriginalType)
+      .addFields(clippedParquetFields: _*)
+      .named(parquetRecord.getName)
+  }
+
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]].
+   *
+   * @return A list of clipped [[GroupType]] fields, which can be empty.
+   */
+  private def clipParquetGroupFields(
+      parquetRecord: GroupType, structType: StructType): Seq[Type] = {
+    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
+    val toParquet = new CatalystSchemaConverter(followParquetFormatSpec = true)
+    structType.map { f =>
+      parquetFieldMap
+        .get(f.name)
+        .map(clipParquetType(_, f.dataType))
+        .getOrElse(toParquet.convertField(f))
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index fe13dfbbed385..f17e794b76650 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -113,31 +113,6 @@ private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUp
  * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
  * any "parent" container.
  *
- * @note Constructor argument [[parquetType]] refers to requested fields of the actual schema of the
- *       Parquet file being read, while constructor argument [[catalystType]] refers to requested
- *       fields of the global schema.  The key difference is that, in case of schema merging,
- *       [[parquetType]] can be a subset of [[catalystType]].  For example, it's possible to have
- *       the following [[catalystType]]:
- *       {{{
- *         new StructType()
- *           .add("f1", IntegerType, nullable = false)
- *           .add("f2", StringType, nullable = true)
- *           .add("f3", new StructType()
- *             .add("f31", DoubleType, nullable = false)
- *             .add("f32", IntegerType, nullable = true)
- *             .add("f33", StringType, nullable = true), nullable = false)
- *       }}}
- *       and the following [[parquetType]] (`f2` and `f32` are missing):
- *       {{{
- *         message root {
- *           required int32 f1;
- *           required group f3 {
- *             required double f31;
- *             optional binary f33 (utf8);
- *           }
- *         }
- *       }}}
- *
  * @param parquetType Parquet schema of Parquet records
  * @param catalystType Spark SQL schema that corresponds to the Parquet record type
  * @param updater An updater which propagates converted field values to the parent container
@@ -179,31 +154,7 @@ private[parquet] class CatalystRowConverter(
 
   // Converters for each field.
   private val fieldConverters: Array[Converter with HasParentContainerUpdater] = {
-    // In case of schema merging, `parquetType` can be a subset of `catalystType`.  We need to pad
-    // those missing fields and create converters for them, although values of these fields are
-    // always null.
-    val paddedParquetFields = {
-      val parquetFields = parquetType.getFields.asScala
-      val parquetFieldNames = parquetFields.map(_.getName).toSet
-      val missingFields = catalystType.filterNot(f => parquetFieldNames.contains(f.name))
-
-      // We don't need to worry about feature flag arguments like `assumeBinaryIsString` when
-      // creating the schema converter here, since values of missing fields are always null.
-      val toParquet = new CatalystSchemaConverter()
-
-      (parquetFields ++ missingFields.map(toParquet.convertField)).sortBy { f =>
-        catalystType.indexWhere(_.name == f.getName)
-      }
-    }
-
-    if (paddedParquetFields.length != catalystType.length) {
-      throw new UnsupportedOperationException(
-        "A Parquet file's schema has different number of fields with the table schema. " +
-          "Please enable schema merging by setting \"mergeSchema\" to true when load " +
-          "a Parquet dataset or set spark.sql.parquet.mergeSchema to true in SQLConf.")
-    }
-
-    paddedParquetFields.zip(catalystType).zipWithIndex.map {
+    parquetType.getFields.asScala.zip(catalystType).zipWithIndex.map {
       case ((parquetFieldType, catalystField), ordinal) =>
         // Converted field value should be set to the `ordinal`-th cell of `currentRow`
         newConverter(parquetFieldType, catalystField.dataType, new RowUpdater(currentRow, ordinal))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index be6c0545f5a0a..a21ab1dbb25d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -55,16 +55,10 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  *        to old style non-standard behaviors.
  */
 private[parquet] class CatalystSchemaConverter(
-    private val assumeBinaryIsString: Boolean,
-    private val assumeInt96IsTimestamp: Boolean,
-    private val followParquetFormatSpec: Boolean) {
-
-  // Only used when constructing converter for converting Spark SQL schema to Parquet schema, in
-  // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are irrelevant.
-  def this() = this(
-    assumeBinaryIsString = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
-    assumeInt96IsTimestamp = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
-    followParquetFormatSpec = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
+    assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
+    assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
+    followParquetFormatSpec: Boolean = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get
+) {
 
   def this(conf: SQLConf) = this(
     assumeBinaryIsString = conf.isParquetBinaryAsString,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
index bd7cf8c10abef..36b929ee1f409 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.io.File
 import java.nio.ByteBuffer
 import java.util.{List => JList, Map => JMap}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
new file mode 100644
index 0000000000000..83b65fb419ed3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.File
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
+  test("parquet files with different physical schemas but share the same logical schema") {
+    import ParquetCompatibilityTest._
+
+    // This test case writes two Parquet files, both representing the following Catalyst schema
+    //
+    //   StructType(
+    //     StructField(
+    //       "f",
+    //       ArrayType(IntegerType, containsNull = false),
+    //       nullable = false))
+    //
+    // The first Parquet file comes with parquet-avro style 2-level LIST-annotated group, while the
+    // other one comes with parquet-protobuf style 1-level unannotated primitive field.
+    withTempDir { dir =>
+      val avroStylePath = new File(dir, "avro-style").getCanonicalPath
+      val protobufStylePath = new File(dir, "protobuf-style").getCanonicalPath
+
+      val avroStyleSchema =
+        """message avro_style {
+          |  required group f (LIST) {
+          |    repeated int32 array;
+          |  }
+          |}
+        """.stripMargin
+
+      writeDirect(avroStylePath, avroStyleSchema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("array", 0) {
+                rc.addInteger(0)
+                rc.addInteger(1)
+              }
+            }
+          }
+        }
+      })
+
+      logParquetSchema(avroStylePath)
+
+      val protobufStyleSchema =
+        """message protobuf_style {
+          |  repeated int32 f;
+          |}
+        """.stripMargin
+
+      writeDirect(protobufStylePath, protobufStyleSchema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.addInteger(2)
+            rc.addInteger(3)
+          }
+        }
+      })
+
+      logParquetSchema(protobufStylePath)
+
+      checkAnswer(
+        sqlContext.read.parquet(dir.getCanonicalPath),
+        Seq(
+          Row(Seq(0, 1)),
+          Row(Seq(2, 3))))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index b7b70c2bbbd5c..a379523d67f80 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -229,4 +229,81 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-10301 Clipping nested structs in requested schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id) AS s")
+        .coalesce(1)
+
+      df.write.mode("append").parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s", new StructType().add("a", LongType, nullable = true), nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0)))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('b', id, 'c', id) AS s")
+        .coalesce(1)
+
+      df1.write.parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add("a", LongType, nullable = true)
+            .add("c", LongType, nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Seq(
+          Row(Row(0, null)),
+          Row(Row(null, 1))))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', ARRAY(NAMED_STRUCT('b', id, 'c', id))) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add(
+              "a",
+              ArrayType(
+                new StructType()
+                  .add("b", LongType, nullable = true)
+                  .add("d", StringType, nullable = true),
+                containsNull = true),
+              nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(Seq(Row(0, null)))))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 9dcbc1a047bea..28c59a4abdd76 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.parquet.schema.MessageTypeParser
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -941,4 +942,313 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));
       |}
     """.stripMargin)
+
+  private def testSchemaClipping(
+      testName: String,
+      parquetSchema: String,
+      catalystSchema: StructType,
+      expectedSchema: String): Unit = {
+    test(s"Clipping - $testName") {
+      val expected = MessageTypeParser.parseMessageType(expectedSchema)
+      val actual = CatalystReadSupport.clipParquetSchema(
+        MessageTypeParser.parseMessageType(parquetSchema), catalystSchema)
+
+      try {
+        expected.checkContains(actual)
+        actual.checkContains(expected)
+      } catch { case cause: Throwable =>
+        fail(
+          s"""Expected clipped schema:
+             |$expected
+             |Actual clipped schema:
+             |$actual
+           """.stripMargin,
+          cause)
+      }
+    }
+  }
+
+  testSchemaClipping(
+    "simple nested struct",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional int32 f00;
+        |    optional int32 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f0Type = new StructType().add("f00", IntegerType, nullable = true)
+      new StructType()
+        .add("f0", f0Type, nullable = false)
+        .add("f1", IntegerType, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional int32 f00;
+        |  }
+        |  optional int32 f1;
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-protobuf style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    repeated binary f00 (UTF8);
+        |    repeated group f01 {
+        |      optional int32 f010;
+        |      optional double f011;
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f11Type = new StructType().add("f011", DoubleType, nullable = true)
+      val f01Type = ArrayType(StringType, containsNull = false)
+      val f0Type = new StructType()
+        .add("f00", f01Type, nullable = false)
+        .add("f01", f11Type, nullable = false)
+      val f1Type = ArrayType(IntegerType, containsNull = true)
+      new StructType()
+        .add("f0", f0Type, nullable = false)
+        .add("f1", f1Type, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    repeated binary f00 (UTF8);
+        |    repeated group f01 {
+        |      optional double f011;
+        |    }
+        |  }
+        |
+        |  optional group f1 (LIST) {
+        |    repeated group list {
+        |      optional int32 element;
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-thrift style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 {
+        |      repeated binary f00_tuple (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group f01_tuple {
+        |        optional int32 f010;
+        |        optional double f011;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f11ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = false)
+        .add("f01", ArrayType(f11ElementType, containsNull = false), nullable = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 {
+        |      repeated binary f00_tuple (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group f01_tuple {
+        |        optional double f011;
+        |        optional int64 f012;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-avro style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 {
+        |      repeated binary array (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group array {
+        |        optional int32 f010;
+        |        optional double f011;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f11ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = false)
+        .add("f01", ArrayType(f11ElementType, containsNull = false), nullable = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 {
+        |      repeated binary array (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group array {
+        |        optional double f011;
+        |        optional int64 f012;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-hive style array",
+
+    parquetSchema =
+      """message root {
+        |  optional group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group bag {
+        |        optional binary array_element;
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group bag {
+        |        optional group array_element {
+        |          optional int32 f010;
+        |          optional double f011;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = true), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = true), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  optional group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group bag {
+        |        optional binary array_element;
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group bag {
+        |        optional group array_element {
+        |          optional double f011;
+        |          optional int64 f012;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "2-level list of required struct",
+
+    parquetSchema =
+      s"""message root {
+         |  required group f0 {
+         |    required group f00 (LIST) {
+         |      repeated group element {
+         |        required int32 f000;
+         |        optional int64 f001;
+         |      }
+         |    }
+         |  }
+         |}
+       """.stripMargin,
+
+    catalystSchema = {
+      val f00ElementType =
+        new StructType()
+          .add("f001", LongType, nullable = true)
+          .add("f002", DoubleType, nullable = false)
+
+      val f00Type = ArrayType(f00ElementType, containsNull = false)
+      val f0Type = new StructType().add("f00", f00Type, nullable = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      s"""message root {
+         |  required group f0 {
+         |    required group f00 (LIST) {
+         |      repeated group element {
+         |        optional int64 f001;
+         |        required double f002;
+         |      }
+         |    }
+         |  }
+         |}
+       """.stripMargin)
+
+  testSchemaClipping(
+    "empty requested schema",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    required int32 f00;
+        |    required int64 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = new StructType(),
+
+    expectedSchema = "message root {}")
 }

From e6e483cc4de740c46398385b03ffe0e662edae39 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 1 Sep 2015 10:48:57 -0700
Subject: [PATCH 161/802] [SPARK-9679] [ML] [PYSPARK] Add Python API for Stop
 Words Remover

Add a python API for the Stop Words Remover.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8118 from holdenk/SPARK-9679-python-StopWordsRemover.
---
 .../spark/ml/feature/StopWordsRemover.scala   |  6 +-
 .../ml/feature/StopWordsRemoverSuite.scala    |  2 +-
 python/pyspark/ml/feature.py                  | 73 ++++++++++++++++++-
 python/pyspark/ml/tests.py                    | 20 ++++-
 4 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 5d77ea08db657..7da430c7d16df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -29,14 +29,14 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructTyp
 /**
  * stop words list
  */
-private object StopWords {
+private[spark] object StopWords {
 
   /**
    * Use the same default stopwords list as scikit-learn.
    * The original list can be found from "Glasgow Information Retrieval Group"
    * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]]
    */
-  val EnglishStopWords = Array( "a", "about", "above", "across", "after", "afterwards", "again",
+  val English = Array( "a", "about", "above", "across", "after", "afterwards", "again",
     "against", "all", "almost", "alone", "along", "already", "also", "although", "always",
     "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
     "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
@@ -121,7 +121,7 @@ class StopWordsRemover(override val uid: String)
   /** @group getParam */
   def getCaseSensitive: Boolean = $(caseSensitive)
 
-  setDefault(stopWords -> StopWords.EnglishStopWords, caseSensitive -> false)
+  setDefault(stopWords -> StopWords.English, caseSensitive -> false)
 
   override def transform(dataset: DataFrame): DataFrame = {
     val outputSchema = transformSchema(dataset.schema)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index f01306f89cb5f..e0d433f566c25 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -65,7 +65,7 @@ class StopWordsRemoverSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("StopWordsRemover with additional words") {
-    val stopWords = StopWords.EnglishStopWords ++ Array("python", "scala")
+    val stopWords = StopWords.English ++ Array("python", "scala")
     val remover = new StopWordsRemover()
       .setInputCol("raw")
       .setOutputCol("filtered")
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 0626281e200a1..d955307e27efd 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -22,7 +22,7 @@
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import keyword_only
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
@@ -30,7 +30,7 @@
            'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
            'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
            'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
-           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
 
 
 @inherit_doc
@@ -933,6 +933,75 @@ class StringIndexerModel(JavaModel):
     """
 
 
+class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    A feature transformer that filters out stop words from input.
+    Note: null values from input array are preserved unless adding null to stopWords explicitly.
+    """
+    # a placeholder to make the stopwords show up in generated doc
+    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
+    caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
+                          "comparison over the stop words")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, stopWords=None,
+                 caseSensitive=False):
+        """
+        __init__(self, inputCol=None, outputCol=None, stopWords=None,\
+                 caseSensitive=false)
+        """
+        super(StopWordsRemover, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
+                                            self.uid)
+        self.stopWords = Param(self, "stopWords", "The words to be filtered out")
+        self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
+                                   "sensitive comparison over the stop words")
+        stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
+        defaultStopWords = stopWordsObj.English()
+        self._setDefault(stopWords=defaultStopWords)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, stopWords=None,
+                  caseSensitive=False):
+        """
+        setParams(self, inputCol="input", outputCol="output", stopWords=None,\
+                  caseSensitive=false)
+        Sets params for this StopWordRemover.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setStopWords(self, value):
+        """
+        Specify the stopwords to be filtered.
+        """
+        self._paramMap[self.stopWords] = value
+        return self
+
+    def getStopWords(self):
+        """
+        Get the stopwords.
+        """
+        return self.getOrDefault(self.stopWords)
+
+    def setCaseSensitive(self, value):
+        """
+        Set whether to do a case sensitive comparison over the stop words
+        """
+        self._paramMap[self.caseSensitive] = value
+        return self
+
+    def getCaseSensitive(self):
+        """
+        Get whether to do a case sensitive comparison over the stop words.
+        """
+        return self.getOrDefault(self.caseSensitive)
+
+
 @inherit_doc
 @ignore_unicode_prefix
 class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 60e4237293adc..b892318f50bd9 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -31,7 +31,7 @@
     import unittest
 
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
-from pyspark.sql import DataFrame, SQLContext
+from pyspark.sql import DataFrame, SQLContext, Row
 from pyspark.sql.functions import rand
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.param import Param, Params
@@ -258,7 +258,7 @@ def test_idf(self):
     def test_ngram(self):
         sqlContext = SQLContext(self.sc)
         dataset = sqlContext.createDataFrame([
-            ([["a", "b", "c", "d", "e"]])], ["input"])
+            Row(input=["a", "b", "c", "d", "e"])])
         ngram0 = NGram(n=4, inputCol="input", outputCol="output")
         self.assertEqual(ngram0.getN(), 4)
         self.assertEqual(ngram0.getInputCol(), "input")
@@ -266,6 +266,22 @@ def test_ngram(self):
         transformedDF = ngram0.transform(dataset)
         self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])
 
+    def test_stopwordsremover(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([Row(input=["a", "panda"])])
+        stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
+        # Default
+        self.assertEquals(stopWordRemover.getInputCol(), "input")
+        transformedDF = stopWordRemover.transform(dataset)
+        self.assertEquals(transformedDF.head().output, ["panda"])
+        # Custom
+        stopwords = ["panda"]
+        stopWordRemover.setStopWords(stopwords)
+        self.assertEquals(stopWordRemover.getInputCol(), "input")
+        self.assertEquals(stopWordRemover.getStopWords(), stopwords)
+        transformedDF = stopWordRemover.transform(dataset)
+        self.assertEquals(transformedDF.head().output, ["a"])
+
 
 class HasInducedError(Params):
 

From 3f63bd6023edcc9af268933a235f34e10bc3d2ba Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 1 Sep 2015 20:06:01 +0100
Subject: [PATCH 162/802] [SPARK-10398] [DOCS] Migrate Spark download page to
 use new lua mirroring scripts

Migrate Apache download closer.cgi refs to new closer.lua

This is the bit of the change that affects the project docs; I'm implementing the changes to the Apache site separately.

Author: Sean Owen <sowen@cloudera.com>

Closes #8557 from srowen/SPARK-10398.
---
 docker/spark-mesos/Dockerfile | 2 +-
 docs/running-on-mesos.md      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/spark-mesos/Dockerfile b/docker/spark-mesos/Dockerfile
index b90aef3655dee..fb3f267fe5c78 100644
--- a/docker/spark-mesos/Dockerfile
+++ b/docker/spark-mesos/Dockerfile
@@ -24,7 +24,7 @@ RUN apt-get update && \
     apt-get install -y python libnss3 openjdk-7-jre-headless curl
 
 RUN mkdir /opt/spark && \
-    curl http://www.apache.org/dyn/closer.cgi/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \
+    curl http://www.apache.org/dyn/closer.lua/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \
     | tar -xzC /opt
 ENV SPARK_HOME /opt/spark
 ENV MESOS_NATIVE_JAVA_LIBRARY /usr/local/lib/libmesos.so
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index cfd219ab02e26..f36921ae30c2f 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -45,7 +45,7 @@ frameworks.  You can install Mesos either from source or using prebuilt packages
 To install Apache Mesos from source, follow these steps:
 
 1. Download a Mesos release from a
-   [mirror](http://www.apache.org/dyn/closer.cgi/mesos/{{site.MESOS_VERSION}}/)
+   [mirror](http://www.apache.org/dyn/closer.lua/mesos/{{site.MESOS_VERSION}}/)
 2. Follow the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for compiling and
    installing Mesos
 

From ec012805337926e56343be2761a1037296446880 Mon Sep 17 00:00:00 2001
From: zhuol <zhuol@yahoo-inc.com>
Date: Tue, 1 Sep 2015 11:14:59 -1000
Subject: [PATCH 163/802] [SPARK-4223] [CORE] Support * in acls.

SPARK-4223.

Currently we support setting view and modify acls but you have to specify a list of users. It would be nice to support * meaning all users have access.

Manual tests to verify that: "*" works for any user in:
a. Spark ui: view and kill stage.     Done.
b. Spark history server.                  Done.
c. Yarn application killing.  Done.

Author: zhuol <zhuol@yahoo-inc.com>

Closes #8398 from zhuoliu/4223.
---
 .../org/apache/spark/SecurityManager.scala    | 26 ++++++++++--
 .../apache/spark/SecurityManagerSuite.scala   | 41 +++++++++++++++++++
 docs/configuration.md                         |  9 ++--
 3 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 673ef49e7c1c5..746d2081d4393 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -310,7 +310,16 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     setViewAcls(Set[String](defaultUser), allowedUsers)
   }
 
-  def getViewAcls: String = viewAcls.mkString(",")
+  /**
+   * Checking the existence of "*" is necessary as YARN can't recognize the "*" in "defaultuser,*"
+   */
+  def getViewAcls: String = {
+    if (viewAcls.contains("*")) {
+      "*"
+    } else {
+      viewAcls.mkString(",")
+    }
+  }
 
   /**
    * Admin acls should be set before the view or modify acls.  If you modify the admin
@@ -321,7 +330,16 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     logInfo("Changing modify acls to: " + modifyAcls.mkString(","))
   }
 
-  def getModifyAcls: String = modifyAcls.mkString(",")
+  /**
+   * Checking the existence of "*" is necessary as YARN can't recognize the "*" in "defaultuser,*"
+   */
+  def getModifyAcls: String = {
+    if (modifyAcls.contains("*")) {
+      "*"
+    } else {
+      modifyAcls.mkString(",")
+    }
+  }
 
   /**
    * Admin acls should be set before the view or modify acls.  If you modify the admin
@@ -394,7 +412,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   def checkUIViewPermissions(user: String): Boolean = {
     logDebug("user=" + user + " aclsEnabled=" + aclsEnabled() + " viewAcls=" +
       viewAcls.mkString(","))
-    !aclsEnabled || user == null || viewAcls.contains(user)
+    !aclsEnabled || user == null || viewAcls.contains(user) || viewAcls.contains("*")
   }
 
   /**
@@ -409,7 +427,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   def checkModifyPermissions(user: String): Boolean = {
     logDebug("user=" + user + " aclsEnabled=" + aclsEnabled() + " modifyAcls=" +
       modifyAcls.mkString(","))
-    !aclsEnabled || user == null || modifyAcls.contains(user)
+    !aclsEnabled || user == null || modifyAcls.contains(user) || modifyAcls.contains("*")
   }
 
 
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index f34aefca4eb18..f29160d834082 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -125,6 +125,47 @@ class SecurityManagerSuite extends SparkFunSuite {
 
   }
 
+  test("set security with * in acls") {
+    val conf = new SparkConf
+    conf.set("spark.ui.acls.enable", "true")
+    conf.set("spark.admin.acls", "user1,user2")
+    conf.set("spark.ui.view.acls", "*")
+    conf.set("spark.modify.acls", "user4")
+
+    val securityManager = new SecurityManager(conf)
+    assert(securityManager.aclsEnabled() === true)
+
+    // check for viewAcls with *
+    assert(securityManager.checkUIViewPermissions("user1") === true)
+    assert(securityManager.checkUIViewPermissions("user5") === true)
+    assert(securityManager.checkUIViewPermissions("user6") === true)
+    assert(securityManager.checkModifyPermissions("user4") === true)
+    assert(securityManager.checkModifyPermissions("user7") === false)
+    assert(securityManager.checkModifyPermissions("user8") === false)
+
+    // check for modifyAcls with *
+    securityManager.setModifyAcls(Set("user4"), "*")
+    assert(securityManager.checkModifyPermissions("user7") === true)
+    assert(securityManager.checkModifyPermissions("user8") === true)
+
+    securityManager.setAdminAcls("user1,user2")
+    securityManager.setModifyAcls(Set("user1"), "user2")
+    securityManager.setViewAcls(Set("user1"), "user2")
+    assert(securityManager.checkUIViewPermissions("user5") === false)
+    assert(securityManager.checkUIViewPermissions("user6") === false)
+    assert(securityManager.checkModifyPermissions("user7") === false)
+    assert(securityManager.checkModifyPermissions("user8") === false)
+
+    // check for adminAcls with *
+    securityManager.setAdminAcls("user1,*")
+    securityManager.setModifyAcls(Set("user1"), "user2")
+    securityManager.setViewAcls(Set("user1"), "user2")
+    assert(securityManager.checkUIViewPermissions("user5") === true)
+    assert(securityManager.checkUIViewPermissions("user6") === true)
+    assert(securityManager.checkModifyPermissions("user7") === true)
+    assert(securityManager.checkModifyPermissions("user8") === true)
+  }
+
   test("ssl on setup") {
     val conf = SSLSampleConfigs.sparkSSLConfig()
     val expectedAlgorithms = Set(
diff --git a/docs/configuration.md b/docs/configuration.md
index 77c5cbc7b3196..fb0315ce7c3cc 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1286,7 +1286,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Comma separated list of users/administrators that have view and modify access to all Spark jobs.
     This can be used if you run on a shared cluster and have a set of administrators or devs who
-    help debug when things work.
+    help debug when things work. Putting a "*" in the list means any user can have the priviledge
+    of admin.
   </td>
 </tr>
 <tr>
@@ -1327,7 +1328,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>Empty</td>
   <td>
     Comma separated list of users that have modify access to the Spark job. By default only the
-    user that started the Spark job has access to modify it (kill it for example).
+    user that started the Spark job has access to modify it (kill it for example). Putting a "*" in
+    the list means any user can have access to modify it.
   </td>
 </tr>
 <tr>
@@ -1349,7 +1351,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>Empty</td>
   <td>
     Comma separated list of users that have view access to the Spark web ui. By default only the
-    user that started the Spark job has view access.
+    user that started the Spark job has view access. Putting a "*" in the list means any user can
+    have view access to this Spark job.
   </td>
 </tr>
 </table>

From bf550a4b551b6dd18fea3eb3f70497f9a6ad8e6c Mon Sep 17 00:00:00 2001
From: 0x0FFF <programmerag@gmail.com>
Date: Tue, 1 Sep 2015 14:34:59 -0700
Subject: [PATCH 164/802] [SPARK-10162] [SQL] Fix the timezone omitting for
 PySpark Dataframe filter function

This PR addresses [SPARK-10162](https://issues.apache.org/jira/browse/SPARK-10162)
The issue is with DataFrame filter() function, if datetime.datetime is passed to it:
* Timezone information of this datetime is ignored
* This datetime is assumed to be in local timezone, which depends on the OS timezone setting

Fix includes both code change and regression test. Problem reproduction code on master:
```python
import pytz
from datetime import datetime
from pyspark.sql import *
from pyspark.sql.types import *
sqc = SQLContext(sc)
df = sqc.createDataFrame([], StructType([StructField("dt", TimestampType())]))

m1 = pytz.timezone('UTC')
m2 = pytz.timezone('Etc/GMT+3')

df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain()
df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain()
```
It gives the same timestamp ignoring time zone:
```
>>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain()
Filter (dt#0 > 946713600000000)
 Scan PhysicalRDD[dt#0]

>>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain()
Filter (dt#0 > 946713600000000)
 Scan PhysicalRDD[dt#0]
```
After the fix:
```
>>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m1)).explain()
Filter (dt#0 > 946684800000000)
 Scan PhysicalRDD[dt#0]

>>> df.filter(df.dt > datetime(2000, 01, 01, tzinfo=m2)).explain()
Filter (dt#0 > 946695600000000)
 Scan PhysicalRDD[dt#0]
```
PR [8536](https://github.com/apache/spark/pull/8536) was occasionally closed by me dropping the repo

Author: 0x0FFF <programmerag@gmail.com>

Closes #8555 from 0x0FFF/SPARK-10162.
---
 python/pyspark/sql/tests.py | 26 ++++++++++++++++++--------
 python/pyspark/sql/types.py |  7 +++++--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cd32e26c64f22..59a891bd7c420 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -50,16 +50,17 @@
 from pyspark.sql.utils import AnalysisException, IllegalArgumentException
 
 
-class UTC(datetime.tzinfo):
-    """UTC"""
-    ZERO = datetime.timedelta(0)
+class UTCOffsetTimezone(datetime.tzinfo):
+    """
+    Specifies timezone in UTC offset
+    """
+
+    def __init__(self, offset=0):
+        self.ZERO = datetime.timedelta(hours=offset)
 
     def utcoffset(self, dt):
         return self.ZERO
 
-    def tzname(self, dt):
-        return "UTC"
-
     def dst(self, dt):
         return self.ZERO
 
@@ -841,13 +842,22 @@ def test_filter_with_datetime(self):
         self.assertEqual(0, df.filter(df.date > date).count())
         self.assertEqual(0, df.filter(df.time > time).count())
 
+    def test_filter_with_datetime_timezone(self):
+        dt1 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(0))
+        dt2 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(1))
+        row = Row(date=dt1)
+        df = self.sqlCtx.createDataFrame([row])
+        self.assertEqual(0, df.filter(df.date == dt2).count())
+        self.assertEqual(1, df.filter(df.date > dt2).count())
+        self.assertEqual(0, df.filter(df.date < dt2).count())
+
     def test_time_with_timezone(self):
         day = datetime.date.today()
         now = datetime.datetime.now()
         ts = time.mktime(now.timetuple())
         # class in __main__ is not serializable
-        from pyspark.sql.tests import UTC
-        utc = UTC()
+        from pyspark.sql.tests import UTCOffsetTimezone
+        utc = UTCOffsetTimezone()
         utcnow = datetime.datetime.utcfromtimestamp(ts)  # without microseconds
         # add microseconds to utcnow (keeping year,month,day,hour,minute,second)
         utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc)))
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 94e581a78364c..f84d08d7098ad 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1290,8 +1290,11 @@ def can_convert(self, obj):
 
     def convert(self, obj, gateway_client):
         Timestamp = JavaClass("java.sql.Timestamp", gateway_client)
-        return Timestamp(int(time.mktime(obj.timetuple())) * 1000 + obj.microsecond // 1000)
-
+        seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo
+                   else time.mktime(obj.timetuple()))
+        t = Timestamp(int(seconds) * 1000)
+        t.setNanos(obj.microsecond * 1000)
+        return t
 
 # datetime is a subclass of date, we should register DatetimeConverter first
 register_input_converter(DatetimeConverter())

From 00d9af5e190475affffb8b50467fcddfc40f50dc Mon Sep 17 00:00:00 2001
From: 0x0FFF <programmerag@gmail.com>
Date: Tue, 1 Sep 2015 14:58:49 -0700
Subject: [PATCH 165/802] [SPARK-10392] [SQL] Pyspark - Wrong DateType support
 on JDBC connection

This PR addresses issue [SPARK-10392](https://issues.apache.org/jira/browse/SPARK-10392)
The problem is that for "start of epoch" date (01 Jan 1970) PySpark class DateType returns 0 instead of the `datetime.date` due to implementation of its return statement

Issue reproduction on master:
```
>>> from pyspark.sql.types import *
>>> a = DateType()
>>> a.fromInternal(0)
0
>>> a.fromInternal(1)
datetime.date(1970, 1, 2)
```

Author: 0x0FFF <programmerag@gmail.com>

Closes #8556 from 0x0FFF/SPARK-10392.
---
 python/pyspark/sql/tests.py | 5 +++++
 python/pyspark/sql/types.py | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 59a891bd7c420..fc778631d93a3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -168,6 +168,11 @@ def test_decimal_type(self):
         t3 = DecimalType(8)
         self.assertNotEqual(t2, t3)
 
+    # regression test for SPARK-10392
+    def test_datetype_equal_zero(self):
+        dt = DateType()
+        self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1))
+
 
 class SQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index f84d08d7098ad..8bd58d69eeecd 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -168,10 +168,12 @@ def needConversion(self):
         return True
 
     def toInternal(self, d):
-        return d and d.toordinal() - self.EPOCH_ORDINAL
+        if d is not None:
+            return d.toordinal() - self.EPOCH_ORDINAL
 
     def fromInternal(self, v):
-        return v and datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
+        if v is not None:
+            return datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
 
 
 class TimestampType(AtomicType):

From c3b881a7d7e4736f7131ff002a80e25def1f63af Mon Sep 17 00:00:00 2001
From: Chuan Shao <shaochuan@huawei.com>
Date: Wed, 2 Sep 2015 11:02:27 -0700
Subject: [PATCH 166/802] [SPARK-7336] [HISTORYSERVER] Fix bug that
 applications status incorrect on JobHistory UI.

Author: ArcherShao <shaochuan@huawei.com>

Closes #5886 from ArcherShao/SPARK-7336.
---
 .../deploy/history/FsHistoryProvider.scala    | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index e573ff16c50a3..a5755eac36396 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy.history
 
 import java.io.{BufferedInputStream, FileNotFoundException, InputStream, IOException, OutputStream}
+import java.util.UUID
 import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
@@ -73,7 +74,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // The modification time of the newest log detected during the last scan. This is used
   // to ignore logs that are older during subsequent scans, to avoid processing data that
   // is already known.
-  private var lastModifiedTime = -1L
+  private var lastScanTime = -1L
 
   // Mapping of application IDs to their metadata, in descending end time order. Apps are inserted
   // into the map in order, so the LinkedHashMap maintains the correct ordering.
@@ -179,15 +180,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    */
   private[history] def checkForLogs(): Unit = {
     try {
+      val newLastScanTime = getNewLastScanTime()
       val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
         .getOrElse(Seq[FileStatus]())
-      var newLastModifiedTime = lastModifiedTime
       val logInfos: Seq[FileStatus] = statusList
         .filter { entry =>
           try {
             getModificationTime(entry).map { time =>
-              newLastModifiedTime = math.max(newLastModifiedTime, time)
-              time >= lastModifiedTime
+              time >= lastScanTime
             }.getOrElse(false)
           } catch {
             case e: AccessControlException =>
@@ -224,12 +224,29 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           }
         }
 
-      lastModifiedTime = newLastModifiedTime
+      lastScanTime = newLastScanTime
     } catch {
       case e: Exception => logError("Exception in checking for event log updates", e)
     }
   }
 
+  private def getNewLastScanTime(): Long = {
+    val fileName = "." + UUID.randomUUID().toString
+    val path = new Path(logDir, fileName)
+    val fos = fs.create(path)
+
+    try {
+      fos.close()
+      fs.getFileStatus(path).getModificationTime
+    } catch {
+      case e: Exception =>
+        logError("Exception encountered when attempting to update last scan time", e)
+        lastScanTime
+    } finally {
+      fs.delete(path)
+    }
+  }
+
   override def writeEventLogs(
       appId: String,
       attemptId: Option[String],

From 56c4c172e99a5e14f4bc3308e7ff36d94113b63e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 2 Sep 2015 11:13:17 -0700
Subject: [PATCH 167/802] [SPARK-10034] [SQL] add regression test for Sort on
 Aggregate

Before #8371, there was a bug for `Sort` on `Aggregate` that we can't use aggregate expressions named `_aggOrdering` and can't use more than one ordering expressions which contains aggregate functions. The reason of this bug is that: The aggregate expression in `SortOrder` never get resolved, we alias it with `_aggOrdering` and call `toAttribute` which gives us an `UnresolvedAttribute`. So actually we are referencing aggregate expression by name, not by exprId like we thought. And if there is already an aggregate expression named `_aggOrdering` or there are more than one ordering expressions having aggregate functions, we will have conflict names and can't search by name.

However, after #8371 got merged, the `SortOrder`s are guaranteed to be resolved and we are always referencing aggregate expression by exprId. The Bug doesn't exist anymore and this PR add regression tests for it.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8231 from cloud-fan/sort-agg.
---
 .../scala/org/apache/spark/sql/DataFrameSuite.scala    |  8 ++++++++
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala     | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 284fff184085a..a4871e247cff7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -887,4 +887,12 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       .select(struct($"b"))
       .collect()
   }
+
+  test("SPARK-10034: Sort on Aggregate with aggregation expression named 'aggOrdering'") {
+    val df = Seq(1 -> 2).toDF("i", "j")
+    val query = df.groupBy('i)
+      .agg(max('j).as("aggOrdering"))
+      .orderBy(sum('j))
+    checkAnswer(query, Row(1, 2))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 9e172b2c264cb..28201073a2d7b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1490,6 +1490,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         """.stripMargin),
       Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil)
 
+    checkAnswer(
+      sql(
+        """
+          |SELECT sum(b)
+          |FROM orderByData
+          |GROUP BY a
+          |ORDER BY sum(b), max(b)
+        """.stripMargin),
+      Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil)
+
     checkAnswer(
       sql(
         """

From fc48307797912dc1d53893dce741ddda8630957b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 2 Sep 2015 11:32:27 -0700
Subject: [PATCH 168/802] [SPARK-10389] [SQL] support order by non-attribute
 grouping expression on Aggregate

For example, we can write `SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY key + 1` in PostgreSQL, and we should support this in Spark SQL.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8548 from cloud-fan/support-order-by-non-attribute.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 72 ++++++++++---------
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 19 +++--
 2 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 1a5de15c61f86..591747b45c376 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -560,43 +560,47 @@ class Analyzer(
           filter
         }
 
-      case sort @ Sort(sortOrder, global,
-             aggregate @ Aggregate(grouping, originalAggExprs, child))
+      case sort @ Sort(sortOrder, global, aggregate: Aggregate)
         if aggregate.resolved && !sort.resolved =>
 
         // Try resolving the ordering as though it is in the aggregate clause.
         try {
-          val aliasedOrder = sortOrder.map(o => Alias(o.child, "aggOrder")())
-          val aggregatedOrdering = Aggregate(grouping, aliasedOrder, child)
-          val resolvedOperator: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
-          def resolvedAggregateOrdering = resolvedOperator.aggregateExpressions
-
-          // Expressions that have an aggregate can be pushed down.
-          val needsAggregate = resolvedAggregateOrdering.exists(containsAggregate)
-
-          // Attribute references, that are missing from the order but are present in the grouping
-          // expressions can also be pushed down.
-          val requiredAttributes = resolvedAggregateOrdering.map(_.references).reduce(_ ++ _)
-          val missingAttributes = requiredAttributes -- aggregate.outputSet
-          val validPushdownAttributes =
-            missingAttributes.filter(a => grouping.exists(a.semanticEquals))
-
-          // If resolution was successful and we see the ordering either has an aggregate in it or
-          // it is missing something that is projected away by the aggregate, add the ordering
-          // the original aggregate operator.
-          if (resolvedOperator.resolved && (needsAggregate || validPushdownAttributes.nonEmpty)) {
-            val evaluatedOrderings: Seq[SortOrder] = sortOrder.zip(resolvedAggregateOrdering).map {
-              case (order, evaluated) => order.copy(child = evaluated.toAttribute)
-            }
-            val aggExprsWithOrdering: Seq[NamedExpression] =
-              resolvedAggregateOrdering ++ originalAggExprs
-
-            Project(aggregate.output,
-              Sort(evaluatedOrderings, global,
-                aggregate.copy(aggregateExpressions = aggExprsWithOrdering)))
-          } else {
-            sort
+          val aliasedOrdering = sortOrder.map(o => Alias(o.child, "aggOrder")())
+          val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
+          val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          val resolvedAliasedOrdering: Seq[Alias] =
+            resolvedAggregate.aggregateExpressions.asInstanceOf[Seq[Alias]]
+
+          // If we pass the analysis check, then the ordering expressions should only reference to
+          // aggregate expressions or grouping expressions, and it's safe to push them down to
+          // Aggregate.
+          checkAnalysis(resolvedAggregate)
+
+          val originalAggExprs = aggregate.aggregateExpressions.map(
+            CleanupAliases.trimNonTopLevelAliases(_).asInstanceOf[NamedExpression])
+
+          // If the ordering expression is same with original aggregate expression, we don't need
+          // to push down this ordering expression and can reference the original aggregate
+          // expression instead.
+          val needsPushDown = ArrayBuffer.empty[NamedExpression]
+          val evaluatedOrderings = resolvedAliasedOrdering.zip(sortOrder).map {
+            case (evaluated, order) =>
+              val index = originalAggExprs.indexWhere {
+                case Alias(child, _) => child semanticEquals evaluated.child
+                case other => other semanticEquals evaluated.child
+              }
+
+              if (index == -1) {
+                needsPushDown += evaluated
+                order.copy(child = evaluated.toAttribute)
+              } else {
+                order.copy(child = originalAggExprs(index).toAttribute)
+              }
           }
+
+          Project(aggregate.output,
+            Sort(evaluatedOrderings, global,
+              aggregate.copy(aggregateExpressions = originalAggExprs ++ needsPushDown)))
         } catch {
           // Attempting to resolve in the aggregate can result in ambiguity.  When this happens,
           // just return the original plan.
@@ -605,9 +609,7 @@ class Analyzer(
     }
 
     protected def containsAggregate(condition: Expression): Boolean = {
-      condition
-        .collect { case ae: AggregateExpression => ae }
-        .nonEmpty
+      condition.find(_.isInstanceOf[AggregateExpression]).isDefined
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 28201073a2d7b..0ef25fe0faef0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1722,9 +1722,20 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-10130 type coercion for IF should have children resolved first") {
-    val df = Seq((1, 1), (-1, 1)).toDF("key", "value")
-    df.registerTempTable("src")
-    checkAnswer(
-      sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0)))
+    withTempTable("src") {
+      Seq((1, 1), (-1, 1)).toDF("key", "value").registerTempTable("src")
+      checkAnswer(
+        sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0)))
+    }
+  }
+
+  test("SPARK-10389: order by non-attribute grouping expression on Aggregate") {
+    withTempTable("src") {
+      Seq((1, 1), (-1, 1)).toDF("key", "value").registerTempTable("src")
+      checkAnswer(sql("SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY key + 1"),
+        Seq(Row(1), Row(1)))
+      checkAnswer(sql("SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY (key + 1) * 2"),
+        Seq(Row(1), Row(1)))
+    }
   }
 }

From 2da3a9e98e5d129d4507b5db01bba5ee9558d28e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 2 Sep 2015 12:53:24 -0700
Subject: [PATCH 169/802] [SPARK-10004] [SHUFFLE] Perform auth checks when
 clients read shuffle data.

To correctly isolate applications, when requests to read shuffle data
arrive at the shuffle service, proper authorization checks need to
be performed. This change makes sure that only the application that
created the shuffle data can read from it.

Such checks are only enabled when "spark.authenticate" is enabled,
otherwise there's no secure way to make sure that the client is really
who it says it is.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8218 from vanzin/SPARK-10004.
---
 .../network/netty/NettyBlockRpcServer.scala   |   3 +-
 .../netty/NettyBlockTransferService.scala     |   2 +-
 network/common/pom.xml                        |   4 +
 .../spark/network/client/TransportClient.java |  22 +++
 .../network/sasl/SaslClientBootstrap.java     |   2 +
 .../spark/network/sasl/SaslRpcHandler.java    |   1 +
 .../server/OneForOneStreamManager.java        |  31 +++-
 .../spark/network/server/StreamManager.java   |   9 +
 .../server/TransportRequestHandler.java       |   1 +
 .../shuffle/ExternalShuffleBlockHandler.java  |  16 +-
 .../network/sasl/SaslIntegrationSuite.java    | 163 +++++++++++++++---
 .../ExternalShuffleBlockHandlerSuite.java     |   2 +-
 project/MimaExcludes.scala                    |   1 +
 13 files changed, 221 insertions(+), 36 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index 7c170a742fb64..76968249fb625 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -38,6 +38,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel}
  * is equivalent to one Spark-level shuffle block.
  */
 class NettyBlockRpcServer(
+    appId: String,
     serializer: Serializer,
     blockManager: BlockDataManager)
   extends RpcHandler with Logging {
@@ -55,7 +56,7 @@ class NettyBlockRpcServer(
       case openBlocks: OpenBlocks =>
         val blocks: Seq[ManagedBuffer] =
           openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
-        val streamId = streamManager.registerStream(blocks.iterator.asJava)
+        val streamId = streamManager.registerStream(appId, blocks.iterator.asJava)
         logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
         responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index ff8aae9ebe9f0..d5ad2c9ad00e8 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -49,7 +49,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
   private[this] var appId: String = _
 
   override def init(blockDataManager: BlockDataManager): Unit = {
-    val rpcHandler = new NettyBlockRpcServer(serializer, blockDataManager)
+    val rpcHandler = new NettyBlockRpcServer(conf.getAppId, serializer, blockDataManager)
     var serverBootstrap: Option[TransportServerBootstrap] = None
     var clientBootstrap: Option[TransportClientBootstrap] = None
     if (authEnabled) {
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 7dc3068ab8cb7..4141fcb8267a5 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -48,6 +48,10 @@
       <artifactId>slf4j-api</artifactId>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>com.google.code.findbugs</groupId>
+      <artifactId>jsr305</artifactId>
+    </dependency>
     <!--
       Promote Guava to "compile" so that maven-shade-plugin picks it up (for packaging the Optional
       class exposed in the Java API). The plugin will then remove this dependency from the published
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index e8e7f06247d3e..df841288a02e3 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -23,6 +23,7 @@
 import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
+import javax.annotation.Nullable;
 
 import com.google.common.base.Objects;
 import com.google.common.base.Preconditions;
@@ -70,6 +71,7 @@ public class TransportClient implements Closeable {
 
   private final Channel channel;
   private final TransportResponseHandler handler;
+  @Nullable private String clientId;
 
   public TransportClient(Channel channel, TransportResponseHandler handler) {
     this.channel = Preconditions.checkNotNull(channel);
@@ -84,6 +86,25 @@ public SocketAddress getSocketAddress() {
     return channel.remoteAddress();
   }
 
+  /**
+   * Returns the ID used by the client to authenticate itself when authentication is enabled.
+   *
+   * @return The client ID, or null if authentication is disabled.
+   */
+  public String getClientId() {
+    return clientId;
+  }
+
+  /**
+   * Sets the authenticated client ID. This is meant to be used by the authentication layer.
+   *
+   * Trying to set a different client ID after it's been set will result in an exception.
+   */
+  public void setClientId(String id) {
+    Preconditions.checkState(clientId == null, "Client ID has already been set.");
+    this.clientId = id;
+  }
+
   /**
    * Requests a single chunk from the remote side, from the pre-negotiated streamId.
    *
@@ -207,6 +228,7 @@ public void close() {
   public String toString() {
     return Objects.toStringHelper(this)
       .add("remoteAdress", channel.remoteAddress())
+      .add("clientId", clientId)
       .add("isActive", isActive())
       .toString();
   }
diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
index 185ba2ef3bb1f..69923769d44b4 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
@@ -77,6 +77,8 @@ public void doBootstrap(TransportClient client, Channel channel) {
         payload = saslClient.response(response);
       }
 
+      client.setClientId(appId);
+
       if (encrypt) {
         if (!SparkSaslServer.QOP_AUTH_CONF.equals(saslClient.getNegotiatedProperty(Sasl.QOP))) {
           throw new RuntimeException(
diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
index be6165caf3c74..3f2ebe32887b8 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
@@ -81,6 +81,7 @@ public void receive(TransportClient client, byte[] message, RpcResponseCallback
 
     if (saslServer == null) {
       // First message in the handshake, setup the necessary state.
+      client.setClientId(saslMessage.appId);
       saslServer = new SparkSaslServer(saslMessage.appId, secretKeyHolder,
         conf.saslServerAlwaysEncrypt());
     }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
index c95e64e8e2cda..e671854da1cae 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
@@ -24,13 +24,13 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
 
+import com.google.common.base.Preconditions;
 import io.netty.channel.Channel;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.buffer.ManagedBuffer;
-
-import com.google.common.base.Preconditions;
+import org.apache.spark.network.client.TransportClient;
 
 /**
  * StreamManager which allows registration of an Iterator&lt;ManagedBuffer&gt;, which are individually
@@ -44,6 +44,7 @@ public class OneForOneStreamManager extends StreamManager {
 
   /** State of a single stream. */
   private static class StreamState {
+    final String appId;
     final Iterator<ManagedBuffer> buffers;
 
     // The channel associated to the stream
@@ -53,7 +54,8 @@ private static class StreamState {
     // that the caller only requests each chunk one at a time, in order.
     int curChunk = 0;
 
-    StreamState(Iterator<ManagedBuffer> buffers) {
+    StreamState(String appId, Iterator<ManagedBuffer> buffers) {
+      this.appId = appId;
       this.buffers = Preconditions.checkNotNull(buffers);
     }
   }
@@ -109,15 +111,34 @@ public void connectionTerminated(Channel channel) {
     }
   }
 
+  @Override
+  public void checkAuthorization(TransportClient client, long streamId) {
+    if (client.getClientId() != null) {
+      StreamState state = streams.get(streamId);
+      Preconditions.checkArgument(state != null, "Unknown stream ID.");
+      if (!client.getClientId().equals(state.appId)) {
+        throw new SecurityException(String.format(
+          "Client %s not authorized to read stream %d (app %s).",
+          client.getClientId(),
+          streamId,
+          state.appId));
+      }
+    }
+  }
+
   /**
    * Registers a stream of ManagedBuffers which are served as individual chunks one at a time to
    * callers. Each ManagedBuffer will be release()'d after it is transferred on the wire. If a
    * client connection is closed before the iterator is fully drained, then the remaining buffers
    * will all be release()'d.
+   *
+   * If an app ID is provided, only callers who've authenticated with the given app ID will be
+   * allowed to fetch from this stream.
    */
-  public long registerStream(Iterator<ManagedBuffer> buffers) {
+  public long registerStream(String appId, Iterator<ManagedBuffer> buffers) {
     long myStreamId = nextStreamId.getAndIncrement();
-    streams.put(myStreamId, new StreamState(buffers));
+    streams.put(myStreamId, new StreamState(appId, buffers));
     return myStreamId;
   }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
index 929f789bf9d24..aaa677c965640 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
@@ -20,6 +20,7 @@
 import io.netty.channel.Channel;
 
 import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.client.TransportClient;
 
 /**
  * The StreamManager is used to fetch individual chunks from a stream. This is used in
@@ -60,4 +61,12 @@ public void registerChannel(Channel channel, long streamId) { }
    * to read from the associated streams again, so any state can be cleaned up.
    */
   public void connectionTerminated(Channel channel) { }
+
+  /**
+   * Verify that the client is authorized to read from the given stream.
+   *
+   * @throws SecurityException If client is not authorized.
+   */
+  public void checkAuthorization(TransportClient client, long streamId) { }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index e5159ab56d0d4..df6027805838d 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -97,6 +97,7 @@ private void processFetchRequest(final ChunkFetchRequest req) {
 
     ManagedBuffer buf;
     try {
+      streamManager.checkAuthorization(reverseClient, req.streamChunkId.streamId);
       streamManager.registerChannel(channel, req.streamChunkId.streamId);
       buf = streamManager.getChunk(req.streamChunkId.streamId, req.streamChunkId.chunkIndex);
     } catch (Exception e) {
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
index 0df1dd621f6e9..3ddf5c3c39189 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
@@ -58,7 +58,7 @@ public ExternalShuffleBlockHandler(TransportConf conf, File registeredExecutorFi
 
   /** Enables mocking out the StreamManager and BlockManager. */
   @VisibleForTesting
-  ExternalShuffleBlockHandler(
+  public ExternalShuffleBlockHandler(
       OneForOneStreamManager streamManager,
       ExternalShuffleBlockResolver blockManager) {
     this.streamManager = streamManager;
@@ -77,17 +77,19 @@ protected void handleMessage(
       RpcResponseCallback callback) {
     if (msgObj instanceof OpenBlocks) {
       OpenBlocks msg = (OpenBlocks) msgObj;
-      List<ManagedBuffer> blocks = Lists.newArrayList();
+      checkAuth(client, msg.appId);
 
+      List<ManagedBuffer> blocks = Lists.newArrayList();
       for (String blockId : msg.blockIds) {
         blocks.add(blockManager.getBlockData(msg.appId, msg.execId, blockId));
       }
-      long streamId = streamManager.registerStream(blocks.iterator());
+      long streamId = streamManager.registerStream(client.getClientId(), blocks.iterator());
       logger.trace("Registered streamId {} with {} buffers", streamId, msg.blockIds.length);
       callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteArray());
 
     } else if (msgObj instanceof RegisterExecutor) {
       RegisterExecutor msg = (RegisterExecutor) msgObj;
+      checkAuth(client, msg.appId);
       blockManager.registerExecutor(msg.appId, msg.execId, msg.executorInfo);
       callback.onSuccess(new byte[0]);
 
@@ -126,4 +128,12 @@ public void reregisterExecutor(AppExecId appExecId, ExecutorShuffleInfo executor
   public void close() {
     blockManager.close();
   }
+
+  private void checkAuth(TransportClient client, String appId) {
+    if (client.getClientId() != null && !client.getClientId().equals(appId)) {
+      throw new SecurityException(String.format(
+        "Client for %s not authorized for application %s.", client.getClientId(), appId));
+    }
+  }
+
 }
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
index 382f613ecbb1b..5cb0e4d4a6458 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.collect.Lists;
 import org.junit.After;
@@ -27,9 +28,12 @@
 import org.junit.Test;
 
 import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
 
 import org.apache.spark.network.TestUtils;
 import org.apache.spark.network.TransportContext;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.client.ChunkReceivedCallback;
 import org.apache.spark.network.client.RpcResponseCallback;
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.client.TransportClientBootstrap;
@@ -39,44 +43,39 @@
 import org.apache.spark.network.server.StreamManager;
 import org.apache.spark.network.server.TransportServer;
 import org.apache.spark.network.server.TransportServerBootstrap;
+import org.apache.spark.network.shuffle.BlockFetchingListener;
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler;
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver;
+import org.apache.spark.network.shuffle.OneForOneBlockFetcher;
+import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
+import org.apache.spark.network.shuffle.protocol.OpenBlocks;
+import org.apache.spark.network.shuffle.protocol.RegisterExecutor;
+import org.apache.spark.network.shuffle.protocol.StreamHandle;
 import org.apache.spark.network.util.SystemPropertyConfigProvider;
 import org.apache.spark.network.util.TransportConf;
 
 public class SaslIntegrationSuite {
-  static ExternalShuffleBlockHandler handler;
   static TransportServer server;
   static TransportConf conf;
   static TransportContext context;
+  static SecretKeyHolder secretKeyHolder;
 
   TransportClientFactory clientFactory;
 
-  /** Provides a secret key holder which always returns the given secret key. */
-  static class TestSecretKeyHolder implements SecretKeyHolder {
-
-    private final String secretKey;
-
-    TestSecretKeyHolder(String secretKey) {
-      this.secretKey = secretKey;
-    }
-
-    @Override
-    public String getSaslUser(String appId) {
-      return "user";
-    }
-    @Override
-    public String getSecretKey(String appId) {
-      return secretKey;
-    }
-  }
-
-
   @BeforeClass
   public static void beforeAll() throws IOException {
-    SecretKeyHolder secretKeyHolder = new TestSecretKeyHolder("good-key");
     conf = new TransportConf(new SystemPropertyConfigProvider());
     context = new TransportContext(conf, new TestRpcHandler());
 
+    secretKeyHolder = mock(SecretKeyHolder.class);
+    when(secretKeyHolder.getSaslUser(eq("app-1"))).thenReturn("app-1");
+    when(secretKeyHolder.getSecretKey(eq("app-1"))).thenReturn("app-1");
+    when(secretKeyHolder.getSaslUser(eq("app-2"))).thenReturn("app-2");
+    when(secretKeyHolder.getSecretKey(eq("app-2"))).thenReturn("app-2");
+    when(secretKeyHolder.getSaslUser(anyString())).thenReturn("other-app");
+    when(secretKeyHolder.getSecretKey(anyString())).thenReturn("correct-password");
+
     TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf, secretKeyHolder);
     server = context.createServer(Arrays.asList(bootstrap));
   }
@@ -99,7 +98,7 @@ public void afterEach() {
   public void testGoodClient() throws IOException {
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("good-key"))));
+        new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
 
     TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
     String msg = "Hello, World!";
@@ -109,13 +108,17 @@ public void testGoodClient() throws IOException {
 
   @Test
   public void testBadClient() {
+    SecretKeyHolder badKeyHolder = mock(SecretKeyHolder.class);
+    when(badKeyHolder.getSaslUser(anyString())).thenReturn("other-app");
+    when(badKeyHolder.getSecretKey(anyString())).thenReturn("wrong-password");
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("bad-key"))));
+        new SaslClientBootstrap(conf, "unknown-app", badKeyHolder)));
 
     try {
       // Bootstrap should fail on startup.
       clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+      fail("Connection should have failed.");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("Mismatched response"));
     }
@@ -149,7 +152,7 @@ public void testNoSaslServer() {
     TransportContext context = new TransportContext(conf, handler);
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("key"))));
+        new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
     TransportServer server = context.createServer();
     try {
       clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
@@ -160,6 +163,110 @@ public void testNoSaslServer() {
     }
   }
 
+  /**
+   * This test is not actually testing SASL behavior, but testing that the shuffle service
+   * performs correct authorization checks based on the SASL authentication data.
+   */
+  @Test
+  public void testAppIsolation() throws Exception {
+    // Start a new server with the correct RPC handler to serve block data.
+    ExternalShuffleBlockResolver blockResolver = mock(ExternalShuffleBlockResolver.class);
+    ExternalShuffleBlockHandler blockHandler = new ExternalShuffleBlockHandler(
+      new OneForOneStreamManager(), blockResolver);
+    TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf, secretKeyHolder);
+    TransportContext blockServerContext = new TransportContext(conf, blockHandler);
+    TransportServer blockServer = blockServerContext.createServer(Arrays.asList(bootstrap));
+
+    TransportClient client1 = null;
+    TransportClient client2 = null;
+    TransportClientFactory clientFactory2 = null;
+    try {
+      // Create a client, and make a request to fetch blocks from a different app.
+      clientFactory = blockServerContext.createClientFactory(
+        Lists.<TransportClientBootstrap>newArrayList(
+          new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
+      client1 = clientFactory.createClient(TestUtils.getLocalHost(),
+        blockServer.getPort());
+
+      final AtomicReference<Throwable> exception = new AtomicReference<>();
+
+      BlockFetchingListener listener = new BlockFetchingListener() {
+        @Override
+        public synchronized void onBlockFetchSuccess(String blockId, ManagedBuffer data) {
+          notifyAll();
+        }
+
+        @Override
+        public synchronized void onBlockFetchFailure(String blockId, Throwable t) {
+          exception.set(t);
+          notifyAll();
+        }
+      };
+
+      String[] blockIds = new String[] { "shuffle_2_3_4", "shuffle_6_7_8" };
+      OneForOneBlockFetcher fetcher = new OneForOneBlockFetcher(client1, "app-2", "0",
+        blockIds, listener);
+      synchronized (listener) {
+        fetcher.start();
+        listener.wait();
+      }
+      checkSecurityException(exception.get());
+
+      // Register an executor so that the next steps work.
+      ExecutorShuffleInfo executorInfo = new ExecutorShuffleInfo(
+        new String[] { System.getProperty("java.io.tmpdir") }, 1,
+        "org.apache.spark.shuffle.sort.SortShuffleManager");
+      RegisterExecutor regmsg = new RegisterExecutor("app-1", "0", executorInfo);
+      client1.sendRpcSync(regmsg.toByteArray(), 10000);
+
+      // Make a successful request to fetch blocks, which creates a new stream. But do not actually
+      // fetch any blocks, to keep the stream open.
+      OpenBlocks openMessage = new OpenBlocks("app-1", "0", blockIds);
+      byte[] response = client1.sendRpcSync(openMessage.toByteArray(), 10000);
+      StreamHandle stream = (StreamHandle) BlockTransferMessage.Decoder.fromByteArray(response);
+      long streamId = stream.streamId;
+
+      // Create a second client, authenticated with a different app ID, and try to read from
+      // the stream created for the previous app.
+      clientFactory2 = blockServerContext.createClientFactory(
+        Lists.<TransportClientBootstrap>newArrayList(
+          new SaslClientBootstrap(conf, "app-2", secretKeyHolder)));
+      client2 = clientFactory2.createClient(TestUtils.getLocalHost(),
+        blockServer.getPort());
+
+      ChunkReceivedCallback callback = new ChunkReceivedCallback() {
+        @Override
+        public synchronized void onSuccess(int chunkIndex, ManagedBuffer buffer) {
+          notifyAll();
+        }
+
+        @Override
+        public synchronized void onFailure(int chunkIndex, Throwable t) {
+          exception.set(t);
+          notifyAll();
+        }
+      };
+
+      exception.set(null);
+      synchronized (callback) {
+        client2.fetchChunk(streamId, 0, callback);
+        callback.wait();
+      }
+      checkSecurityException(exception.get());
+    } finally {
+      if (client1 != null) {
+        client1.close();
+      }
+      if (client2 != null) {
+        client2.close();
+      }
+      if (clientFactory2 != null) {
+        clientFactory2.close();
+      }
+      blockServer.close();
+    }
+  }
+
   /** RPC handler which simply responds with the message it received. */
   public static class TestRpcHandler extends RpcHandler {
     @Override
@@ -172,4 +279,10 @@ public StreamManager getStreamManager() {
       return new OneForOneStreamManager();
     }
   }
+
+  private void checkSecurityException(Throwable t) {
+    assertNotNull("No exception was caught.", t);
+    assertTrue("Expected SecurityException.",
+      t.getMessage().contains(SecurityException.class.getName()));
+  }
 }
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
index 1d197497b7c8f..e61390cf57061 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
@@ -93,7 +93,7 @@ public void testOpenShuffleBlocks() {
     @SuppressWarnings("unchecked")
     ArgumentCaptor<Iterator<ManagedBuffer>> stream = (ArgumentCaptor<Iterator<ManagedBuffer>>)
         (ArgumentCaptor<?>) ArgumentCaptor.forClass(Iterator.class);
-    verify(streamManager, times(1)).registerStream(stream.capture());
+    verify(streamManager, times(1)).registerStream(anyString(), stream.capture());
     Iterator<ManagedBuffer> buffers = stream.getValue();
     assertEquals(block0Marker, buffers.next());
     assertEquals(block1Marker, buffers.next());
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 88745dc086a04..714ce3cd9b1de 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -37,6 +37,7 @@ object MimaExcludes {
         case v if v.startsWith("1.5") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
+            MimaBuild.excludeSparkPackage("network"),
             // These are needed if checking against the sbt build, since they are part of
             // the maven-generated artifacts in 1.3.
             excludePackage("org.spark-project.jetty"),

From 6cd98c1878a9c5c6475ed5974643021ab27862a7 Mon Sep 17 00:00:00 2001
From: 0x0FFF <programmerag@gmail.com>
Date: Wed, 2 Sep 2015 13:36:36 -0700
Subject: [PATCH 170/802] [SPARK-10417] [SQL] Iterating through Column results
 in infinite loop

`pyspark.sql.column.Column` object has `__getitem__` method, which makes it iterable for Python. In fact it has `__getitem__` to address the case when the column might be a list or dict, for you to be able to access certain element of it in DF API. The ability to iterate over it is just a side effect that might cause confusion for the people getting familiar with Spark DF (as you might iterate this way on Pandas DF for instance)

Issue reproduction:
```
df = sqlContext.jsonRDD(sc.parallelize(['{"name": "El Magnifico"}']))
for i in df["name"]: print i
```

Author: 0x0FFF <programmerag@gmail.com>

Closes #8574 from 0x0FFF/SPARK-10417.
---
 python/pyspark/sql/column.py | 3 +++
 python/pyspark/sql/tests.py  | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 0948f9b27cd38..56e75e8caee88 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -226,6 +226,9 @@ def __getattr__(self, item):
             raise AttributeError(item)
         return self.getField(item)
 
+    def __iter__(self):
+        raise TypeError("Column is not iterable")
+
     # string methods
     rlike = _bin_op("rlike")
     like = _bin_op("like")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index fc778631d93a3..eb449e8679fa0 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1066,6 +1066,15 @@ def test_with_column_with_existing_name(self):
         keys = self.df.withColumn("key", self.df.key).select("key").collect()
         self.assertEqual([r.key for r in keys], list(range(100)))
 
+    # regression test for SPARK-10417
+    def test_column_iterator(self):
+
+        def foo():
+            for x in self.df.key:
+                break
+
+        self.assertRaises(TypeError, foo)
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 

From 03f3e91ff21707d8a1c7057a00f1b1cd8b743e3f Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 2 Sep 2015 21:00:13 -0700
Subject: [PATCH 171/802] [SPARK-10422] [SQL] String column in
 InMemoryColumnarCache needs to override clone method

https://issues.apache.org/jira/browse/SPARK-10422

Author: Yin Huai <yhuai@databricks.com>

Closes #8578 from yhuai/SPARK-10422.
---
 .../spark/sql/columnar/ColumnType.scala       |  2 ++
 .../columnar/InMemoryColumnarQuerySuite.scala | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 531a8244d55d1..ab482a3636121 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -339,6 +339,8 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) {
     setField(to, toOrdinal, getField(from, fromOrdinal))
   }
+
+  override def clone(v: UTF8String): UTF8String = v.clone()
 }
 
 private[sql] object DATE extends NativeColumnType(DateType, 8, 4) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 952637c5f9cb8..83db9b6510b36 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -191,4 +191,24 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       ctx.table("InMemoryCache_different_data_types").collect())
     ctx.dropTempTable("InMemoryCache_different_data_types")
   }
+
+  test("SPARK-10422: String column in InMemoryColumnarCache needs to override clone method") {
+    val df =
+      ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+    val cached = df.cache()
+    // count triggers the caching action. It should not throw.
+    cached.count()
+
+    // Make sure, the DataFrame is indeed cached.
+    assert(sqlContext.cacheManager.lookupCachedData(cached).nonEmpty)
+
+    // Check result.
+    checkAnswer(
+      cached,
+      ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+    )
+
+    // Drop the cache.
+    cached.unpersist()
+  }
 }

From 44948a2e9dcad5cd8d1eb749f469e49c5750b5ba Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 2 Sep 2015 21:19:42 -0700
Subject: [PATCH 172/802] [SPARK-9723] [ML] params getordefault should throw
 more useful error

Params.getOrDefault should throw a more meaningful exception than what you get from a bad key lookup.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8567 from holdenk/SPARK-9723-params-getordefault-should-throw-more-useful-error.
---
 .../scala/org/apache/spark/ml/param/params.scala  |  3 ++-
 .../org/apache/spark/ml/param/ParamsSuite.scala   | 15 ++++++++++-----
 .../org/apache/spark/ml/param/TestParams.scala    |  5 +++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 91c0a5631319d..de32b7218c277 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -461,7 +461,8 @@ trait Params extends Identifiable with Serializable {
    */
   final def getOrDefault[T](param: Param[T]): T = {
     shouldOwn(param)
-    get(param).orElse(getDefault(param)).get
+    get(param).orElse(getDefault(param)).getOrElse(
+      throw new NoSuchElementException(s"Failed to find a default value for ${param.name}"))
   }
 
   /** An alias for [[getOrDefault()]]. */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 2c878f8372a47..dfab82c8b67ad 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -40,6 +40,10 @@ class ParamsSuite extends SparkFunSuite {
 
     assert(inputCol.toString === s"${uid}__inputCol")
 
+    intercept[java.util.NoSuchElementException] {
+      solver.getOrDefault(solver.handleInvalid)
+    }
+
     intercept[IllegalArgumentException] {
       solver.setMaxIter(-1)
     }
@@ -102,12 +106,13 @@ class ParamsSuite extends SparkFunSuite {
 
   test("params") {
     val solver = new TestParams()
-    import solver.{maxIter, inputCol}
+    import solver.{handleInvalid, maxIter, inputCol}
 
     val params = solver.params
-    assert(params.length === 2)
-    assert(params(0).eq(inputCol), "params must be ordered by name")
-    assert(params(1).eq(maxIter))
+    assert(params.length === 3)
+    assert(params(0).eq(handleInvalid), "params must be ordered by name")
+    assert(params(1).eq(inputCol), "params must be ordered by name")
+    assert(params(2).eq(maxIter))
 
     assert(!solver.isSet(maxIter))
     assert(solver.isDefined(maxIter))
@@ -122,7 +127,7 @@ class ParamsSuite extends SparkFunSuite {
     assert(solver.explainParam(maxIter) ===
       "maxIter: maximum number of iterations (>= 0) (default: 10, current: 100)")
     assert(solver.explainParams() ===
-      Seq(inputCol, maxIter).map(solver.explainParam).mkString("\n"))
+      Seq(handleInvalid, inputCol, maxIter).map(solver.explainParam).mkString("\n"))
 
     assert(solver.getParam("inputCol").eq(inputCol))
     assert(solver.getParam("maxIter").eq(maxIter))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
index 2759248344531..9d23547f28447 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.ml.param
 
-import org.apache.spark.ml.param.shared.{HasInputCol, HasMaxIter}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasMaxIter}
 import org.apache.spark.ml.util.Identifiable
 
 /** A subclass of Params for testing. */
-class TestParams(override val uid: String) extends Params with HasMaxIter with HasInputCol {
+class TestParams(override val uid: String) extends Params with HasHandleInvalid with HasMaxIter
+    with HasInputCol {
 
   def this() = this(Identifiable.randomUID("testParams"))
 

From 4bd85d06e0334c49be18c4612b04d013b37f189c Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Wed, 2 Sep 2015 22:07:50 -0700
Subject: [PATCH 173/802] [SPARK-5945] Spark should not retry a stage
 infinitely on a FetchFailedException

The ```Stage``` class now tracks whether there were a sufficient number of consecutive failures of that stage to trigger an abort.

To avoid an infinite loop of stage retries, we abort the job completely after 4 consecutive stage failures for one stage. We still allow more than 4 consecutive stage failures if there is an intervening successful attempt for the stage, so that in very long-lived applications, where a stage may get reused many times, we don't abort the job after failures that have been recovered from successfully.

I've added test cases to exercise the most obvious scenarios.

Author: Ilya Ganelin <ilya.ganelin@capitalone.com>

Closes #5636 from ilganeli/SPARK-5945.
---
 .../apache/spark/scheduler/DAGScheduler.scala |  13 +-
 .../org/apache/spark/scheduler/Stage.scala    |  30 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   | 282 +++++++++++++++++-
 3 files changed, 320 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index daf9b0f95273e..d673cb0946639 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1101,7 +1101,6 @@ class DAGScheduler(
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
             s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
         } else {
-
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
           // possible the fetch failure has already been handled by the scheduler.
@@ -1117,6 +1116,11 @@ class DAGScheduler(
           if (disallowStageRetryForTest) {
             abortStage(failedStage, "Fetch failure will not retry stage due to testing config",
               None)
+          } else if (failedStage.failedOnFetchAndShouldAbort(task.stageAttemptId)) {
+            abortStage(failedStage, s"$failedStage (${failedStage.name}) " +
+              s"has failed the maximum allowable number of " +
+              s"times: ${Stage.MAX_CONSECUTIVE_FETCH_FAILURES}. " +
+              s"Most recent failure reason: ${failureMessage}", None)
           } else if (failedStages.isEmpty) {
             // Don't schedule an event to resubmit failed stages if failed isn't empty, because
             // in that case the event will already have been scheduled.
@@ -1240,10 +1244,17 @@ class DAGScheduler(
     if (errorMessage.isEmpty) {
       logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
       stage.latestInfo.completionTime = Some(clock.getTimeMillis())
+
+      // Clear failure count for this stage, now that it's succeeded.
+      // We only limit consecutive failures of stage attempts,so that if a stage is
+      // re-used many times in a long-running job, unrelated failures don't eventually cause the
+      // stage to be aborted.
+      stage.clearFailures()
     } else {
       stage.latestInfo.stageFailed(errorMessage.get)
       logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
     }
+
     outputCommitCoordinator.stageEnd(stage.id)
     listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
     runningStages -= stage
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index 1cf06856ffbc2..c086535782c23 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -46,7 +46,7 @@ import org.apache.spark.util.CallSite
  * be updated for each attempt.
  *
  */
-private[spark] abstract class Stage(
+private[scheduler] abstract class Stage(
     val id: Int,
     val rdd: RDD[_],
     val numTasks: Int,
@@ -92,6 +92,29 @@ private[spark] abstract class Stage(
    */
   private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
 
+  /**
+   * Set of stage attempt IDs that have failed with a FetchFailure. We keep track of these
+   * failures in order to avoid endless retries if a stage keeps failing with a FetchFailure.
+   * We keep track of each attempt ID that has failed to avoid recording duplicate failures if
+   * multiple tasks from the same stage attempt fail (SPARK-5945).
+   */
+  private val fetchFailedAttemptIds = new HashSet[Int]
+
+  private[scheduler] def clearFailures() : Unit = {
+    fetchFailedAttemptIds.clear()
+  }
+
+  /**
+   * Check whether we should abort the failedStage due to multiple consecutive fetch failures.
+   *
+   * This method updates the running set of failed stage attempts and returns
+   * true if the number of failures exceeds the allowable number of failures.
+   */
+  private[scheduler] def failedOnFetchAndShouldAbort(stageAttemptId: Int): Boolean = {
+    fetchFailedAttemptIds.add(stageAttemptId)
+    fetchFailedAttemptIds.size >= Stage.MAX_CONSECUTIVE_FETCH_FAILURES
+  }
+
   /** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */
   def makeNewStageAttempt(
       numPartitionsToCompute: Int,
@@ -110,3 +133,8 @@ private[spark] abstract class Stage(
     case _ => false
   }
 }
+
+private[scheduler] object Stage {
+  // The number of consecutive failures allowed before a stage is aborted
+  val MAX_CONSECUTIVE_FETCH_FAILURES = 4
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 2e8688cf41d99..62957c66974fb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -26,11 +26,11 @@ import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
+import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster}
 import org.apache.spark.util.CallSite
-import org.apache.spark.executor.TaskMetrics
 
 class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
   extends DAGSchedulerEventProcessLoop(dagScheduler) {
@@ -473,6 +473,282 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+
+  // Helper function to validate state when creating tests for task failures
+  private def checkStageId(stageId: Int, attempt: Int, stageAttempt: TaskSet) {
+    assert(stageAttempt.stageId === stageId)
+    assert(stageAttempt.stageAttemptId == attempt)
+  }
+
+
+  // Helper functions to extract commonly used code in Fetch Failure test cases
+  private def setupStageAbortTest(sc: SparkContext) {
+    sc.listenerBus.addListener(new EndListener())
+    ended = false
+    jobResult = null
+  }
+
+  // Create a new Listener to confirm that the listenerBus sees the JobEnd message
+  // when we abort the stage. This message will also be consumed by the EventLoggingListener
+  // so this will propagate up to the user.
+  var ended = false
+  var jobResult : JobResult = null
+
+  class EndListener extends SparkListener {
+    override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
+      jobResult = jobEnd.jobResult
+      ended = true
+    }
+  }
+
+  /**
+   * Common code to get the next stage attempt, confirm it's the one we expect, and complete it
+   * successfully.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   * @param numShufflePartitions - The number of partitions in the next stage
+   */
+  private def completeShuffleMapStageSuccessfully(
+      stageId: Int,
+      attemptIdx: Int,
+      numShufflePartitions: Int): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map {
+      case (task, idx) =>
+        (Success, makeMapStatus("host" + ('A' + idx).toChar, numShufflePartitions))
+    }.toSeq)
+  }
+
+  /**
+   * Common code to get the next stage attempt, confirm it's the one we expect, and complete it
+   * with all FetchFailure.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   * @param shuffleDep - The shuffle dependency of the stage with a fetch failure
+   */
+  private def completeNextStageWithFetchFailure(
+      stageId: Int,
+      attemptIdx: Int,
+      shuffleDep: ShuffleDependency[_, _, _]): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { case (task, idx) =>
+      (FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, idx, "ignored"), null)
+    }.toSeq)
+  }
+
+  /**
+   * Common code to get the next result stage attempt, confirm it's the one we expect, and
+   * complete it with a success where we return 42.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   */
+  private def completeNextResultStageWithSuccess(stageId: Int, attemptIdx: Int): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    assert(scheduler.stageIdToStage(stageId).isInstanceOf[ResultStage])
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map(_ => (Success, 42)).toSeq)
+  }
+
+  /**
+   * In this test, we simulate a job where many tasks in the same stage fail. We want to show
+   * that many fetch failures inside a single stage attempt do not trigger an abort
+   * on their own, but only when there are enough failing stage attempts.
+   */
+  test("Single stage fetch failure should not abort the stage.") {
+    setupStageAbortTest(sc)
+
+    val parts = 8
+    val shuffleMapRdd = new MyRDD(sc, parts, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, parts, List(shuffleDep))
+    submit(reduceRdd, (0 until parts).toArray)
+
+    completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = parts)
+
+    completeNextStageWithFetchFailure(1, 0, shuffleDep)
+
+    // Resubmit and confirm that now all is well
+    scheduler.resubmitFailedStages()
+
+    assert(scheduler.runningStages.nonEmpty)
+    assert(!ended)
+
+    // Complete stage 0 and then stage 1 with a "42"
+    completeShuffleMapStageSuccessfully(0, 1, numShufflePartitions = parts)
+    completeNextResultStageWithSuccess(1, 1)
+
+    // Confirm job finished succesfully
+    sc.listenerBus.waitUntilEmpty(1000)
+    assert(ended === true)
+    assert(results === (0 until parts).map { idx => idx -> 42 }.toMap)
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test we simulate a job failure where the first stage completes successfully and
+   * the second stage fails due to a fetch failure. Multiple successive fetch failures of a stage
+   * trigger an overall job abort to avoid endless retries.
+   */
+  test("Multiple consecutive stage fetch failures should lead to job being aborted.") {
+    setupStageAbortTest(sc)
+
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    submit(reduceRdd, Array(0, 1))
+
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) {
+      // Complete all the tasks for the current attempt of stage 0 successfully
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      // Now we should have a new taskSet, for a new attempt of stage 1.
+      // Fail all these tasks with FetchFailure
+      completeNextStageWithFetchFailure(1, attempt, shuffleDep)
+
+      // this will trigger a resubmission of stage 0, since we've lost some of its
+      // map output, for the next iteration through the loop
+      scheduler.resubmitFailedStages()
+
+      if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) {
+        assert(scheduler.runningStages.nonEmpty)
+        assert(!ended)
+      } else {
+        // Stage should have been aborted and removed from running stages
+        assertDataStructuresEmpty()
+        sc.listenerBus.waitUntilEmpty(1000)
+        assert(ended)
+        jobResult match {
+          case JobFailed(reason) =>
+            assert(reason.getMessage.contains("ResultStage 1 () has failed the maximum"))
+          case other => fail(s"expected JobFailed, not $other")
+        }
+      }
+    }
+  }
+
+  /**
+   * In this test, we create a job with two consecutive shuffles, and simulate 2 failures for each
+   * shuffle fetch. In total In total, the job has had four failures overall but not four failures
+   * for a particular stage, and as such should not be aborted.
+   */
+  test("Failures in different stages should not trigger an overall abort") {
+    setupStageAbortTest(sc)
+
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    submit(finalRdd, Array(0))
+
+    // In the first two iterations, Stage 0 succeeds and stage 1 fails. In the next two iterations,
+    // stage 2 fails.
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) {
+      // Complete all the tasks for the current attempt of stage 0 successfully
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2) {
+        // Now we should have a new taskSet, for a new attempt of stage 1.
+        // Fail all these tasks with FetchFailure
+        completeNextStageWithFetchFailure(1, attempt, shuffleDepOne)
+      } else {
+        completeShuffleMapStageSuccessfully(1, attempt, numShufflePartitions = 1)
+
+        // Fail stage 2
+        completeNextStageWithFetchFailure(2, attempt - Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2,
+          shuffleDepTwo)
+      }
+
+      // this will trigger a resubmission of stage 0, since we've lost some of its
+      // map output, for the next iteration through the loop
+      scheduler.resubmitFailedStages()
+    }
+
+    completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 4, numShufflePartitions = 1)
+
+    // Succeed stage2 with a "42"
+    completeNextResultStageWithSuccess(2, Stage.MAX_CONSECUTIVE_FETCH_FAILURES/2)
+
+    assert(results === Map(0 -> 42))
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test we demonstrate that only consecutive failures trigger a stage abort. A stage may
+   * fail multiple times, succeed, then fail a few more times (because its run again by downstream
+   * dependencies). The total number of failed attempts for one stage will go over the limit,
+   * but that doesn't matter, since they have successes in the middle.
+   */
+  test("Non-consecutive stage failures don't trigger abort") {
+    setupStageAbortTest(sc)
+
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    submit(finalRdd, Array(0))
+
+    // First, execute stages 0 and 1, failing stage 1 up to MAX-1 times.
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) {
+      // Make each task in stage 0 success
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      // Now we should have a new taskSet, for a new attempt of stage 1.
+      // Fail these tasks with FetchFailure
+      completeNextStageWithFetchFailure(1, attempt, shuffleDepOne)
+
+      scheduler.resubmitFailedStages()
+
+      // Confirm we have not yet aborted
+      assert(scheduler.runningStages.nonEmpty)
+      assert(!ended)
+    }
+
+    // Rerun stage 0 and 1 to step through the task set
+    completeShuffleMapStageSuccessfully(0, 3, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 3, numShufflePartitions = 1)
+
+    // Fail stage 2 so that stage 1 is resubmitted when we call scheduler.resubmitFailedStages()
+    completeNextStageWithFetchFailure(2, 0, shuffleDepTwo)
+
+    scheduler.resubmitFailedStages()
+
+    // Rerun stage 0 to step through the task set
+    completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2)
+
+    // Now again, fail stage 1 (up to MAX_FAILURES) but confirm that this doesn't trigger an abort
+    // since we succeeded in between.
+    completeNextStageWithFetchFailure(1, 4, shuffleDepOne)
+
+    scheduler.resubmitFailedStages()
+
+    // Confirm we have not yet aborted
+    assert(scheduler.runningStages.nonEmpty)
+    assert(!ended)
+
+    // Next, succeed all and confirm output
+    // Rerun stage 0 + 1
+    completeShuffleMapStageSuccessfully(0, 5, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 5, numShufflePartitions = 1)
+
+    // Succeed stage 2 and verify results
+    completeNextResultStageWithSuccess(2, 1)
+
+    assertDataStructuresEmpty()
+    sc.listenerBus.waitUntilEmpty(1000)
+    assert(ended === true)
+    assert(results === Map(0 -> 42))
+  }
+
   test("trivial shuffle with multiple fetch failures") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
@@ -810,7 +1086,7 @@ class DAGSchedulerSuite
     submit(finalRdd, Array(0))
     cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
     cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
-    // complete stage 2
+    // complete stage 0
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 2)),
         (Success, makeMapStatus("hostB", 2))))
@@ -818,7 +1094,7 @@ class DAGSchedulerSuite
     complete(taskSets(1), Seq(
         (Success, makeMapStatus("hostA", 1)),
         (Success, makeMapStatus("hostB", 1))))
-    // pretend stage 0 failed because hostA went down
+    // pretend stage 2 failed because hostA went down
     complete(taskSets(2), Seq(
         (FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0, "ignored"), null)))
     // TODO assert this:

From 0985d2c30e031f80892987f7c3581d15dd210303 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Wed, 2 Sep 2015 22:11:11 -0700
Subject: [PATCH 174/802] [SPARK-8707] RDD#toDebugString fails if any cached
 RDD has invalid partitions

Added numPartitions(evaluate: Boolean) to RDD. With "evaluate=true" the method is same with "partitions.length". With "evaluate=false", it checks checked-out or already evaluated partitions in the RDD to get number of partition. If it's not those cases, returns -1. RDDInfo.partitionNum calls numPartition only when it's accessed.

Author: navis.ryu <navis@apache.org>

Closes #7127 from navis/SPARK-8707.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 6 +++++-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala      | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 738887076b0d1..cbfe8bf31c3d6 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1516,8 +1516,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   @DeveloperApi
   def getRDDStorageInfo: Array[RDDInfo] = {
+    getRDDStorageInfo(_ => true)
+  }
+
+  private[spark] def getRDDStorageInfo(filter: RDD[_] => Boolean): Array[RDDInfo] = {
     assertNotStopped()
-    val rddInfos = persistentRdds.values.map(RDDInfo.fromRdd).toArray
+    val rddInfos = persistentRdds.values.filter(filter).map(RDDInfo.fromRdd).toArray
     StorageUtils.updateRddInfo(rddInfos, getExecutorStorageStatus)
     rddInfos.filter(_.isCached)
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 081c721f23687..7dd2bc5d7cd72 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1666,7 +1666,7 @@ abstract class RDD[T: ClassTag](
       import Utils.bytesToString
 
       val persistence = if (storageLevel != StorageLevel.NONE) storageLevel.description else ""
-      val storageInfo = rdd.context.getRDDStorageInfo.filter(_.id == rdd.id).map(info =>
+      val storageInfo = rdd.context.getRDDStorageInfo(_.id == rdd.id).map(info =>
         "    CachedPartitions: %d; MemorySize: %s; ExternalBlockStoreSize: %s; DiskSize: %s".format(
           info.numCachedPartitions, bytesToString(info.memSize),
           bytesToString(info.externalBlockStoreSize), bytesToString(info.diskSize)))

From f6c447f87592286a6f58aee5e0b2dc8dcb470d0c Mon Sep 17 00:00:00 2001
From: Evan Racah <ejracah@gmail.com>
Date: Wed, 2 Sep 2015 22:13:18 -0700
Subject: [PATCH 175/802] Removed code duplication in
 ShuffleBlockFetcherIterator

Added fetchUpToMaxBytes() to prevent having to update both code blocks when a change is made.

Author: Evan Racah <ejracah@gmail.com>

Closes #8514 from eracah/master.
---
 .../storage/ShuffleBlockFetcherIterator.scala  | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index a759ceb96ec1e..0d0448feb5b06 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -260,10 +260,7 @@ final class ShuffleBlockFetcherIterator(
     fetchRequests ++= Utils.randomize(remoteRequests)
 
     // Send out initial requests for blocks, up to our maxBytesInFlight
-    while (fetchRequests.nonEmpty &&
-      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
-      sendRequest(fetchRequests.dequeue())
-    }
+    fetchUpToMaxBytes()
 
     val numFetches = remoteRequests.size - fetchRequests.size
     logInfo("Started " + numFetches + " remote fetches in" + Utils.getUsedTimeMs(startTime))
@@ -296,10 +293,7 @@ final class ShuffleBlockFetcherIterator(
       case _ =>
     }
     // Send fetch requests up to maxBytesInFlight
-    while (fetchRequests.nonEmpty &&
-      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
-      sendRequest(fetchRequests.dequeue())
-    }
+    fetchUpToMaxBytes()
 
     result match {
       case FailureFetchResult(blockId, address, e) =>
@@ -315,6 +309,14 @@ final class ShuffleBlockFetcherIterator(
     }
   }
 
+  private def fetchUpToMaxBytes(): Unit = {
+    // Send fetch requests up to maxBytesInFlight
+    while (fetchRequests.nonEmpty &&
+      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
+      sendRequest(fetchRequests.dequeue())
+    }
+  }
+
   private def throwFetchFailedException(blockId: BlockId, address: BlockManagerId, e: Throwable) = {
     blockId match {
       case ShuffleBlockId(shufId, mapId, reduceId) =>

From 3ddb9b32335154e47890a0c761e0dfea3ccaac7b Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 2 Sep 2015 22:14:50 -0700
Subject: [PATCH 176/802] [SPARK-10247] [CORE] improve readability of a test
 case in DAGSchedulerSuite

This is pretty minor, just trying to improve the readability of `DAGSchedulerSuite`, I figure every bit helps.  Before whenever I read this test, I never knew what "should work" and "should be ignored" really meant -- this adds some asserts & updates comments to make it more clear.  Also some reformatting per a suggestion from markhamstra on https://github.com/apache/spark/pull/7699

Author: Imran Rashid <irashid@cloudera.com>

Closes #8434 from squito/SPARK-10247.
---
 .../spark/scheduler/DAGSchedulerSuite.scala   | 57 +++++++++++++++----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 62957c66974fb..80f64de44307e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -926,27 +926,64 @@ class DAGSchedulerSuite
     val shuffleId = shuffleDep.shuffleId
     val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
+
     // pretend we were told hostA went away
     val oldEpoch = mapOutputTracker.getEpoch
     runEvent(ExecutorLost("exec-hostA"))
     val newEpoch = mapOutputTracker.getEpoch
     assert(newEpoch > oldEpoch)
+
+    // now start completing some tasks in the shuffle map stage, under different hosts
+    // and epochs, and make sure scheduler updates its state correctly
     val taskSet = taskSets(0)
+    val shuffleStage = scheduler.stageIdToStage(taskSet.stageId).asInstanceOf[ShuffleMapStage]
+    assert(shuffleStage.numAvailableOutputs === 0)
+
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
-    // should work because it's a non-failed host
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 0)
+
+    // should work because it's a non-failed host (so the available map outputs will increase)
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostB", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 1)
+
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
-    // should work because it's a new epoch
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 1)
+
+    // should work because it's a new epoch, which will increase the number of available map
+    // outputs, and also finish the stage
     taskSet.tasks(1).epoch = newEpoch
-    runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSet.tasks(1),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 2)
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
            HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
+
+    // finish the next stage normally, which completes the job
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
     assert(results === Map(0 -> 42, 1 -> 43))
     assertDataStructuresEmpty()

From 62b4690d6b3016f41292b640ac28644ef31e299d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 2 Sep 2015 22:15:54 -0700
Subject: [PATCH 177/802] [SPARK-10379] preserve first page in
 UnsafeShuffleExternalSorter

Author: Davies Liu <davies@databricks.com>

Closes #8543 from davies/preserve_page.
---
 .../spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java    | 4 ++++
 .../apache/spark/rdd/MapPartitionsWithPreparationRDD.scala   | 2 +-
 .../spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java       | 5 +++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
index 3d1ef0c48adc5..e73ba39468828 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
@@ -122,6 +122,10 @@ public UnsafeShuffleExternalSorter(
     this.maxRecordSizeBytes = pageSizeBytes - 4;
     this.writeMetrics = writeMetrics;
     initializeForWriting();
+
+    // preserve first page to ensure that we have at least one page to work with. Otherwise,
+    // other operators in the same task may starve this sorter (SPARK-9709).
+    acquireNewPageIfNecessary(pageSizeBytes);
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
index 1f2213d0c4346..417ff5278db2a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
@@ -41,7 +41,7 @@ private[spark] class MapPartitionsWithPreparationRDD[U: ClassTag, T: ClassTag, M
 
   // In certain join operations, prepare can be called on the same partition multiple times.
   // In this case, we need to ensure that each call to compute gets a separate prepare argument.
-  private[this] var preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M]
+  private[this] val preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M]
 
   /**
    * Prepare a partition for a single call to compute.
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 94650be536b5f..a266b0c36e0fa 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -530,8 +530,9 @@ public void testPeakMemoryUsed() throws Exception {
       for (int i = 0; i < numRecordsPerPage * 10; i++) {
         writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1));
         newPeakMemory = writer.getPeakMemoryUsedBytes();
-        if (i % numRecordsPerPage == 0) {
-          // We allocated a new page for this record, so peak memory should change
+        if (i % numRecordsPerPage == 0 && i != 0) {
+          // The first page is allocated in constructor, another page will be allocated after
+          // every numRecordsPerPage records (peak memory should change).
           assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory);
         } else {
           assertEquals(previousPeakMemory, newPeakMemory);

From 0349b5b4383cf813bea4e1053bcc4e0268603743 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 2 Sep 2015 22:17:39 -0700
Subject: [PATCH 178/802] [SPARK-10411] [SQL] Move visualization above explain
 output and hide explain by default

New screenshots after this fix:

<img width="627" alt="s1" src="https://cloud.githubusercontent.com/assets/1000778/9625782/4b2dba36-518b-11e5-9104-c713ff026e3d.png">

Default:
<img width="462" alt="s2" src="https://cloud.githubusercontent.com/assets/1000778/9625817/92366e50-518b-11e5-9981-cdfb774d66b8.png">

After clicking `+details`:
<img width="377" alt="s3" src="https://cloud.githubusercontent.com/assets/1000778/9625784/4ba24342-518b-11e5-8522-846a16a95d44.png">

Author: zsxwing <zsxwing@gmail.com>

Closes #8570 from zsxwing/SPARK-10411.
---
 .../sql/execution/ui/ExecutionPage.scala      | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index f0b56c2eb7a53..a4dbd2e1978d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -74,16 +74,14 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
                 }}
               </li>
             }}
-            <li>
-              <strong>Detail: </strong><br/>
-              <pre>{executionUIData.physicalPlanDescription}</pre>
-            </li>
           </ul>
         </div>
 
       val metrics = listener.getExecutionMetrics(executionId)
 
-      summary ++ planVisualization(metrics, executionUIData.physicalPlanGraph)
+      summary ++
+        planVisualization(metrics, executionUIData.physicalPlanGraph) ++
+        physicalPlanDescription(executionUIData.physicalPlanDescription)
     }.getOrElse {
       <div>No information to display for Plan {executionId}</div>
     }
@@ -124,4 +122,23 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
 
   private def jobURL(jobId: Long): String =
     "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), jobId)
+
+  private def physicalPlanDescription(physicalPlanDescription: String): Seq[Node] = {
+    <div>
+      <span style="cursor: pointer;" onclick="clickPhysicalPlanDetails();">
+        <span id="physical-plan-details-arrow" class="arrow-closed"></span>
+        <a>Details</a>
+      </span>
+    </div>
+    <div id="physical-plan-details" style="display: none;">
+      <pre>{physicalPlanDescription}</pre>
+    </div>
+    <script>
+      function clickPhysicalPlanDetails() {{
+        $('#physical-plan-details').toggle();
+        $('#physical-plan-details-arrow').toggleClass('arrow-open').toggleClass('arrow-closed');
+      }}
+    </script>
+    <br/>
+  }
 }

From 67580f1f574d272af3712fd91458f3c87368c2e4 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 3 Sep 2015 09:30:54 +0100
Subject: [PATCH 179/802] [SPARK-10332] [CORE] Fix yarn spark executor
 validation

From Jira:
Running spark-submit with yarn with number-executors equal to 0 when not using dynamic allocation should error out.
In spark 1.5.0 it continues and ends up hanging.
yarn.ClientArguments still has the check so something else must have changed.
spark-submit --master yarn --deploy-mode cluster --class org.apache.spark.examples.SparkPi --num-executors 0 ....
spark 1.4.1 errors with:
java.lang.IllegalArgumentException:
Number of executors was 0, but must be at least 1
(or 0 if dynamic executor allocation is enabled).

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8580 from holdenk/SPARK-10332-spark-submit-to-yarn-executors-0-message.
---
 .../scala/org/apache/spark/deploy/yarn/ClientArguments.scala   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 4f42ffefa77f9..54f62e6b723ac 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -96,6 +96,9 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       }
 
       numExecutors = initialNumExecutors
+    } else {
+      val numExecutorsConf = "spark.executor.instances"
+      numExecutors = sparkConf.getInt(numExecutorsConf, numExecutors)
     }
     principal = Option(principal)
       .orElse(sparkConf.getOption("spark.yarn.principal"))

From 3abc0d512541158d11b181e2d9fa126d1371d5c0 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Thu, 3 Sep 2015 12:56:36 -0700
Subject: [PATCH 180/802] [SPARK-9596] [SQL] treat hadoop classes as shared one
 in IsolatedClientLoader

https://issues.apache.org/jira/browse/SPARK-9596

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #7931 from WangTaoTheTonic/SPARK-9596.
---
 .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 7856037508412..1fe4cba9571f3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -131,6 +131,7 @@ private[hive] class IsolatedClientLoader(
     name.contains("slf4j") ||
     name.contains("log4j") ||
     name.startsWith("org.apache.spark.") ||
+    (name.startsWith("org.apache.hadoop.") && !name.startsWith("org.apache.hadoop.hive.")) ||
     name.startsWith("scala.") ||
     (name.startsWith("com.google") && !name.startsWith("com.google.cloud")) ||
     name.startsWith("java.lang.") ||

From af0e3125cb1d48b1fc0e44c42b6880d67a9f1a85 Mon Sep 17 00:00:00 2001
From: CHOIJAEHONG <redrock07@naver.com>
Date: Thu, 3 Sep 2015 13:38:26 -0700
Subject: [PATCH 181/802] [SPARK-8951] [SPARKR] support Unicode characters in
 collect()

Spark gives an error message and does not show the output when a field of the result DataFrame contains characters in CJK.
I changed SerDe.scala in order that Spark support Unicode characters when writes a string to R.

Author: CHOIJAEHONG <redrock07@naver.com>

Closes #7494 from CHOIJAEHONG1/SPARK-8951.
---
 R/pkg/R/deserialize.R                         |  6 +++--
 R/pkg/R/serialize.R                           |  2 +-
 R/pkg/inst/tests/test_sparkSQL.R              | 26 +++++++++++++++++++
 .../scala/org/apache/spark/api/r/SerDe.scala  |  9 +++----
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 6cf628e3007de..88f18613fd7b1 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -57,8 +57,10 @@ readTypedObject <- function(con, type) {
 
 readString <- function(con) {
   stringLen <- readInt(con)
-  string <- readBin(con, raw(), stringLen, endian = "big")
-  rawToChar(string)
+  raw <- readBin(con, raw(), stringLen, endian = "big")
+  string <- rawToChar(raw)
+  Encoding(string) <- "UTF-8"
+  string 
 }
 
 readInt <- function(con) {
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index e3676f57f907f..91e6b3e5609b5 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -79,7 +79,7 @@ writeJobj <- function(con, value) {
 writeString <- function(con, value) {
   utfVal <- enc2utf8(value)
   writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1))
-  writeBin(utfVal, con, endian = "big")
+  writeBin(utfVal, con, endian = "big", useBytes=TRUE)
 }
 
 writeInt <- function(con, value) {
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 0da5e38654732..6d331f9883d55 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -431,6 +431,32 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
   expect_equal(ncol(collect(df)), ncol(take(df, 10)))
 })
 
+test_that("collect() support Unicode characters", {
+  markUtf8 <- function(s) {
+    Encoding(s) <- "UTF-8"
+    s
+  }
+
+  lines <- c("{\"name\":\"안녕하세요\"}",
+             "{\"name\":\"您好\", \"age\":30}",
+             "{\"name\":\"こんにちは\", \"age\":19}",
+             "{\"name\":\"Xin chào\"}")
+
+  jsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(lines, jsonPath)
+
+  df <- read.df(sqlContext, jsonPath, "json")
+  rdf <- collect(df)
+  expect_true(is.data.frame(rdf))
+  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
+  expect_equal(rdf$name[2], markUtf8("您好"))
+  expect_equal(rdf$name[3], markUtf8("こんにちは"))
+  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+
+  df1 <- createDataFrame(sqlContext, rdf)
+  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+})
+
 test_that("multiple pipeline transformations result in an RDD with the correct values", {
   df <- jsonFile(sqlContext, jsonPath)
   first <- lapply(df, function(row) {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 26ad4f1d4697e..190e193427af8 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -329,12 +329,11 @@ private[spark] object SerDe {
     out.writeDouble((value.getTime / 1000).toDouble + value.getNanos.toDouble / 1e9)
   }
 
-  // NOTE: Only works for ASCII right now
   def writeString(out: DataOutputStream, value: String): Unit = {
-    val len = value.length
-    out.writeInt(len + 1) // For the \0
-    out.writeBytes(value)
-    out.writeByte(0)
+    val utf8 = value.getBytes("UTF-8")
+    val len = utf8.length
+    out.writeInt(len)
+    out.write(utf8, 0, len)
   }
 
   def writeBytes(out: DataOutputStream, value: Array[Byte]): Unit = {

From 49aff7b9add4a70fb229c93f15394387bde142b8 Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Thu, 3 Sep 2015 13:46:16 -0700
Subject: [PATCH 182/802] [SPARK-10432] spark.port.maxRetries documentation is
 unclear

Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #8585 from tgravescs/SPARK-10432.
---
 docs/configuration.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index fb0315ce7c3cc..29a36bd67f28b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1015,7 +1015,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.port.maxRetries</code></td>
   <td>16</td>
   <td>
-    Default maximum number of retries when binding to a port before giving up.
+    Maximum number of retries when binding to a port before giving up.
+    When a port is given a specific value (non 0), each subsequent retry will
+    increment the port used in the previous attempt by 1 before retrying. This 
+    essentially allows it to try a range of ports from the start port specified 
+    to port + maxRetries.
   </td>
 </tr>
 <tr>

From d911c682f00cd5c438568c548098e03d3e7ea05c Mon Sep 17 00:00:00 2001
From: robbins <robbins@uk.ibm.com>
Date: Thu, 3 Sep 2015 13:47:22 -0700
Subject: [PATCH 183/802] [SPARK-10431] [CORE] Fix intermittent test failure.
 Wait for event queue to be clear

Author: robbins <robbins@uk.ibm.com>

Closes #8582 from robbinspg/InputOutputMetricsSuite.
---
 .../org/apache/spark/metrics/InputOutputMetricsSuite.scala    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index d3218a548efc7..44eb5a0469122 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -286,6 +286,10 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
   private def runAndReturnMetrics(job: => Unit,
       collector: (SparkListenerTaskEnd) => Option[Long]): Long = {
     val taskMetrics = new ArrayBuffer[Long]()
+
+    // Avoid receiving earlier taskEnd events
+    sc.listenerBus.waitUntilEmpty(500)
+
     sc.addSparkListener(new SparkListener() {
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
         collector(taskEnd).foreach(taskMetrics += _)

From 754f853b02e9fd221f138c2446445fd56e3f3fb3 Mon Sep 17 00:00:00 2001
From: robbins <robbins@uk.ibm.com>
Date: Thu, 3 Sep 2015 13:48:35 -0700
Subject: [PATCH 184/802] [SPARK-9869] [STREAMING] Wait for all event
 notifications before asserting results

Author: robbins <robbins@uk.ibm.com>

Closes #8589 from robbinspg/InputStreamSuite-fix.
---
 .../scala/org/apache/spark/streaming/InputStreamsSuite.scala   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index ec2852d9a0206..047e38ef90998 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -76,6 +76,9 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
           fail("Timeout: cannot finish all batches in 30 seconds")
         }
 
+        // Ensure progress listener has been notified of all events
+        ssc.scheduler.listenerBus.waitUntilEmpty(500)
+
         // Verify all "InputInfo"s have been reported
         assert(ssc.progressListener.numTotalReceivedRecords === input.size)
         assert(ssc.progressListener.numTotalProcessedRecords === input.size)

From e62f4a46f4396ae1e064e3d2ebfa2434f549b090 Mon Sep 17 00:00:00 2001
From: Pat Shields <yeoldefortran@gmail.com>
Date: Thu, 3 Sep 2015 13:52:47 -0700
Subject: [PATCH 185/802] =?UTF-8?q?[SPARK-9672]=20[MESOS]=20Don=E2=80=99t?=
 =?UTF-8?q?=20include=20SPARK=5FENV=5FLOADED=20when=20passing=20env=20vars?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This contribution is my original work and I license the work to the project under the project's open source license.

Author: Pat Shields <yeoldefortran@gmail.com>

Closes #7979 from pashields/env-loading-on-driver.
---
 .../deploy/rest/RestSubmissionClient.scala      | 17 +++++++++++++----
 .../deploy/rest/StandaloneRestSubmitSuite.scala | 12 ++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
index 1fe956320a1b8..957a928bc402b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
@@ -392,15 +392,14 @@ private[spark] object RestSubmissionClient {
       mainClass: String,
       appArgs: Array[String],
       conf: SparkConf,
-      env: Map[String, String] = sys.env): SubmitRestProtocolResponse = {
+      env: Map[String, String] = Map()): SubmitRestProtocolResponse = {
     val master = conf.getOption("spark.master").getOrElse {
       throw new IllegalArgumentException("'spark.master' must be set.")
     }
     val sparkProperties = conf.getAll.toMap
-    val environmentVariables = env.filter { case (k, _) => k.startsWith("SPARK_") }
     val client = new RestSubmissionClient(master)
     val submitRequest = client.constructSubmitRequest(
-      appResource, mainClass, appArgs, sparkProperties, environmentVariables)
+      appResource, mainClass, appArgs, sparkProperties, env)
     client.createSubmission(submitRequest)
   }
 
@@ -413,6 +412,16 @@ private[spark] object RestSubmissionClient {
     val mainClass = args(1)
     val appArgs = args.slice(2, args.size)
     val conf = new SparkConf
-    run(appResource, mainClass, appArgs, conf)
+    val env = filterSystemEnvironment(sys.env)
+    run(appResource, mainClass, appArgs, conf, env)
+  }
+
+  /**
+   * Filter non-spark environment variables from any environment.
+   */
+  private[rest] def filterSystemEnvironment(env: Map[String, String]): Map[String, String] = {
+    env.filter { case (k, _) =>
+      (k.startsWith("SPARK_") && k != "SPARK_ENV_LOADED") || k.startsWith("MESOS_")
+    }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 96e456d889ac3..9693e32bf6af6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -366,6 +366,18 @@ class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(conn3.getResponseCode === HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
   }
 
+  test("client does not send 'SPARK_ENV_LOADED' env var by default") {
+    val environmentVariables = Map("SPARK_VAR" -> "1", "SPARK_ENV_LOADED" -> "1")
+    val filteredVariables = RestSubmissionClient.filterSystemEnvironment(environmentVariables)
+    assert(filteredVariables == Map("SPARK_VAR" -> "1"))
+  }
+
+  test("client includes mesos env vars") {
+    val environmentVariables = Map("SPARK_VAR" -> "1", "MESOS_VAR" -> "1", "OTHER_VAR" -> "1")
+    val filteredVariables = RestSubmissionClient.filterSystemEnvironment(environmentVariables)
+    assert(filteredVariables == Map("SPARK_VAR" -> "1", "MESOS_VAR" -> "1"))
+  }
+
   /* --------------------- *
    |     Helper methods    |
    * --------------------- */

From 11ef32c5a1fad84574733ade1e9d50a94275842b Mon Sep 17 00:00:00 2001
From: Vinod K C <vinod.kc@huawei.com>
Date: Thu, 3 Sep 2015 13:54:58 -0700
Subject: [PATCH 186/802] [SPARK-10430] [CORE] Added hashCode methods in
 AccumulableInfo and RDDOperationScope

Author: Vinod K C <vinod.kc@huawei.com>

Closes #8581 from vinodkc/fix_RDDOperationScope_Hashcode.
---
 .../scala/org/apache/spark/rdd/RDDOperationScope.scala   | 3 +++
 .../org/apache/spark/scheduler/AccumulableInfo.scala     | 8 +++++++-
 .../org/apache/spark/rdd/RDDOperationScopeSuite.scala    | 7 +++++++
 .../org/apache/spark/scheduler/DAGSchedulerSuite.scala   | 9 +++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
index 44667281c1063..540cbd688b63b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
@@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.{JsonIgnore, JsonInclude, JsonPropertyOr
 import com.fasterxml.jackson.annotation.JsonInclude.Include
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Objects
 
 import org.apache.spark.{Logging, SparkContext}
 
@@ -67,6 +68,8 @@ private[spark] class RDDOperationScope(
     }
   }
 
+  override def hashCode(): Int = Objects.hashCode(id, name, parent)
+
   override def toString: String = toJson
 }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index 11d123eec43ca..b6bff64ee368e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -34,9 +34,15 @@ class AccumulableInfo private[spark] (
   override def equals(other: Any): Boolean = other match {
     case acc: AccumulableInfo =>
       this.id == acc.id && this.name == acc.name &&
-        this.update == acc.update && this.value == acc.value
+        this.update == acc.update && this.value == acc.value &&
+        this.internal == acc.internal
     case _ => false
   }
+
+  override def hashCode(): Int = {
+    val state = Seq(id, name, update, value, internal)
+    state.map(_.hashCode).reduceLeft(31 * _ + _)
+  }
 }
 
 object AccumulableInfo {
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
index f65349e3e3585..16a92f54f9368 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
@@ -38,6 +38,13 @@ class RDDOperationScopeSuite extends SparkFunSuite with BeforeAndAfter {
     sc.stop()
   }
 
+  test("equals and hashCode") {
+    val opScope1 = new RDDOperationScope("scope1", id = "1")
+    val opScope2 = new RDDOperationScope("scope1", id = "1")
+    assert(opScope1 === opScope2)
+    assert(opScope1.hashCode() === opScope2.hashCode())
+  }
+
   test("getAllScopes") {
     assert(scope1.getAllScopes === Seq(scope1))
     assert(scope2.getAllScopes === Seq(scope1, scope2))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 80f64de44307e..2c3aa2cf90d3c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -285,6 +285,15 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+  test("equals and hashCode AccumulableInfo") {
+    val accInfo1 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, true)
+    val accInfo2 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, false)
+    val accInfo3 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, false)
+    assert(accInfo1 !== accInfo2)
+    assert(accInfo2 === accInfo3)
+    assert(accInfo2.hashCode() === accInfo3.hashCode())
+  }
+
   test("cache location preferences w/ dependency") {
     val baseRdd = new MyRDD(sc, 1, Nil).cache()
     val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))

From db4c130f9e8e2195c7cd8332ccb668217df55507 Mon Sep 17 00:00:00 2001
From: jeanlyn <jeanlyn92@gmail.com>
Date: Thu, 3 Sep 2015 13:56:11 -0700
Subject: [PATCH 187/802] [SPARK-9591] [CORE] Job may fail for exception during
 getting remote block

[SPARK-9591](https://issues.apache.org/jira/browse/SPARK-9591)
When we getting the broadcast variable, we can fetch the block form several location,but now when connecting the lost blockmanager(idle for enough time removed by driver when using dynamic resource allocate and so on) will cause task fail,and the worse case will cause the job fail.

Author: jeanlyn <jeanlyn92@gmail.com>

Closes #7927 from jeanlyn/catch_exception.
---
 .../spark/storage/BlockFetchException.scala   | 24 ++++++++++++
 .../apache/spark/storage/BlockManager.scala   | 21 ++++++++++-
 .../spark/storage/BlockManagerSuite.scala     | 37 +++++++++++++++++++
 3 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala
new file mode 100644
index 0000000000000..f6e46ae9a481a
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import org.apache.spark.SparkException
+
+private[spark]
+case class BlockFetchException(messages: String, throwable: Throwable)
+  extends SparkException(messages, throwable)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index fefaef0ab82c8..d31aa68eb6954 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -23,6 +23,7 @@ import java.nio.{ByteBuffer, MappedByteBuffer}
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.concurrent.{ExecutionContext, Await, Future}
 import scala.concurrent.duration._
+import scala.util.control.NonFatal
 import scala.util.Random
 
 import sun.nio.ch.DirectBuffer
@@ -600,10 +601,26 @@ private[spark] class BlockManager(
   private def doGetRemote(blockId: BlockId, asBlockResult: Boolean): Option[Any] = {
     require(blockId != null, "BlockId is null")
     val locations = Random.shuffle(master.getLocations(blockId))
+    var numFetchFailures = 0
     for (loc <- locations) {
       logDebug(s"Getting remote block $blockId from $loc")
-      val data = blockTransferService.fetchBlockSync(
-        loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
+      val data = try {
+        blockTransferService.fetchBlockSync(
+          loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
+      } catch {
+        case NonFatal(e) =>
+          numFetchFailures += 1
+          if (numFetchFailures == locations.size) {
+            // An exception is thrown while fetching this block from all locations
+            throw new BlockFetchException(s"Failed to fetch block from" +
+              s" ${locations.size} locations. Most recent failure cause:", e)
+          } else {
+            // This location failed, so we retry fetch from a different one by returning null here
+            logWarning(s"Failed to fetch remote block $blockId " +
+              s"from $loc (failed attempt $numFetchFailures)", e)
+            null
+          }
+      }
 
       if (data != null) {
         if (asBlockResult) {
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index f480fd107a0c2..e5b54d66c8157 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -47,6 +47,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   private val conf = new SparkConf(false)
   var store: BlockManager = null
   var store2: BlockManager = null
+  var store3: BlockManager = null
   var rpcEnv: RpcEnv = null
   var master: BlockManagerMaster = null
   conf.set("spark.authenticate", "false")
@@ -99,6 +100,10 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
       store2.stop()
       store2 = null
     }
+    if (store3 != null) {
+      store3.stop()
+      store3 = null
+    }
     rpcEnv.shutdown()
     rpcEnv.awaitTermination()
     rpcEnv = null
@@ -443,6 +448,38 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(list2DiskGet.get.readMethod === DataReadMethod.Disk)
   }
 
+  test("SPARK-9591: getRemoteBytes from another location when Exception throw") {
+    val origTimeoutOpt = conf.getOption("spark.network.timeout")
+    try {
+      conf.set("spark.network.timeout", "2s")
+      store = makeBlockManager(8000, "executor1")
+      store2 = makeBlockManager(8000, "executor2")
+      store3 = makeBlockManager(8000, "executor3")
+      val list1 = List(new Array[Byte](4000))
+      store2.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store3.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      var list1Get = store.getRemoteBytes("list1")
+      assert(list1Get.isDefined, "list1Get expected to be fetched")
+      // block manager exit
+      store2.stop()
+      store2 = null
+      list1Get = store.getRemoteBytes("list1")
+      // get `list1` block
+      assert(list1Get.isDefined, "list1Get expected to be fetched")
+      store3.stop()
+      store3 = null
+      // exception throw because there is no locations
+      intercept[BlockFetchException] {
+        list1Get = store.getRemoteBytes("list1")
+      }
+    } finally {
+      origTimeoutOpt match {
+        case Some(t) => conf.set("spark.network.timeout", t)
+        case None => conf.remove("spark.network.timeout")
+      }
+    }
+  }
+
   test("in-memory LRU storage") {
     store = makeBlockManager(12000)
     val a1 = new Array[Byte](4000)

From 08b07509746667791db7dbfa1b329b0e221044bb Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 3 Sep 2015 13:57:20 -0700
Subject: [PATCH 188/802] [SPARK-10435] Spark submit should fail fast for Mesos
 cluster mode with R

It's not supported yet so we should error with a clear message.

Author: Andrew Or <andrew@databricks.com>

Closes #8590 from andrewor14/mesos-cluster-r-guard.
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 86fcf942c2c4e..36e9750b86636 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -322,6 +322,9 @@ object SparkSubmit {
       case (MESOS, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on Mesos clusters.")
+      case (MESOS, CLUSTER) if args.isR =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
+          "applications on Mesos clusters.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")

From 208fbca102c269c52eaf84bdde9838474ded276b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 3 Sep 2015 15:10:30 -0700
Subject: [PATCH 189/802] [SPARK-10421] [BUILD] Exclude curator artifacts from
 tachyon dependencies.

This avoids them being mistakenly pulled instead of the newer ones that
Spark actually uses. Spark only depends on these artifacts transitively,
so sometimes maven just decides to pick tachyon's version of the
dependency for whatever reason.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8577 from vanzin/SPARK-10421.
---
 core/pom.xml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/core/pom.xml b/core/pom.xml
index 4f79d71bf85fa..a46292c13bcc0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -272,6 +272,14 @@
           <groupId>org.apache.hadoop</groupId>
           <artifactId>hadoop-client</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-framework</artifactId>
+        </exclusion>
         <exclusion>
           <groupId>org.apache.curator</groupId>
           <artifactId>curator-recipes</artifactId>

From cf42138643d1d4bf464f1d700457309d9e537721 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 3 Sep 2015 17:55:10 -0700
Subject: [PATCH 190/802] [SPARK-10003] Improve readability of DAGScheduler

Note: this is not intended to be in Spark 1.5!

This patch rewrites some code in the `DAGScheduler` to make it more readable. In particular
- there were blocks of code that are unnecessary and removed for simplicity
- there were abstractions that are unnecessary and made the code hard to navigate
- other minor changes

Author: Andrew Or <andrew@databricks.com>

Closes #8217 from andrewor14/dag-scheduler-readability and squashes the following commits:

57abca3 [Andrew Or] Move comment back into if case
574fb1e [Andrew Or] Merge branch 'master' of github.com:apache/spark into dag-scheduler-readability
64a9ed2 [Andrew Or] Remove unnecessary code + minor code rewrites
---
 .../apache/spark/scheduler/DAGScheduler.scala | 46 ++++---------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index d673cb0946639..09e963f5cdf60 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -250,11 +250,12 @@ class DAGScheduler(
       case Some(stage) => stage
       case None =>
         // We are going to register ancestor shuffle dependencies
-        registerShuffleDependencies(shuffleDep, firstJobId)
+        getAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
+          shuffleToMapStage(dep.shuffleId) = newOrUsedShuffleStage(dep, firstJobId)
+        }
         // Then register current shuffleDep
         val stage = newOrUsedShuffleStage(shuffleDep, firstJobId)
         shuffleToMapStage(shuffleDep.shuffleId) = stage
-
         stage
     }
   }
@@ -365,16 +366,6 @@ class DAGScheduler(
     parents.toList
   }
 
-  /** Find ancestor missing shuffle dependencies and register into shuffleToMapStage */
-  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], firstJobId: Int) {
-    val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd)
-    while (parentsWithNoMapStage.nonEmpty) {
-      val currentShufDep = parentsWithNoMapStage.pop()
-      val stage = newOrUsedShuffleStage(currentShufDep, firstJobId)
-      shuffleToMapStage(currentShufDep.shuffleId) = stage
-    }
-  }
-
   /** Find ancestor shuffle dependencies that are not registered in shuffleToMapStage yet */
   private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
     val parents = new Stack[ShuffleDependency[_, _, _]]
@@ -391,11 +382,9 @@ class DAGScheduler(
               if (!shuffleToMapStage.contains(shufDep.shuffleId)) {
                 parents.push(shufDep)
               }
-
-              waitingForVisit.push(shufDep.rdd)
             case _ =>
-              waitingForVisit.push(dep.rdd)
           }
+          waitingForVisit.push(dep.rdd)
         }
       }
     }
@@ -1052,10 +1041,11 @@ class DAGScheduler(
               //       we registered these map outputs.
               mapOutputTracker.registerMapOutputs(
                 shuffleStage.shuffleDep.shuffleId,
-                shuffleStage.outputLocs.map(list => if (list.isEmpty) null else list.head),
+                shuffleStage.outputLocs.map(_.headOption.orNull),
                 changeEpoch = true)
 
               clearCacheLocs()
+
               if (shuffleStage.outputLocs.contains(Nil)) {
                 // Some tasks had failed; let's resubmit this shuffleStage
                 // TODO: Lower-level scheduler should also deal with this
@@ -1064,27 +1054,9 @@ class DAGScheduler(
                   shuffleStage.outputLocs.zipWithIndex.filter(_._1.isEmpty)
                       .map(_._2).mkString(", "))
                 submitStage(shuffleStage)
-              } else {
-                val newlyRunnable = new ArrayBuffer[Stage]
-                for (shuffleStage <- waitingStages) {
-                  logInfo("Missing parents for " + shuffleStage + ": " +
-                    getMissingParentStages(shuffleStage))
-                }
-                for (shuffleStage <- waitingStages if getMissingParentStages(shuffleStage).isEmpty)
-                {
-                  newlyRunnable += shuffleStage
-                }
-                waitingStages --= newlyRunnable
-                runningStages ++= newlyRunnable
-                for {
-                  shuffleStage <- newlyRunnable.sortBy(_.id)
-                  jobId <- activeJobForStage(shuffleStage)
-                } {
-                  logInfo("Submitting " + shuffleStage + " (" +
-                    shuffleStage.rdd + "), which is now runnable")
-                  submitMissingTasks(shuffleStage, jobId)
-                }
               }
+
+              // Note: newly runnable stages will be submitted below when we submit waiting stages
             }
           }
 
@@ -1186,7 +1158,7 @@ class DAGScheduler(
         // TODO: This will be really slow if we keep accumulating shuffle map stages
         for ((shuffleId, stage) <- shuffleToMapStage) {
           stage.removeOutputsOnExecutor(execId)
-          val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head)
+          val locs = stage.outputLocs.map(_.headOption.orNull)
           mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true)
         }
         if (shuffleToMapStage.isEmpty) {

From 143e521d1a0e16af18083524ceb4399cbe6372bc Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 4 Sep 2015 11:24:47 -0700
Subject: [PATCH 191/802] [MINOR] Minor style fix in SparkR

`dev/lintr-r` passes on my machine now

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #8601 from shivaram/sparkr-style-fix.
---
 R/pkg/R/deserialize.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 88f18613fd7b1..d1858ec227b56 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -60,7 +60,7 @@ readString <- function(con) {
   raw <- readBin(con, raw(), stringLen, endian = "big")
   string <- rawToChar(raw)
   Encoding(string) <- "UTF-8"
-  string 
+  string
 }
 
 readInt <- function(con) {

From 804a0126e0cc982cc9f22cc76ba7b874ebbef5dd Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 4 Sep 2015 14:59:56 -0700
Subject: [PATCH 192/802] MAINTENANCE: Automated closing of pull requests. This
 commit exists to close the following pull requests on Github:

Closes #1890 (requested by andrewor14, JoshRosen)
Closes #3558 (requested by JoshRosen, marmbrus)
Closes #3890 (requested by marmbrus)
Closes #3895 (requested by andrewor14, marmbrus)
Closes #4055 (requested by andrewor14)
Closes #4105 (requested by andrewor14)
Closes #4812 (requested by marmbrus)
Closes #5109 (requested by andrewor14)
Closes #5178 (requested by andrewor14)
Closes #5298 (requested by marmbrus)
Closes #5393 (requested by marmbrus)
Closes #5449 (requested by andrewor14)
Closes #5468 (requested by marmbrus)
Closes #5715 (requested by marmbrus)
Closes #6192 (requested by marmbrus)
Closes #6319 (requested by marmbrus)
Closes #6326 (requested by marmbrus)
Closes #6349 (requested by marmbrus)
Closes #6380 (requested by andrewor14)
Closes #6554 (requested by marmbrus)
Closes #6696 (requested by marmbrus)
Closes #6868 (requested by marmbrus)
Closes #6951 (requested by marmbrus)
Closes #7129 (requested by marmbrus)
Closes #7188 (requested by marmbrus)
Closes #7358 (requested by marmbrus)
Closes #7379 (requested by marmbrus)
Closes #7628 (requested by marmbrus)
Closes #7715 (requested by marmbrus)
Closes #7782 (requested by marmbrus)
Closes #7914 (requested by andrewor14)
Closes #8051 (requested by andrewor14)
Closes #8269 (requested by andrewor14)
Closes #8448 (requested by andrewor14)
Closes #8576 (requested by andrewor14)

From c3c0e431a6280fbcf726ac9bc4db0e1b5a862be8 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 4 Sep 2015 15:17:37 -0700
Subject: [PATCH 193/802] [SPARK-10176] [SQL] Show partially analyzed plans
 when checkAnswer fails to analyze

This PR takes over https://github.com/apache/spark/pull/8389.

This PR improves `checkAnswer` to print the partially analyzed plan in addition to the user friendly error message, in order to aid debugging failing tests.

In doing so, I ran into a conflict with the various ways that we bring a SQLContext into the tests. Depending on the trait we refer to the current context as `sqlContext`, `_sqlContext`, `ctx` or `hiveContext` with access modifiers `public`, `protected` and `private` depending on the defining class.

I propose we refactor as follows:

1. All tests should only refer to a `protected sqlContext` when testing general features, and `protected hiveContext` when it is a method that only exists on a `HiveContext`.
2. All tests should only import `testImplicits._` (i.e., don't import `TestHive.implicits._`)

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8584 from cloud-fan/cleanupTests.
---
 .../spark/sql/catalyst/plans/PlanTest.scala   |   1 -
 .../apache/spark/sql/CachedTableSuite.scala   | 156 ++++++-------
 .../spark/sql/ColumnExpressionSuite.scala     |  16 +-
 .../spark/sql/DataFrameAggregateSuite.scala   |   4 +-
 .../spark/sql/DataFrameComplexTypeSuite.scala |   6 +-
 .../spark/sql/DataFrameImplicitsSuite.scala   |   8 +-
 .../apache/spark/sql/DataFrameStatSuite.scala |  10 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |  14 +-
 .../spark/sql/DataFrameTungstenSuite.scala    |   6 +-
 .../spark/sql/ExtraStrategiesSuite.scala      |   2 +-
 .../org/apache/spark/sql/JoinSuite.scala      |  12 +-
 .../apache/spark/sql/ListTablesSuite.scala    |  20 +-
 .../org/apache/spark/sql/QueryTest.scala      |  27 ++-
 .../scala/org/apache/spark/sql/RowSuite.scala |   2 +-
 .../org/apache/spark/sql/SQLConfSuite.scala   |  44 ++--
 .../apache/spark/sql/SQLContextSuite.scala    |  12 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  40 ++--
 .../apache/spark/sql/SerializationSuite.scala |   2 +-
 .../spark/sql/StringFunctionsSuite.scala      |  47 ++--
 .../scala/org/apache/spark/sql/UDFSuite.scala |  42 ++--
 .../spark/sql/UserDefinedTypeSuite.scala      |   6 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |  41 ++--
 .../columnar/PartitionBatchPruningSuite.scala |  20 +-
 .../spark/sql/execution/ExchangeSuite.scala   |   2 +
 .../spark/sql/execution/PlannerSuite.scala    |  99 ++++----
 .../execution/RowFormatConvertersSuite.scala  |  16 +-
 .../spark/sql/execution/SortSuite.scala       |   1 +
 .../spark/sql/execution/SparkPlanTest.scala   |  27 +--
 .../sql/execution/TungstenSortSuite.scala     |  12 +-
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 .../datasources/json/JsonSuite.scala          | 214 +++++++++---------
 .../datasources/json/TestJsonData.scala       |  34 +--
 .../parquet/ParquetCompatibilityTest.scala    |   5 +-
 .../datasources/parquet/ParquetIOSuite.scala  |  52 ++---
 .../ParquetPartitionDiscoverySuite.scala      |   2 +-
 .../parquet/ParquetQuerySuite.scala           |  42 ++--
 .../datasources/parquet/ParquetTest.scala     |   9 +-
 .../execution/joins/BroadcastJoinSuite.scala  |  10 +-
 .../execution/joins/HashedRelationSuite.scala |   6 +-
 .../sql/execution/joins/InnerJoinSuite.scala  |   9 +-
 .../sql/execution/joins/OuterJoinSuite.scala  |   8 +-
 .../sql/execution/joins/SemiJoinSuite.scala   |   8 +-
 .../execution/metric/SQLMetricsSuite.scala    |  24 +-
 .../sql/execution/ui/SQLListenerSuite.scala   |   8 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  20 +-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  52 ++---
 .../sources/CreateTableAsSelectSuite.scala    |   2 -
 .../spark/sql/sources/DataSourceTest.scala    |   4 +-
 .../spark/sql/sources/InsertSuite.scala       |   3 +-
 .../sql/sources/PartitionedWriteSuite.scala   |   8 +-
 .../spark/sql/sources/SaveLoadSuite.scala     |   1 -
 .../apache/spark/sql/test/SQLTestData.scala   |  52 ++---
 .../apache/spark/sql/test/SQLTestUtils.scala  |  41 ++--
 .../spark/sql/test/SharedSQLContext.scala     |  17 +-
 .../spark/sql/test/TestSQLContext.scala       |   2 +-
 .../apache/spark/sql/hive/test/TestHive.scala |   7 +-
 .../spark/sql/hive/CachedTableSuite.scala     |  20 +-
 .../spark/sql/hive/ErrorPositionSuite.scala   |   8 +-
 .../hive/HiveDataFrameAnalyticsSuite.scala    |  13 +-
 .../sql/hive/HiveDataFrameJoinSuite.scala     |   6 +-
 .../sql/hive/HiveDataFrameWindowSuite.scala   |   7 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  20 +-
 .../spark/sql/hive/HiveParquetSuite.scala     |  12 +-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  11 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  35 ++-
 .../spark/sql/hive/ListTablesSuite.scala      |  12 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  13 +-
 .../spark/sql/hive/MultiDatabaseSuite.scala   |  20 +-
 .../hive/ParquetHiveCompatibilitySuite.scala  |  24 +-
 .../spark/sql/hive/QueryPartitionSuite.scala  |  18 +-
 .../spark/sql/hive/StatisticsSuite.scala      |  42 ++--
 .../org/apache/spark/sql/hive/UDFSuite.scala  |  16 +-
 .../execution/AggregationQuerySuite.scala     |  19 +-
 .../hive/execution/HiveComparisonTest.scala   |   4 +-
 .../sql/hive/execution/HiveExplainSuite.scala |  11 +-
 .../HiveOperatorQueryableSuite.scala          |   8 +-
 .../sql/hive/execution/HivePlanTest.scala     |   8 +-
 .../sql/hive/execution/HiveUDFSuite.scala     |  54 ++---
 .../sql/hive/execution/SQLQuerySuite.scala    |  39 ++--
 .../execution/ScriptTransformationSuite.scala |  17 +-
 .../hive/orc/OrcHadoopFsRelationSuite.scala   |   7 +-
 .../hive/orc/OrcPartitionDiscoverySuite.scala |  23 +-
 .../spark/sql/hive/orc/OrcSourceSuite.scala   |  10 +-
 .../apache/spark/sql/hive/orc/OrcTest.scala   |   9 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  16 +-
 .../CommitFailureTestRelationSuite.scala      |   9 +-
 .../sources/JsonHadoopFsRelationSuite.scala   |  12 +-
 .../ParquetHadoopFsRelationSuite.scala        |  15 +-
 .../SimpleTextHadoopFsRelationSuite.scala     |   4 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |  28 ++-
 90 files changed, 908 insertions(+), 999 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 765c1e2dda99f..f76a903dcc9cf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.util._
  * Provides helper methods for comparing plans.
  */
 class PlanTest extends SparkFunSuite {
-
   /**
    * Since attribute references are given globally unique ids during analysis,
    * we must normalize them to check if two different queries are identical.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index af7590c3d3c17..3a3541a8429b6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -34,7 +34,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   def rddIdOf(tableName: String): Int = {
-    val executedPlan = ctx.table(tableName).queryExecution.executedPlan
+    val executedPlan = sqlContext.table(tableName).queryExecution.executedPlan
     executedPlan.collect {
       case InMemoryColumnarTableScan(_, _, relation) =>
         relation.cachedColumnBuffers.id
@@ -44,7 +44,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   }
 
   def isMaterialized(rddId: Int): Boolean = {
-    ctx.sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
+    sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
   }
 
   test("withColumn doesn't invalidate cached dataframe") {
@@ -69,41 +69,41 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   test("cache temp table") {
     testData.select('key).registerTempTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0)
-    ctx.cacheTable("tempTable")
+    sqlContext.cacheTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    ctx.uncacheTable("tempTable")
+    sqlContext.uncacheTable("tempTable")
   }
 
   test("unpersist an uncached table will not raise exception") {
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.persist()
-    assert(None != ctx.cacheManager.lookupCachedData(testData))
+    assert(None != sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
   }
 
   test("cache table as select") {
     sql("CACHE TABLE tempTable AS SELECT key FROM testData")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    ctx.uncacheTable("tempTable")
+    sqlContext.uncacheTable("tempTable")
   }
 
   test("uncaching temp table") {
     testData.select('key).registerTempTable("tempTable1")
     testData.select('key).registerTempTable("tempTable2")
-    ctx.cacheTable("tempTable1")
+    sqlContext.cacheTable("tempTable1")
 
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"))
     assertCached(sql("SELECT COUNT(*) FROM tempTable2"))
 
     // Is this valid?
-    ctx.uncacheTable("tempTable2")
+    sqlContext.uncacheTable("tempTable2")
 
     // Should this be cached?
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"), 0)
@@ -111,103 +111,103 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("too big for memory") {
     val data = "*" * 1000
-    ctx.sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
+    sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
       .registerTempTable("bigData")
-    ctx.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
-    assert(ctx.table("bigData").count() === 200000L)
-    ctx.table("bigData").unpersist(blocking = true)
+    sqlContext.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
+    assert(sqlContext.table("bigData").count() === 200000L)
+    sqlContext.table("bigData").unpersist(blocking = true)
   }
 
   test("calling .cache() should use in-memory columnar caching") {
-    ctx.table("testData").cache()
-    assertCached(ctx.table("testData"))
-    ctx.table("testData").unpersist(blocking = true)
+    sqlContext.table("testData").cache()
+    assertCached(sqlContext.table("testData"))
+    sqlContext.table("testData").unpersist(blocking = true)
   }
 
   test("calling .unpersist() should drop in-memory columnar cache") {
-    ctx.table("testData").cache()
-    ctx.table("testData").count()
-    ctx.table("testData").unpersist(blocking = true)
-    assertCached(ctx.table("testData"), 0)
+    sqlContext.table("testData").cache()
+    sqlContext.table("testData").count()
+    sqlContext.table("testData").unpersist(blocking = true)
+    assertCached(sqlContext.table("testData"), 0)
   }
 
   test("isCached") {
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
 
-    assertCached(ctx.table("testData"))
-    assert(ctx.table("testData").queryExecution.withCachedData match {
+    assertCached(sqlContext.table("testData"))
+    assert(sqlContext.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => true
       case _ => false
     })
 
-    ctx.uncacheTable("testData")
-    assert(!ctx.isCached("testData"))
-    assert(ctx.table("testData").queryExecution.withCachedData match {
+    sqlContext.uncacheTable("testData")
+    assert(!sqlContext.isCached("testData"))
+    assert(sqlContext.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => false
       case _ => true
     })
   }
 
   test("SPARK-1669: cacheTable should be idempotent") {
-    assume(!ctx.table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
+    assume(!sqlContext.table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
 
-    ctx.cacheTable("testData")
-    assertCached(ctx.table("testData"))
+    sqlContext.cacheTable("testData")
+    assertCached(sqlContext.table("testData"))
 
     assertResult(1, "InMemoryRelation not found, testData should have been cached") {
-      ctx.table("testData").queryExecution.withCachedData.collect {
+      sqlContext.table("testData").queryExecution.withCachedData.collect {
         case r: InMemoryRelation => r
       }.size
     }
 
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
     assertResult(0, "Double InMemoryRelations found, cacheTable() is not idempotent") {
-      ctx.table("testData").queryExecution.withCachedData.collect {
+      sqlContext.table("testData").queryExecution.withCachedData.collect {
         case r @ InMemoryRelation(_, _, _, _, _: InMemoryColumnarTableScan, _) => r
       }.size
     }
 
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
   }
 
   test("read from cached table and uncache") {
-    ctx.cacheTable("testData")
-    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
-    assertCached(ctx.table("testData"))
+    sqlContext.cacheTable("testData")
+    checkAnswer(sqlContext.table("testData"), testData.collect().toSeq)
+    assertCached(sqlContext.table("testData"))
 
-    ctx.uncacheTable("testData")
-    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
-    assertCached(ctx.table("testData"), 0)
+    sqlContext.uncacheTable("testData")
+    checkAnswer(sqlContext.table("testData"), testData.collect().toSeq)
+    assertCached(sqlContext.table("testData"), 0)
   }
 
   test("correct error on uncache of non-cached table") {
     intercept[IllegalArgumentException] {
-      ctx.uncacheTable("testData")
+      sqlContext.uncacheTable("testData")
     }
   }
 
   test("SELECT star from cached table") {
     sql("SELECT * FROM testData").registerTempTable("selectStar")
-    ctx.cacheTable("selectStar")
+    sqlContext.cacheTable("selectStar")
     checkAnswer(
       sql("SELECT * FROM selectStar WHERE key = 1"),
       Seq(Row(1, "1")))
-    ctx.uncacheTable("selectStar")
+    sqlContext.uncacheTable("selectStar")
   }
 
   test("Self-join cached") {
     val unCachedAnswer =
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key").collect()
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
     checkAnswer(
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key"),
       unCachedAnswer.toSeq)
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' SQL statement") {
     sql("CACHE TABLE testData")
-    assertCached(ctx.table("testData"))
+    assertCached(sqlContext.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -215,7 +215,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
       "Eagerly cached in-memory table should have already been materialized")
 
     sql("UNCACHE TABLE testData")
-    assert(!ctx.isCached("testData"), "Table 'testData' should not be cached")
+    assert(!sqlContext.isCached("testData"), "Table 'testData' should not be cached")
 
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
@@ -224,14 +224,14 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE TABLE tableName AS SELECT * FROM anotherTable") {
     sql("CACHE TABLE testCacheTable AS SELECT * FROM testData")
-    assertCached(ctx.table("testCacheTable"))
+    assertCached(sqlContext.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    ctx.uncacheTable("testCacheTable")
+    sqlContext.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -239,14 +239,14 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE TABLE tableName AS SELECT ...") {
     sql("CACHE TABLE testCacheTable AS SELECT key FROM testData LIMIT 10")
-    assertCached(ctx.table("testCacheTable"))
+    assertCached(sqlContext.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    ctx.uncacheTable("testCacheTable")
+    sqlContext.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -254,7 +254,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE LAZY TABLE tableName") {
     sql("CACHE LAZY TABLE testData")
-    assertCached(ctx.table("testData"))
+    assertCached(sqlContext.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -266,7 +266,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
       isMaterialized(rddId),
       "Lazily cached in-memory table should have been materialized")
 
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -274,7 +274,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("InMemoryRelation statistics") {
     sql("CACHE TABLE testData")
-    ctx.table("testData").queryExecution.withCachedData.collect {
+    sqlContext.table("testData").queryExecution.withCachedData.collect {
       case cached: InMemoryRelation =>
         val actualSizeInBytes = (1 to 100).map(i => INT.defaultSize + i.toString.length + 4).sum
         assert(cached.statistics.sizeInBytes === actualSizeInBytes)
@@ -283,46 +283,48 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("Drops temporary table") {
     testData.select('key).registerTempTable("t1")
-    ctx.table("t1")
-    ctx.dropTempTable("t1")
-    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
+    sqlContext.table("t1")
+    sqlContext.dropTempTable("t1")
+    assert(
+      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
   }
 
   test("Drops cached temporary table") {
     testData.select('key).registerTempTable("t1")
     testData.select('key).registerTempTable("t2")
-    ctx.cacheTable("t1")
+    sqlContext.cacheTable("t1")
 
-    assert(ctx.isCached("t1"))
-    assert(ctx.isCached("t2"))
+    assert(sqlContext.isCached("t1"))
+    assert(sqlContext.isCached("t2"))
 
-    ctx.dropTempTable("t1")
-    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
-    assert(!ctx.isCached("t2"))
+    sqlContext.dropTempTable("t1")
+    assert(
+      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    assert(!sqlContext.isCached("t2"))
   }
 
   test("Clear all cache") {
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
-    ctx.clearCache()
-    assert(ctx.cacheManager.isEmpty)
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
+    sqlContext.clearCache()
+    assert(sqlContext.cacheManager.isEmpty)
 
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
     sql("Clear CACHE")
-    assert(ctx.cacheManager.isEmpty)
+    assert(sqlContext.cacheManager.isEmpty)
   }
 
   test("Clear accumulators when uncacheTable to prevent memory leaking") {
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
 
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
 
     sql("SELECT * FROM t1").count()
     sql("SELECT * FROM t2").count()
@@ -331,8 +333,8 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
     Accumulators.synchronized {
       val accsSize = Accumulators.originals.size
-      ctx.uncacheTable("t1")
-      ctx.uncacheTable("t2")
+      sqlContext.uncacheTable("t1")
+      sqlContext.uncacheTable("t2")
       assert((accsSize - 2) == Accumulators.originals.size)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 37738ec5b3c1d..4e988f074b113 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -29,7 +29,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   private lazy val booleanData = {
-    ctx.createDataFrame(ctx.sparkContext.parallelize(
+    sqlContext.createDataFrame(sparkContext.parallelize(
       Row(false, false) ::
       Row(false, true) ::
       Row(true, false) ::
@@ -286,7 +286,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("isNaN") {
-    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val testData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(Double.NaN, Float.NaN) ::
       Row(math.log(-1), math.log(-3).toFloat) ::
       Row(null, null) ::
@@ -307,7 +307,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("nanvl") {
-    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val testData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(null, 3.0, Double.NaN, Double.PositiveInfinity, 1.0f, 4) :: Nil),
       StructType(Seq(StructField("a", DoubleType), StructField("b", DoubleType),
         StructField("c", DoubleType), StructField("d", DoubleType),
@@ -350,7 +350,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("!==") {
-    val nullData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val nullData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(1, 1) ::
       Row(1, 2) ::
       Row(1, null) ::
@@ -411,7 +411,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("between") {
-    val testData = ctx.sparkContext.parallelize(
+    val testData = sparkContext.parallelize(
       (0, 1, 2) ::
       (1, 2, 3) ::
       (2, 1, 0) ::
@@ -556,7 +556,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("monotonicallyIncreasingId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
+    val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -567,7 +567,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("sparkPartitionId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
+    val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -578,7 +578,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("InputFileName") {
     withTempPath { dir =>
-      val data = sqlContext.sparkContext.parallelize(0 to 10).toDF("id")
+      val data = sparkContext.parallelize(0 to 10).toDF("id")
       data.write.parquet(dir.getCanonicalPath)
       val answer = sqlContext.read.parquet(dir.getCanonicalPath).select(inputFileName())
         .head.getString(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 72cf7aab0b977..c0950b09b14ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -66,12 +66,12 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Seq(Row(1, 3), Row(2, 3), Row(3, 3))
     )
 
-    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, false)
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, false)
     checkAnswer(
       testData2.groupBy("a").agg(sum($"b")),
       Seq(Row(3), Row(3), Row(3))
     )
-    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, true)
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, true)
   }
 
   test("agg without groups") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
index 3c359dd840ab7..09f7b507670c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
@@ -28,19 +28,19 @@ class DataFrameComplexTypeSuite extends QueryTest with SharedSQLContext {
 
   test("UDF on struct") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.select(struct($"a").as("s")).select(f($"s.a")).collect()
   }
 
   test("UDF on named_struct") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.selectExpr("named_struct('a', a) s").select(f($"s.a")).collect()
   }
 
   test("UDF on array") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.select(array($"a").as("s")).select(f(expr("s[0]"))).collect()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index e5d7d63441a6b..094efbaeadcd5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -24,7 +24,7 @@ class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext {
 
   test("RDD of tuples") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
+      sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
       (1 to 10).map(i => Row(i, i.toString)))
   }
 
@@ -36,19 +36,19 @@ class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext {
 
   test("RDD[Int]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).toDF("intCol"),
+      sparkContext.parallelize(1 to 10).toDF("intCol"),
       (1 to 10).map(i => Row(i)))
   }
 
   test("RDD[Long]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1L to 10L).toDF("longCol"),
+      sparkContext.parallelize(1L to 10L).toDF("longCol"),
       (1L to 10L).map(i => Row(i)))
   }
 
   test("RDD[String]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
+      sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
       (1 to 10).map(i => Row(i.toString)))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 28bdd6f83b687..6524abcf5e97f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -29,7 +29,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("sample with replacement") {
     val n = 100
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     checkAnswer(
       data.sample(withReplacement = true, 0.05, seed = 13),
       Seq(5, 10, 52, 73).map(Row(_))
@@ -38,7 +38,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("sample without replacement") {
     val n = 100
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     checkAnswer(
       data.sample(withReplacement = false, 0.05, seed = 13),
       Seq(16, 23, 88, 100).map(Row(_))
@@ -47,7 +47,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("randomSplit") {
     val n = 600
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     for (seed <- 1 to 5) {
       val splits = data.randomSplit(Array[Double](1, 2, 3), seed)
       assert(splits.length == 3, "wrong number of splits")
@@ -164,7 +164,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("Frequent Items 2") {
-    val rows = ctx.sparkContext.parallelize(Seq.empty[Int], 4)
+    val rows = sparkContext.parallelize(Seq.empty[Int], 4)
     // this is a regression test, where when merging partitions, we omitted values with higher
     // counts than those that existed in the map when the map was full. This test should also fail
     // if anything like SPARK-9614 is observed once again
@@ -182,7 +182,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("sampleBy") {
-    val df = ctx.range(0, 100).select((col("id") % 3).as("key"))
+    val df = sqlContext.range(0, 100).select((col("id") % 3).as("key"))
     val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L)
     checkAnswer(
       sampled.groupBy("key").count().orderBy("key"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index a4871e247cff7..b5b9f11785074 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -345,7 +345,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("replace column using withColumn") {
-    val df2 = sqlContext.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
+    val df2 = sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
     checkAnswer(
       df3.select("x"),
@@ -506,7 +506,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
   test("showString: truncate = [true, false]") {
     val longString = Array.fill(21)("1").mkString
-    val df = sqlContext.sparkContext.parallelize(Seq("1", longString)).toDF()
+    val df = sparkContext.parallelize(Seq("1", longString)).toDF()
     val expectedAnswerForFalse = """+---------------------+
                                    ||_1                   |
                                    |+---------------------+
@@ -596,7 +596,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
-    val rowRDD = sqlContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
+    val rowRDD = sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
     val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
     val df = sqlContext.createDataFrame(rowRDD, schema)
     df.rdd.collect()
@@ -619,14 +619,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df = sqlContext.read.json(sparkContext.makeRDD(
       """{"a.b": {"c": {"d..e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df.select(df("`a.b`.c.`d..e`.`f`")),
       Row(1)
     )
 
-    val df2 = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df2 = sqlContext.read.json(sparkContext.makeRDD(
       """{"a  b": {"c": {"d  e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df2.select(df2("`a  b`.c.d  e.f")),
@@ -646,7 +646,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-7324 dropDuplicates") {
-    val testData = sqlContext.sparkContext.parallelize(
+    val testData = sparkContext.parallelize(
       (2, 1, 2) :: (1, 1, 1) ::
       (1, 2, 1) :: (2, 1, 2) ::
       (2, 2, 2) :: (2, 2, 1) ::
@@ -869,7 +869,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-9323: DataFrame.orderBy should support nested column name") {
-    val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df = sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"b": 1}}""" :: Nil))
     checkAnswer(df.orderBy("a.b"), Row(Row(1)))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
index 77907e91363ec..7ae12a7895f7e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
@@ -32,7 +32,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
 
   test("test simple types") {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
-      val df = sqlContext.sparkContext.parallelize(Seq((1, 2))).toDF("a", "b")
+      val df = sparkContext.parallelize(Seq((1, 2))).toDF("a", "b")
       assert(df.select(struct("a", "b")).first().getStruct(0) === Row(1, 2))
     }
   }
@@ -40,7 +40,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
   test("test struct type") {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
       val struct = Row(1, 2L, 3.0F, 3.0)
-      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, struct)))
+      val data = sparkContext.parallelize(Seq(Row(1, struct)))
 
       val schema = new StructType()
         .add("a", IntegerType)
@@ -60,7 +60,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
       val innerStruct = Row(1, "abcd")
       val outerStruct = Row(1, 2L, 3.0F, 3.0, innerStruct, "efg")
-      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, outerStruct)))
+      val data = sparkContext.parallelize(Seq(Row(1, outerStruct)))
 
       val schema = new StructType()
         .add("a", IntegerType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
index 8d2f45d70308b..78a98798eff64 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
@@ -52,7 +52,7 @@ class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
     try {
       sqlContext.experimental.extraStrategies = TestStrategy :: Nil
 
-      val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
+      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
       checkAnswer(
         df.select("a"),
         Row("so fast"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index f5c5046a8ed88..b05435bad5c5a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -31,7 +31,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, $"x.a" === $"y.a", "inner").queryExecution.optimizedPlan
-    val planned = ctx.planner.EquiJoinSelection(join)
+    val planned = sqlContext.planner.EquiJoinSelection(join)
     assert(planned.size === 1)
   }
 
@@ -59,7 +59,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
 
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]),
@@ -118,7 +118,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted hash join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
     for (sortMergeJoinEnabled <- Seq(true, false)) {
       withClue(s"sortMergeJoinEnabled=$sortMergeJoinEnabled") {
@@ -138,7 +138,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted hash outer join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
     withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") {
       Seq(
@@ -167,7 +167,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, ($"x.a" === $"y.a") && ($"x.b" === $"y.b")).queryExecution.optimizedPlan
-    val planned = ctx.planner.EquiJoinSelection(join)
+    val planned = sqlContext.planner.EquiJoinSelection(join)
     assert(planned.size === 1)
   }
 
@@ -442,7 +442,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted left semi join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
 
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1000000000") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index babf8835d2545..eab0fbb196eb6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -32,33 +32,33 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
   }
 
   after {
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
   }
 
   test("get all tables") {
     checkAnswer(
-      ctx.tables().filter("tableName = 'ListTablesSuiteTable'"),
+      sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
       sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("getting all Tables with a database name has no impact on returned table names") {
     checkAnswer(
-      ctx.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
+      sqlContext.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
       sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("query the returned DataFrame of tables") {
@@ -66,7 +66,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       StructField("tableName", StringType, false) ::
       StructField("isTemporary", BooleanType, false) :: Nil)
 
-    Seq(ctx.tables(), sql("SHOW TABLes")).foreach {
+    Seq(sqlContext.tables(), sql("SHOW TABLes")).foreach {
       case tableDF =>
         assert(expectedSchema === tableDF.schema)
 
@@ -77,9 +77,9 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
           Row(true, "ListTablesSuiteTable")
         )
         checkAnswer(
-          ctx.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
+          sqlContext.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
           Row("tables", true))
-        ctx.dropTempTable("tables")
+        sqlContext.dropTempTable("tables")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 3649c2a97b5ef..cada03e9ac6bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -25,7 +25,9 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.columnar.InMemoryRelation
 
-class QueryTest extends PlanTest {
+abstract class QueryTest extends PlanTest {
+
+  protected def sqlContext: SQLContext
 
   // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
   TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
@@ -56,18 +58,33 @@ class QueryTest extends PlanTest {
    * @param df the [[DataFrame]] to be executed
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
-  protected def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    QueryTest.checkAnswer(df, expectedAnswer) match {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    val analyzedDF = try df catch {
+      case ae: AnalysisException =>
+        val currentValue = sqlContext.conf.dataFrameEagerAnalysis
+        sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, false)
+        val partiallyAnalzyedPlan = df.queryExecution.analyzed
+        sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, currentValue)
+        fail(
+          s"""
+             |Failed to analyze query: $ae
+             |$partiallyAnalzyedPlan
+             |
+             |${stackTraceToString(ae)}
+             |""".stripMargin)
+    }
+
+    QueryTest.checkAnswer(analyzedDF, expectedAnswer) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
   }
 
-  protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = {
     checkAnswer(df, Seq(expectedAnswer))
   }
 
-  protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = {
     checkAnswer(df, expectedAnswer.collect())
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 77ccd6f775e50..3ba14d7602a62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -57,7 +57,7 @@ class RowSuite extends SparkFunSuite with SharedSQLContext {
 
   test("serialize w/ kryo") {
     val row = Seq((1, Seq(1), Map(1 -> 1), BigDecimal(1))).toDF().first()
-    val serializer = new SparkSqlSerializer(ctx.sparkContext.getConf)
+    val serializer = new SparkSqlSerializer(sparkContext.getConf)
     val instance = serializer.newInstance()
     val ser = instance.serialize(row)
     val de = instance.deserialize(ser).asInstanceOf[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 7699adadd9cc8..c35b31c96df48 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -27,58 +27,58 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   test("propagate from spark conf") {
     // We create a new context here to avoid order dependence with other tests that might call
     // clear().
-    val newContext = new SQLContext(ctx.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     assert(newContext.getConf("spark.sql.testkey", "false") === "true")
   }
 
   test("programmatic ways of basic setting and getting") {
-    ctx.conf.clear()
-    assert(ctx.getAllConfs.size === 0)
+    sqlContext.conf.clear()
+    assert(sqlContext.getAllConfs.size === 0)
 
-    ctx.setConf(testKey, testVal)
-    assert(ctx.getConf(testKey) === testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getAllConfs.contains(testKey))
+    sqlContext.setConf(testKey, testVal)
+    assert(sqlContext.getConf(testKey) === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getAllConfs.contains(testKey))
 
     // Tests SQLConf as accessed from a SQLContext is mutable after
     // the latter is initialized, unlike SparkConf inside a SparkContext.
-    assert(ctx.getConf(testKey) == testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getAllConfs.contains(testKey))
+    assert(sqlContext.getConf(testKey) == testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getAllConfs.contains(testKey))
 
-    ctx.conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("parse SQL set commands") {
-    ctx.conf.clear()
+    sqlContext.conf.clear()
     sql(s"set $testKey=$testVal")
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
 
     sql("set some.property=20")
-    assert(ctx.getConf("some.property", "0") === "20")
+    assert(sqlContext.getConf("some.property", "0") === "20")
     sql("set some.property = 40")
-    assert(ctx.getConf("some.property", "0") === "40")
+    assert(sqlContext.getConf("some.property", "0") === "40")
 
     val key = "spark.sql.key"
     val vs = "val0,val_1,val2.3,my_table"
     sql(s"set $key=$vs")
-    assert(ctx.getConf(key, "0") === vs)
+    assert(sqlContext.getConf(key, "0") === vs)
 
     sql(s"set $key=")
-    assert(ctx.getConf(key, "0") === "")
+    assert(sqlContext.getConf(key, "0") === "")
 
-    ctx.conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("deprecated property") {
-    ctx.conf.clear()
+    sqlContext.conf.clear()
     sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-    assert(ctx.conf.numShufflePartitions === 10)
+    assert(sqlContext.conf.numShufflePartitions === 10)
   }
 
   test("invalid conf value") {
-    ctx.conf.clear()
+    sqlContext.conf.clear()
     val e = intercept[IllegalArgumentException] {
       sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index 007be12950774..dd88ae3700ab9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -24,7 +24,7 @@ class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
 
   override def afterAll(): Unit = {
     try {
-      SQLContext.setLastInstantiatedContext(ctx)
+      SQLContext.setLastInstantiatedContext(sqlContext)
     } finally {
       super.afterAll()
     }
@@ -32,18 +32,18 @@ class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
 
   test("getOrCreate instantiates SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = SQLContext.getOrCreate(ctx.sparkContext)
+    val sqlContext = SQLContext.getOrCreate(sparkContext)
     assert(sqlContext != null, "SQLContext.getOrCreate returned null")
-    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
       "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
   }
 
   test("getOrCreate gets last explicitly instantiated SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = new SQLContext(ctx.sparkContext)
-    assert(SQLContext.getOrCreate(ctx.sparkContext) != null,
+    val sqlContext = new SQLContext(sparkContext)
+    assert(SQLContext.getOrCreate(sparkContext) != null,
       "SQLContext.getOrCreate after explicitly created SQLContext returned null")
-    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
       "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 0ef25fe0faef0..05f2000459599 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -147,14 +147,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SQL Dialect Switching to a new SQL parser") {
-    val newContext = new SQLContext(sqlContext.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     newContext.setConf("spark.sql.dialect", classOf[MyDialect].getCanonicalName())
     assert(newContext.getSQLDialect().getClass === classOf[MyDialect])
     assert(newContext.sql("SELECT 1").collect() === Array(Row(1)))
   }
 
   test("SQL Dialect Switch to an invalid parser with alias") {
-    val newContext = new SQLContext(sqlContext.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     newContext.sql("SET spark.sql.dialect=MyTestClass")
     intercept[DialectException] {
       newContext.sql("SELECT 1")
@@ -196,7 +196,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("grouping on nested fields") {
-    sqlContext.read.json(sqlContext.sparkContext.parallelize(
+    sqlContext.read.json(sparkContext.parallelize(
       """{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
      .registerTempTable("rows")
 
@@ -215,7 +215,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-6201 IN type conversion") {
     sqlContext.read.json(
-      sqlContext.sparkContext.parallelize(
+      sparkContext.parallelize(
         Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
       .registerTempTable("d")
 
@@ -1342,7 +1342,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-3483 Special chars in column names") {
-    val data = sqlContext.sparkContext.parallelize(
+    val data = sparkContext.parallelize(
       Seq("""{"key?number1": "value1", "key.number2": "value2"}"""))
     sqlContext.read.json(data).registerTempTable("records")
     sql("SELECT `key?number1`, `key.number2` FROM records")
@@ -1385,13 +1385,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-4322 Grouping field with struct field as sub expression") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil))
+    sqlContext.read.json(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil))
       .registerTempTable("data")
     checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), Row(1))
     sqlContext.dropTempTable("data")
 
     sqlContext.read.json(
-      sqlContext.sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
+      sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
     checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), Row(2))
     sqlContext.dropTempTable("data")
   }
@@ -1412,10 +1412,10 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("Supporting relational operator '<=>' in Spark SQL") {
     val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd1 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
+    val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
     rdd1.toDF().registerTempTable("nulldata1")
     val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd2 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
+    val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
     rdd2.toDF().registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
@@ -1424,7 +1424,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
@@ -1432,14 +1432,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   test("SPARK-4699 case sensitivity SQL query") {
     sqlContext.setConf(SQLConf.CASE_SENSITIVE, false)
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("testTable1")
     checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1"))
     sqlContext.setConf(SQLConf.CASE_SENSITIVE, true)
   }
 
   test("SPARK-6145: ORDER BY test for nested fields") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
         """{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
       .registerTempTable("nestedOrder")
 
@@ -1452,14 +1452,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-6145: special cases") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"b": [1]}, "b": [{"a": 1}], "_c0": {"a": 1}}""" :: Nil)).registerTempTable("t")
     checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY _c0.a"), Row(1))
     checkAnswer(sql("SELECT b[0].a FROM t ORDER BY _c0.a"), Row(1))
   }
 
   test("SPARK-6898: complete support for special chars in column names") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
@@ -1543,7 +1543,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-7067: order by queries for complex ExtractValue chain") {
     withTempTable("t") {
-      sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+      sqlContext.read.json(sparkContext.makeRDD(
         """{"a": {"b": [{"c": 1}]}, "b": [{"d": 1}]}""" :: Nil)).registerTempTable("t")
       checkAnswer(sql("SELECT a.b FROM t ORDER BY b[0].d"), Row(Seq(Row(1))))
     }
@@ -1610,8 +1610,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("aggregation with codegen updates peak execution memory") {
     withSQLConf((SQLConf.CODEGEN_ENABLED.key, "true")) {
-      val sc = sqlContext.sparkContext
-      AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "aggregation with codegen") {
+      AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "aggregation with codegen") {
         testCodeGen(
           "SELECT key, count(value) FROM testData GROUP BY key",
           (1 to 100).map(i => Row(i, 1)))
@@ -1670,8 +1669,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("external sorting updates peak execution memory") {
     withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) {
-      val sc = sqlContext.sparkContext
-      AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sort") {
+      AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "external sort") {
         sortTest()
       }
     }
@@ -1679,7 +1677,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-9511: error with table starting with number") {
     withTempTable("1one") {
-      sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString))
+      sparkContext.parallelize(1 to 10).map(i => (i, i.toString))
         .toDF("num", "str")
         .registerTempTable("1one")
       checkAnswer(sql("select count(num) from 1one"), Row(10))
@@ -1690,7 +1688,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
       val df =
-        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+        sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
       df
         .write
         .format("parquet")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
index 45d0ee4a8e749..ddab918629645 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.test.SharedSQLContext
 class SerializationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("[SPARK-5235] SQLContext should be serializable") {
-    val _sqlContext = new SQLContext(sqlContext.sparkContext)
+    val _sqlContext = new SQLContext(sparkContext)
     new JavaSerializer(new SparkConf()).newInstance().serialize(_sqlContext)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index b91438baea06f..e12e6bea30260 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -268,9 +268,7 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(3, 4))
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("length(c)"), // int type of the argument is unacceptable
-        Row("5.0000"))
+      df.selectExpr("length(c)") // int type of the argument is unacceptable
     }
   }
 
@@ -284,63 +282,46 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("number format function") {
-    val tuple =
-      ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],
-        3.13223f, 4, 5L, 6.48173d, Decimal(7.128381))
-    val df =
-      Seq(tuple)
-        .toDF(
-          "a", // string "aa"
-          "b", // byte    1
-          "c", // short   2
-          "d", // float   3.13223f
-          "e", // integer 4
-          "f", // long    5L
-          "g", // double  6.48173d
-          "h") // decimal 7.128381
-
-    checkAnswer(
-      df.select(format_number($"f", 4)),
+    val df = sqlContext.range(1)
+
+    checkAnswer(
+      df.select(format_number(lit(5L), 4)),
       Row("5.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer
+      df.select(format_number(lit(1.toByte), 4)), // convert the 1st argument to integer
       Row("1.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(c, e)"), // convert the 1st argument to integer
+      df.select(format_number(lit(2.toShort), 4)), // convert the 1st argument to integer
       Row("2.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(d, e)"), // convert the 1st argument to double
+      df.select(format_number(lit(3.1322.toFloat), 4)), // convert the 1st argument to double
       Row("3.1322"))
 
     checkAnswer(
-      df.selectExpr("format_number(e, e)"), // not convert anything
+      df.select(format_number(lit(4), 4)), // not convert anything
       Row("4.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(f, e)"), // not convert anything
+      df.select(format_number(lit(5L), 4)), // not convert anything
       Row("5.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(g, e)"), // not convert anything
+      df.select(format_number(lit(6.48173), 4)), // not convert anything
       Row("6.4817"))
 
     checkAnswer(
-      df.selectExpr("format_number(h, e)"), // not convert anything
+      df.select(format_number(lit(BigDecimal(7.128381)), 4)), // not convert anything
       Row("7.1284"))
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(a, e)"), // string type of the 1st argument is unacceptable
-        Row("5.0000"))
+      df.select(format_number(lit("aa"), 4)) // string type of the 1st argument is unacceptable
     }
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
-        Row("5.0000"))
+      df.selectExpr("format_number(4, 6.48173)") // non-integral type 2nd argument is unacceptable
     }
 
     // for testing the mutable state of the expression in code gen.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index eb275af101e2f..e0435a0dba6ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -26,7 +26,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   test("built-in fixed arity expressions") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     df.selectExpr("rand()", "randn()", "rand(5)", "randn(50)")
   }
 
@@ -55,23 +55,23 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val df = Seq((1, "Tearing down the walls that divide us")).toDF("id", "saying")
     df.registerTempTable("tmp_table")
     checkAnswer(sql("select spark_partition_id() from tmp_table").toDF(), Row(0))
-    ctx.dropTempTable("tmp_table")
+    sqlContext.dropTempTable("tmp_table")
   }
 
   test("SPARK-8005 input_file_name") {
     withTempPath { dir =>
-      val data = ctx.sparkContext.parallelize(0 to 10, 2).toDF("id")
+      val data = sparkContext.parallelize(0 to 10, 2).toDF("id")
       data.write.parquet(dir.getCanonicalPath)
-      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("test_table")
+      sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("test_table")
       val answer = sql("select input_file_name() from test_table").head().getString(0)
       assert(answer.contains(dir.getCanonicalPath))
       assert(sql("select input_file_name() from test_table").distinct().collect().length >= 2)
-      ctx.dropTempTable("test_table")
+      sqlContext.dropTempTable("test_table")
     }
   }
 
   test("error reporting for incorrect number of arguments") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     val e = intercept[AnalysisException] {
       df.selectExpr("substr('abcd', 2, 3, 4)")
     }
@@ -79,7 +79,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("error reporting for undefined functions") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     val e = intercept[AnalysisException] {
       df.selectExpr("a_function_that_does_not_exist()")
     }
@@ -87,24 +87,24 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("Simple UDF") {
-    ctx.udf.register("strLenScala", (_: String).length)
+    sqlContext.udf.register("strLenScala", (_: String).length)
     assert(sql("SELECT strLenScala('test')").head().getInt(0) === 4)
   }
 
   test("ZeroArgument UDF") {
-    ctx.udf.register("random0", () => { Math.random()})
+    sqlContext.udf.register("random0", () => { Math.random()})
     assert(sql("SELECT random0()").head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {
-    ctx.udf.register("strLenScala", (_: String).length + (_: Int))
+    sqlContext.udf.register("strLenScala", (_: String).length + (_: Int))
     assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5)
   }
 
   test("UDF in a WHERE") {
-    ctx.udf.register("oneArgFilter", (n: Int) => { n > 80 })
+    sqlContext.udf.register("oneArgFilter", (n: Int) => { n > 80 })
 
-    val df = ctx.sparkContext.parallelize(
+    val df = sparkContext.parallelize(
       (1 to 100).map(i => TestData(i, i.toString))).toDF()
     df.registerTempTable("integerData")
 
@@ -114,7 +114,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDF in a HAVING") {
-    ctx.udf.register("havingFilter", (n: Long) => { n > 5 })
+    sqlContext.udf.register("havingFilter", (n: Long) => { n > 5 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -133,7 +133,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDF in a GROUP BY") {
-    ctx.udf.register("groupFunction", (n: Int) => { n > 10 })
+    sqlContext.udf.register("groupFunction", (n: Int) => { n > 10 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -150,10 +150,10 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDFs everywhere") {
-    ctx.udf.register("groupFunction", (n: Int) => { n > 10 })
-    ctx.udf.register("havingFilter", (n: Long) => { n > 2000 })
-    ctx.udf.register("whereFilter", (n: Int) => { n < 150 })
-    ctx.udf.register("timesHundred", (n: Long) => { n * 100 })
+    sqlContext.udf.register("groupFunction", (n: Int) => { n > 10 })
+    sqlContext.udf.register("havingFilter", (n: Long) => { n > 2000 })
+    sqlContext.udf.register("whereFilter", (n: Int) => { n < 150 })
+    sqlContext.udf.register("timesHundred", (n: Long) => { n * 100 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -172,7 +172,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("struct UDF") {
-    ctx.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
+    sqlContext.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
 
     val result =
       sql("SELECT returnStruct('test', 'test2') as ret")
@@ -181,13 +181,13 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("udf that is transformed") {
-    ctx.udf.register("makeStruct", (x: Int, y: Int) => (x, y))
+    sqlContext.udf.register("makeStruct", (x: Int, y: Int) => (x, y))
     // 1 + 1 is constant folded causing a transformation.
     assert(sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2))
   }
 
   test("type coercion for udf inputs") {
-    ctx.udf.register("intExpected", (x: Int) => x)
+    sqlContext.udf.register("intExpected", (x: Int) => x)
     // pass a decimal to intExpected.
     assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 1)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index b6d279ae47268..fa8f9c8e00089 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -90,7 +90,7 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDTs and UDFs") {
-    ctx.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
+    sqlContext.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
     pointsRDD.registerTempTable("points")
     checkAnswer(
       sql("SELECT testType(features) from points"),
@@ -148,8 +148,8 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
       StructField("vec", new MyDenseVectorUDT, false)
     ))
 
-    val stringRDD = ctx.sparkContext.parallelize(data)
-    val jsonRDD = ctx.read.schema(schema).json(stringRDD)
+    val stringRDD = sparkContext.parallelize(data)
+    val jsonRDD = sqlContext.read.schema(schema).json(stringRDD)
     checkAnswer(
       jsonRDD,
       Row(1, new MyDenseVector(Array(1.1, 2.2, 3.3, 4.4))) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 83db9b6510b36..cd3644eb9c099 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -31,7 +31,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   setupTestData()
 
   test("simple columnar query") {
-    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -39,16 +39,16 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
   test("default size avoids broadcast") {
     // TODO: Improve this test when we have better statistics
-    ctx.sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+    sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
       .toDF().registerTempTable("sizeTst")
-    ctx.cacheTable("sizeTst")
+    sqlContext.cacheTable("sizeTst")
     assert(
-      ctx.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
-        ctx.conf.autoBroadcastJoinThreshold)
+      sqlContext.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
+        sqlContext.conf.autoBroadcastJoinThreshold)
   }
 
   test("projection") {
-    val plan = ctx.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().map {
@@ -57,7 +57,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
-    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -69,7 +69,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM repeatedData"),
       repeatedData.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("repeatedData")
+    sqlContext.cacheTable("repeatedData")
 
     checkAnswer(
       sql("SELECT * FROM repeatedData"),
@@ -81,7 +81,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM nullableRepeatedData"),
       nullableRepeatedData.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("nullableRepeatedData")
+    sqlContext.cacheTable("nullableRepeatedData")
 
     checkAnswer(
       sql("SELECT * FROM nullableRepeatedData"),
@@ -96,7 +96,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT time FROM timestamps"),
       timestamps.collect().toSeq)
 
-    ctx.cacheTable("timestamps")
+    sqlContext.cacheTable("timestamps")
 
     checkAnswer(
       sql("SELECT time FROM timestamps"),
@@ -108,7 +108,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM withEmptyParts"),
       withEmptyParts.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("withEmptyParts")
+    sqlContext.cacheTable("withEmptyParts")
 
     checkAnswer(
       sql("SELECT * FROM withEmptyParts"),
@@ -157,7 +157,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
     // Create a RDD for the schema
     val rdd =
-      ctx.sparkContext.parallelize((1 to 100), 10).map { i =>
+      sparkContext.parallelize((1 to 100), 10).map { i =>
         Row(
           s"str${i}: test cache.",
           s"binary${i}: test cache.".getBytes("UTF-8"),
@@ -177,24 +177,24 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
           (0 to i).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
           Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
-    ctx.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
+    sqlContext.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
     // Cache the table.
     sql("cache table InMemoryCache_different_data_types")
     // Make sure the table is indeed cached.
-    val tableScan = ctx.table("InMemoryCache_different_data_types").queryExecution.executedPlan
+    sqlContext.table("InMemoryCache_different_data_types").queryExecution.executedPlan
     assert(
-      ctx.isCached("InMemoryCache_different_data_types"),
+      sqlContext.isCached("InMemoryCache_different_data_types"),
       "InMemoryCache_different_data_types should be cached.")
     // Issue a query and check the results.
     checkAnswer(
       sql(s"SELECT DISTINCT ${allColumns} FROM InMemoryCache_different_data_types"),
-      ctx.table("InMemoryCache_different_data_types").collect())
-    ctx.dropTempTable("InMemoryCache_different_data_types")
+      sqlContext.table("InMemoryCache_different_data_types").collect())
+    sqlContext.dropTempTable("InMemoryCache_different_data_types")
   }
 
   test("SPARK-10422: String column in InMemoryColumnarCache needs to override clone method") {
-    val df =
-      ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+    val df = sqlContext.range(1, 100).selectExpr("id % 10 as id")
+      .rdd.map(id => Tuple1(s"str_$id")).toDF("i")
     val cached = df.cache()
     // count triggers the caching action. It should not throw.
     cached.count()
@@ -205,7 +205,8 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     // Check result.
     checkAnswer(
       cached,
-      ctx.range(1, 100).selectExpr("id % 10 as id").rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+      sqlContext.range(1, 100).selectExpr("id % 10 as id")
+        .rdd.map(id => Tuple1(s"str_$id")).toDF("i")
     )
 
     // Drop the cache.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index ab2644eb4581d..6b7401464f46f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -25,32 +25,32 @@ import org.apache.spark.sql.test.SQLTestData._
 class PartitionBatchPruningSuite extends SparkFunSuite with SharedSQLContext {
   import testImplicits._
 
-  private lazy val originalColumnBatchSize = ctx.conf.columnBatchSize
-  private lazy val originalInMemoryPartitionPruning = ctx.conf.inMemoryPartitionPruning
+  private lazy val originalColumnBatchSize = sqlContext.conf.columnBatchSize
+  private lazy val originalInMemoryPartitionPruning = sqlContext.conf.inMemoryPartitionPruning
 
   override protected def beforeAll(): Unit = {
     super.beforeAll()
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
-    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, 10)
+    sqlContext.setConf(SQLConf.COLUMN_BATCH_SIZE, 10)
 
-    val pruningData = ctx.sparkContext.makeRDD((1 to 100).map { key =>
+    val pruningData = sparkContext.makeRDD((1 to 100).map { key =>
       val string = if (((key - 1) / 10) % 2 == 0) null else key.toString
       TestData(key, string)
     }, 5).toDF()
     pruningData.registerTempTable("pruningData")
 
     // Enable in-memory partition pruning
-    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
+    sqlContext.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
     // Enable in-memory table scan accumulators
-    ctx.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
-    ctx.cacheTable("pruningData")
+    sqlContext.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
+    sqlContext.cacheTable("pruningData")
   }
 
   override protected def afterAll(): Unit = {
     try {
-      ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
-      ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
-      ctx.uncacheTable("pruningData")
+      sqlContext.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
+      sqlContext.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
+      sqlContext.uncacheTable("pruningData")
     } finally {
       super.afterAll()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
index 8998f5111124c..911d12e93e503 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -22,6 +22,8 @@ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
+
   test("shuffling UnsafeRows in exchange") {
     val input = (1 to 1000).map(Tuple1.apply)
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index fad93b014c237..cafa1d5154788 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{execution, Row, SQLConf}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -31,14 +30,14 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
 
-class PlannerSuite extends SparkFunSuite with SharedSQLContext {
+class PlannerSuite extends SharedSQLContext {
   import testImplicits._
 
   setupTestData()
 
   private def testPartialAggregationPlan(query: LogicalPlan): Unit = {
-    val _ctx = ctx
-    import _ctx.planner._
+    val planner = sqlContext.planner
+    import planner._
     val plannedOption = HashAggregation(query).headOption.orElse(Aggregation(query).headOption)
     val planned =
       plannedOption.getOrElse(
@@ -53,8 +52,8 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("unions are collapsed") {
-    val _ctx = ctx
-    import _ctx.planner._
+    val planner = sqlContext.planner
+    import planner._
     val query = testData.unionAll(testData).unionAll(testData).logicalPlan
     val planned = BasicOperators(query).head
     val logicalUnions = query collect { case u: logical.Union => u }
@@ -81,33 +80,30 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
-    def checkPlan(fieldTypes: Seq[DataType], newThreshold: Int): Unit = {
-      ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold)
-      val fields = fieldTypes.zipWithIndex.map {
-        case (dataType, index) => StructField(s"c${index}", dataType, true)
-      } :+ StructField("key", IntegerType, true)
-      val schema = StructType(fields)
-      val row = Row.fromSeq(Seq.fill(fields.size)(null))
-      val rowRDD = ctx.sparkContext.parallelize(row :: Nil)
-      ctx.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
-
-      val planned = sql(
-        """
-          |SELECT l.a, l.b
-          |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
-        """.stripMargin).queryExecution.executedPlan
-
-      val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
-      val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
-
-      assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
-      assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
-
-      ctx.dropTempTable("testLimit")
+    def checkPlan(fieldTypes: Seq[DataType]): Unit = {
+      withTempTable("testLimit") {
+        val fields = fieldTypes.zipWithIndex.map {
+          case (dataType, index) => StructField(s"c${index}", dataType, true)
+        } :+ StructField("key", IntegerType, true)
+        val schema = StructType(fields)
+        val row = Row.fromSeq(Seq.fill(fields.size)(null))
+        val rowRDD = sparkContext.parallelize(row :: Nil)
+        sqlContext.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
+
+        val planned = sql(
+          """
+            |SELECT l.a, l.b
+            |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
+          """.stripMargin).queryExecution.executedPlan
+
+        val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
+        val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
+
+        assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
+        assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+      }
     }
 
-    val origThreshold = ctx.conf.autoBroadcastJoinThreshold
-
     val simpleTypes =
       NullType ::
       BooleanType ::
@@ -124,7 +120,9 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
       StringType ::
       BinaryType :: Nil
 
-    checkPlan(simpleTypes, newThreshold = 16434)
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "16434") {
+      checkPlan(simpleTypes)
+    }
 
     val complexTypes =
       ArrayType(DoubleType, true) ::
@@ -136,36 +134,37 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
         StructField("b", ArrayType(DoubleType), nullable = false),
         StructField("c", DoubleType, nullable = false))) :: Nil
 
-    checkPlan(complexTypes, newThreshold = 901617)
-
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "901617") {
+      checkPlan(complexTypes)
+    }
   }
 
   test("InMemoryRelation statistics propagation") {
-    val origThreshold = ctx.conf.autoBroadcastJoinThreshold
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920)
-
-    testData.limit(3).registerTempTable("tiny")
-    sql("CACHE TABLE tiny")
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") {
+      withTempTable("tiny") {
+        testData.limit(3).registerTempTable("tiny")
+        sql("CACHE TABLE tiny")
 
-    val a = testData.as("a")
-    val b = ctx.table("tiny").as("b")
-    val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
+        val a = testData.as("a")
+        val b = sqlContext.table("tiny").as("b")
+        val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
 
-    val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
-    val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
+        val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
+        val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
 
-    assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
-    assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+        assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
+        assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
 
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
+        sqlContext.clearCache()
+      }
+    }
   }
 
   test("efficient limit -> project -> sort") {
     {
       val query =
         testData.select('key, 'value).sort('key).limit(2).logicalPlan
-      val planned = ctx.planner.TakeOrderedAndProject(query)
+      val planned = sqlContext.planner.TakeOrderedAndProject(query)
       assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject])
       assert(planned.head.output === testData.select('key, 'value).logicalPlan.output)
     }
@@ -175,7 +174,7 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
       // into it.
       val query =
         testData.select('key, 'value).sort('key).select('value, 'key).limit(2).logicalPlan
-      val planned = ctx.planner.TakeOrderedAndProject(query)
+      val planned = sqlContext.planner.TakeOrderedAndProject(query)
       assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject])
       assert(planned.head.output === testData.select('value, 'key).logicalPlan.output)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index ef6ad59b71fb3..4492e37ad01ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -39,20 +39,20 @@ class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
 
   test("planner should insert unsafe->safe conversions when required") {
     val plan = Limit(10, outputsUnsafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.children.head.isInstanceOf[ConvertToSafe])
   }
 
   test("filter can process unsafe rows") {
     val plan = Filter(IsNull(IsNull(Literal(1))), outputsUnsafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(getConverters(preparedPlan).size === 1)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("filter can process safe rows") {
     val plan = Filter(IsNull(IsNull(Literal(1))), outputsSafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(getConverters(preparedPlan).isEmpty)
     assert(!preparedPlan.outputsUnsafeRows)
   }
@@ -67,33 +67,33 @@ class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
   test("union requires all of its input rows' formats to agree") {
     val plan = Union(Seq(outputsSafe, outputsUnsafe))
     assert(plan.canProcessSafeRows && plan.canProcessUnsafeRows)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("union can process safe rows") {
     val plan = Union(Seq(outputsSafe, outputsSafe))
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(!preparedPlan.outputsUnsafeRows)
   }
 
   test("union can process unsafe rows") {
     val plan = Union(Seq(outputsUnsafe, outputsUnsafe))
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("round trip with ConvertToUnsafe and ConvertToSafe") {
     val input = Seq(("hello", 1), ("world", 2))
     checkAnswer(
-      ctx.createDataFrame(input),
+      sqlContext.createDataFrame(input),
       plan => ConvertToSafe(ConvertToUnsafe(plan)),
       input.map(Row.fromTuple)
     )
   }
 
   test("SPARK-9683: copy UTF8String when convert unsafe array/map to safe") {
-    SparkPlan.currentContext.set(ctx)
+    SparkPlan.currentContext.set(sqlContext)
     val schema = ArrayType(StringType)
     val rows = (1 to 100).map { i =>
       InternalRow(new GenericArrayData(Array[Any](UTF8String.fromString(i.toString))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
index 8fa77b0fcb7b7..3073d492e613b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.test.SharedSQLContext
 
 class SortSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
   // This test was originally added as an example of how to use [[SparkPlanTest]];
   // it's not designed to be a comprehensive test of ExternalSort.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 5ab8f44faebf6..de45ae4635fb7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -31,14 +31,7 @@ import org.apache.spark.sql.test.SQLTestUtils
  * class's test helper methods can be used, see [[SortSuite]].
  */
 private[sql] abstract class SparkPlanTest extends SparkFunSuite {
-  protected def _sqlContext: SQLContext
-
-  /**
-   * Creates a DataFrame from a local Seq of Product.
-   */
-  implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = {
-    _sqlContext.implicits.localSeqToDataFrameHolder(data)
-  }
+  protected def sqlContext: SQLContext
 
   /**
    * Runs the plan and makes sure the answer matches the expected result.
@@ -98,7 +91,7 @@ private[sql] abstract class SparkPlanTest extends SparkFunSuite {
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
       sortAnswers: Boolean = true): Unit = {
-    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, _sqlContext) match {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -122,7 +115,7 @@ private[sql] abstract class SparkPlanTest extends SparkFunSuite {
       expectedPlanFunction: SparkPlan => SparkPlan,
       sortAnswers: Boolean = true): Unit = {
     SparkPlanTest.checkAnswer(
-        input, planFunction, expectedPlanFunction, sortAnswers, _sqlContext) match {
+        input, planFunction, expectedPlanFunction, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -149,13 +142,13 @@ object SparkPlanTest {
       planFunction: SparkPlan => SparkPlan,
       expectedPlanFunction: SparkPlan => SparkPlan,
       sortAnswers: Boolean,
-      _sqlContext: SQLContext): Option[String] = {
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.queryExecution.sparkPlan)
     val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan)
 
     val expectedAnswer: Seq[Row] = try {
-      executePlan(expectedOutputPlan, _sqlContext)
+      executePlan(expectedOutputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -170,7 +163,7 @@ object SparkPlanTest {
     }
 
     val actualAnswer: Seq[Row] = try {
-      executePlan(outputPlan, _sqlContext)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -210,12 +203,12 @@ object SparkPlanTest {
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
       sortAnswers: Boolean,
-      _sqlContext: SQLContext): Option[String] = {
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan))
 
     val sparkAnswer: Seq[Row] = try {
-      executePlan(outputPlan, _sqlContext)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -238,10 +231,10 @@ object SparkPlanTest {
     }
   }
 
-  private def executePlan(outputPlan: SparkPlan, _sqlContext: SQLContext): Seq[Row] = {
+  private def executePlan(outputPlan: SparkPlan, sqlContext: SQLContext): Seq[Row] = {
     // A very simple resolver to make writing tests easier. In contrast to the real resolver
     // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = _sqlContext.prepareForExecution.execute(
+    val resolvedPlan = sqlContext.prepareForExecution.execute(
       outputPlan transform {
         case plan: SparkPlan =>
           val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
index 3158458edb831..7a0f0dfd2b7f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
@@ -29,15 +29,16 @@ import org.apache.spark.sql.types._
  * A test suite that generates randomized data to test the [[TungstenSort]] operator.
  */
 class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
+    sqlContext.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
   }
 
   override def afterAll(): Unit = {
     try {
-      ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get)
+      sqlContext.conf.unsetConf(SQLConf.CODEGEN_ENABLED)
     } finally {
       super.afterAll()
     }
@@ -64,8 +65,7 @@ class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
   }
 
   test("sorting updates peak execution memory") {
-    val sc = ctx.sparkContext
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "unsafe external sort") {
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "unsafe external sort") {
       checkThatPlansAgree(
         (1 to 100).map(v => Tuple1(v)).toDF("a"),
         (child: SparkPlan) => TungstenSort('a.asc :: Nil, true, child),
@@ -83,8 +83,8 @@ class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
   ) {
     test(s"sorting on $dataType with nullable=$nullable, sortOrder=$sortOrder") {
       val inputData = Seq.fill(1000)(randomDataGenerator())
-      val inputDf = ctx.createDataFrame(
-        ctx.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
+      val inputDf = sqlContext.createDataFrame(
+        sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)
       )
       assert(TungstenSort.supportsSchema(inputDf.schema))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 5fdb82b067728..afda0d29f6d91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -37,7 +37,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       val newMutableProjection = (expr: Seq[Expression], schema: Seq[Attribute]) => {
         () => new InterpretedMutableProjection(expr, schema)
       }
-      val dummyAccum = SQLMetrics.createLongMetric(ctx.sparkContext, "dummy")
+      val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, 0,
         Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 1174b27732f22..6a18cc6d27138 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -215,7 +215,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Complex field and type inferring with null in sampling") {
-    val jsonDF = ctx.read.json(jsonNullStruct)
+    val jsonDF = sqlContext.read.json(jsonNullStruct)
     val expectedSchema = StructType(
       StructField("headers", StructType(
         StructField("Charset", StringType, true) ::
@@ -234,7 +234,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Primitive field and type inferring") {
-    val jsonDF = ctx.read.json(primitiveFieldAndType)
+    val jsonDF = sqlContext.read.json(primitiveFieldAndType)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType(20, 0), true) ::
@@ -262,7 +262,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Complex field and type inferring") {
-    val jsonDF = ctx.read.json(complexFieldAndType1)
+    val jsonDF = sqlContext.read.json(complexFieldAndType1)
 
     val expectedSchema = StructType(
       StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
@@ -361,7 +361,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("GetField operation on complex data type") {
-    val jsonDF = ctx.read.json(complexFieldAndType1)
+    val jsonDF = sqlContext.read.json(complexFieldAndType1)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -377,7 +377,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in primitive field values") {
-    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(primitiveFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("num_bool", StringType, true) ::
@@ -449,7 +449,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   ignore("Type conflict in primitive field values (Ignored)") {
-    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(primitiveFieldValueTypeConflict)
     jsonDF.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expression.
@@ -502,7 +502,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in complex field values") {
-    val jsonDF = ctx.read.json(complexFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(complexFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array", ArrayType(LongType, true), true) ::
@@ -526,7 +526,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in array elements") {
-    val jsonDF = ctx.read.json(arrayElementTypeConflict)
+    val jsonDF = sqlContext.read.json(arrayElementTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array1", ArrayType(StringType, true), true) ::
@@ -554,7 +554,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Handling missing fields") {
-    val jsonDF = ctx.read.json(missingFields)
+    val jsonDF = sqlContext.read.json(missingFields)
 
     val expectedSchema = StructType(
       StructField("a", BooleanType, true) ::
@@ -573,9 +573,9 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val dir = Utils.createTempDir()
     dir.delete()
     val path = dir.getCanonicalFile.toURI.toString
-    ctx.sparkContext.parallelize(1 to 100)
+    sparkContext.parallelize(1 to 100)
       .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
-    val jsonDF = ctx.read.option("samplingRatio", "0.49").json(path)
+    val jsonDF = sqlContext.read.option("samplingRatio", "0.49").json(path)
 
     val analyzed = jsonDF.queryExecution.analyzed
     assert(
@@ -590,7 +590,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
     val schema = StructType(StructField("a", LongType, true) :: Nil)
     val logicalRelation =
-      ctx.read.schema(schema).json(path)
+      sqlContext.read.schema(schema).json(path)
         .queryExecution.analyzed.asInstanceOf[LogicalRelation]
     val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation]
     assert(relationWithSchema.paths === Array(path))
@@ -603,7 +603,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     dir.delete()
     val path = dir.getCanonicalPath
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
-    val jsonDF = ctx.read.json(path)
+    val jsonDF = sqlContext.read.json(path)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType(20, 0), true) ::
@@ -672,7 +672,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
 
-    val jsonDF1 = ctx.read.schema(schema).json(path)
+    val jsonDF1 = sqlContext.read.schema(schema).json(path)
 
     assert(schema === jsonDF1.schema)
 
@@ -689,7 +689,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       "this is a simple string.")
     )
 
-    val jsonDF2 = ctx.read.schema(schema).json(primitiveFieldAndType)
+    val jsonDF2 = sqlContext.read.schema(schema).json(primitiveFieldAndType)
 
     assert(schema === jsonDF2.schema)
 
@@ -710,7 +710,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   test("Applying schemas with MapType") {
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    val jsonWithSimpleMap = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
+    val jsonWithSimpleMap = sqlContext.read.schema(schemaWithSimpleMap).json(mapType1)
 
     jsonWithSimpleMap.registerTempTable("jsonWithSimpleMap")
 
@@ -738,7 +738,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val schemaWithComplexMap = StructType(
       StructField("map", MapType(StringType, innerStruct, true), false) :: Nil)
 
-    val jsonWithComplexMap = ctx.read.schema(schemaWithComplexMap).json(mapType2)
+    val jsonWithComplexMap = sqlContext.read.schema(schemaWithComplexMap).json(mapType2)
 
     jsonWithComplexMap.registerTempTable("jsonWithComplexMap")
 
@@ -764,7 +764,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-2096 Correctly parse dot notations") {
-    val jsonDF = ctx.read.json(complexFieldAndType2)
+    val jsonDF = sqlContext.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -782,7 +782,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-3390 Complex arrays") {
-    val jsonDF = ctx.read.json(complexFieldAndType2)
+    val jsonDF = sqlContext.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -805,7 +805,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-3308 Read top level JSON arrays") {
-    val jsonDF = ctx.read.json(jsonArray)
+    val jsonDF = sqlContext.read.json(jsonArray)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -823,64 +823,63 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
   test("Corrupt records") {
     // Test if we can query corrupt records.
-    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
-
-    val jsonDF = ctx.read.json(corruptRecords)
-    jsonDF.registerTempTable("jsonTable")
-
-    val schema = StructType(
-      StructField("_unparsed", StringType, true) ::
-      StructField("a", StringType, true) ::
-      StructField("b", StringType, true) ::
-      StructField("c", StringType, true) :: Nil)
-
-    assert(schema === jsonDF.schema)
-
-    // In HiveContext, backticks should be used to access columns starting with a underscore.
-    checkAnswer(
-      sql(
-        """
-          |SELECT a, b, c, _unparsed
-          |FROM jsonTable
-        """.stripMargin),
-      Row(null, null, null, "{") ::
-        Row(null, null, null, "") ::
-        Row(null, null, null, """{"a":1, b:2}""") ::
-        Row(null, null, null, """{"a":{, b:3}""") ::
-        Row("str_a_4", "str_b_4", "str_c_4", null) ::
-        Row(null, null, null, "]") :: Nil
-    )
-
-    checkAnswer(
-      sql(
-        """
-          |SELECT a, b, c
-          |FROM jsonTable
-          |WHERE _unparsed IS NULL
-        """.stripMargin),
-      Row("str_a_4", "str_b_4", "str_c_4")
-    )
-
-    checkAnswer(
-      sql(
-        """
-          |SELECT _unparsed
-          |FROM jsonTable
-          |WHERE _unparsed IS NOT NULL
-        """.stripMargin),
-      Row("{") ::
-        Row("") ::
-        Row("""{"a":1, b:2}""") ::
-        Row("""{"a":{, b:3}""") ::
-        Row("]") :: Nil
-    )
-
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      withTempTable("jsonTable") {
+        val jsonDF = sqlContext.read.json(corruptRecords)
+        jsonDF.registerTempTable("jsonTable")
+
+        val schema = StructType(
+          StructField("_unparsed", StringType, true) ::
+          StructField("a", StringType, true) ::
+          StructField("b", StringType, true) ::
+          StructField("c", StringType, true) :: Nil)
+
+        assert(schema === jsonDF.schema)
+
+        // In HiveContext, backticks should be used to access columns starting with a underscore.
+        checkAnswer(
+          sql(
+            """
+              |SELECT a, b, c, _unparsed
+              |FROM jsonTable
+            """.stripMargin),
+          Row(null, null, null, "{") ::
+            Row(null, null, null, "") ::
+            Row(null, null, null, """{"a":1, b:2}""") ::
+            Row(null, null, null, """{"a":{, b:3}""") ::
+            Row("str_a_4", "str_b_4", "str_c_4", null) ::
+            Row(null, null, null, "]") :: Nil
+        )
+
+        checkAnswer(
+          sql(
+            """
+              |SELECT a, b, c
+              |FROM jsonTable
+              |WHERE _unparsed IS NULL
+            """.stripMargin),
+          Row("str_a_4", "str_b_4", "str_c_4")
+        )
+
+        checkAnswer(
+          sql(
+            """
+              |SELECT _unparsed
+              |FROM jsonTable
+              |WHERE _unparsed IS NOT NULL
+            """.stripMargin),
+          Row("{") ::
+            Row("") ::
+            Row("""{"a":1, b:2}""") ::
+            Row("""{"a":{, b:3}""") ::
+            Row("]") :: Nil
+        )
+      }
+    }
   }
 
   test("SPARK-4068: nulls in arrays") {
-    val jsonDF = ctx.read.json(nullsInArrays)
+    val jsonDF = sqlContext.read.json(nullsInArrays)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -926,7 +925,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5)
     }
 
-    val df1 = ctx.createDataFrame(rowRDD1, schema1)
+    val df1 = sqlContext.createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     val df2 = df1.toDF
     val result = df2.toJSON.collect()
@@ -949,7 +948,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df3 = ctx.createDataFrame(rowRDD2, schema2)
+    val df3 = sqlContext.createDataFrame(rowRDD2, schema2)
     df3.registerTempTable("applySchema2")
     val df4 = df3.toDF
     val result2 = df4.toJSON.collect()
@@ -957,8 +956,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
     assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
 
-    val jsonDF = ctx.read.json(primitiveFieldAndType)
-    val primTable = ctx.read.json(jsonDF.toJSON)
+    val jsonDF = sqlContext.read.json(primitiveFieldAndType)
+    val primTable = sqlContext.read.json(jsonDF.toJSON)
     primTable.registerTempTable("primativeTable")
     checkAnswer(
         sql("select * from primativeTable"),
@@ -970,8 +969,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
         "this is a simple string.")
       )
 
-    val complexJsonDF = ctx.read.json(complexFieldAndType1)
-    val compTable = ctx.read.json(complexJsonDF.toJSON)
+    val complexJsonDF = sqlContext.read.json(complexFieldAndType1)
+    val compTable = sqlContext.read.json(complexJsonDF.toJSON)
     compTable.registerTempTable("complexTable")
     // Access elements of a primitive array.
     checkAnswer(
@@ -1039,25 +1038,25 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Some(empty),
       1.0,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation0 = LogicalRelation(relation0)
     val relation1 = new JSONRelation(
       Some(singleRow),
       1.0,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation1 = LogicalRelation(relation1)
     val relation2 = new JSONRelation(
       Some(singleRow),
       0.5,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation2 = LogicalRelation(relation2)
     val relation3 = new JSONRelation(
       Some(singleRow),
       1.0,
       Some(StructType(StructField("b", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation3 = LogicalRelation(relation3)
 
     assert(relation0 !== relation1)
@@ -1078,18 +1077,18 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
     withTempPath(dir => {
       val path = dir.getCanonicalFile.toURI.toString
-      ctx.sparkContext.parallelize(1 to 100)
+      sparkContext.parallelize(1 to 100)
         .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
 
       val d1 = ResolvedDataSource(
-        ctx,
+        sqlContext,
         userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         provider = classOf[DefaultSource].getCanonicalName,
         options = Map("path" -> path))
 
       val d2 = ResolvedDataSource(
-        ctx,
+        sqlContext,
         userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         provider = classOf[DefaultSource].getCanonicalName,
@@ -1105,24 +1104,21 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-7565 MapType in JsonRDD") {
-    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
-
-    val schemaWithSimpleMap = StructType(
-      StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    try {
-      val temp = Utils.createTempDir().getPath
-
-      val df = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
-      df.write.mode("overwrite").parquet(temp)
-      // order of MapType is not defined
-      assert(ctx.read.parquet(temp).count() == 5)
-
-      val df2 = ctx.read.json(corruptRecords)
-      df2.write.mode("overwrite").parquet(temp)
-      checkAnswer(ctx.read.parquet(temp), df2.collect())
-    } finally {
-      ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      withTempDir { dir =>
+        val schemaWithSimpleMap = StructType(
+          StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
+        val df = sqlContext.read.schema(schemaWithSimpleMap).json(mapType1)
+
+        val path = dir.getAbsolutePath
+        df.write.mode("overwrite").parquet(path)
+        // order of MapType is not defined
+        assert(sqlContext.read.parquet(path).count() == 5)
+
+        val df2 = sqlContext.read.json(corruptRecords)
+        df2.write.mode("overwrite").parquet(path)
+        checkAnswer(sqlContext.read.parquet(path), df2.collect())
+      }
     }
   }
 
@@ -1142,19 +1138,19 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       val d1 = new File(root, "d1=1")
       // root/dt=1/col1=abc
       val p1_col1 = makePartition(
-        ctx.sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""),
+        sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""),
         d1,
         "col1",
         "abc")
 
       // root/dt=1/col1=abd
       val p2 = makePartition(
-        ctx.sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""),
+        sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""),
         d1,
         "col1",
         "abd")
 
-        ctx.read.json(root.getAbsolutePath).registerTempTable("test_myjson_with_part")
+        sqlContext.read.json(root.getAbsolutePath).registerTempTable("test_myjson_with_part")
         checkAnswer(sql(
           "SELECT count(a) FROM test_myjson_with_part where d1 = 1 and col1='abc'"), Row(4))
         checkAnswer(sql(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
index 2864181cf91d5..713d1da1cb515 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
@@ -21,10 +21,10 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
 
 private[json] trait TestJsonData {
-  protected def _sqlContext: SQLContext
+  protected def sqlContext: SQLContext
 
   def primitiveFieldAndType: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"string":"this is a simple string.",
           "integer":10,
           "long":21474836470,
@@ -35,7 +35,7 @@ private[json] trait TestJsonData {
       }"""  :: Nil)
 
   def primitiveFieldValueTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"num_num_1":11, "num_num_2":null, "num_num_3": 1.1,
           "num_bool":true, "num_str":13.1, "str_bool":"str1"}""" ::
       """{"num_num_1":null, "num_num_2":21474836470.9, "num_num_3": null,
@@ -46,14 +46,14 @@ private[json] trait TestJsonData {
           "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
 
   def jsonNullStruct: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
         """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
 
   def complexFieldValueTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"num_struct":11, "str_array":[1, 2, 3],
           "array":[], "struct_array":[], "struct": {}}""" ::
       """{"num_struct":{"field":false}, "str_array":null,
@@ -64,14 +64,14 @@ private[json] trait TestJsonData {
           "array":[7], "struct_array":{"field": true}, "struct": {"field": "str"}}""" :: Nil)
 
   def arrayElementTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
           "array2": [{"field":214748364700}, {"field":1}]}""" ::
       """{"array3": [{"field":"str"}, {"field":1}]}""" ::
       """{"array3": [1, 2, 3]}""" :: Nil)
 
   def missingFields: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"a":true}""" ::
       """{"b":21474836470}""" ::
       """{"c":[33, 44]}""" ::
@@ -79,7 +79,7 @@ private[json] trait TestJsonData {
       """{"e":"str"}""" :: Nil)
 
   def complexFieldAndType1: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"struct":{"field1": true, "field2": 92233720368547758070},
           "structWithArrayFields":{"field1":[4, 5, 6], "field2":["str1", "str2"]},
           "arrayOfString":["str1", "str2"],
@@ -95,7 +95,7 @@ private[json] trait TestJsonData {
          }"""  :: Nil)
 
   def complexFieldAndType2: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"arrayOfStruct":[{"field1": true, "field2": "str1"}, {"field1": false}, {"field3": null}],
           "complexArrayOfStruct": [
           {
@@ -149,7 +149,7 @@ private[json] trait TestJsonData {
       }""" :: Nil)
 
   def mapType1: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"map": {"a": 1}}""" ::
       """{"map": {"b": 2}}""" ::
       """{"map": {"c": 3}}""" ::
@@ -157,7 +157,7 @@ private[json] trait TestJsonData {
       """{"map": {"e": null}}""" :: Nil)
 
   def mapType2: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"map": {"a": {"field1": [1, 2, 3, null]}}}""" ::
       """{"map": {"b": {"field2": 2}}}""" ::
       """{"map": {"c": {"field1": [], "field2": 4}}}""" ::
@@ -166,21 +166,21 @@ private[json] trait TestJsonData {
       """{"map": {"f": {"field1": null}}}""" :: Nil)
 
   def nullsInArrays: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"field1":[[null], [[["Test"]]]]}""" ::
       """{"field2":[null, [{"Test":1}]]}""" ::
       """{"field3":[[null], [{"Test":"2"}]]}""" ::
       """{"field4":[[null, [1,2,3]]]}""" :: Nil)
 
   def jsonArray: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """[{"a":"str_a_1"}]""" ::
       """[{"a":"str_a_2"}, {"b":"str_b_3"}]""" ::
       """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" ::
       """[]""" :: Nil)
 
   def corruptRecords: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{""" ::
       """""" ::
       """{"a":1, b:2}""" ::
@@ -189,7 +189,7 @@ private[json] trait TestJsonData {
       """]""" :: Nil)
 
   def emptyRecords: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{""" ::
         """""" ::
         """{"a": {}}""" ::
@@ -198,7 +198,7 @@ private[json] trait TestJsonData {
         """]""" :: Nil)
 
 
-  lazy val singleRow: RDD[String] = _sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil)
+  lazy val singleRow: RDD[String] = sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil)
 
-  def empty: RDD[String] = _sqlContext.sparkContext.parallelize(Seq[String]())
+  def empty: RDD[String] = sqlContext.sparkContext.parallelize(Seq[String]())
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
index 91f3ce4d34c8b..0835bd123049b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
@@ -39,12 +39,13 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
 
   protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = {
     val fsPath = new Path(path)
-    val fs = fsPath.getFileSystem(configuration)
+    val fs = fsPath.getFileSystem(hadoopConfiguration)
     val parquetFiles = fs.listStatus(fsPath, new PathFilter {
       override def accept(path: Path): Boolean = pathFilter(path)
     }).toSeq.asJava
 
-    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
+    val footers =
+      ParquetFileReader.readAllFootersInParallel(hadoopConfiguration, parquetFiles, true)
     footers.asScala.head.getParquetMetadata.getFileMetaData.getSchema
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 08d2b9dee99b0..cd552e83372f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -101,7 +101,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
   test("fixed-length decimals") {
     def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sqlContext.sparkContext
+      sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(i / 100.0))
         .toDF()
@@ -119,7 +119,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
   test("date type") {
     def makeDateRDD(): DataFrame =
-      sqlContext.sparkContext
+      sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(DateTimeUtils.toJavaDate(i)))
         .toDF()
@@ -207,7 +207,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   test("compression codec") {
     def compressionCodecFor(path: String): String = {
       val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(configuration)).getBlocks.asScala
+        .readMetaData(new Path(path), Some(hadoopConfiguration)).getBlocks.asScala
         .flatMap(_.getColumns.asScala)
         .map(_.getCodec.name())
         .distinct
@@ -277,14 +277,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   test("write metadata") {
     withTempPath { file =>
       val path = new Path(file.toURI.toString)
-      val fs = FileSystem.getLocal(configuration)
+      val fs = FileSystem.getLocal(hadoopConfiguration)
       val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+      ParquetTypesConverter.writeMetaData(attributes, path, hadoopConfiguration)
 
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+      val metaData = ParquetTypesConverter.readMetaData(path, Some(hadoopConfiguration))
       val actualSchema = metaData.getFileMetaData.getSchema
       val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
 
@@ -355,7 +355,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       val path = new Path(location.getCanonicalPath)
 
       ParquetFileWriter.writeMetadataFile(
-        sqlContext.sparkContext.hadoopConfiguration,
+        sparkContext.hadoopConfiguration,
         path,
         Collections.singletonList(
           new Footer(path, new ParquetMetadata(fileMetadata, Collections.emptyList()))))
@@ -370,12 +370,12 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-6352 DirectParquetOutputCommitter") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Write to a parquet file and let it fail.
     // _temporary should be missing if direct output committer works.
     try {
-      configuration.set("spark.sql.parquet.output.committer.class",
+      hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
         classOf[DirectParquetOutputCommitter].getCanonicalName)
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
@@ -383,23 +383,23 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
-        val fs = path.getFileSystem(configuration)
+        val fs = path.getFileSystem(hadoopConfiguration)
         assert(!fs.exists(path))
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
   test("SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Write to a parquet file and let it fail.
     // _temporary should be missing if direct output committer works.
     try {
-      configuration.set("spark.sql.parquet.output.committer.class",
+      hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
         "org.apache.spark.sql.parquet.DirectParquetOutputCommitter")
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
@@ -407,25 +407,25 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
-        val fs = path.getFileSystem(configuration)
+        val fs = path.getFileSystem(hadoopConfiguration)
         assert(!fs.exists(path))
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
 
   test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden") {
     withTempPath { dir =>
-      val clonedConf = new Configuration(configuration)
+      val clonedConf = new Configuration(hadoopConfiguration)
 
-      configuration.set(
+      hadoopConfiguration.set(
         SQLConf.OUTPUT_COMMITTER_CLASS.key, classOf[ParquetOutputCommitter].getCanonicalName)
 
-      configuration.set(
+      hadoopConfiguration.set(
         "spark.sql.parquet.output.committer.class",
         classOf[JobCommitFailureParquetOutputCommitter].getCanonicalName)
 
@@ -436,8 +436,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
         assert(message === "Intentional exception for testing purposes")
       } finally {
         // Hadoop 1 doesn't have `Configuration.unset`
-        configuration.clear()
-        clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+        hadoopConfiguration.clear()
+        clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
       }
     }
   }
@@ -455,11 +455,11 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-7837 Do not close output writer twice when commitTask() fails") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Using a output committer that always fail when committing a task, so that both
     // `commitTask()` and `abortTask()` are invoked.
-    configuration.set(
+    hadoopConfiguration.set(
       "spark.sql.parquet.output.committer.class",
       classOf[TaskCommitFailureParquetOutputCommitter].getCanonicalName)
 
@@ -483,8 +483,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index ed8bafb10c60b..7bac8609e1b91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -517,7 +517,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     }
 
     val schema = StructType(partitionColumns :+ StructField(s"i", StringType))
-    val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(row :: Nil), schema)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(row :: Nil), schema)
 
     withTempPath { dir =>
       df.write.format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index a379523d67f80..9edbb522684eb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.util.Utils
  * A test suite that tests various Parquet queries.
  */
 class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext {
+  import testImplicits._
 
   test("simple select queries") {
     withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
@@ -40,22 +41,22 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
 
   test("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
-    ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
-      checkAnswer(ctx.table("t"), (data ++ data).map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    ctx.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("overwriting") {
     val data = (0 until 10).map(i => (i, i.toString))
-    ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
-      checkAnswer(ctx.table("t"), data.map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple))
     }
-    ctx.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("self-join") {
@@ -118,9 +119,9 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     val schema = StructType(List(StructField("d", DecimalType(18, 0), false),
       StructField("time", TimestampType, false)).toArray)
     withTempPath { file =>
-      val df = ctx.createDataFrame(ctx.sparkContext.parallelize(data), schema)
+      val df = sqlContext.createDataFrame(sparkContext.parallelize(data), schema)
       df.write.parquet(file.getCanonicalPath)
-      val df2 = ctx.read.parquet(file.getCanonicalPath)
+      val df2 = sqlContext.read.parquet(file.getCanonicalPath)
       checkAnswer(df2, df.collect().toSeq)
     }
   }
@@ -129,12 +130,12 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     def testSchemaMerging(expectedColumnNumber: Int): Unit = {
       withTempDir { dir =>
         val basePath = dir.getCanonicalPath
-        ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-        ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
+        sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+        sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
         // delete summary files, so if we don't merge part-files, one column will not be included.
         Utils.deleteRecursively(new File(basePath + "/foo=1/_metadata"))
         Utils.deleteRecursively(new File(basePath + "/foo=1/_common_metadata"))
-        assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber)
+        assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber)
       }
     }
 
@@ -153,9 +154,9 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     def testSchemaMerging(expectedColumnNumber: Int): Unit = {
       withTempDir { dir =>
         val basePath = dir.getCanonicalPath
-        ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-        ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
-        assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber)
+        sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+        sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
+        assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber)
       }
     }
 
@@ -171,19 +172,19 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
   test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") {
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath
-      ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-      ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString)
+      sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+      sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString)
 
       // Disables the global SQL option for schema merging
       withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") {
         assertResult(2) {
           // Disables schema merging via data source option
-          ctx.read.option("mergeSchema", "false").parquet(basePath).columns.length
+          sqlContext.read.option("mergeSchema", "false").parquet(basePath).columns.length
         }
 
         assertResult(3) {
           // Enables schema merging via data source option
-          ctx.read.option("mergeSchema", "true").parquet(basePath).columns.length
+          sqlContext.read.option("mergeSchema", "true").parquet(basePath).columns.length
         }
       }
     }
@@ -193,7 +194,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath
       val schema = StructType(Array(StructField("name", DecimalType(10, 5), false)))
-      val rowRDD = sqlContext.sparkContext.parallelize(Array(Row(Decimal("67123.45"))))
+      val rowRDD = sparkContext.parallelize(Array(Row(Decimal("67123.45"))))
       val df = sqlContext.createDataFrame(rowRDD, schema)
       df.write.parquet(basePath)
 
@@ -203,9 +204,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
   }
 
   test("SPARK-10005 Schema merging for nested struct") {
-    val sqlContext = _sqlContext
-    import sqlContext.implicits._
-
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 5dbc7d1630f27..442fafb12f200 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
 private[sql] trait ParquetTest extends SQLTestUtils {
-  protected def _sqlContext: SQLContext
 
   /**
    * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
@@ -43,7 +42,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       (data: Seq[T])
       (f: String => Unit): Unit = {
     withTempPath { file =>
-      _sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath)
+      sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
@@ -55,7 +54,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
   protected def withParquetDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withParquetFile(data)(path => f(_sqlContext.read.parquet(path)))
+    withParquetFile(data)(path => f(sqlContext.read.parquet(path)))
   }
 
   /**
@@ -67,14 +66,14 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       (data: Seq[T], tableName: String)
       (f: => Unit): Unit = {
     withParquetDataFrame(data) { df =>
-      _sqlContext.registerDataFrameAsTable(df, tableName)
+      sqlContext.registerDataFrameAsTable(df, tableName)
       withTempTable(tableName)(f)
     }
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    _sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
+    sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 53a0e53fd7719..dcbfdca71acb6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -33,8 +33,7 @@ import org.apache.spark.sql.{SQLConf, SQLContext, QueryTest}
  * without serializing the hashed relation, which does not happen in local mode.
  */
 class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
-  private var sc: SparkContext = null
-  private var sqlContext: SQLContext = null
+  protected var sqlContext: SQLContext = null
 
   /**
    * Create a new [[SQLContext]] running in local-cluster mode with unsafe and codegen enabled.
@@ -44,15 +43,14 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
     val conf = new SparkConf()
       .setMaster("local-cluster[2,1,1024]")
       .setAppName("testing")
-    sc = new SparkContext(conf)
+    val sc = new SparkContext(conf)
     sqlContext = new SQLContext(sc)
     sqlContext.setConf(SQLConf.UNSAFE_ENABLED, true)
     sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
   }
 
   override def afterAll(): Unit = {
-    sc.stop()
-    sc = null
+    sqlContext.sparkContext.stop()
     sqlContext = null
   }
 
@@ -60,7 +58,7 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
    * Test whether the specified broadcast join updates the peak execution memory accumulator.
    */
   private def testBroadcastJoin[T: ClassTag](name: String, joinType: String): Unit = {
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, name) {
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sqlContext.sparkContext, name) {
       val df1 = sqlContext.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
       val df2 = sqlContext.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
       // Comparison at the end is for broadcast left semi join
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 4c9187a9a7106..e5fd9e277fc61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -37,7 +37,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("GeneralHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val hashed = HashedRelation(data.iterator, numDataRows, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
 
@@ -53,7 +53,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("UniqueKeyHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val hashed = HashedRelation(data.iterator, numDataRows, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
 
@@ -73,7 +73,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
   test("UnsafeHashedRelation") {
     val schema = StructType(StructField("a", IntegerType, true) :: Nil)
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val toUnsafe = UnsafeProjection.create(schema)
     val unsafeData = data.map(toUnsafe(_).copy()).toArray
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
index cc649b9bd4c45..4174ee055021d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -27,9 +27,10 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
 class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
-  private lazy val myUpperCaseData = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val myUpperCaseData = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, "A"),
       Row(2, "B"),
       Row(3, "C"),
@@ -39,8 +40,8 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(null, "G")
     )), new StructType().add("N", IntegerType).add("L", StringType))
 
-  private lazy val myLowerCaseData = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val myLowerCaseData = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, "a"),
       Row(2, "b"),
       Row(3, "c"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
index a1a617d7b7398..c2e0bdac17968 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.{IntegerType, DoubleType, StructType}
 
 class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
 
-  private lazy val left = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val left = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, 2.0),
       Row(2, 100.0),
       Row(2, 1.0), // This row is duplicated to ensure that we will have multiple buffered matches
@@ -40,8 +40,8 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(null, null)
     )), new StructType().add("a", IntegerType).add("b", DoubleType))
 
-  private lazy val right = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val right = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(0, 0.0),
       Row(2, 3.0), // This row is duplicated to ensure that we will have multiple buffered matches
       Row(2, -1.0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
index baa86e320d986..3afd762942bcf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
 
 class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {
 
-  private lazy val left = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val left = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, 2.0),
       Row(1, 2.0),
       Row(2, 1.0),
@@ -40,8 +40,8 @@ class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(6, null)
     )), new StructType().add("a", IntegerType).add("b", DoubleType))
 
-  private lazy val right = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val right = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(2, 3.0),
       Row(2, 3.0),
       Row(3, 2.0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 80006bf077fe8..6afffae161ef6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -36,7 +36,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
   import testImplicits._
 
   test("LongSQLMetric should not box Long") {
-    val l = SQLMetrics.createLongMetric(ctx.sparkContext, "long")
+    val l = SQLMetrics.createLongMetric(sparkContext, "long")
     val f = () => {
       l += 1L
       l.add(1L)
@@ -50,7 +50,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
 
   test("Normal accumulator should do boxing") {
     // We need this test to make sure BoxingFinder works.
-    val l = ctx.sparkContext.accumulator(0L)
+    val l = sparkContext.accumulator(0L)
     val f = () => { l += 1L }
     BoxingFinder.getClassReader(f.getClass).foreach { cl =>
       val boxingFinder = new BoxingFinder()
@@ -71,19 +71,19 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       df: DataFrame,
       expectedNumOfJobs: Int,
       expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = {
-    val previousExecutionIds = ctx.listener.executionIdToData.keySet
+    val previousExecutionIds = sqlContext.listener.executionIdToData.keySet
     df.collect()
-    ctx.sparkContext.listenerBus.waitUntilEmpty(10000)
-    val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds)
+    sparkContext.listenerBus.waitUntilEmpty(10000)
+    val executionIds = sqlContext.listener.executionIdToData.keySet.diff(previousExecutionIds)
     assert(executionIds.size === 1)
     val executionId = executionIds.head
-    val jobs = ctx.listener.getExecution(executionId).get.jobs
+    val jobs = sqlContext.listener.getExecution(executionId).get.jobs
     // Use "<=" because there is a race condition that we may miss some jobs
     // TODO Change it to "=" once we fix the race condition that missing the JobStarted event.
     assert(jobs.size <= expectedNumOfJobs)
     if (jobs.size == expectedNumOfJobs) {
       // If we can track all jobs, check the metric values
-      val metricValues = ctx.listener.getExecutionMetrics(executionId)
+      val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       val actualMetrics = SparkPlanGraph(df.queryExecution.executedPlan).nodes.filter { node =>
         expectedMetrics.contains(node.id)
       }.map { node =>
@@ -474,19 +474,19 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
 
   test("save metrics") {
     withTempPath { file =>
-      val previousExecutionIds = ctx.listener.executionIdToData.keySet
+      val previousExecutionIds = sqlContext.listener.executionIdToData.keySet
       // Assume the execution plan is
       // PhysicalRDD(nodeId = 0)
       person.select('name).write.format("json").save(file.getAbsolutePath)
-      ctx.sparkContext.listenerBus.waitUntilEmpty(10000)
-      val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds)
+      sparkContext.listenerBus.waitUntilEmpty(10000)
+      val executionIds = sqlContext.listener.executionIdToData.keySet.diff(previousExecutionIds)
       assert(executionIds.size === 1)
       val executionId = executionIds.head
-      val jobs = ctx.listener.getExecution(executionId).get.jobs
+      val jobs = sqlContext.listener.getExecution(executionId).get.jobs
       // Use "<=" because there is a race condition that we may miss some jobs
       // TODO Change "<=" to "=" once we fix the race condition that missing the JobStarted event.
       assert(jobs.size <= 1)
-      val metricValues = ctx.listener.getExecutionMetrics(executionId)
+      val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       // Because "save" will create a new DataFrame internally, we cannot get the real metric id.
       // However, we still can check the value.
       assert(metricValues.values.toSeq === Seq(2L))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 80d1e88956949..2bbb41ca777b7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -74,7 +74,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     val accumulatorIds =
@@ -212,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -241,7 +241,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -281,7 +281,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobFailed)") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index d8c9a08d84c61..ed710689cc670 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -255,26 +255,26 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("Basic API") {
-    assert(ctx.read.jdbc(
+    assert(sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", new Properties).collect().length === 3)
   }
 
   test("Basic API with FetchSize") {
     val properties = new Properties
     properties.setProperty("fetchSize", "2")
-    assert(ctx.read.jdbc(
+    assert(sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", properties).collect().length === 3)
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
     assert(
-      ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
+      sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
       .collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
+    assert(sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
       .collect().length === 3)
   }
 
@@ -330,9 +330,9 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test DATE types") {
-    val rows = ctx.read.jdbc(
+    val rows = sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    val cachedRows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val cachedRows = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
     assert(rows(1).getAs[java.sql.Date](1) === null)
@@ -340,8 +340,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test DATE types in cache") {
-    val rows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val rows = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
+    sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().registerTempTable("mycached_date")
     val cachedRows = sql("select * from mycached_date").collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
@@ -349,7 +349,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test types for null value") {
-    val rows = ctx.read.jdbc(
+    val rows = sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.NULLTYPES", new Properties).collect()
     assert((0 to 14).forall(i => rows(0).isNullAt(i)))
   }
@@ -396,7 +396,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
 
   test("Remap types via JdbcDialects") {
     JdbcDialects.registerDialect(testH2Dialect)
-    val df = ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
+    val df = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
     assert(df.schema.filter(_.dataType != org.apache.spark.sql.types.StringType).isEmpty)
     val rows = df.collect()
     assert(rows(0).get(0).isInstanceOf[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 5dc3a2c07b8c7..e23ee6693133b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -22,13 +22,12 @@ import java.util.Properties
 
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{Row, SaveMode}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext {
+class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
 
   val url = "jdbc:h2:mem:testdb2"
   var conn: java.sql.Connection = null
@@ -76,8 +75,6 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
     conn1.close()
   }
 
-  private lazy val sc = ctx.sparkContext
-
   private lazy val arr2x2 = Array[Row](Row.apply("dave", 42), Row.apply("mary", 222))
   private lazy val arr1x2 = Array[Row](Row.apply("fred", 3))
   private lazy val schema2 = StructType(
@@ -91,49 +88,50 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
       StructField("seq", IntegerType) :: Nil)
 
   test("Basic CREATE") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
 
     df.write.jdbc(url, "TEST.BASICCREATETEST", new Properties)
-    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
-    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
+    assert(
+      2 === sqlContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
   }
 
   test("CREATE with overwrite") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x3), schema3)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.DROPTEST", properties)
-    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(3 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(3 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
 
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.DROPTEST", properties)
-    assert(1 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(1 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url, "TEST.APPENDTEST", new Properties)
     df2.write.mode(SaveMode.Append).jdbc(url, "TEST.APPENDTEST", new Properties)
-    assert(3 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
-    assert(2 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
+    assert(3 === sqlContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
+    assert(2 === sqlContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.TRUNCATETEST", properties)
-    assert(1 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
+    assert(1 === sqlContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr2x3), schema3)
 
     df.write.jdbc(url, "TEST.INCOMPATIBLETEST", new Properties)
     intercept[org.apache.spark.SparkException] {
@@ -143,14 +141,14 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
 
   test("INSERT to JDBC Datasource") {
     sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 
   test("INSERT to JDBC Datasource with overwrite") {
     sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
     sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 9bc3f6bcf6fce..6fc9febe49707 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -26,10 +26,8 @@ import org.apache.spark.sql.execution.datasources.DDLException
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
-
 class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var path: File = null
 
   override def beforeAll(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index d74d29fb0beb0..af04079ec895a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -19,13 +19,11 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.sql._
 
-
 private[sql] abstract class DataSourceTest extends QueryTest {
-  protected def _sqlContext: SQLContext
 
   // We want to test some edge cases.
   protected lazy val caseInsensitiveContext: SQLContext = {
-    val ctx = new SQLContext(_sqlContext.sparkContext)
+    val ctx = new SQLContext(sqlContext.sparkContext)
     ctx.setConf(SQLConf.CASE_SENSITIVE, false)
     ctx
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 084d83f6e9bff..5b70d258d6ce3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.sql.sources
 
 import java.io.File
 
-import org.apache.spark.sql.{SaveMode, AnalysisException, Row}
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
 class InsertSuite extends DataSourceTest with SharedSQLContext {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var path: File = null
 
   override def beforeAll(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 79b6e9b45c009..c9791879ec74c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -29,11 +29,11 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     val path = Utils.createTempDir()
     path.delete()
 
-    val df = ctx.range(100).select($"id", lit(1).as("data"))
+    val df = sqlContext.range(100).select($"id", lit(1).as("data"))
     df.write.partitionBy("id").save(path.getCanonicalPath)
 
     checkAnswer(
-      ctx.read.load(path.getCanonicalPath),
+      sqlContext.read.load(path.getCanonicalPath),
       (0 to 99).map(Row(1, _)).toSeq)
 
     Utils.deleteRecursively(path)
@@ -43,12 +43,12 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     val path = Utils.createTempDir()
     path.delete()
 
-    val base = ctx.range(100)
+    val base = sqlContext.range(100)
     val df = base.unionAll(base).select($"id", lit(1).as("data"))
     df.write.partitionBy("id").save(path.getCanonicalPath)
 
     checkAnswer(
-      ctx.read.load(path.getCanonicalPath),
+      sqlContext.read.load(path.getCanonicalPath),
       (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq)
 
     Utils.deleteRecursively(path)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index f18546b4c2d9b..10d261368993d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -28,7 +28,6 @@ import org.apache.spark.util.Utils
 
 class SaveLoadSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var originalDefaultSource: String = null
   private var path: File = null
   private var df: DataFrame = null
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
index 3fc02df954e23..520dea7f7dd92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
@@ -24,11 +24,11 @@ import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits}
  * A collection of sample data used in SQL tests.
  */
 private[sql] trait SQLTestData { self =>
-  protected def _sqlContext: SQLContext
+  protected def sqlContext: SQLContext
 
   // Helper object to import SQL implicits without a concrete SQLContext
   private object internalImplicits extends SQLImplicits {
-    protected override def _sqlContext: SQLContext = self._sqlContext
+    protected override def _sqlContext: SQLContext = self.sqlContext
   }
 
   import internalImplicits._
@@ -37,21 +37,21 @@ private[sql] trait SQLTestData { self =>
   // Note: all test data should be lazy because the SQLContext is not set up yet.
 
   protected lazy val emptyTestData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       Seq.empty[Int].map(i => TestData(i, i.toString))).toDF()
     df.registerTempTable("emptyTestData")
     df
   }
 
   protected lazy val testData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       (1 to 100).map(i => TestData(i, i.toString))).toDF()
     df.registerTempTable("testData")
     df
   }
 
   protected lazy val testData2: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       TestData2(1, 1) ::
       TestData2(1, 2) ::
       TestData2(2, 1) ::
@@ -63,7 +63,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val testData3: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       TestData3(1, None) ::
       TestData3(2, Some(2)) :: Nil).toDF()
     df.registerTempTable("testData3")
@@ -71,14 +71,14 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val negativeData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       (1 to 100).map(i => TestData(-i, (-i).toString))).toDF()
     df.registerTempTable("negativeData")
     df
   }
 
   protected lazy val largeAndSmallInts: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       LargeAndSmallInts(2147483644, 1) ::
       LargeAndSmallInts(1, 2) ::
       LargeAndSmallInts(2147483645, 1) ::
@@ -90,7 +90,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val decimalData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       DecimalData(1, 1) ::
       DecimalData(1, 2) ::
       DecimalData(2, 1) ::
@@ -102,7 +102,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val binaryData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       BinaryData("12".getBytes, 1) ::
       BinaryData("22".getBytes, 5) ::
       BinaryData("122".getBytes, 3) ::
@@ -113,7 +113,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val upperCaseData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       UpperCaseData(1, "A") ::
       UpperCaseData(2, "B") ::
       UpperCaseData(3, "C") ::
@@ -125,7 +125,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val lowerCaseData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       LowerCaseData(1, "a") ::
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
@@ -135,7 +135,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val arrayData: RDD[ArrayData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) ::
       ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil)
     rdd.toDF().registerTempTable("arrayData")
@@ -143,7 +143,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val mapData: RDD[MapData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) ::
       MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) ::
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
@@ -154,13 +154,13 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val repeatedData: RDD[StringData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
+    val rdd = sqlContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
     rdd.toDF().registerTempTable("repeatedData")
     rdd
   }
 
   protected lazy val nullableRepeatedData: RDD[StringData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
     rdd.toDF().registerTempTable("nullableRepeatedData")
@@ -168,7 +168,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val nullInts: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullInts(1) ::
       NullInts(2) ::
       NullInts(3) ::
@@ -178,7 +178,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val allNulls: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullInts(null) ::
       NullInts(null) ::
       NullInts(null) ::
@@ -188,7 +188,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val nullStrings: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
       NullStrings(3, null) :: Nil).toDF()
@@ -197,13 +197,13 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val tableName: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(TableName("test") :: Nil).toDF()
+    val df = sqlContext.sparkContext.parallelize(TableName("test") :: Nil).toDF()
     df.registerTempTable("tableName")
     df
   }
 
   protected lazy val unparsedStrings: RDD[String] = {
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       "1, A1, true, null" ::
       "2, B2, false, null" ::
       "3, C3, true, null" ::
@@ -212,13 +212,13 @@ private[sql] trait SQLTestData { self =>
 
   // An RDD with 4 elements and 8 partitions
   protected lazy val withEmptyParts: RDD[IntField] = {
-    val rdd = _sqlContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
+    val rdd = sqlContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
     rdd.toDF().registerTempTable("withEmptyParts")
     rdd
   }
 
   protected lazy val person: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       Person(0, "mike", 30) ::
       Person(1, "jim", 20) :: Nil).toDF()
     df.registerTempTable("person")
@@ -226,7 +226,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val salary: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       Salary(0, 2000.0) ::
       Salary(1, 1000.0) :: Nil).toDF()
     df.registerTempTable("salary")
@@ -234,7 +234,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val complexData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1, 1, 1), true) ::
       ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2, 2, 2), false) ::
       Nil).toDF()
@@ -246,7 +246,7 @@ private[sql] trait SQLTestData { self =>
    * Initialize all test data such that all temp tables are properly registered.
    */
   def loadTestData(): Unit = {
-    assert(_sqlContext != null, "attempted to initialize test data before SQLContext.")
+    assert(sqlContext != null, "attempted to initialize test data before SQLContext.")
     emptyTestData
     testData
     testData2
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index dc08306ad9cb4..9214569f18e93 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.conf.Configuration
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{DataFrame, Row, SQLContext, SQLImplicits}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.util.Utils
@@ -47,13 +47,13 @@ private[sql] trait SQLTestUtils
   with BeforeAndAfterAll
   with SQLTestData { self =>
 
-  protected def _sqlContext: SQLContext
+  protected def sparkContext = sqlContext.sparkContext
 
   // Whether to materialize all test data before the first test is run
   private var loadTestDataBeforeTests = false
 
   // Shorthand for running a query using our SQLContext
-  protected lazy val sql = _sqlContext.sql _
+  protected lazy val sql = sqlContext.sql _
 
   /**
    * A helper object for importing SQL implicits.
@@ -63,7 +63,14 @@ private[sql] trait SQLTestUtils
    * but the implicits import is needed in the constructor.
    */
   protected object testImplicits extends SQLImplicits {
-    protected override def _sqlContext: SQLContext = self._sqlContext
+    protected override def _sqlContext: SQLContext = self.sqlContext
+
+    // This must live here to preserve binary compatibility with Spark < 1.5.
+    implicit class StringToColumn(val sc: StringContext) {
+      def $(args: Any*): ColumnName = {
+        new ColumnName(sc.s(args: _*))
+      }
+    }
   }
 
   /**
@@ -84,8 +91,8 @@ private[sql] trait SQLTestUtils
   /**
    * The Hadoop configuration used by the active [[SQLContext]].
    */
-  protected def configuration: Configuration = {
-    _sqlContext.sparkContext.hadoopConfiguration
+  protected def hadoopConfiguration: Configuration = {
+    sparkContext.hadoopConfiguration
   }
 
   /**
@@ -96,12 +103,12 @@ private[sql] trait SQLTestUtils
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(_sqlContext.conf.getConfString(key)).toOption)
-    (keys, values).zipped.foreach(_sqlContext.conf.setConfString)
+    val currentValues = keys.map(key => Try(sqlContext.conf.getConfString(key)).toOption)
+    (keys, values).zipped.foreach(sqlContext.conf.setConfString)
     try f finally {
       keys.zip(currentValues).foreach {
-        case (key, Some(value)) => _sqlContext.conf.setConfString(key, value)
-        case (key, None) => _sqlContext.conf.unsetConf(key)
+        case (key, Some(value)) => sqlContext.conf.setConfString(key, value)
+        case (key, None) => sqlContext.conf.unsetConf(key)
       }
     }
   }
@@ -133,7 +140,7 @@ private[sql] trait SQLTestUtils
    * Drops temporary table `tableName` after calling `f`.
    */
   protected def withTempTable(tableNames: String*)(f: => Unit): Unit = {
-    try f finally tableNames.foreach(_sqlContext.dropTempTable)
+    try f finally tableNames.foreach(sqlContext.dropTempTable)
   }
 
   /**
@@ -142,7 +149,7 @@ private[sql] trait SQLTestUtils
   protected def withTable(tableNames: String*)(f: => Unit): Unit = {
     try f finally {
       tableNames.foreach { name =>
-        _sqlContext.sql(s"DROP TABLE IF EXISTS $name")
+        sqlContext.sql(s"DROP TABLE IF EXISTS $name")
       }
     }
   }
@@ -155,12 +162,12 @@ private[sql] trait SQLTestUtils
     val dbName = s"db_${UUID.randomUUID().toString.replace('-', '_')}"
 
     try {
-      _sqlContext.sql(s"CREATE DATABASE $dbName")
+      sqlContext.sql(s"CREATE DATABASE $dbName")
     } catch { case cause: Throwable =>
       fail("Failed to create temporary database", cause)
     }
 
-    try f(dbName) finally _sqlContext.sql(s"DROP DATABASE $dbName CASCADE")
+    try f(dbName) finally sqlContext.sql(s"DROP DATABASE $dbName CASCADE")
   }
 
   /**
@@ -168,8 +175,8 @@ private[sql] trait SQLTestUtils
    * `f` returns.
    */
   protected def activateDatabase(db: String)(f: => Unit): Unit = {
-    _sqlContext.sql(s"USE $db")
-    try f finally _sqlContext.sql(s"USE default")
+    sqlContext.sql(s"USE $db")
+    try f finally sqlContext.sql(s"USE default")
   }
 
   /**
@@ -177,7 +184,7 @@ private[sql] trait SQLTestUtils
    * way to construct [[DataFrame]] directly out of local data without relying on implicits.
    */
   protected implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = {
-    DataFrame(_sqlContext, plan)
+    DataFrame(sqlContext, plan)
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index d23c6a0732669..963d10eed62ed 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.test
 
-import org.apache.spark.sql.{ColumnName, SQLContext}
+import org.apache.spark.sql.SQLContext
 
 
 /**
@@ -36,9 +36,7 @@ trait SharedSQLContext extends SQLTestUtils {
   /**
    * The [[TestSQLContext]] to use for all tests in this suite.
    */
-  protected def ctx: TestSQLContext = _ctx
-  protected def sqlContext: TestSQLContext = _ctx
-  protected override def _sqlContext: SQLContext = _ctx
+  protected def sqlContext: SQLContext = _ctx
 
   /**
    * Initialize the [[TestSQLContext]].
@@ -64,15 +62,4 @@ trait SharedSQLContext extends SQLTestUtils {
       super.afterAll()
     }
   }
-
-  /**
-   * Converts $"col name" into an [[Column]].
-   * @since 1.3.0
-   */
-  // This must be duplicated here to preserve binary compatibility with Spark < 1.5.
-  implicit class StringToColumn(val sc: StringContext) {
-    def $(args: Any*): ColumnName = {
-      new ColumnName(sc.s(args: _*))
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 92ef2f7d74ba1..d99d191ebe81e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -47,6 +47,6 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
   }
 
   private object testData extends SQLTestData {
-    protected override def _sqlContext: SQLContext = self
+    protected override def sqlContext: SQLContext = self
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 57fea5d8db343..77f43f92705d8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.hive.ql.exec.FunctionRegistry
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
-import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.{SQLContext, SQLConf}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.CacheTableCommand
@@ -51,6 +51,11 @@ object TestHive
         // SPARK-8910
         .set("spark.ui.enabled", "false")))
 
+trait TestHiveSingleton {
+  protected val sqlContext: SQLContext = TestHive
+  protected val hiveContext: TestHiveContext = TestHive
+}
+
 /**
  * A locally running test instance of Spark's Hive execution engine.
  *
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 39d315aaeab57..9adb3780a2c55 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -19,14 +19,14 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
-import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.{SaveMode, AnalysisException, DataFrame, QueryTest}
+import org.apache.spark.sql.columnar.InMemoryColumnarTableScan
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
 import org.apache.spark.storage.RDDBlockId
 import org.apache.spark.util.Utils
 
-class CachedTableSuite extends QueryTest {
+class CachedTableSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext._
 
   def rddIdOf(tableName: String): Int = {
     val executedPlan = table(tableName).queryExecution.executedPlan
@@ -95,18 +95,18 @@ class CachedTableSuite extends QueryTest {
 
   test("correct error on uncache of non-cached table") {
     intercept[IllegalArgumentException] {
-      TestHive.uncacheTable("src")
+      hiveContext.uncacheTable("src")
     }
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' HiveQL statement") {
-    TestHive.sql("CACHE TABLE src")
+    sql("CACHE TABLE src")
     assertCached(table("src"))
-    assert(TestHive.isCached("src"), "Table 'src' should be cached")
+    assert(hiveContext.isCached("src"), "Table 'src' should be cached")
 
-    TestHive.sql("UNCACHE TABLE src")
+    sql("UNCACHE TABLE src")
     assertCached(table("src"), 0)
-    assert(!TestHive.isCached("src"), "Table 'src' should not be cached")
+    assert(!hiveContext.isCached("src"), "Table 'src' should not be cached")
   }
 
   test("CACHE TABLE tableName AS SELECT * FROM anotherTable") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
index 30f5313d2b812..cf737836939f9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
@@ -22,12 +22,12 @@ import scala.util.Try
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.catalyst.util.quietly
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.{AnalysisException, QueryTest}
 
 
-class ErrorPositionSuite extends QueryTest with BeforeAndAfter {
+class ErrorPositionSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter {
+  import hiveContext.implicits._
 
   before {
     Seq((1, 1, 1)).toDF("a", "a", "b").registerTempTable("dupAttributes")
@@ -122,7 +122,7 @@ class ErrorPositionSuite extends QueryTest with BeforeAndAfter {
 
     test(name) {
       val error = intercept[AnalysisException] {
-        quietly(sql(query))
+        quietly(hiveContext.sql(query))
       }
 
       assert(!error.getMessage.contains("Seq("))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
index fb10f8583da99..2e5cae415e54b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -19,24 +19,25 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.scalatest.BeforeAndAfterAll
 
 // TODO ideally we should put the test suite into the package `sql`, as
 // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
 // support the `cube` or `rollup` yet.
-class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
+class HiveDataFrameAnalyticsSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext.implicits._
+  import hiveContext.sql
+
   private var testData: DataFrame = _
 
   override def beforeAll() {
     testData = Seq((1, 2), (2, 4)).toDF("a", "b")
-    TestHive.registerDataFrameAsTable(testData, "mytable")
+    hiveContext.registerDataFrameAsTable(testData, "mytable")
   }
 
   override def afterAll(): Unit = {
-    TestHive.dropTempTable("mytable")
+    hiveContext.dropTempTable("mytable")
   }
 
   test("rollup") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
index 52e782768cb75..f621367eb553b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{Row, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-
-class HiveDataFrameJoinSuite extends QueryTest {
+class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.implicits._
 
   // We should move this into SQL package if we make case sensitivity configurable in SQL.
   test("join - self join auto resolve ambiguity with case insensitivity") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index c177cbdd991cf..2c98f1c3cc49c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.hive
 import org.apache.spark.sql.{Row, QueryTest}
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class HiveDataFrameWindowSuite extends QueryTest {
+class HiveDataFrameWindowSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.implicits._
+  import hiveContext.sql
 
   test("reuse window partitionBy") {
     val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 574624d501f22..107457f79ec03 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -19,18 +19,15 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.hive.client.{ExternalTable, ManagedTable}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.sources.DataSourceTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
-import org.apache.spark.sql.{Row, SaveMode, SQLContext}
-import org.apache.spark.{Logging, SparkFunSuite}
 
-
-class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging {
+class HiveMetastoreCatalogSuite extends SparkFunSuite with TestHiveSingleton {
+  import hiveContext.implicits._
 
   test("struct field should accept underscore in sub-column name") {
     val hiveTypeStr = "struct<a: int, b_1: string, c: string>"
@@ -46,14 +43,15 @@ class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging {
   }
 
   test("duplicated metastore relations") {
-    val df = sql("SELECT * FROM src")
+    val df = hiveContext.sql("SELECT * FROM src")
     logInfo(df.queryExecution.toString)
     df.as('a).join(df.as('b), $"a.key" === $"b.key")
   }
 }
 
-class DataSourceWithHiveMetastoreCatalogSuite extends DataSourceTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
+class DataSourceWithHiveMetastoreCatalogSuite
+  extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
   import testImplicits._
 
   private val testDF = range(1, 3).select(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index fe0db5228de16..5596ec6882ea2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
-import org.apache.spark.sql.{QueryTest, Row, SQLContext}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.{QueryTest, Row}
 
 case class Cases(lower: String, UPPER: String)
 
-class HiveParquetSuite extends QueryTest with ParquetTest {
-  private val ctx = TestHive
-  override def _sqlContext: SQLContext = ctx
+class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton {
 
   test("Case insensitive attribute names") {
     withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
@@ -53,7 +51,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
     withTempPath { dir =>
       sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
-      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("p")
+      hiveContext.read.parquet(dir.getCanonicalPath).registerTempTable("p")
       withTempTable("p") {
         checkAnswer(
           sql("SELECT * FROM src ORDER BY key"),
@@ -66,7 +64,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
     withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
       withTempPath { file =>
         sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
-        ctx.read.parquet(file.getCanonicalPath).registerTempTable("p")
+        hiveContext.read.parquet(file.getCanonicalPath).registerTempTable("p")
         withTempTable("p") {
           // let's do three overwrites for good measure
           sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index dc2d85f48624c..84f3db44ecd8d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -29,7 +29,7 @@ import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{SQLContext, QueryTest}
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
 import org.apache.spark.sql.types.DecimalType
@@ -272,7 +272,11 @@ object SparkSQLConfTest extends Logging {
   }
 }
 
-object SPARK_9757 extends QueryTest with Logging {
+object SPARK_9757 extends QueryTest {
+  import org.apache.spark.sql.functions._
+
+  protected var sqlContext: SQLContext = _
+
   def main(args: Array[String]): Unit = {
     Utils.configTestLog4j("INFO")
 
@@ -282,10 +286,9 @@ object SPARK_9757 extends QueryTest with Logging {
         .set("spark.sql.hive.metastore.jars", "maven"))
 
     val hiveContext = new TestHiveContext(sparkContext)
+    sqlContext = hiveContext
     import hiveContext.implicits._
 
-    import org.apache.spark.sql.functions._
-
     val dir = Utils.createTempDir()
     dir.delete()
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index d33e81227db88..80a61f82fd24f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -24,28 +24,25 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.{QueryTest, _}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-/* Implicits */
-import org.apache.spark.sql.hive.test.TestHive._
-
 case class TestData(key: Int, value: String)
 
 case class ThreeCloumntable(key: Int, value: String, key1: String)
 
-class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-
+class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter {
+  import hiveContext.implicits._
+  import hiveContext.sql
 
-  val testData = TestHive.sparkContext.parallelize(
+  val testData = hiveContext.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString))).toDF()
 
   before {
     // Since every we are doing tests for DDL statements,
     // it is better to reset before every test.
-    TestHive.reset()
+    hiveContext.reset()
     // Register the testData, which will be used in every test.
     testData.registerTempTable("testData")
   }
@@ -96,9 +93,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
 
   test("SPARK-4052: scala.collection.Map as value type of MapType") {
     val schema = StructType(StructField("m", MapType(StringType, StringType), true) :: Nil)
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(scala.collection.mutable.HashMap(s"key$i" -> s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m MAP <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -169,8 +166,8 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert ArrayType.containsNull == false") {
     val schema = StructType(Seq(
       StructField("a", ArrayType(StringType, containsNull = false))))
-    val rowRDD = TestHive.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val rowRDD = hiveContext.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i"))))
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithArrayValue")
     sql("CREATE TABLE hiveTableWithArrayValue(a Array <STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithArrayValue SELECT a FROM tableWithArrayValue")
@@ -185,9 +182,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert MapType.valueContainsNull == false") {
     val schema = StructType(Seq(
       StructField("m", MapType(StringType, StringType, valueContainsNull = false))))
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(Map(s"key$i" -> s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m Map <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -202,9 +199,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert StructType.fields.exists(_.nullable == false)") {
     val schema = StructType(Seq(
       StructField("s", StructType(Seq(StructField("f", StringType, nullable = false))))))
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(Row(s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithStructValue")
     sql("CREATE TABLE hiveTableWithStructValue(s Struct <f: STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithStructValue SELECT s FROM tableWithStructValue")
@@ -217,11 +214,11 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   }
 
   test("SPARK-5498:partition schema does not match table schema") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       (1 to 10).map(i => TestData(i, i.toString))).toDF()
     testData.registerTempTable("testData")
 
-    val testDatawithNull = TestHive.sparkContext.parallelize(
+    val testDatawithNull = hiveContext.sparkContext.parallelize(
       (1 to 10).map(i => ThreeCloumntable(i, i.toString, null))).toDF()
 
     val tmpDir = Utils.createTempDir()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index d3388a9429e41..579631df772b5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -19,17 +19,15 @@ package org.apache.spark.sql.hive
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.Row
 
-class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
+class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+  import hiveContext.implicits._
 
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-
-  val df =
-    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
+  val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 20a50586d5201..bf0db084906cd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -22,15 +22,11 @@ import java.io.{IOException, File}
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
@@ -39,10 +35,9 @@ import org.apache.spark.util.Utils
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
  */
-class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll
-  with Logging {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
+  import hiveContext.implicits._
 
   var jsonFilePath: String = _
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 997c667ec0d1b..f16c257ab5ab4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -17,20 +17,16 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.{AnalysisException, QueryTest, SQLContext, SaveMode}
+import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
 
-class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
-  override val _sqlContext: HiveContext = TestHive
-  private val sqlContext = _sqlContext
-
-  private val df = sqlContext.range(10).coalesce(1)
+class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  private lazy val df = sqlContext.range(10).coalesce(1)
 
   private def checkTablePath(dbName: String, tableName: String): Unit = {
-    // val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val metastoreTable = sqlContext.catalog.client.getTable(dbName, tableName)
-    val expectedPath = sqlContext.catalog.client.getDatabase(dbName).location + "/" + tableName
+    val metastoreTable = hiveContext.catalog.client.getTable(dbName, tableName)
+    val expectedPath = hiveContext.catalog.client.getDatabase(dbName).location + "/" + tableName
 
     assert(metastoreTable.serdeProperties("path") === expectedPath)
   }
@@ -220,7 +216,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
           df.write.parquet(s"$path/p=2")
           sql("ALTER TABLE t ADD PARTITION (p=2)")
-          sqlContext.refreshTable("t")
+          hiveContext.refreshTable("t")
           checkAnswer(
             sqlContext.table("t"),
             df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
@@ -252,7 +248,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
         df.write.parquet(s"$path/p=2")
         sql(s"ALTER TABLE $db.t ADD PARTITION (p=2)")
-        sqlContext.refreshTable(s"$db.t")
+        hiveContext.refreshTable(s"$db.t")
         checkAnswer(
           sqlContext.table(s"$db.t"),
           df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
index 91d7a48208e8d..49aab85cf1aaf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -18,38 +18,20 @@
 package org.apache.spark.sql.hive
 
 import java.sql.Timestamp
-import java.util.{Locale, TimeZone}
 
 import org.apache.hadoop.hive.conf.HiveConf
-import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-
-class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+import org.apache.spark.sql.{Row, SQLConf}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
+class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHiveSingleton {
   /**
    * Set the staging directory (and hence path to ignore Parquet files under)
    * to that set by [[HiveConf.ConfVars.STAGINGDIR]].
    */
   private val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR)
 
-  private val originalTimeZone = TimeZone.getDefault
-  private val originalLocale = Locale.getDefault
-
-  protected override def beforeAll(): Unit = {
-    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
-    Locale.setDefault(Locale.US)
-  }
-
-  override protected def afterAll(): Unit = {
-    TimeZone.setDefault(originalTimeZone)
-    Locale.setDefault(originalLocale)
-  }
-
   override protected def logParquetSchema(path: String): Unit = {
     val schema = readParquetSchema(path, { path =>
       !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 1cc8a93e83411..f542a5a02508c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,22 +18,18 @@
 package org.apache.spark.sql.hive
 
 import com.google.common.io.Files
-import org.apache.spark.sql.test.SQLTestUtils
 
-import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.util.Utils
+import org.apache.spark.sql.{QueryTest, _}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 
+class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext.implicits._
 
-class QueryPartitionSuite extends QueryTest with SQLTestUtils {
-
-  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
-  import ctx.implicits._
-
-  protected def _sqlContext = ctx
-
-  test("SPARK-5068: query data when path doesn't exist"){
+  test("SPARK-5068: query data when path doesn't exist") {
     withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
-      val testData = ctx.sparkContext.parallelize(
+      val testData = sparkContext.parallelize(
         (1 to 10).map(i => TestData(i, i.toString))).toDF()
       testData.registerTempTable("testData")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index e4fec7e2c8a2a..6a692d6fce562 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -17,24 +17,15 @@
 
 package org.apache.spark.sql.hive
 
-import org.scalatest.BeforeAndAfterAll
-
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{Row, SQLConf, QueryTest}
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.execution._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
-
-  private lazy val ctx: HiveContext = {
-    val ctx = org.apache.spark.sql.hive.test.TestHive
-    ctx.reset()
-    ctx.cacheTables = false
-    ctx
-  }
-
-  import ctx.sql
+class StatisticsSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.sql
 
   test("parse analyze commands") {
     def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
@@ -54,9 +45,6 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       }
     }
 
-    // Ensure session state is initialized.
-    ctx.parseSql("use default")
-
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 COMPUTE STATISTICS",
       classOf[HiveNativeCommand])
@@ -80,7 +68,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      ctx.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
+      hiveContext.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -114,7 +102,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         |SELECT * FROM src
       """.stripMargin).collect()
 
-    assert(queryTotalSize("analyzeTable_part") === ctx.conf.defaultSizeInBytes)
+    assert(queryTotalSize("analyzeTable_part") === hiveContext.conf.defaultSizeInBytes)
 
     sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
@@ -125,9 +113,9 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     // Try to analyze a temp table
     sql("""SELECT * FROM src""").registerTempTable("tempTable")
     intercept[UnsupportedOperationException] {
-      ctx.analyze("tempTable")
+      hiveContext.analyze("tempTable")
     }
-    ctx.catalog.unregisterTable(Seq("tempTable"))
+    hiveContext.catalog.unregisterTable(Seq("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
@@ -155,8 +143,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       val sizes = df.queryExecution.analyzed.collect {
         case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
       }
-      assert(sizes.size === 2 && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold
-        && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold,
+      assert(sizes.size === 2 && sizes(0) <= hiveContext.conf.autoBroadcastJoinThreshold
+        && sizes(1) <= hiveContext.conf.autoBroadcastJoinThreshold,
         s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
       // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -167,8 +155,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
       checkAnswer(df, expectedAnswer) // check correctness of output
 
-      ctx.conf.settings.synchronized {
-        val tmp = ctx.conf.autoBroadcastJoinThreshold
+      hiveContext.conf.settings.synchronized {
+        val tmp = hiveContext.conf.autoBroadcastJoinThreshold
 
         sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1""")
         df = sql(query)
@@ -211,8 +199,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         .isAssignableFrom(r.getClass) =>
         r.statistics.sizeInBytes
     }
-    assert(sizes.size === 2 && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold
-      && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold,
+    assert(sizes.size === 2 && sizes(1) <= hiveContext.conf.autoBroadcastJoinThreshold
+      && sizes(0) <= hiveContext.conf.autoBroadcastJoinThreshold,
       s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
     // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -225,8 +213,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(df, answer) // check correctness of output
 
-    ctx.conf.settings.synchronized {
-      val tmp = ctx.conf.autoBroadcastJoinThreshold
+    hiveContext.conf.settings.synchronized {
+      val tmp = hiveContext.conf.autoBroadcastJoinThreshold
 
       sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1")
       df = sql(leftSemiJoinQuery)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 7ee1c8d13aa3f..3ab4576811194 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -18,18 +18,18 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 case class FunctionResult(f1: String, f2: String)
 
-class UDFSuite extends QueryTest {
-  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
+class UDFSuite extends QueryTest with TestHiveSingleton {
 
   test("UDF case insensitive") {
-    ctx.udf.register("random0", () => { Math.random() })
-    ctx.udf.register("RANDOM1", () => { Math.random() })
-    ctx.udf.register("strlenScala", (_: String).length + (_: Int))
-    assert(ctx.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
+    hiveContext.udf.register("random0", () => { Math.random() })
+    hiveContext.udf.register("RANDOM1", () => { Math.random() })
+    hiveContext.udf.register("strlenScala", (_: String).length + (_: Int))
+    assert(hiveContext.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(hiveContext.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(hiveContext.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 4886a85948367..b126ec455fc69 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -17,19 +17,15 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.scalatest.BeforeAndAfterAll
-
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.aggregate
-import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
-  import sqlContext.implicits._
+abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   var originalUseAggregate2: Boolean = _
 
@@ -69,7 +65,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be
     data2.write.saveAsTable("agg2")
 
     val emptyDF = sqlContext.createDataFrame(
-      sqlContext.sparkContext.emptyRDD[Row],
+      sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
     emptyDF.registerTempTable("emptyTable")
 
@@ -597,7 +593,7 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
     sqlContext.conf.unsetConf("spark.sql.TungstenAggregate.testFallbackStartsAt")
   }
 
-  override protected def checkAnswer(actual: DataFrame, expectedAnswer: Seq[Row]): Unit = {
+  override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
     (0 to 2).foreach { fallbackStartsAt =>
       sqlContext.setConf(
         "spark.sql.TungstenAggregate.testFallbackStartsAt",
@@ -605,6 +601,7 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
 
       // Create a new df to make sure its physical operator picks up
       // spark.sql.TungstenAggregate.testFallbackStartsAt.
+      // todo: remove it?
       val newActual = DataFrame(sqlContext, actual.logicalPlan)
 
       QueryTest.checkAnswer(newActual, expectedAnswer) match {
@@ -626,12 +623,12 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
   }
 
   // Override it to make sure we call the actually overridden checkAnswer.
-  override protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = {
+  override protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = {
     checkAnswer(df, Seq(expectedAnswer))
   }
 
   // Override it to make sure we call the actually overridden checkAnswer.
-  override protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = {
+  override protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = {
     checkAnswer(df, expectedAnswer.collect())
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 4d45249d9c6b8..aa95ba94fa873 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -23,7 +23,7 @@ import scala.util.control.NonFatal
 
 import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
 
-import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util._
@@ -42,7 +42,7 @@ import org.apache.spark.sql.hive.test.TestHive
  * configured using system properties.
  */
 abstract class HiveComparisonTest
-  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen with Logging {
+  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen {
 
   /**
    * When set, any cache files that result in test failures will be deleted.  Used when the test
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index 11d7a872dff09..94162da4eae1a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -17,17 +17,14 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.{SQLContext, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 /**
  * A set of tests that validates support for Hive Explain command.
  */
-class HiveExplainSuite extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("explain extended command") {
     checkExistence(sql(" explain   select * from src where key=123 "), true,
@@ -83,7 +80,7 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils {
   test("SPARK-6212: The EXPLAIN output of CTAS only shows the analyzed plan") {
     withTempTable("jt") {
       val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""))
-      read.json(rdd).registerTempTable("jt")
+      hiveContext.read.json(rdd).registerTempTable("jt")
       val outputs = sql(
         s"""
            |EXPLAIN EXTENDED
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
index efbef68cd4447..0d4c7f86b315a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
@@ -18,14 +18,16 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.{Row, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
 
 /**
  * A set of tests that validates commands can also be queried by like a table
  */
-class HiveOperatorQueryableSuite extends QueryTest {
+class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext._
+
   test("SPARK-5324 query result of describe command") {
-    loadTestTable("src")
+    hiveContext.loadTestTable("src")
 
     // register a describe command to be a temp table
     sql("desc src").registerTempTable("mydesc")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
index ba56a8a6b689c..cd055f9eca37e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
@@ -21,11 +21,11 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class HivePlanTest extends QueryTest {
-  import TestHive._
-  import TestHive.implicits._
+class HivePlanTest extends QueryTest with TestHiveSingleton {
+  import hiveContext.sql
+  import hiveContext.implicits._
 
   test("udf constant folding") {
     Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 9c10ffe1113dc..d9ba895e1eceb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectIns
 import org.apache.hadoop.hive.serde2.{AbstractSerDe, SerDeStats}
 import org.apache.hadoop.io.Writable
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SQLConf}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 import org.apache.spark.util.Utils
 
@@ -43,10 +43,10 @@ case class ListStringCaseClass(l: Seq[String])
 /**
  * A test suite for Hive custom UDFs.
  */
-class HiveUDFSuite extends QueryTest {
+class HiveUDFSuite extends QueryTest with TestHiveSingleton {
 
-  import TestHive.{udf, sql}
-  import TestHive.implicits._
+  import hiveContext.{udf, sql}
+  import hiveContext.implicits._
 
   test("spark sql udf test that returns a struct") {
     udf.register("getStruct", (_: Int) => Fields(1, 2, 3, 4, 5))
@@ -123,12 +123,12 @@ class HiveUDFSuite extends QueryTest {
           |           "value", value)).value FROM src
         """.stripMargin), Seq(Row("val_0")))
     }
-    val codegenDefault = TestHive.getConf(SQLConf.CODEGEN_ENABLED)
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, true)
+    val codegenDefault = hiveContext.getConf(SQLConf.CODEGEN_ENABLED)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, true)
     testOrderInStruct()
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, false)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, false)
     testOrderInStruct()
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
   }
 
   test("SPARK-6409 UDAFAverage test") {
@@ -137,7 +137,7 @@ class HiveUDFSuite extends QueryTest {
       sql("SELECT test_avg(1), test_avg(substr(value,5)) FROM src"),
       Seq(Row(1.0, 260.182)))
     sql("DROP TEMPORARY FUNCTION IF EXISTS test_avg")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("SPARK-2693 udaf aggregates test") {
@@ -157,7 +157,7 @@ class HiveUDFSuite extends QueryTest {
    }
 
   test("UDFIntegerToString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil).toDF()
     testData.registerTempTable("integerTable")
 
@@ -168,11 +168,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("1"), Row("2")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFIntegerToString")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToListString") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToListString AS '${classOf[UDFToListString].getName}'")
@@ -183,11 +183,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch a component type in List<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListString")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToListInt") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToListInt AS '${classOf[UDFToListInt].getName}'")
@@ -198,11 +198,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch a component type in List<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListInt")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToStringIntMap") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToStringIntMap " +
@@ -214,11 +214,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch key and value types in Map<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToStringIntMap")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToIntIntMap") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToIntIntMap " +
@@ -230,11 +230,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch key and value types in Map<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToIntIntMap")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFListListInt") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil).toDF()
@@ -246,11 +246,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row(0), Row(2), Row(13)))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListListInt")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFListString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListStringCaseClass(Seq("a", "b", "c")) ::
       ListStringCaseClass(Seq("d", "e")) :: Nil).toDF()
     testData.registerTempTable("listStringTable")
@@ -261,11 +261,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("a,b,c"), Row("d,e")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListString")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFStringString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil).toDF()
     testData.registerTempTable("stringTable")
 
@@ -280,11 +280,11 @@ class HiveUDFSuite extends QueryTest {
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testStringStringUDF")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFTwoListList") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) ::
@@ -297,7 +297,7 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("0, 0"), Row("2, 2"), Row("13, 13")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 1ff1d9a2934cc..8126d02335217 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -26,9 +26,7 @@ import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
@@ -65,12 +63,12 @@ class MyDialect extends DefaultParserDialect
  * Hive to generate them (in contrast to HiveQuerySuite).  Often this is because the query is
  * valid, but Hive currently cannot execute it.
  */
-class SQLQuerySuite extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
+  import hiveContext.implicits._
 
   test("UDTF") {
-    sql(s"ADD JAR ${TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath()}")
+    sql(s"ADD JAR ${hiveContext.getHiveFile("TestUDTF.jar").getCanonicalPath()}")
     // The function source code can be found at:
     // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
     sql(
@@ -509,19 +507,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     checkAnswer(
       sql("SELECT f1.f2.f3 FROM nested"),
       Row(1))
-    checkAnswer(sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested"),
-      Seq.empty[Row])
+
+    sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested")
     checkAnswer(
       sql("SELECT * FROM test_ctas_1234"),
       sql("SELECT * FROM nested").collect().toSeq)
 
     intercept[AnalysisException] {
-      sql("CREATE TABLE test_ctas_12345 AS SELECT * from notexists").collect()
+      sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect()
     }
   }
 
   test("test CTAS") {
-    checkAnswer(sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src"), Seq.empty[Row])
+    sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src")
     checkAnswer(
       sql("SELECT key, value FROM test_ctas_123 ORDER BY key"),
       sql("SELECT key, value FROM src ORDER BY key").collect().toSeq)
@@ -614,7 +612,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
 
     val rowRdd = sparkContext.parallelize(row :: Nil)
 
-    TestHive.createDataFrame(rowRdd, schema).registerTempTable("testTable")
+    hiveContext.createDataFrame(rowRdd, schema).registerTempTable("testTable")
 
     sql(
       """CREATE TABLE nullValuesInInnerComplexTypes
@@ -1044,10 +1042,10 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     val thread = new Thread {
       override def run() {
         // To make sure this test works, this jar should not be loaded in another place.
-        TestHive.sql(
-          s"ADD JAR ${TestHive.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath()}")
+        sql(
+          s"ADD JAR ${hiveContext.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath()}")
         try {
-          TestHive.sql(
+          sql(
             """
               |CREATE TEMPORARY FUNCTION example_max
               |AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax'
@@ -1097,21 +1095,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
 
   test("SPARK-8588 HiveTypeCoercion.inConversion fires too early") {
     val df =
-      TestHive.createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01")))
+      createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01")))
     df.toDF("id", "datef").registerTempTable("test_SPARK8588")
     checkAnswer(
-      TestHive.sql(
+      sql(
         """
           |select id, concat(year(datef))
           |from test_SPARK8588 where concat(year(datef), ' year') in ('2015 year', '2014 year')
         """.stripMargin),
       Row(1, "2014") :: Row(2, "2015") :: Nil
     )
-    TestHive.dropTempTable("test_SPARK8588")
+    dropTempTable("test_SPARK8588")
   }
 
   test("SPARK-9371: fix the support for special chars in column names for hive context") {
-    TestHive.read.json(TestHive.sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
@@ -1142,8 +1140,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
   test("specifying database name for a temporary table is not allowed") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
-      val df =
-        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+      val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
       df
         .write
         .format("parquet")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index 9aca40f15ac15..cb8d0fca8e693 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -22,17 +22,14 @@ import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan, SparkPlanTest}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types.StringType
 
-class ScriptTransformationSuite extends SparkPlanTest {
-
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton {
+  import hiveContext.implicits._
 
   private val noSerdeIOSchema = HiveScriptIOSchema(
     inputRowFormat = Seq.empty,
@@ -59,7 +56,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
         output = Seq(AttributeReference("a", StringType)()),
         child = child,
         ioschema = noSerdeIOSchema
-      )(TestHive),
+      )(hiveContext),
       rowsDf.collect())
   }
 
@@ -73,7 +70,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
         output = Seq(AttributeReference("a", StringType)()),
         child = child,
         ioschema = serdeIOSchema
-      )(TestHive),
+      )(hiveContext),
       rowsDf.collect())
   }
 
@@ -88,7 +85,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
           output = Seq(AttributeReference("a", StringType)()),
           child = ExceptionInjectingOperator(child),
           ioschema = noSerdeIOSchema
-        )(TestHive),
+        )(hiveContext),
         rowsDf.collect())
     }
     assert(e.getMessage().contains("intentional exception"))
@@ -105,7 +102,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
           output = Seq(AttributeReference("a", StringType)()),
           child = ExceptionInjectingOperator(child),
           ioschema = serdeIOSchema
-        )(TestHive),
+        )(hiveContext),
         rowsDf.collect())
     }
     assert(e.getMessage().contains("intentional exception"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index deec0048d24b8..9a299c3f9d1f3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -24,10 +24,9 @@ import org.apache.spark.sql.sources.HadoopFsRelationTest
 import org.apache.spark.sql.types._
 
 class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
-  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName
+  import testImplicits._
 
-  import sqlContext._
-  import sqlContext.implicits._
+  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -48,7 +47,7 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.options(Map(
+        hiveContext.read.options(Map(
           "path" -> file.getCanonicalPath,
           "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load())
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index a46ca9a2c9706..52e09f9496f05 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -18,19 +18,17 @@
 package org.apache.spark.sql.hive.orc
 
 import java.io.File
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.util.Utils
-import org.scalatest.BeforeAndAfterAll
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.scalatest.BeforeAndAfterAll
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.util.Utils
+
 // The data where the partitioning key exists only in the directory structure.
 case class OrcParData(intField: Int, stringField: String)
 
@@ -38,7 +36,10 @@ case class OrcParData(intField: Int, stringField: String)
 case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
 
 // TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
-class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
+class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+  import hiveContext.implicits._
+
   val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultStrVal
 
   def withTempDir(f: File => Unit): Unit = {
@@ -58,7 +59,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
-    try f finally TestHive.dropTempTable(tableName)
+    try f finally hiveContext.dropTempTable(tableName)
   }
 
   protected def makePartitionDir(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 80c38084f293d..7a34cf731b4c5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -21,12 +21,14 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 case class OrcData(intField: Int, stringField: String)
 
-abstract class OrcSuite extends QueryTest with BeforeAndAfterAll {
+abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+
   var orcTableDir: File = null
   var orcTableAsDir: File = null
 
@@ -156,7 +158,7 @@ class OrcSourceSuite extends OrcSuite {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
-    sql(
+    hiveContext.sql(
       s"""CREATE TEMPORARY TABLE normal_orc_source
          |USING org.apache.spark.sql.hive.orc
          |OPTIONS (
@@ -164,7 +166,7 @@ class OrcSourceSuite extends OrcSuite {
          |)
        """.stripMargin)
 
-    sql(
+    hiveContext.sql(
       s"""CREATE TEMPORARY TABLE normal_orc_as_source
          |USING org.apache.spark.sql.hive.orc
          |OPTIONS (
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
index f7ba20ff41d8d..88a0ed511749f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -22,15 +22,12 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-private[sql] trait OrcTest extends SQLTestUtils { this: SparkFunSuite =>
-  protected override def _sqlContext: SQLContext = org.apache.spark.sql.hive.test.TestHive
-  protected val sqlContext = _sqlContext
-  import sqlContext.implicits._
-  import sqlContext.sparkContext
+private[sql] trait OrcTest extends SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   /**
    * Writes `data` to a Orc file, which is then passed to `f` and will be deleted after `f`
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 34d3434569f58..6842ec2b5eb37 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -19,15 +19,11 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
-import org.scalatest.BeforeAndAfterAll
-
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
@@ -58,6 +54,8 @@ case class ParquetDataWithKeyAndComplexTypes(
  * built in parquet support.
  */
 class ParquetMetastoreSuite extends ParquetPartitioningTest {
+  import hiveContext._
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     dropTables("partitioned_parquet",
@@ -536,6 +534,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
  * A suite of tests for the Parquet support through the data sources API.
  */
 class ParquetSourceSuite extends ParquetPartitioningTest {
+  import testImplicits._
+  import hiveContext._
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     dropTables("partitioned_parquet",
@@ -684,9 +685,8 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */
-abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
+abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   var partitionedTableDir: File = null
   var normalTableDir: File = null
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
index b4640b1616281..dc0531a6d4bc5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
@@ -18,16 +18,13 @@
 package org.apache.spark.sql.sources
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 
 
-class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton  {
 
   // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
   val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 8ca3a17085194..1945b15002337 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,8 +28,6 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
-  import sqlContext._
-
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
@@ -47,7 +45,7 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
@@ -65,14 +63,14 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       val data =
         Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
           Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
-      val df = createDataFrame(sparkContext.parallelize(data), schema)
+      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)
 
       // Write the data out.
       df.write.format(dataSourceName).save(file.getCanonicalPath)
 
       // Read it back and check the result.
       checkAnswer(
-        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
+        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
         df
       )
     }
@@ -90,14 +88,14 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
         Row(new BigDecimal("10.02")) ::
           Row(new BigDecimal("20000.99")) ::
           Row(new BigDecimal("10000")) :: Nil
-      val df = createDataFrame(sparkContext.parallelize(data), schema)
+      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)
 
       // Write the data out.
       df.write.format(dataSourceName).save(file.getCanonicalPath)
 
       // Read it back and check the result.
       checkAnswer(
-        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
+        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
         df
       )
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
index 06dadbb5feab0..08c3c17973043 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -28,10 +28,9 @@ import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
 
 class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
-  override val dataSourceName: String = "parquet"
+  import testImplicits._
 
-  import sqlContext._
-  import sqlContext.implicits._
+  override val dataSourceName: String = "parquet"
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -51,7 +50,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
@@ -69,7 +68,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         .format("parquet")
         .save(s"${dir.getCanonicalPath}/_temporary")
 
-      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
+      checkAnswer(hiveContext.read.format("parquet").load(dir.getCanonicalPath), df.collect())
     }
   }
 
@@ -97,7 +96,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
 
       // This shouldn't throw anything.
       df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
-      checkAnswer(read.format("parquet").load(path), df)
+      checkAnswer(hiveContext.read.format("parquet").load(path), df)
     }
   }
 
@@ -107,7 +106,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         // Parquet doesn't allow field names with spaces.  Here we are intentionally making an
         // exception thrown from the `ParquetRelation2.prepareForWriteJob()` method to trigger
         // the bug.  Please refer to spark-8079 for more details.
-        range(1, 10)
+        hiveContext.range(1, 10)
           .withColumnRenamed("id", "a b")
           .write
           .format("parquet")
@@ -125,7 +124,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       val summaryPath = new Path(path, "_metadata")
       val commonSummaryPath = new Path(path, "_common_metadata")
 
-      val fs = summaryPath.getFileSystem(configuration)
+      val fs = summaryPath.getFileSystem(hadoopConfiguration)
       fs.delete(summaryPath, true)
       fs.delete(commonSummaryPath, true)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index e8975e5f5cd08..1125ca670107b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -25,8 +25,6 @@ import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
 
-  import sqlContext._
-
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
@@ -44,7 +42,7 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 7966b43596e75..2ad2618dfc436 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -28,14 +28,12 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
 
-abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
+abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with TestHiveSingleton {
   import sqlContext.implicits._
 
   val dataSourceName: String
@@ -504,17 +502,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-8578 specified custom output committer will not be used to append data") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
     try {
       val df = sqlContext.range(1, 10).toDF("i")
       withTempPath { dir =>
         df.write.mode("append").format(dataSourceName).save(dir.getCanonicalPath)
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
         // Since Parquet has its own output committer setting, also set it
         // to AlwaysFailParquetOutputCommitter at here.
-        configuration.set("spark.sql.parquet.output.committer.class",
+        hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
           classOf[AlwaysFailParquetOutputCommitter].getName)
         // Because there data already exists,
         // this append should succeed because we will use the output committer associated
@@ -533,12 +531,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         }
       }
       withTempPath { dir =>
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
         // Since Parquet has its own output committer setting, also set it
         // to AlwaysFailParquetOutputCommitter at here.
-        configuration.set("spark.sql.parquet.output.committer.class",
+        hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
           classOf[AlwaysFailParquetOutputCommitter].getName)
         // Because there is no existing data,
         // this append will fail because AlwaysFailOutputCommitter is used when we do append
@@ -549,8 +547,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
@@ -570,7 +568,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-9899 Disable customized output committer when speculation is on") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
     val speculationEnabled =
       sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false)
 
@@ -580,7 +578,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         sqlContext.sparkContext.conf.set("spark.speculation", "true")
 
         // Uses a customized output committer which always fails
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
 
@@ -597,8 +595,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
       sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString)
     }
   }

From 3339e6f674264e387ab6c4f9c356bd3bc7167221 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 4 Sep 2015 15:20:20 -0700
Subject: [PATCH 194/802] [SPARK-10450] [SQL] Minor improvements to readability
 / style / typos etc.

Author: Andrew Or <andrew@databricks.com>

Closes #8603 from andrewor14/minor-sql-changes.
---
 .../sql/catalyst/expressions/predicates.scala    |  2 +-
 .../sql/catalyst/planning/QueryPlanner.scala     |  2 +-
 .../spark/sql/catalyst/planning/patterns.scala   |  8 ++++----
 .../spark/sql/execution/SparkStrategies.scala    | 16 ++++++++--------
 .../sql/execution/joins/SortMergeJoin.scala      |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 65706dba7d975..daefc016bc91c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -67,7 +67,7 @@ trait PredicateHelper {
 
   /**
    * Returns true if `expr` can be evaluated using only the output of `plan`.  This method
-   * can be used to determine when is is acceptable to move expression evaluation within a query
+   * can be used to determine when it is acceptable to move expression evaluation within a query
    * plan.
    *
    * For example consider a join between two relations R(a, b) and S(c, d).
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
index 73a21884a4710..56a3dd02f9ba3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
@@ -51,7 +51,7 @@ abstract class QueryPlanner[PhysicalPlan <: TreeNode[PhysicalPlan]] {
    * filled in automatically by the QueryPlanner using the other execution strategies that are
    * available.
    */
-  protected def planLater(plan: LogicalPlan) = this.plan(plan).next()
+  protected def planLater(plan: LogicalPlan): PhysicalPlan = this.plan(plan).next()
 
   def plan(plan: LogicalPlan): Iterator[PhysicalPlan] = {
     // Obviously a lot to do here still...
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index b9ca712c1ee1c..e8abcd63f7d85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -195,8 +195,9 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       // as join keys.
       val (joinPredicates, otherPredicates) =
         condition.map(splitConjunctivePredicates).getOrElse(Nil).partition {
-          case EqualTo(l, r) if (canEvaluate(l, left) && canEvaluate(r, right)) ||
-            (canEvaluate(l, right) && canEvaluate(r, left)) => true
+          case EqualTo(l, r) =>
+            (canEvaluate(l, left) && canEvaluate(r, right)) ||
+            (canEvaluate(l, right) && canEvaluate(r, left))
           case _ => false
         }
 
@@ -204,10 +205,9 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
         case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => (l, r)
         case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => (r, l)
       }
-      val leftKeys = joinKeys.map(_._1)
-      val rightKeys = joinKeys.map(_._2)
 
       if (joinKeys.nonEmpty) {
+        val (leftKeys, rightKeys) = joinKeys.unzip
         logDebug(s"leftKeys:$leftKeys | rightKeys:$rightKeys")
         Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right))
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4df53687a0731..2170bc73a0fd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -87,7 +87,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         left: LogicalPlan,
         right: LogicalPlan,
         condition: Option[Expression],
-        side: joins.BuildSide) = {
+        side: joins.BuildSide): Seq[SparkPlan] = {
       val broadcastHashJoin = execution.joins.BroadcastHashJoin(
         leftKeys, rightKeys, side, planLater(left), planLater(right))
       condition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) :: Nil
@@ -123,12 +123,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // --- Outer joins --------------------------------------------------------------------------
 
       case ExtractEquiJoinKeys(
-             LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
+          LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
         joins.BroadcastHashOuterJoin(
           leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil
 
       case ExtractEquiJoinKeys(
-             RightOuter, leftKeys, rightKeys, condition, CanBroadcast(left), right) =>
+          RightOuter, leftKeys, rightKeys, condition, CanBroadcast(left), right) =>
         joins.BroadcastHashOuterJoin(
           leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
 
@@ -156,11 +156,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       // Aggregations that can be performed in two phases, before and after the shuffle.
       case PartialAggregation(
-             namedGroupingAttributes,
-             rewrittenAggregateExpressions,
-             groupingExpressions,
-             partialComputation,
-             child) if !canBeConvertedToNewAggregation(plan) =>
+          namedGroupingAttributes,
+          rewrittenAggregateExpressions,
+          groupingExpressions,
+          partialComputation,
+          child) if !canBeConvertedToNewAggregation(plan) =>
         execution.Aggregate(
           partial = false,
           namedGroupingAttributes,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 6b7322671d6b4..906f20d2a7289 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -238,7 +238,7 @@ private[joins] class SortMergeJoinScanner(
    * Advances the streamed input iterator and buffers all rows from the buffered input that
    * have matching keys.
    * @return true if the streamed iterator returned a row, false otherwise. If this returns true,
-   *         then [getStreamedRow and [[getBufferedMatches]] can be called to produce the outer
+   *         then [[getStreamedRow]] and [[getBufferedMatches]] can be called to produce the outer
    *         join results.
    */
   final def findNextOuterJoinRows(): Boolean = {

From b087d23e28004f1bfdf6d2cd3ff34ae58c8132df Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Fri, 4 Sep 2015 15:21:11 -0700
Subject: [PATCH 195/802] [SPARK-9669] [MESOS] Support PySpark on Mesos cluster
 mode.

Support running pyspark with cluster mode on Mesos!
This doesn't upload any scripts, so if running in a remote Mesos requires the user to specify the script from a available URI.

Author: Timothy Chen <tnachen@gmail.com>

Closes #8349 from tnachen/mesos_python.
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 13 ++++--
 .../cluster/mesos/MesosClusterScheduler.scala | 42 +++++++++++++------
 docs/running-on-mesos.md                      |  2 +
 3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 36e9750b86636..ad92f5635af35 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -319,9 +319,6 @@ object SparkSubmit {
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
-      case (MESOS, CLUSTER) if args.isPython =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for python " +
-          "applications on Mesos clusters.")
       case (MESOS, CLUSTER) if args.isR =>
         printErrorAndExit("Cluster deploy mode is currently not supported for R " +
           "applications on Mesos clusters.")
@@ -554,7 +551,15 @@ object SparkSubmit {
     if (isMesosCluster) {
       assert(args.useRest, "Mesos cluster mode is only supported through the REST submission API")
       childMainClass = "org.apache.spark.deploy.rest.RestSubmissionClient"
-      childArgs += (args.primaryResource, args.mainClass)
+      if (args.isPython) {
+        // Second argument is main class
+        childArgs += (args.primaryResource, "")
+        if (args.pyFiles != null) {
+          sysProps("spark.submit.pyFiles") = args.pyFiles
+        }
+      } else {
+        childArgs += (args.primaryResource, args.mainClass)
+      }
       if (args.childArgs != null) {
         childArgs ++= args.childArgs
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 07da9242b9922..a6d9374eb9e8c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -29,7 +29,6 @@ import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.{Scheduler, SchedulerDriver}
-
 import org.apache.spark.deploy.mesos.MesosDriverDescription
 import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
 import org.apache.spark.metrics.MetricsSystem
@@ -375,21 +374,20 @@ private[spark] class MesosClusterScheduler(
     val executorOpts = desc.schedulerProperties.map { case (k, v) => s"-D$k=$v" }.mkString(" ")
     envBuilder.addVariables(
       Variable.newBuilder().setName("SPARK_EXECUTOR_OPTS").setValue(executorOpts))
-    val cmdOptions = generateCmdOption(desc).mkString(" ")
     val dockerDefined = desc.schedulerProperties.contains("spark.mesos.executor.docker.image")
     val executorUri = desc.schedulerProperties.get("spark.executor.uri")
       .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI"))
-    val appArguments = desc.command.arguments.mkString(" ")
-    val (executable, jar) = if (dockerDefined) {
+    // Gets the path to run spark-submit, and the path to the Mesos sandbox.
+    val (executable, sandboxPath) = if (dockerDefined) {
       // Application jar is automatically downloaded in the mounted sandbox by Mesos,
       // and the path to the mounted volume is stored in $MESOS_SANDBOX env variable.
-      ("./bin/spark-submit", s"$$MESOS_SANDBOX/${desc.jarUrl.split("/").last}")
+      ("./bin/spark-submit", "$MESOS_SANDBOX")
     } else if (executorUri.isDefined) {
       builder.addUris(CommandInfo.URI.newBuilder().setValue(executorUri.get).build())
       val folderBasename = executorUri.get.split('/').last.split('.').head
       val cmdExecutable = s"cd $folderBasename*; $prefixEnv bin/spark-submit"
-      val cmdJar = s"../${desc.jarUrl.split("/").last}"
-      (cmdExecutable, cmdJar)
+      // Sandbox path points to the parent folder as we chdir into the folderBasename.
+      (cmdExecutable, "..")
     } else {
       val executorSparkHome = desc.schedulerProperties.get("spark.mesos.executor.home")
         .orElse(conf.getOption("spark.home"))
@@ -398,30 +396,50 @@ private[spark] class MesosClusterScheduler(
           throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
         }
       val cmdExecutable = new File(executorSparkHome, "./bin/spark-submit").getCanonicalPath
-      val cmdJar = desc.jarUrl.split("/").last
-      (cmdExecutable, cmdJar)
+      // Sandbox points to the current directory by default with Mesos.
+      (cmdExecutable, ".")
     }
-    builder.setValue(s"$executable $cmdOptions $jar $appArguments")
+    val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString()
+    val cmdOptions = generateCmdOption(desc, sandboxPath).mkString(" ")
+    val appArguments = desc.command.arguments.mkString(" ")
+    builder.setValue(s"$executable $cmdOptions $primaryResource $appArguments")
     builder.setEnvironment(envBuilder.build())
     conf.getOption("spark.mesos.uris").map { uris =>
       setupUris(uris, builder)
     }
+    desc.schedulerProperties.get("spark.mesos.uris").map { uris =>
+      setupUris(uris, builder)
+    }
+    desc.schedulerProperties.get("spark.submit.pyFiles").map { pyFiles =>
+      setupUris(pyFiles, builder)
+    }
     builder.build()
   }
 
-  private def generateCmdOption(desc: MesosDriverDescription): Seq[String] = {
+  private def generateCmdOption(desc: MesosDriverDescription, sandboxPath: String): Seq[String] = {
     var options = Seq(
       "--name", desc.schedulerProperties("spark.app.name"),
-      "--class", desc.command.mainClass,
       "--master", s"mesos://${conf.get("spark.master")}",
       "--driver-cores", desc.cores.toString,
       "--driver-memory", s"${desc.mem}M")
+
+    // Assume empty main class means we're running python
+    if (!desc.command.mainClass.equals("")) {
+      options ++= Seq("--class", desc.command.mainClass)
+    }
+
     desc.schedulerProperties.get("spark.executor.memory").map { v =>
       options ++= Seq("--executor-memory", v)
     }
     desc.schedulerProperties.get("spark.cores.max").map { v =>
       options ++= Seq("--total-executor-cores", v)
     }
+    desc.schedulerProperties.get("spark.submit.pyFiles").map { pyFiles =>
+      val formattedFiles = pyFiles.split(",")
+        .map { path => new File(sandboxPath, path.split("/").last).toString() }
+        .mkString(",")
+      options ++= Seq("--py-files", formattedFiles)
+    }
     options
   }
 
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index f36921ae30c2f..247e6ecfbdb86 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -157,6 +157,8 @@ From the client, you can submit a job to Mesos cluster by running `spark-submit`
 to the url of the MesosClusterDispatcher (e.g: mesos://dispatcher:7077). You can view driver statuses on the
 Spark cluster Web UI.
 
+Note that jars or python files that are passed to spark-submit should be URIs reachable by Mesos slaves.
+
 # Mesos Run Modes
 
 Spark can run over Mesos in two modes: "fine-grained" (default) and "coarse-grained".

From 2e1c17553d179f2d26a165805622cc01f92081b9 Mon Sep 17 00:00:00 2001
From: robbins <robbins@uk.ibm.com>
Date: Fri, 4 Sep 2015 15:23:29 -0700
Subject: [PATCH 196/802] [SPARK-10454] [SPARK CORE] wait for empty event queue

Author: robbins <robbins@uk.ibm.com>

Closes #8605 from robbinspg/DAGSchedulerSuite-fix.
---
 .../scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 2c3aa2cf90d3c..1b9ff740ff530 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -812,6 +812,7 @@ class DAGSchedulerSuite
     }
 
     // The map stage should have been submitted.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(countSubmittedMapStageAttempts() === 1)
 
     complete(taskSets(0), Seq(

From eafe37236c56b41ab32adf6610743e1e9a0f230f Mon Sep 17 00:00:00 2001
From: xutingjun <xutingjun@huawei.com>
Date: Fri, 4 Sep 2015 15:40:02 -0700
Subject: [PATCH 197/802] [SPARK-10311] [STREAMING] Reload appId and attemptId
 when app starts with checkpoint file in cluster mode

Author: xutingjun <xutingjun@huawei.com>

Closes #8477 from XuTingjun/streaming-attempt.
---
 .../src/main/scala/org/apache/spark/streaming/Checkpoint.scala  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index cd5d960369c05..3985e1a3d9dfa 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -49,6 +49,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
     // Reload properties for the checkpoint application since user wants to set a reload property
     // or spark had changed its value and user wants to set it back.
     val propertiesToReload = List(
+      "spark.yarn.app.id",
+      "spark.yarn.app.attemptId",
       "spark.driver.host",
       "spark.driver.port",
       "spark.master",

From 22eab706f4a1459100f9e97fc557a7f3c88ca10e Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Fri, 4 Sep 2015 17:32:35 -0700
Subject: [PATCH 198/802] [SPARK-10402] [DOCS] [ML] Add defaults to the
 scaladoc for params in ml/

We should make sure the scaladoc for params includes their default values through the models in ml/

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8591 from holdenk/SPARK-10402-add-scaladoc-for-default-values-of-params-in-ml.
---
 .../ml/classification/MultilayerPerceptronClassifier.scala    | 2 ++
 .../spark/ml/evaluation/BinaryClassificationEvaluator.scala   | 1 +
 .../main/scala/org/apache/spark/ml/feature/Binarizer.scala    | 1 +
 mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala    | 1 +
 .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala  | 2 ++
 .../scala/org/apache/spark/ml/feature/StringIndexer.scala     | 4 ++--
 .../scala/org/apache/spark/ml/feature/VectorIndexer.scala     | 1 +
 .../main/scala/org/apache/spark/ml/feature/VectorSlicer.scala | 2 ++
 .../src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala | 3 +++
 .../org/apache/spark/ml/regression/IsotonicRegression.scala   | 1 +
 10 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 1e5b0bc4453e4..82fc80c58054f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -32,6 +32,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
   with HasSeed with HasMaxIter with HasTol {
   /**
    * Layer sizes including input size and output size.
+   * Default: Array(1, 1)
    * @group param
    */
   final val layers: IntArrayParam = new IntArrayParam(this, "layers",
@@ -50,6 +51,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
    * Data is stacked within partitions. If block size is more than remaining data in
    * a partition then it is adjusted to the size of this data.
    * Recommended size is between 10 and 1000.
+   * Default: 128
    * @group expertParam
    */
   final val blockSize: IntParam = new IntParam(this, "blockSize",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 56419a0a15952..08df2919a8a87 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -38,6 +38,7 @@ class BinaryClassificationEvaluator(override val uid: String)
 
   /**
    * param for metric name in evaluation
+   * Default: areaUnderROC
    * @group param
    */
   val metricName: Param[String] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index 46314854d5e3a..edad754436455 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -41,6 +41,7 @@ final class Binarizer(override val uid: String)
    * Param for threshold used to binarize continuous features.
    * The features greater than the threshold, will be binarized to 1.0.
    * The features equal to or less than the threshold, will be binarized to 0.0.
+   * Default: 0.0
    * @group param
    */
   val threshold: DoubleParam =
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 938447447a0a2..4c36df75d8aa0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -35,6 +35,7 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
 
   /**
    * The minimum of documents in which a term should appear.
+   * Default: 0
    * @group param
    */
   final val minDocFreq = new IntParam(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 7da430c7d16df..2a79582625e9a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -98,6 +98,7 @@ class StopWordsRemover(override val uid: String)
 
   /**
    * the stop words set to be filtered out
+   * Default: [[StopWords.English]]
    * @group param
    */
   val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words")
@@ -110,6 +111,7 @@ class StopWordsRemover(override val uid: String)
 
   /**
    * whether to do a case sensitive comparison over the stop words
+   * Default: false
    * @group param
    */
   val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 24250e4c4cf92..77aeed0ab0370 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -212,8 +212,8 @@ class IndexToString private[ml] (
 
   /**
    * Param for array of labels.
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels.
+   * Optional labels to be provided by the user.
+   * Default: Empty array, in which case column metadata is used for labels.
    * @group param
    */
   final val labels: StringArrayParam = new StringArrayParam(this, "labels",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 61b925c0fdc07..52e0599e38d83 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -43,6 +43,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
    * Must be >= 2.
    *
    * (default = 20)
+   * @group param
    */
   val maxCategories = new IntParam(this, "maxCategories",
     "Threshold for the number of values a categorical feature can take (>= 2)." +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
index c5c2272270792..fb3387d4aa9be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
@@ -49,6 +49,7 @@ final class VectorSlicer(override val uid: String)
   /**
    * An array of indices to select features from a vector column.
    * There can be no overlap with [[names]].
+   * Default: Empty array
    * @group param
    */
   val indices = new IntArrayParam(this, "indices",
@@ -67,6 +68,7 @@ final class VectorSlicer(override val uid: String)
    * An array of feature names to select features from a vector column.
    * These names must be specified by ML [[org.apache.spark.ml.attribute.Attribute]]s.
    * There can be no overlap with [[indices]].
+   * Default: Empty Array
    * @group param
    */
   val names = new StringArrayParam(this, "names",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 5af775a4159ad..9edab3af913ca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -39,6 +39,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * The dimension of the code that you want to transform from words.
+   * Default: 100
    * @group param
    */
   final val vectorSize = new IntParam(
@@ -50,6 +51,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * Number of partitions for sentences of words.
+   * Default: 1
    * @group param
    */
   final val numPartitions = new IntParam(
@@ -62,6 +64,7 @@ private[feature] trait Word2VecBase extends Params
   /**
    * The minimum number of times a token must appear to be included in the word2vec model's
    * vocabulary.
+   * Default: 5
    * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 0f33bae30e622..d43a3447d3975 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -40,6 +40,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
   /**
    * Param for whether the output sequence should be isotonic/increasing (true) or
    * antitonic/decreasing (false).
+   * Default: true
    * @group param
    */
   final val isotonic: BooleanParam =

From 47058ca5db3135a33feea9f485fcda455e430054 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 4 Sep 2015 18:58:25 -0700
Subject: [PATCH 199/802] [SPARK-9925] [SQL] [TESTS] Set
 SQLConf.SHUFFLE_PARTITIONS.key correctly for tests

This PR fix the failed test and conflict for #8155

https://issues.apache.org/jira/browse/SPARK-9925

Closes #8155

Author: Yin Huai <yhuai@databricks.com>
Author: Davies Liu <davies@databricks.com>

Closes #8602 from davies/shuffle_partitions.
---
 .../org/apache/spark/sql/SQLConfSuite.scala   | 19 ++++++++++----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 21 ++++++++++-----
 .../spark/sql/test/TestSQLContext.scala       | 26 +++++++++++++++++--
 .../apache/spark/sql/hive/test/TestHive.scala | 26 +++++++++++++++++--
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  5 +++-
 .../hive/execution/ConcurrentHiveSuite.scala  |  4 ++-
 .../sql/hive/execution/HiveQuerySuite.scala   | 10 ++++---
 7 files changed, 90 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index c35b31c96df48..589ba86a84144 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.test.{TestSQLContext, SharedSQLContext}
 
 
 class SQLConfSuite extends QueryTest with SharedSQLContext {
@@ -32,8 +32,12 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   }
 
   test("programmatic ways of basic setting and getting") {
+    // Set a conf first.
+    sqlContext.setConf(testKey, testVal)
+    // Clear the conf.
     sqlContext.conf.clear()
-    assert(sqlContext.getAllConfs.size === 0)
+    // After clear, only overrideConfs used by unit test should be in the SQLConf.
+    assert(sqlContext.getAllConfs === TestSQLContext.overrideConfs)
 
     sqlContext.setConf(testKey, testVal)
     assert(sqlContext.getConf(testKey) === testVal)
@@ -42,7 +46,7 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
 
     // Tests SQLConf as accessed from a SQLContext is mutable after
     // the latter is initialized, unlike SparkConf inside a SparkContext.
-    assert(sqlContext.getConf(testKey) == testVal)
+    assert(sqlContext.getConf(testKey) === testVal)
     assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
     assert(sqlContext.getAllConfs.contains(testKey))
 
@@ -73,8 +77,13 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
 
   test("deprecated property") {
     sqlContext.conf.clear()
-    sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-    assert(sqlContext.conf.numShufflePartitions === 10)
+    val original = sqlContext.conf.numShufflePartitions
+    try{
+      sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
+      assert(ctx.conf.numShufflePartitions === 10)
+    } finally {
+      sql(s"set ${SQLConf.SHUFFLE_PARTITIONS}=$original")
+    }
   }
 
   test("invalid conf value") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 05f2000459599..664b7a1512faf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestData._
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
 import org.apache.spark.sql.types._
 
 /** A SQL Dialect for testing purpose, and it can not be nested type */
@@ -991,21 +991,30 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val nonexistentKey = "nonexistent"
 
     // "set" itself returns all config variables currently specified in SQLConf.
-    assert(sql("SET").collect().size == 0)
+    assert(sql("SET").collect().size === TestSQLContext.overrideConfs.size)
+    sql("SET").collect().foreach { row =>
+      val key = row.getString(0)
+      val value = row.getString(1)
+      assert(
+        TestSQLContext.overrideConfs.contains(key),
+        s"$key should exist in SQLConf.")
+      assert(
+        TestSQLContext.overrideConfs(key) === value,
+        s"The value of $key should be ${TestSQLContext.overrideConfs(key)} instead of $value.")
+    }
+    val overrideConfs = sql("SET").collect()
 
     // "set key=val"
     sql(s"SET $testKey=$testVal")
     checkAnswer(
       sql("SET"),
-      Row(testKey, testVal)
+      overrideConfs ++ Seq(Row(testKey, testVal))
     )
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     checkAnswer(
       sql("set"),
-      Seq(
-        Row(testKey, testVal),
-        Row(testKey + testKey, testVal + testVal))
+      overrideConfs ++ Seq(Row(testKey, testVal), Row(testKey + testKey, testVal + testVal))
     )
 
     // "set key"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index d99d191ebe81e..10e633f3cde46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -31,13 +31,24 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
       new SparkConf().set("spark.sql.testkey", "true")))
   }
 
-  // Use fewer partitions to speed up testing
+  // Make sure we set those test specific confs correctly when we create
+  // the SQLConf as well as when we call clear.
   protected[sql] override def createSession(): SQLSession = new this.SQLSession()
 
   /** A special [[SQLSession]] that uses fewer shuffle partitions than normal. */
   protected[sql] class SQLSession extends super.SQLSession {
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
+
+      clear()
+
+      override def clear(): Unit = {
+        super.clear()
+
+        // Make sure we start with the default test configs even after clear
+        TestSQLContext.overrideConfs.map {
+          case (key, value) => setConfString(key, value)
+        }
+      }
     }
   }
 
@@ -50,3 +61,14 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
     protected override def sqlContext: SQLContext = self
   }
 }
+
+private[sql] object TestSQLContext {
+
+  /**
+   * A map used to store all confs that need to be overridden in sql/core unit tests.
+   */
+  val overrideConfs: Map[String, String] =
+    Map(
+      // Fewer shuffle partitions to speed up testing.
+      SQLConf.SHUFFLE_PARTITIONS.key -> "5")
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 77f43f92705d8..be335a47dcabd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -116,18 +116,28 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   override def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
+  // Make sure we set those test specific confs correctly when we create
+  // the SQLConf as well as when we call clear.
   override protected[sql] def createSession(): SQLSession = {
     new this.SQLSession()
   }
 
   protected[hive] class SQLSession extends super.SQLSession {
-    /** Fewer partitions to speed up testing. */
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def numShufflePartitions: Int = getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
       // TODO as in unit test, conf.clear() probably be called, all of the value will be cleared.
       // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
       override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
       override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
+
+      clear()
+
+      override def clear(): Unit = {
+        super.clear()
+
+        TestHiveContext.overrideConfs.map {
+          case (key, value) => setConfString(key, value)
+        }
+      }
     }
   }
 
@@ -455,3 +465,15 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
     }
   }
 }
+
+private[hive] object TestHiveContext {
+
+  /**
+   * A map used to store all confs that need to be overridden in sql/hive unit tests.
+   */
+  val overrideConfs: Map[String, String] =
+    Map(
+      // Fewer shuffle partitions to speed up testing.
+      SQLConf.SHUFFLE_PARTITIONS.key -> "5"
+    )
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 84f3db44ecd8d..97df249bdb6d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -173,6 +173,7 @@ object SparkSubmitClassLoaderTest extends Logging {
   def main(args: Array[String]) {
     Utils.configTestLog4j("INFO")
     val conf = new SparkConf()
+    conf.set("spark.ui.enabled", "false")
     val sc = new SparkContext(conf)
     val hiveContext = new TestHiveContext(sc)
     val df = hiveContext.createDataFrame((1 to 100).map(i => (i, i))).toDF("i", "j")
@@ -264,6 +265,7 @@ object SparkSQLConfTest extends Logging {
       // For this simple test, we do not really clone this object.
       override def clone: SparkConf = this
     }
+    conf.set("spark.ui.enabled", "false")
     val sc = new SparkContext(conf)
     val hiveContext = new TestHiveContext(sc)
     // Run a simple command to make sure all lazy vals in hiveContext get instantiated.
@@ -283,7 +285,8 @@ object SPARK_9757 extends QueryTest {
     val sparkContext = new SparkContext(
       new SparkConf()
         .set("spark.sql.hive.metastore.version", "0.13.1")
-        .set("spark.sql.hive.metastore.jars", "maven"))
+        .set("spark.sql.hive.metastore.jars", "maven")
+        .set("spark.ui.enabled", "false"))
 
     val hiveContext = new TestHiveContext(sparkContext)
     sqlContext = hiveContext
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
index b0d3dd44daedc..e38d1eb5779fe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
@@ -25,8 +25,10 @@ class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll {
   ignore("multiple instances not supported") {
     test("Multiple Hive Instances") {
       (1 to 10).map { i =>
+        val conf = new SparkConf()
+        conf.set("spark.ui.enabled", "false")
         val ts =
-          new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", new SparkConf()))
+          new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", conf))
         ts.executeSql("SHOW TABLES").toRdd.collect()
         ts.executeSql("SELECT * FROM src").toRdd.collect()
         ts.executeSql("SHOW TABLES").toRdd.collect()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 83f9f3eaa3a5e..fe63ad5683195 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.Cast
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.hive._
+import org.apache.spark.sql.hive.test.TestHiveContext
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 
@@ -1104,18 +1105,19 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
 
     // "SET" itself returns all config variables currently specified in SQLConf.
     // TODO: Should we be listing the default here always? probably...
-    assert(sql("SET").collect().size == 0)
+    assert(sql("SET").collect().size === TestHiveContext.overrideConfs.size)
 
+    val defaults = collectResults(sql("SET"))
     assertResult(Set(testKey -> testVal)) {
       collectResults(sql(s"SET $testKey=$testVal"))
     }
 
-    assert(hiveconf.get(testKey, "") == testVal)
-    assertResult(Set(testKey -> testVal))(collectResults(sql("SET")))
+    assert(hiveconf.get(testKey, "") === testVal)
+    assertResult(defaults ++ Set(testKey -> testVal))(collectResults(sql("SET")))
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     assert(hiveconf.get(testKey + testKey, "") == testVal + testVal)
-    assertResult(Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
+    assertResult(defaults ++ Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
       collectResults(sql("SET"))
     }
 

From 6c751940ea8449b7540ba956a00ed19ba0af599b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 4 Sep 2015 22:57:52 -1000
Subject: [PATCH 200/802] [HOTFIX] [SQL] Fixes compilation error

Jenkins master builders are currently broken by a merge conflict between PR #8584 and PR #8155.

Author: Cheng Lian <lian@databricks.com>

Closes #8614 from liancheng/hotfix/fix-pr-8155-8584-conflict.
---
 sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 589ba86a84144..3d2bd236ceead 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -80,7 +80,7 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
     val original = sqlContext.conf.numShufflePartitions
     try{
       sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-      assert(ctx.conf.numShufflePartitions === 10)
+      assert(sqlContext.conf.numShufflePartitions === 10)
     } finally {
       sql(s"set ${SQLConf.SHUFFLE_PARTITIONS}=$original")
     }

From 7a4f326c00fb33c384b4fb927310d687ec063329 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 4 Sep 2015 23:16:39 -1000
Subject: [PATCH 201/802] [SPARK-10440] [STREAMING] [DOCS] Update python API
 stuff in the programming guides and python docs

- Fixed information around Python API tags in streaming programming guides
- Added missing stuff in python docs

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8595 from tdas/SPARK-10440.
---
 docs/streaming-flume-integration.md |  2 --
 docs/streaming-programming-guide.md | 14 ++++----------
 python/docs/index.rst               |  8 ++++++++
 python/docs/pyspark.streaming.rst   | 21 +++++++++++++++++++++
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index de0461010daec..383d954409ce4 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -5,8 +5,6 @@ title: Spark Streaming + Flume Integration Guide
 
 [Apache Flume](https://flume.apache.org/) is a distributed, reliable, and available service for efficiently collecting, aggregating, and moving large amounts of log data. Here we explain how to configure Flume and Spark Streaming to receive data from Flume. There are two approaches to this.
 
-<span class="badge" style="background-color: grey">Python API</span> Flume is not yet available in the Python API.
-
 ## Approach 1: Flume-style Push-based Approach
 Flume is designed to push data between Flume agents. In this approach, Spark Streaming essentially sets up a receiver that acts an Avro agent for Flume, to which Flume can push the data. Here are the configuration steps.
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 118ced298f4b0..a1acf83f75245 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -50,13 +50,7 @@ all of which are presented in this guide.
 You will find tabs throughout this guide that let you choose between code snippets of
 different languages.
 
-**Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream
-transformations and almost all the output operations available in Scala and Java interfaces.
-However, it only has support for basic sources like text files and text data over sockets.
-APIs for additional sources, like Kafka and Flume, will be available in the future.
-Further information about available features in the Python API are mentioned throughout this
-document; look out for the tag
-<span class="badge" style="background-color: grey">Python API</span>.
+**Note:** There are a few APIs that are either different or not available in Python. Throughout this guide, you will find the tag <span class="badge" style="background-color: grey">Python API</span> highlighting these differences.
 
 ***************************************************************************************************
 
@@ -683,7 +677,7 @@ for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.strea
 {:.no_toc}
 
 <span class="badge" style="background-color: grey">Python API</span> As of Spark {{site.SPARK_VERSION_SHORT}},
-out of these sources, *only* Kafka, Flume and MQTT are available in the Python API. We will add more advanced sources in the Python API in future.
+out of these sources, Kafka, Kinesis, Flume and MQTT are available in the Python API.
 
 This category of sources require interfacing with external non-Spark libraries, some of them with
 complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts
@@ -725,9 +719,9 @@ Some of these advanced sources are as follows.
 
 - **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.2.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details.
 
-- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
+- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.6.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
 
-- **Kinesis:** See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
+- **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
 
 - **Twitter:** Spark Streaming's TwitterUtils uses Twitter4j 3.0.3 to get the public stream of tweets using
   [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
diff --git a/python/docs/index.rst b/python/docs/index.rst
index f7eede9c3c82a..306ffdb0e0f13 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -29,6 +29,14 @@ Core classes:
 
     A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
 
+    :class:`pyspark.streaming.StreamingContext`
+
+    Main entry point for Spark Streaming functionality.
+
+    :class:`pyspark.streaming.DStream`
+
+    A Discretized Stream (DStream), the basic abstraction in Spark Streaming.
+
     :class:`pyspark.sql.SQLContext`
 
     Main entry point for DataFrame and SQL functionality.
diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst
index 50822c93faba1..fc52a647543e7 100644
--- a/python/docs/pyspark.streaming.rst
+++ b/python/docs/pyspark.streaming.rst
@@ -15,3 +15,24 @@ pyspark.streaming.kafka module
     :members:
     :undoc-members:
     :show-inheritance:
+
+pyspark.streaming.kinesis module
+--------------------------------
+.. automodule:: pyspark.streaming.kinesis
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.streaming.flume.module
+------------------------------
+.. automodule:: pyspark.streaming.flume
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.streaming.mqtt module
+-----------------------------
+.. automodule:: pyspark.streaming.mqtt
+    :members:
+    :undoc-members:
+    :show-inheritance:

From bca8c072bd710beda6cfac1533a67f32f579b134 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 5 Sep 2015 17:50:12 +0800
Subject: [PATCH 202/802] [SPARK-10434] [SQL] Fixes Parquet schema of arrays
 that may contain null

To keep full compatibility of Parquet write path with Spark 1.4, we should rename the innermost field name of arrays that may contain null from "array_element" to "array".

Please refer to [SPARK-10434] [1] for more details.

[1]: https://issues.apache.org/jira/browse/SPARK-10434

Author: Cheng Lian <lian@databricks.com>

Closes #8586 from liancheng/spark-10434/fix-parquet-array-type.
---
 .../parquet/CatalystSchemaConverter.scala           | 13 +++++++------
 .../datasources/parquet/ParquetSchemaSuite.scala    |  6 +++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index a21ab1dbb25d1..2d237da81c20d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -426,13 +426,14 @@ private[parquet] class CatalystSchemaConverter(
       // ArrayType and MapType (for Spark versions <= 1.4.x)
       // ===================================================
 
-      // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
-      // LIST structure.  This behavior mimics parquet-hive (1.6.0rc3).  Note that this case is
-      // covered by the backwards-compatibility rules implemented in `isElementType()`.
+      // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
+      // `LIST` structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
+      // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
+      // field name "array" is borrowed from parquet-avro.
       case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
         // <list-repetition> group <name> (LIST) {
         //   optional group bag {
-        //     repeated <element-type> element;
+        //     repeated <element-type> array;
         //   }
         // }
         ConversionPatterns.listType(
@@ -441,8 +442,8 @@ private[parquet] class CatalystSchemaConverter(
           Types
             .buildGroup(REPEATED)
             // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
-            .addField(convertField(StructField("array_element", elementType, nullable)))
-            .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
+            .addField(convertField(StructField("array", elementType, nullable)))
+            .named("bag"))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 28c59a4abdd76..5331d7c035236 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -197,7 +197,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}
@@ -266,7 +266,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group array_element {
+      |            optional group array {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -645,7 +645,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}

From 871764c6ce531af5b1ac7ccccb32e7a903b59a2a Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Sat, 5 Sep 2015 00:04:00 -1000
Subject: [PATCH 203/802] [SPARK-10013] [ML] [JAVA] [TEST] remove java assert
 from java unit tests

From Jira: We should use assertTrue, etc. instead to make sure the asserts are not ignored in tests.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8607 from holdenk/SPARK-10013-remove-java-assert-from-java-unit-tests.
---
 .../JavaLogisticRegressionSuite.java          | 51 ++++++++++---------
 .../classification/JavaNaiveBayesSuite.java   | 13 ++---
 .../regression/JavaLinearRegressionSuite.java |  2 +-
 .../spark/mllib/linalg/JavaMatricesSuite.java | 40 +++++++--------
 4 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 618b95b9bd126..fd22eb6dca018 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -22,6 +22,7 @@
 import java.util.List;
 
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -63,16 +64,16 @@ public void tearDown() {
   @Test
   public void logisticRegressionDefaultParams() {
     LogisticRegression lr = new LogisticRegression();
-    assert(lr.getLabelCol().equals("label"));
+    Assert.assertEquals(lr.getLabelCol(), "label");
     LogisticRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, probability, prediction FROM prediction");
     predictions.collectAsList();
     // Check defaults
-    assert(model.getThreshold() == 0.5);
-    assert(model.getFeaturesCol().equals("features"));
-    assert(model.getPredictionCol().equals("prediction"));
-    assert(model.getProbabilityCol().equals("probability"));
+    Assert.assertEquals(0.5, model.getThreshold(), eps);
+    Assert.assertEquals("features", model.getFeaturesCol());
+    Assert.assertEquals("prediction", model.getPredictionCol());
+    Assert.assertEquals("probability", model.getProbabilityCol());
   }
 
   @Test
@@ -85,19 +86,19 @@ public void logisticRegressionWithSetters() {
       .setProbabilityCol("myProbability");
     LogisticRegressionModel model = lr.fit(dataset);
     LogisticRegression parent = (LogisticRegression) model.parent();
-    assert(parent.getMaxIter() == 10);
-    assert(parent.getRegParam() == 1.0);
-    assert(parent.getThresholds()[0] == 0.4);
-    assert(parent.getThresholds()[1] == 0.6);
-    assert(parent.getThreshold() == 0.6);
-    assert(model.getThreshold() == 0.6);
+    Assert.assertEquals(10, parent.getMaxIter());
+    Assert.assertEquals(1.0, parent.getRegParam(), eps);
+    Assert.assertEquals(0.4, parent.getThresholds()[0], eps);
+    Assert.assertEquals(0.6, parent.getThresholds()[1], eps);
+    Assert.assertEquals(0.6, parent.getThreshold(), eps);
+    Assert.assertEquals(0.6, model.getThreshold(), eps);
 
     // Modify model params, and check that the params worked.
     model.setThreshold(1.0);
     model.transform(dataset).registerTempTable("predAllZero");
     DataFrame predAllZero = jsql.sql("SELECT prediction, myProbability FROM predAllZero");
     for (Row r: predAllZero.collectAsList()) {
-      assert(r.getDouble(0) == 0.0);
+      Assert.assertEquals(0.0, r.getDouble(0), eps);
     }
     // Call transform with params, and check that the params worked.
     model.transform(dataset, model.threshold().w(0.0), model.probabilityCol().w("myProb"))
@@ -107,17 +108,17 @@ public void logisticRegressionWithSetters() {
     for (Row r: predNotAllZero.collectAsList()) {
       if (r.getDouble(0) != 0.0) foundNonZero = true;
     }
-    assert(foundNonZero);
+    Assert.assertTrue(foundNonZero);
 
     // Call fit() with new params, and check as many params as we can.
     LogisticRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1),
         lr.threshold().w(0.4), lr.probabilityCol().w("theProb"));
     LogisticRegression parent2 = (LogisticRegression) model2.parent();
-    assert(parent2.getMaxIter() == 5);
-    assert(parent2.getRegParam() == 0.1);
-    assert(parent2.getThreshold() == 0.4);
-    assert(model2.getThreshold() == 0.4);
-    assert(model2.getProbabilityCol().equals("theProb"));
+    Assert.assertEquals(5, parent2.getMaxIter());
+    Assert.assertEquals(0.1, parent2.getRegParam(), eps);
+    Assert.assertEquals(0.4, parent2.getThreshold(), eps);
+    Assert.assertEquals(0.4, model2.getThreshold(), eps);
+    Assert.assertEquals("theProb", model2.getProbabilityCol());
   }
 
   @SuppressWarnings("unchecked")
@@ -125,18 +126,18 @@ public void logisticRegressionWithSetters() {
   public void logisticRegressionPredictorClassifierMethods() {
     LogisticRegression lr = new LogisticRegression();
     LogisticRegressionModel model = lr.fit(dataset);
-    assert(model.numClasses() == 2);
+    Assert.assertEquals(2, model.numClasses());
 
     model.transform(dataset).registerTempTable("transformed");
     DataFrame trans1 = jsql.sql("SELECT rawPrediction, probability FROM transformed");
     for (Row row: trans1.collect()) {
       Vector raw = (Vector)row.get(0);
       Vector prob = (Vector)row.get(1);
-      assert(raw.size() == 2);
-      assert(prob.size() == 2);
+      Assert.assertEquals(raw.size(), 2);
+      Assert.assertEquals(prob.size(), 2);
       double probFromRaw1 = 1.0 / (1.0 + Math.exp(-raw.apply(1)));
-      assert(Math.abs(prob.apply(1) - probFromRaw1) < eps);
-      assert(Math.abs(prob.apply(0) - (1.0 - probFromRaw1)) < eps);
+      Assert.assertEquals(0, Math.abs(prob.apply(1) - probFromRaw1), eps);
+      Assert.assertEquals(0, Math.abs(prob.apply(0) - (1.0 - probFromRaw1)), eps);
     }
 
     DataFrame trans2 = jsql.sql("SELECT prediction, probability FROM transformed");
@@ -145,7 +146,7 @@ public void logisticRegressionPredictorClassifierMethods() {
       Vector prob = (Vector)row.get(1);
       double probOfPred = prob.apply((int)pred);
       for (int i = 0; i < prob.size(); ++i) {
-        assert(probOfPred >= prob.apply(i));
+        Assert.assertTrue(probOfPred >= prob.apply(i));
       }
     }
   }
@@ -156,6 +157,6 @@ public void logisticRegressionTrainingSummary() {
     LogisticRegressionModel model = lr.fit(dataset);
 
     LogisticRegressionTrainingSummary summary = model.summary();
-    assert(summary.totalIterations() == summary.objectiveHistory().length);
+    Assert.assertEquals(summary.totalIterations(), summary.objectiveHistory().length);
   }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 8fd7bf55a2e5d..075a62c493f17 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -23,6 +23,7 @@
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
+import static org.junit.Assert.assertEquals;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -58,18 +59,18 @@ public void validatePrediction(DataFrame predictionAndLabels) {
     for (Row r : predictionAndLabels.collect()) {
       double prediction = r.getAs(0);
       double label = r.getAs(1);
-      assert(prediction == label);
+      assertEquals(label, prediction, 1E-5);
     }
   }
 
   @Test
   public void naiveBayesDefaultParams() {
     NaiveBayes nb = new NaiveBayes();
-    assert(nb.getLabelCol() == "label");
-    assert(nb.getFeaturesCol() == "features");
-    assert(nb.getPredictionCol() == "prediction");
-    assert(nb.getSmoothing() == 1.0);
-    assert(nb.getModelType() == "multinomial");
+    assertEquals("label", nb.getLabelCol());
+    assertEquals("features", nb.getFeaturesCol());
+    assertEquals("prediction", nb.getPredictionCol());
+    assertEquals(1.0, nb.getSmoothing(), 1E-5);
+    assertEquals("multinomial", nb.getModelType());
   }
 
   @Test
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index d591a456864e4..91c589d00abd5 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -60,7 +60,7 @@ public void tearDown() {
   @Test
   public void linearRegressionDefaultParams() {
     LinearRegression lr = new LinearRegression();
-    assert(lr.getLabelCol().equals("label"));
+    assertEquals("label", lr.getLabelCol());
     LinearRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, prediction FROM prediction");
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
index 3349c5022423a..8beea102efd01 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
@@ -80,10 +80,10 @@ public void diagonalMatrixConstruction() {
         assertArrayEquals(sd.toArray(), s.toArray(), 0.0);
         assertArrayEquals(s.toArray(), ss.toArray(), 0.0);
         assertArrayEquals(s.values(), ss.values(), 0.0);
-        assert(s.values().length == 2);
-        assert(ss.values().length == 2);
-        assert(s.colPtrs().length == 4);
-        assert(ss.colPtrs().length == 4);
+        assertEquals(2, s.values().length);
+        assertEquals(2, ss.values().length);
+        assertEquals(4, s.colPtrs().length);
+        assertEquals(4, ss.colPtrs().length);
     }
 
     @Test
@@ -137,27 +137,27 @@ public void concatenateMatrices() {
         Matrix deHorz2 = Matrices.horzcat(new Matrix[]{spMat1, deMat2});
         Matrix deHorz3 = Matrices.horzcat(new Matrix[]{deMat1, spMat2});
 
-        assert(deHorz1.numRows() == 3);
-        assert(deHorz2.numRows() == 3);
-        assert(deHorz3.numRows() == 3);
-        assert(spHorz.numRows() == 3);
-        assert(deHorz1.numCols() == 5);
-        assert(deHorz2.numCols() == 5);
-        assert(deHorz3.numCols() == 5);
-        assert(spHorz.numCols() == 5);
+        assertEquals(3, deHorz1.numRows());
+        assertEquals(3, deHorz2.numRows());
+        assertEquals(3, deHorz3.numRows());
+        assertEquals(3, spHorz.numRows());
+        assertEquals(5, deHorz1.numCols());
+        assertEquals(5, deHorz2.numCols());
+        assertEquals(5, deHorz3.numCols());
+        assertEquals(5, spHorz.numCols());
 
         Matrix spVert = Matrices.vertcat(new Matrix[]{spMat1, spMat3});
         Matrix deVert1 = Matrices.vertcat(new Matrix[]{deMat1, deMat3});
         Matrix deVert2 = Matrices.vertcat(new Matrix[]{spMat1, deMat3});
         Matrix deVert3 = Matrices.vertcat(new Matrix[]{deMat1, spMat3});
 
-        assert(deVert1.numRows() == 5);
-        assert(deVert2.numRows() == 5);
-        assert(deVert3.numRows() == 5);
-        assert(spVert.numRows() == 5);
-        assert(deVert1.numCols() == 2);
-        assert(deVert2.numCols() == 2);
-        assert(deVert3.numCols() == 2);
-        assert(spVert.numCols() == 2);
+        assertEquals(5, deVert1.numRows());
+        assertEquals(5, deVert2.numRows());
+        assertEquals(5, deVert3.numRows());
+        assertEquals(5, spVert.numRows());
+        assertEquals(2, deVert1.numCols());
+        assertEquals(2, deVert2.numCols());
+        assertEquals(2, deVert3.numCols());
+        assertEquals(2, spVert.numCols());
     }
 }

From 5ffe752b59e468d55363f1f24b17a3677927ca8f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 7 Sep 2015 10:42:30 -1000
Subject: [PATCH 204/802] [SPARK-9767] Remove ConnectionManager.

We introduced the Netty network module for shuffle in Spark 1.2, and has turned it on by default for 3 releases. The old ConnectionManager is difficult to maintain. If we merge the patch now, by the time it is released, it would be 1 yr for which ConnectionManager is off by default. It's time to remove it.

Author: Reynold Xin <rxin@databricks.com>

Closes #8161 from rxin/SPARK-9767.
---
 .../scala/org/apache/spark/SparkEnv.scala     |   11 +-
 .../netty/NettyBlockTransferService.scala     |    8 +-
 .../spark/network/nio/BlockMessage.scala      |  175 ---
 .../spark/network/nio/BlockMessageArray.scala |  140 --
 .../spark/network/nio/BufferMessage.scala     |  114 --
 .../apache/spark/network/nio/Connection.scala |  619 --------
 .../spark/network/nio/ConnectionId.scala      |   36 -
 .../spark/network/nio/ConnectionManager.scala | 1157 ---------------
 .../network/nio/ConnectionManagerId.scala     |   37 -
 .../apache/spark/network/nio/Message.scala    |  114 --
 .../spark/network/nio/MessageChunk.scala      |   41 -
 .../network/nio/MessageChunkHeader.scala      |   83 --
 .../network/nio/NioBlockTransferService.scala |  217 ---
 .../spark/network/nio/SecurityMessage.scala   |  160 ---
 .../spark/serializer/KryoSerializer.scala     |    4 -
 .../network/nio/ConnectionManagerSuite.scala  |  296 ----
 .../BlockManagerReplicationSuite.scala        |    6 +-
 .../spark/storage/BlockManagerSuite.scala     |   10 +-
 docs/configuration.md                         |   11 -
 project/MimaExcludes.scala                    | 1257 +++++++++--------
 .../streaming/ReceivedBlockHandlerSuite.scala |   10 +-
 21 files changed, 651 insertions(+), 3855 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/Connection.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/Message.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 0f1e2e069568d..c6fef7f91f00c 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -33,7 +33,6 @@ import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
 import org.apache.spark.rpc.akka.AkkaRpcEnv
 import org.apache.spark.scheduler.{OutputCommitCoordinator, LiveListenerBus}
@@ -326,15 +325,7 @@ object SparkEnv extends Logging {
 
     val shuffleMemoryManager = ShuffleMemoryManager.create(conf, numUsableCores)
 
-    val blockTransferService =
-      conf.get("spark.shuffle.blockTransferService", "netty").toLowerCase match {
-        case "netty" =>
-          new NettyBlockTransferService(conf, securityManager, numUsableCores)
-        case "nio" =>
-          logWarning("NIO-based block transfer service is deprecated, " +
-            "and will be removed in Spark 1.6.0.")
-          new NioBlockTransferService(conf, securityManager)
-      }
+    val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
 
     val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(
       BlockManagerMaster.DRIVER_ENDPOINT_NAME,
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index d5ad2c9ad00e8..4b851bcb36597 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -149,7 +149,11 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
   }
 
   override def close(): Unit = {
-    server.close()
-    clientFactory.close()
+    if (server != null) {
+      server.close()
+    }
+    if (clientFactory != null) {
+      clientFactory.close()
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
deleted file mode 100644
index 79cb0640c8672..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
-
-import scala.collection.mutable.{ArrayBuffer, StringBuilder}
-
-// private[spark] because we need to register them in Kryo
-private[spark] case class GetBlock(id: BlockId)
-private[spark] case class GotBlock(id: BlockId, data: ByteBuffer)
-private[spark] case class PutBlock(id: BlockId, data: ByteBuffer, level: StorageLevel)
-
-private[nio] class BlockMessage() {
-  // Un-initialized: typ = 0
-  // GetBlock: typ = 1
-  // GotBlock: typ = 2
-  // PutBlock: typ = 3
-  private var typ: Int = BlockMessage.TYPE_NON_INITIALIZED
-  private var id: BlockId = null
-  private var data: ByteBuffer = null
-  private var level: StorageLevel = null
-
-  def set(getBlock: GetBlock) {
-    typ = BlockMessage.TYPE_GET_BLOCK
-    id = getBlock.id
-  }
-
-  def set(gotBlock: GotBlock) {
-    typ = BlockMessage.TYPE_GOT_BLOCK
-    id = gotBlock.id
-    data = gotBlock.data
-  }
-
-  def set(putBlock: PutBlock) {
-    typ = BlockMessage.TYPE_PUT_BLOCK
-    id = putBlock.id
-    data = putBlock.data
-    level = putBlock.level
-  }
-
-  def set(buffer: ByteBuffer) {
-    typ = buffer.getInt()
-    val idLength = buffer.getInt()
-    val idBuilder = new StringBuilder(idLength)
-    for (i <- 1 to idLength) {
-      idBuilder += buffer.getChar()
-    }
-    id = BlockId(idBuilder.toString)
-
-    if (typ == BlockMessage.TYPE_PUT_BLOCK) {
-
-      val booleanInt = buffer.getInt()
-      val replication = buffer.getInt()
-      level = StorageLevel(booleanInt, replication)
-
-      val dataLength = buffer.getInt()
-      data = ByteBuffer.allocate(dataLength)
-      if (dataLength != buffer.remaining) {
-        throw new Exception("Error parsing buffer")
-      }
-      data.put(buffer)
-      data.flip()
-    } else if (typ == BlockMessage.TYPE_GOT_BLOCK) {
-
-      val dataLength = buffer.getInt()
-      data = ByteBuffer.allocate(dataLength)
-      if (dataLength != buffer.remaining) {
-        throw new Exception("Error parsing buffer")
-      }
-      data.put(buffer)
-      data.flip()
-    }
-
-  }
-
-  def set(bufferMsg: BufferMessage) {
-    val buffer = bufferMsg.buffers.apply(0)
-    buffer.clear()
-    set(buffer)
-  }
-
-  def getType: Int = typ
-  def getId: BlockId = id
-  def getData: ByteBuffer = data
-  def getLevel: StorageLevel = level
-
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-    var buffer = ByteBuffer.allocate(4 + 4 + id.name.length * 2)
-    buffer.putInt(typ).putInt(id.name.length)
-    id.name.foreach((x: Char) => buffer.putChar(x))
-    buffer.flip()
-    buffers += buffer
-
-    if (typ == BlockMessage.TYPE_PUT_BLOCK) {
-      buffer = ByteBuffer.allocate(8).putInt(level.toInt).putInt(level.replication)
-      buffer.flip()
-      buffers += buffer
-
-      buffer = ByteBuffer.allocate(4).putInt(data.remaining)
-      buffer.flip()
-      buffers += buffer
-
-      buffers += data
-    } else if (typ == BlockMessage.TYPE_GOT_BLOCK) {
-      buffer = ByteBuffer.allocate(4).putInt(data.remaining)
-      buffer.flip()
-      buffers += buffer
-
-      buffers += data
-    }
-
-    Message.createBufferMessage(buffers)
-  }
-
-  override def toString: String = {
-    "BlockMessage [type = " + typ + ", id = " + id + ", level = " + level +
-    ", data = " + (if (data != null) data.remaining.toString else "null") + "]"
-  }
-}
-
-private[nio] object BlockMessage {
-  val TYPE_NON_INITIALIZED: Int = 0
-  val TYPE_GET_BLOCK: Int = 1
-  val TYPE_GOT_BLOCK: Int = 2
-  val TYPE_PUT_BLOCK: Int = 3
-
-  def fromBufferMessage(bufferMessage: BufferMessage): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(bufferMessage)
-    newBlockMessage
-  }
-
-  def fromByteBuffer(buffer: ByteBuffer): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(buffer)
-    newBlockMessage
-  }
-
-  def fromGetBlock(getBlock: GetBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(getBlock)
-    newBlockMessage
-  }
-
-  def fromGotBlock(gotBlock: GotBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(gotBlock)
-    newBlockMessage
-  }
-
-  def fromPutBlock(putBlock: PutBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(putBlock)
-    newBlockMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
deleted file mode 100644
index f1c9ea8b64ca3..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark._
-import org.apache.spark.storage.{StorageLevel, TestBlockId}
-
-import scala.collection.mutable.ArrayBuffer
-
-private[nio]
-class BlockMessageArray(var blockMessages: Seq[BlockMessage])
-  extends Seq[BlockMessage] with Logging {
-
-  def this(bm: BlockMessage) = this(Array(bm))
-
-  def this() = this(null.asInstanceOf[Seq[BlockMessage]])
-
-  def apply(i: Int): BlockMessage = blockMessages(i)
-
-  def iterator: Iterator[BlockMessage] = blockMessages.iterator
-
-  def length: Int = blockMessages.length
-
-  def set(bufferMessage: BufferMessage) {
-    val startTime = System.currentTimeMillis
-    val newBlockMessages = new ArrayBuffer[BlockMessage]()
-    val buffer = bufferMessage.buffers(0)
-    buffer.clear()
-    while (buffer.remaining() > 0) {
-      val size = buffer.getInt()
-      logDebug("Creating block message of size " + size + " bytes")
-      val newBuffer = buffer.slice()
-      newBuffer.clear()
-      newBuffer.limit(size)
-      logDebug("Trying to convert buffer " + newBuffer + " to block message")
-      val newBlockMessage = BlockMessage.fromByteBuffer(newBuffer)
-      logDebug("Created " + newBlockMessage)
-      newBlockMessages += newBlockMessage
-      buffer.position(buffer.position() + size)
-    }
-    val finishTime = System.currentTimeMillis
-    logDebug("Converted block message array from buffer message in " +
-      (finishTime - startTime) / 1000.0  + " s")
-    this.blockMessages = newBlockMessages
-  }
-
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-
-    blockMessages.foreach(blockMessage => {
-      val bufferMessage = blockMessage.toBufferMessage
-      logDebug("Adding " + blockMessage)
-      val sizeBuffer = ByteBuffer.allocate(4).putInt(bufferMessage.size)
-      sizeBuffer.flip
-      buffers += sizeBuffer
-      buffers ++= bufferMessage.buffers
-      logDebug("Added " + bufferMessage)
-    })
-
-    logDebug("Buffer list:")
-    buffers.foreach((x: ByteBuffer) => logDebug("" + x))
-    Message.createBufferMessage(buffers)
-  }
-}
-
-private[nio] object BlockMessageArray extends Logging {
-
-  def fromBufferMessage(bufferMessage: BufferMessage): BlockMessageArray = {
-    val newBlockMessageArray = new BlockMessageArray()
-    newBlockMessageArray.set(bufferMessage)
-    newBlockMessageArray
-  }
-
-  def main(args: Array[String]) {
-    val blockMessages =
-      (0 until 10).map { i =>
-        if (i % 2 == 0) {
-          val buffer = ByteBuffer.allocate(100)
-          buffer.clear()
-          BlockMessage.fromPutBlock(PutBlock(TestBlockId(i.toString), buffer,
-            StorageLevel.MEMORY_ONLY_SER))
-        } else {
-          BlockMessage.fromGetBlock(GetBlock(TestBlockId(i.toString)))
-        }
-      }
-    val blockMessageArray = new BlockMessageArray(blockMessages)
-    logDebug("Block message array created")
-
-    val bufferMessage = blockMessageArray.toBufferMessage
-    logDebug("Converted to buffer message")
-
-    val totalSize = bufferMessage.size
-    val newBuffer = ByteBuffer.allocate(totalSize)
-    newBuffer.clear()
-    bufferMessage.buffers.foreach(buffer => {
-      assert (0 == buffer.position())
-      newBuffer.put(buffer)
-      buffer.rewind()
-    })
-    newBuffer.flip
-    val newBufferMessage = Message.createBufferMessage(newBuffer)
-    logDebug("Copied to new buffer message, size = " + newBufferMessage.size)
-
-    val newBlockMessageArray = BlockMessageArray.fromBufferMessage(newBufferMessage)
-    logDebug("Converted back to block message array")
-    // scalastyle:off println
-    newBlockMessageArray.foreach(blockMessage => {
-      blockMessage.getType match {
-        case BlockMessage.TYPE_PUT_BLOCK => {
-          val pB = PutBlock(blockMessage.getId, blockMessage.getData, blockMessage.getLevel)
-          println(pB)
-        }
-        case BlockMessage.TYPE_GET_BLOCK => {
-          val gB = new GetBlock(blockMessage.getId)
-          println(gB)
-        }
-      }
-    })
-    // scalastyle:on println
-  }
-}
-
-
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala
deleted file mode 100644
index 9a9e22b0c2366..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.storage.BlockManager
-
-
-private[nio]
-class BufferMessage(id_ : Int, val buffers: ArrayBuffer[ByteBuffer], var ackId: Int)
-  extends Message(Message.BUFFER_MESSAGE, id_) {
-
-  val initialSize = currentSize()
-  var gotChunkForSendingOnce = false
-
-  def size: Int = initialSize
-
-  def currentSize(): Int = {
-    if (buffers == null || buffers.isEmpty) {
-      0
-    } else {
-      buffers.map(_.remaining).reduceLeft(_ + _)
-    }
-  }
-
-  def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] = {
-    if (maxChunkSize <= 0) {
-      throw new Exception("Max chunk size is " + maxChunkSize)
-    }
-
-    val security = if (isSecurityNeg) 1 else 0
-    if (size == 0 && !gotChunkForSendingOnce) {
-      val newChunk = new MessageChunk(
-        new MessageChunkHeader(typ, id, 0, 0, ackId, hasError, security, senderAddress), null)
-      gotChunkForSendingOnce = true
-      return Some(newChunk)
-    }
-
-    while(!buffers.isEmpty) {
-      val buffer = buffers(0)
-      if (buffer.remaining == 0) {
-        BlockManager.dispose(buffer)
-        buffers -= buffer
-      } else {
-        val newBuffer = if (buffer.remaining <= maxChunkSize) {
-          buffer.duplicate()
-        } else {
-          buffer.slice().limit(maxChunkSize).asInstanceOf[ByteBuffer]
-        }
-        buffer.position(buffer.position + newBuffer.remaining)
-        val newChunk = new MessageChunk(new MessageChunkHeader(
-          typ, id, size, newBuffer.remaining, ackId,
-          hasError, security, senderAddress), newBuffer)
-        gotChunkForSendingOnce = true
-        return Some(newChunk)
-      }
-    }
-    None
-  }
-
-  def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] = {
-    // STRONG ASSUMPTION: BufferMessage created when receiving data has ONLY ONE data buffer
-    if (buffers.size > 1) {
-      throw new Exception("Attempting to get chunk from message with multiple data buffers")
-    }
-    val buffer = buffers(0)
-    val security = if (isSecurityNeg) 1 else 0
-    if (buffer.remaining > 0) {
-      if (buffer.remaining < chunkSize) {
-        throw new Exception("Not enough space in data buffer for receiving chunk")
-      }
-      val newBuffer = buffer.slice().limit(chunkSize).asInstanceOf[ByteBuffer]
-      buffer.position(buffer.position + newBuffer.remaining)
-      val newChunk = new MessageChunk(new MessageChunkHeader(
-          typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer)
-      return Some(newChunk)
-    }
-    None
-  }
-
-  def flip() {
-    buffers.foreach(_.flip)
-  }
-
-  def hasAckId(): Boolean = ackId != 0
-
-  def isCompletelyReceived: Boolean = !buffers(0).hasRemaining
-
-  override def toString: String = {
-    if (hasAckId) {
-      "BufferAckMessage(aid = " + ackId + ", id = " + id + ", size = " + size + ")"
-    } else {
-      "BufferMessage(id = " + id + ", size = " + size + ")"
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
deleted file mode 100644
index 8d9ebadaf79d4..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net._
-import java.nio._
-import java.nio.channels._
-import java.util.concurrent.ConcurrentLinkedQueue
-import java.util.LinkedList
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{ArrayBuffer, HashMap}
-import scala.util.control.NonFatal
-
-import org.apache.spark._
-import org.apache.spark.network.sasl.{SparkSaslClient, SparkSaslServer}
-
-private[nio]
-abstract class Connection(val channel: SocketChannel, val selector: Selector,
-    val socketRemoteConnectionManagerId: ConnectionManagerId, val connectionId: ConnectionId,
-    val securityMgr: SecurityManager)
-  extends Logging {
-
-  var sparkSaslServer: SparkSaslServer = null
-  var sparkSaslClient: SparkSaslClient = null
-
-  def this(channel_ : SocketChannel, selector_ : Selector, id_ : ConnectionId,
-      securityMgr_ : SecurityManager) = {
-    this(channel_, selector_,
-      ConnectionManagerId.fromSocketAddress(
-        channel_.socket.getRemoteSocketAddress.asInstanceOf[InetSocketAddress]),
-        id_, securityMgr_)
-  }
-
-  channel.configureBlocking(false)
-  channel.socket.setTcpNoDelay(true)
-  channel.socket.setReuseAddress(true)
-  channel.socket.setKeepAlive(true)
-  /* channel.socket.setReceiveBufferSize(32768) */
-
-  @volatile private var closed = false
-  var onCloseCallback: Connection => Unit = null
-  val onExceptionCallbacks = new ConcurrentLinkedQueue[(Connection, Throwable) => Unit]
-  var onKeyInterestChangeCallback: (Connection, Int) => Unit = null
-
-  val remoteAddress = getRemoteAddress()
-
-  def isSaslComplete(): Boolean
-
-  def resetForceReregister(): Boolean
-
-  // Read channels typically do not register for write and write does not for read
-  // Now, we do have write registering for read too (temporarily), but this is to detect
-  // channel close NOT to actually read/consume data on it !
-  // How does this work if/when we move to SSL ?
-
-  // What is the interest to register with selector for when we want this connection to be selected
-  def registerInterest()
-
-  // What is the interest to register with selector for when we want this connection to
-  // be de-selected
-  // Traditionally, 0 - but in our case, for example, for close-detection on SendingConnection hack,
-  // it will be SelectionKey.OP_READ (until we fix it properly)
-  def unregisterInterest()
-
-  // On receiving a read event, should we change the interest for this channel or not ?
-  // Will be true for ReceivingConnection, false for SendingConnection.
-  def changeInterestForRead(): Boolean
-
-  private def disposeSasl() {
-    if (sparkSaslServer != null) {
-      sparkSaslServer.dispose()
-    }
-
-    if (sparkSaslClient != null) {
-      sparkSaslClient.dispose()
-    }
-  }
-
-  // On receiving a write event, should we change the interest for this channel or not ?
-  // Will be false for ReceivingConnection, true for SendingConnection.
-  // Actually, for now, should not get triggered for ReceivingConnection
-  def changeInterestForWrite(): Boolean
-
-  def getRemoteConnectionManagerId(): ConnectionManagerId = {
-    socketRemoteConnectionManagerId
-  }
-
-  def key(): SelectionKey = channel.keyFor(selector)
-
-  def getRemoteAddress(): InetSocketAddress = {
-    channel.socket.getRemoteSocketAddress().asInstanceOf[InetSocketAddress]
-  }
-
-  // Returns whether we have to register for further reads or not.
-  def read(): Boolean = {
-    throw new UnsupportedOperationException(
-      "Cannot read on connection of type " + this.getClass.toString)
-  }
-
-  // Returns whether we have to register for further writes or not.
-  def write(): Boolean = {
-    throw new UnsupportedOperationException(
-      "Cannot write on connection of type " + this.getClass.toString)
-  }
-
-  def close() {
-    closed = true
-    val k = key()
-    if (k != null) {
-      k.cancel()
-    }
-    channel.close()
-    disposeSasl()
-    callOnCloseCallback()
-  }
-
-  protected def isClosed: Boolean = closed
-
-  def onClose(callback: Connection => Unit) {
-    onCloseCallback = callback
-  }
-
-  def onException(callback: (Connection, Throwable) => Unit) {
-    onExceptionCallbacks.add(callback)
-  }
-
-  def onKeyInterestChange(callback: (Connection, Int) => Unit) {
-    onKeyInterestChangeCallback = callback
-  }
-
-  def callOnExceptionCallbacks(e: Throwable) {
-    onExceptionCallbacks.asScala.foreach {
-      callback =>
-        try {
-          callback(this, e)
-        } catch {
-          case NonFatal(e) => {
-            logWarning("Ignored error in onExceptionCallback", e)
-          }
-        }
-    }
-  }
-
-  def callOnCloseCallback() {
-    if (onCloseCallback != null) {
-      onCloseCallback(this)
-    } else {
-      logWarning("Connection to " + getRemoteConnectionManagerId() +
-        " closed and OnExceptionCallback not registered")
-    }
-
-  }
-
-  def changeConnectionKeyInterest(ops: Int) {
-    if (onKeyInterestChangeCallback != null) {
-      onKeyInterestChangeCallback(this, ops)
-    } else {
-      throw new Exception("OnKeyInterestChangeCallback not registered")
-    }
-  }
-
-  def printRemainingBuffer(buffer: ByteBuffer) {
-    val bytes = new Array[Byte](buffer.remaining)
-    val curPosition = buffer.position
-    buffer.get(bytes)
-    bytes.foreach(x => print(x + " "))
-    buffer.position(curPosition)
-    print(" (" + bytes.length + ")")
-  }
-
-  def printBuffer(buffer: ByteBuffer, position: Int, length: Int) {
-    val bytes = new Array[Byte](length)
-    val curPosition = buffer.position
-    buffer.position(position)
-    buffer.get(bytes)
-    bytes.foreach(x => print(x + " "))
-    print(" (" + position + ", " + length + ")")
-    buffer.position(curPosition)
-  }
-}
-
-
-private[nio]
-class SendingConnection(val address: InetSocketAddress, selector_ : Selector,
-    remoteId_ : ConnectionManagerId, id_ : ConnectionId,
-    securityMgr_ : SecurityManager)
-  extends Connection(SocketChannel.open, selector_, remoteId_, id_, securityMgr_) {
-
-  def isSaslComplete(): Boolean = {
-    if (sparkSaslClient != null) sparkSaslClient.isComplete() else false
-  }
-
-  private class Outbox {
-    val messages = new LinkedList[Message]()
-    val defaultChunkSize = 65536
-    var nextMessageToBeUsed = 0
-
-    def addMessage(message: Message) {
-      messages.synchronized {
-        messages.add(message)
-        logDebug("Added [" + message + "] to outbox for sending to " +
-          "[" + getRemoteConnectionManagerId() + "]")
-      }
-    }
-
-    def getChunk(): Option[MessageChunk] = {
-      messages.synchronized {
-        while (!messages.isEmpty) {
-          /* nextMessageToBeUsed = nextMessageToBeUsed % messages.size */
-          /* val message = messages(nextMessageToBeUsed) */
-
-          val message = if (securityMgr.isAuthenticationEnabled() && !isSaslComplete()) {
-            // only allow sending of security messages until sasl is complete
-            var pos = 0
-            var securityMsg: Message = null
-            while (pos < messages.size() && securityMsg == null) {
-              if (messages.get(pos).isSecurityNeg) {
-                securityMsg = messages.remove(pos)
-              }
-              pos = pos + 1
-            }
-            // didn't find any security messages and auth isn't completed so return
-            if (securityMsg == null) return None
-            securityMsg
-          } else {
-            messages.removeFirst()
-          }
-
-          val chunk = message.getChunkForSending(defaultChunkSize)
-          if (chunk.isDefined) {
-            messages.add(message)
-            nextMessageToBeUsed = nextMessageToBeUsed + 1
-            if (!message.started) {
-              logDebug(
-                "Starting to send [" + message + "] to [" + getRemoteConnectionManagerId() + "]")
-              message.started = true
-              message.startTime = System.currentTimeMillis
-            }
-            logTrace(
-              "Sending chunk from [" + message + "] to [" + getRemoteConnectionManagerId() + "]")
-            return chunk
-          } else {
-            message.finishTime = System.currentTimeMillis
-            logDebug("Finished sending [" + message + "] to [" + getRemoteConnectionManagerId() +
-              "] in "  + message.timeTaken )
-          }
-        }
-      }
-      None
-    }
-  }
-
-  // outbox is used as a lock - ensure that it is always used as a leaf (since methods which
-  // lock it are invoked in context of other locks)
-  private val outbox = new Outbox()
-  /*
-    This is orthogonal to whether we have pending bytes to write or not - and satisfies a slightly
-    different purpose. This flag is to see if we need to force reregister for write even when we
-    do not have any pending bytes to write to socket.
-    This can happen due to a race between adding pending buffers, and checking for existing of
-    data as detailed in https://github.com/mesos/spark/pull/791
-   */
-  private var needForceReregister = false
-
-  val currentBuffers = new ArrayBuffer[ByteBuffer]()
-
-  /* channel.socket.setSendBufferSize(256 * 1024) */
-
-  override def getRemoteAddress(): InetSocketAddress = address
-
-  val DEFAULT_INTEREST = SelectionKey.OP_READ
-
-  override def registerInterest() {
-    // Registering read too - does not really help in most cases, but for some
-    // it does - so let us keep it for now.
-    changeConnectionKeyInterest(SelectionKey.OP_WRITE | DEFAULT_INTEREST)
-  }
-
-  override def unregisterInterest() {
-    changeConnectionKeyInterest(DEFAULT_INTEREST)
-  }
-
-  def registerAfterAuth(): Unit = {
-    outbox.synchronized {
-      needForceReregister = true
-    }
-    if (channel.isConnected) {
-      registerInterest()
-    }
-  }
-
-  def send(message: Message) {
-    outbox.synchronized {
-      outbox.addMessage(message)
-      needForceReregister = true
-    }
-    if (channel.isConnected) {
-      registerInterest()
-    }
-  }
-
-  // return previous value after resetting it.
-  def resetForceReregister(): Boolean = {
-    outbox.synchronized {
-      val result = needForceReregister
-      needForceReregister = false
-      result
-    }
-  }
-
-  // MUST be called within the selector loop
-  def connect() {
-    try {
-      channel.register(selector, SelectionKey.OP_CONNECT)
-      channel.connect(address)
-      logInfo("Initiating connection to [" + address + "]")
-    } catch {
-      case e: Exception =>
-        logError("Error connecting to " + address, e)
-        callOnExceptionCallbacks(e)
-    }
-  }
-
-  def finishConnect(force: Boolean): Boolean = {
-    try {
-      // Typically, this should finish immediately since it was triggered by a connect
-      // selection - though need not necessarily always complete successfully.
-      val connected = channel.finishConnect
-      if (!force && !connected) {
-        logInfo(
-          "finish connect failed [" + address + "], " + outbox.messages.size + " messages pending")
-        return false
-      }
-
-      // Fallback to previous behavior - assume finishConnect completed
-      // This will happen only when finishConnect failed for some repeated number of times
-      // (10 or so)
-      // Is highly unlikely unless there was an unclean close of socket, etc
-      registerInterest()
-      logInfo("Connected to [" + address + "], " + outbox.messages.size + " messages pending")
-    } catch {
-      case e: Exception => {
-        logWarning("Error finishing connection to " + address, e)
-        callOnExceptionCallbacks(e)
-      }
-    }
-    true
-  }
-
-  override def write(): Boolean = {
-    try {
-      while (true) {
-        if (currentBuffers.size == 0) {
-          outbox.synchronized {
-            outbox.getChunk() match {
-              case Some(chunk) => {
-                val buffers = chunk.buffers
-                // If we have 'seen' pending messages, then reset flag - since we handle that as
-                // normal registering of event (below)
-                if (needForceReregister && buffers.exists(_.remaining() > 0)) resetForceReregister()
-
-                currentBuffers ++= buffers
-              }
-              case None => {
-                // changeConnectionKeyInterest(0)
-                /* key.interestOps(0) */
-                return false
-              }
-            }
-          }
-        }
-
-        if (currentBuffers.size > 0) {
-          val buffer = currentBuffers(0)
-          val remainingBytes = buffer.remaining
-          val writtenBytes = channel.write(buffer)
-          if (buffer.remaining == 0) {
-            currentBuffers -= buffer
-          }
-          if (writtenBytes < remainingBytes) {
-            // re-register for write.
-            return true
-          }
-        }
-      }
-    } catch {
-      case e: Exception => {
-        logWarning("Error writing in connection to " + getRemoteConnectionManagerId(), e)
-        callOnExceptionCallbacks(e)
-        close()
-        return false
-      }
-    }
-    // should not happen - to keep scala compiler happy
-    true
-  }
-
-  // This is a hack to determine if remote socket was closed or not.
-  // SendingConnection DOES NOT expect to receive any data - if it does, it is an error
-  // For a bunch of cases, read will return -1 in case remote socket is closed : hence we
-  // register for reads to determine that.
-  override def read(): Boolean = {
-    // We don't expect the other side to send anything; so, we just read to detect an error or EOF.
-    try {
-      val length = channel.read(ByteBuffer.allocate(1))
-      if (length == -1) { // EOF
-        close()
-      } else if (length > 0) {
-        logWarning(
-          "Unexpected data read from SendingConnection to " + getRemoteConnectionManagerId())
-      }
-    } catch {
-      case e: Exception =>
-        logError("Exception while reading SendingConnection to " + getRemoteConnectionManagerId(),
-          e)
-        callOnExceptionCallbacks(e)
-        close()
-    }
-
-    false
-  }
-
-  override def changeInterestForRead(): Boolean = false
-
-  override def changeInterestForWrite(): Boolean = ! isClosed
-}
-
-
-// Must be created within selector loop - else deadlock
-private[spark] class ReceivingConnection(
-    channel_ : SocketChannel,
-    selector_ : Selector,
-    id_ : ConnectionId,
-    securityMgr_ : SecurityManager)
-    extends Connection(channel_, selector_, id_, securityMgr_) {
-
-  def isSaslComplete(): Boolean = {
-    if (sparkSaslServer != null) sparkSaslServer.isComplete() else false
-  }
-
-  class Inbox() {
-    val messages = new HashMap[Int, BufferMessage]()
-
-    def getChunk(header: MessageChunkHeader): Option[MessageChunk] = {
-
-      def createNewMessage: BufferMessage = {
-        val newMessage = Message.create(header).asInstanceOf[BufferMessage]
-        newMessage.started = true
-        newMessage.startTime = System.currentTimeMillis
-        newMessage.isSecurityNeg = header.securityNeg == 1
-        logDebug(
-          "Starting to receive [" + newMessage + "] from [" + getRemoteConnectionManagerId() + "]")
-        messages += ((newMessage.id, newMessage))
-        newMessage
-      }
-
-      val message = messages.getOrElseUpdate(header.id, createNewMessage)
-      logTrace(
-        "Receiving chunk of [" + message + "] from [" + getRemoteConnectionManagerId() + "]")
-      message.getChunkForReceiving(header.chunkSize)
-    }
-
-    def getMessageForChunk(chunk: MessageChunk): Option[BufferMessage] = {
-      messages.get(chunk.header.id)
-    }
-
-    def removeMessage(message: Message) {
-      messages -= message.id
-    }
-  }
-
-  @volatile private var inferredRemoteManagerId: ConnectionManagerId = null
-
-  override def getRemoteConnectionManagerId(): ConnectionManagerId = {
-    val currId = inferredRemoteManagerId
-    if (currId != null) currId else super.getRemoteConnectionManagerId()
-  }
-
-  // The receiver's remote address is the local socket on remote side : which is NOT
-  // the connection manager id of the receiver.
-  // We infer that from the messages we receive on the receiver socket.
-  private def processConnectionManagerId(header: MessageChunkHeader) {
-    val currId = inferredRemoteManagerId
-    if (header.address == null || currId != null) return
-
-    val managerId = ConnectionManagerId.fromSocketAddress(header.address)
-
-    if (managerId != null) {
-      inferredRemoteManagerId = managerId
-    }
-  }
-
-
-  val inbox = new Inbox()
-  val headerBuffer: ByteBuffer = ByteBuffer.allocate(MessageChunkHeader.HEADER_SIZE)
-  var onReceiveCallback: (Connection, Message) => Unit = null
-  var currentChunk: MessageChunk = null
-
-  channel.register(selector, SelectionKey.OP_READ)
-
-  override def read(): Boolean = {
-    try {
-      while (true) {
-        if (currentChunk == null) {
-          val headerBytesRead = channel.read(headerBuffer)
-          if (headerBytesRead == -1) {
-            close()
-            return false
-          }
-          if (headerBuffer.remaining > 0) {
-            // re-register for read event ...
-            return true
-          }
-          headerBuffer.flip
-          if (headerBuffer.remaining != MessageChunkHeader.HEADER_SIZE) {
-            throw new Exception(
-              "Unexpected number of bytes (" + headerBuffer.remaining + ") in the header")
-          }
-          val header = MessageChunkHeader.create(headerBuffer)
-          headerBuffer.clear()
-
-          processConnectionManagerId(header)
-
-          header.typ match {
-            case Message.BUFFER_MESSAGE => {
-              if (header.totalSize == 0) {
-                if (onReceiveCallback != null) {
-                  onReceiveCallback(this, Message.create(header))
-                }
-                currentChunk = null
-                // re-register for read event ...
-                return true
-              } else {
-                currentChunk = inbox.getChunk(header).orNull
-              }
-            }
-            case _ => throw new Exception("Message of unknown type received")
-          }
-        }
-
-        if (currentChunk == null) throw new Exception("No message chunk to receive data")
-
-        val bytesRead = channel.read(currentChunk.buffer)
-        if (bytesRead == 0) {
-          // re-register for read event ...
-          return true
-        } else if (bytesRead == -1) {
-          close()
-          return false
-        }
-
-        /* logDebug("Read " + bytesRead + " bytes for the buffer") */
-
-        if (currentChunk.buffer.remaining == 0) {
-          /* println("Filled buffer at " + System.currentTimeMillis) */
-          val bufferMessage = inbox.getMessageForChunk(currentChunk).get
-          if (bufferMessage.isCompletelyReceived) {
-            bufferMessage.flip()
-            bufferMessage.finishTime = System.currentTimeMillis
-            logDebug("Finished receiving [" + bufferMessage + "] from " +
-              "[" + getRemoteConnectionManagerId() + "] in " + bufferMessage.timeTaken)
-            if (onReceiveCallback != null) {
-              onReceiveCallback(this, bufferMessage)
-            }
-            inbox.removeMessage(bufferMessage)
-          }
-          currentChunk = null
-        }
-      }
-    } catch {
-      case e: Exception => {
-        logWarning("Error reading from connection to " + getRemoteConnectionManagerId(), e)
-        callOnExceptionCallbacks(e)
-        close()
-        return false
-      }
-    }
-    // should not happen - to keep scala compiler happy
-    true
-  }
-
-  def onReceive(callback: (Connection, Message) => Unit) {onReceiveCallback = callback}
-
-  // override def changeInterestForRead(): Boolean = ! isClosed
-  override def changeInterestForRead(): Boolean = true
-
-  override def changeInterestForWrite(): Boolean = {
-    throw new IllegalStateException("Unexpected invocation right now")
-  }
-
-  override def registerInterest() {
-    // Registering read too - does not really help in most cases, but for some
-    // it does - so let us keep it for now.
-    changeConnectionKeyInterest(SelectionKey.OP_READ)
-  }
-
-  override def unregisterInterest() {
-    changeConnectionKeyInterest(0)
-  }
-
-  // For read conn, always false.
-  override def resetForceReregister(): Boolean = false
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala
deleted file mode 100644
index b3b281ff465f1..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-private[nio] case class ConnectionId(connectionManagerId: ConnectionManagerId, uniqId: Int) {
-  override def toString: String = {
-    connectionManagerId.host + "_" + connectionManagerId.port + "_" + uniqId
-  }
-}
-
-private[nio] object ConnectionId {
-
-  def createConnectionIdFromString(connectionIdString: String): ConnectionId = {
-    val res = connectionIdString.split("_").map(_.trim())
-    if (res.size != 3) {
-      throw new Exception("Error converting ConnectionId string: " + connectionIdString +
-        " to a ConnectionId Object")
-    }
-    new ConnectionId(new ConnectionManagerId(res(0), res(1).toInt), res(2).toInt)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
deleted file mode 100644
index 9143918790381..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
+++ /dev/null
@@ -1,1157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.io.IOException
-import java.lang.ref.WeakReference
-import java.net._
-import java.nio._
-import java.nio.channels._
-import java.nio.channels.spi._
-import java.util.concurrent.atomic.AtomicInteger
-import java.util.concurrent.{LinkedBlockingDeque, ThreadPoolExecutor, TimeUnit}
-
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, SynchronizedMap, SynchronizedQueue}
-import scala.concurrent.duration._
-import scala.concurrent.{Await, ExecutionContext, Future, Promise}
-import scala.language.postfixOps
-
-import com.google.common.base.Charsets.UTF_8
-import io.netty.util.{Timeout, TimerTask, HashedWheelTimer}
-
-import org.apache.spark._
-import org.apache.spark.network.sasl.{SparkSaslClient, SparkSaslServer}
-import org.apache.spark.util.{ThreadUtils, Utils}
-
-import scala.util.Try
-import scala.util.control.NonFatal
-
-private[nio] class ConnectionManager(
-    port: Int,
-    conf: SparkConf,
-    securityManager: SecurityManager,
-    name: String = "Connection manager")
-  extends Logging {
-
-  /**
-   * Used by sendMessageReliably to track messages being sent.
-   * @param message the message that was sent
-   * @param connectionManagerId the connection manager that sent this message
-   * @param completionHandler callback that's invoked when the send has completed or failed
-   */
-  class MessageStatus(
-      val message: Message,
-      val connectionManagerId: ConnectionManagerId,
-      completionHandler: Try[Message] => Unit) {
-
-    def success(ackMessage: Message) {
-      if (ackMessage == null) {
-        failure(new NullPointerException)
-      }
-      else {
-        completionHandler(scala.util.Success(ackMessage))
-      }
-    }
-
-    def failWithoutAck() {
-      completionHandler(scala.util.Failure(new IOException("Failed without being ACK'd")))
-    }
-
-    def failure(e: Throwable) {
-      completionHandler(scala.util.Failure(e))
-    }
-  }
-
-  private val selector = SelectorProvider.provider.openSelector()
-  private val ackTimeoutMonitor =
-    new HashedWheelTimer(ThreadUtils.namedThreadFactory("AckTimeoutMonitor"))
-
-  private val ackTimeout =
-    conf.getTimeAsSeconds("spark.core.connection.ack.wait.timeout",
-      conf.get("spark.network.timeout", "120s"))
-
-  // Get the thread counts from the Spark Configuration.
-  //
-  // Even though the ThreadPoolExecutor constructor takes both a minimum and maximum value,
-  // we only query for the minimum value because we are using LinkedBlockingDeque.
-  //
-  // The JavaDoc for ThreadPoolExecutor points out that when using a LinkedBlockingDeque (which is
-  // an unbounded queue) no more than corePoolSize threads will ever be created, so only the "min"
-  // parameter is necessary.
-  private val handlerThreadCount = conf.getInt("spark.core.connection.handler.threads.min", 20)
-  private val ioThreadCount = conf.getInt("spark.core.connection.io.threads.min", 4)
-  private val connectThreadCount = conf.getInt("spark.core.connection.connect.threads.min", 1)
-
-  private val handleMessageExecutor = new ThreadPoolExecutor(
-    handlerThreadCount,
-    handlerThreadCount,
-    conf.getInt("spark.core.connection.handler.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-message-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleMessageExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  private val handleReadWriteExecutor = new ThreadPoolExecutor(
-    ioThreadCount,
-    ioThreadCount,
-    conf.getInt("spark.core.connection.io.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-read-write-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleReadWriteExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks :
-  // which should be executed asap
-  private val handleConnectExecutor = new ThreadPoolExecutor(
-    connectThreadCount,
-    connectThreadCount,
-    conf.getInt("spark.core.connection.connect.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-connect-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleConnectExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  private val serverChannel = ServerSocketChannel.open()
-  // used to track the SendingConnections waiting to do SASL negotiation
-  private val connectionsAwaitingSasl = new HashMap[ConnectionId, SendingConnection]
-    with SynchronizedMap[ConnectionId, SendingConnection]
-  private val connectionsByKey =
-    new HashMap[SelectionKey, Connection] with SynchronizedMap[SelectionKey, Connection]
-  private val connectionsById = new HashMap[ConnectionManagerId, SendingConnection]
-    with SynchronizedMap[ConnectionManagerId, SendingConnection]
-  // Tracks sent messages for which we are awaiting acknowledgements.  Entries are added to this
-  // map when messages are sent and are removed when acknowledgement messages are received or when
-  // acknowledgement timeouts expire
-  private val messageStatuses = new HashMap[Int, MessageStatus]  // [MessageId, MessageStatus]
-  private val keyInterestChangeRequests = new SynchronizedQueue[(SelectionKey, Int)]
-  private val registerRequests = new SynchronizedQueue[SendingConnection]
-
-  implicit val futureExecContext = ExecutionContext.fromExecutor(
-    ThreadUtils.newDaemonCachedThreadPool("Connection manager future execution context"))
-
-  @volatile
-  private var onReceiveCallback: (BufferMessage, ConnectionManagerId) => Option[Message] = null
-
-  private val authEnabled = securityManager.isAuthenticationEnabled()
-
-  serverChannel.configureBlocking(false)
-  serverChannel.socket.setReuseAddress(true)
-  serverChannel.socket.setReceiveBufferSize(256 * 1024)
-
-  private def startService(port: Int): (ServerSocketChannel, Int) = {
-    serverChannel.socket.bind(new InetSocketAddress(port))
-    (serverChannel, serverChannel.socket.getLocalPort)
-  }
-  Utils.startServiceOnPort[ServerSocketChannel](port, startService, conf, name)
-  serverChannel.register(selector, SelectionKey.OP_ACCEPT)
-
-  val id = new ConnectionManagerId(Utils.localHostName, serverChannel.socket.getLocalPort)
-  logInfo("Bound socket to port " + serverChannel.socket.getLocalPort() + " with id = " + id)
-
-  // used in combination with the ConnectionManagerId to create unique Connection ids
-  // to be able to track asynchronous messages
-  private val idCount: AtomicInteger = new AtomicInteger(1)
-
-  private val writeRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]()
-  private val readRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]()
-
-  @volatile private var isActive = true
-  private val selectorThread = new Thread("connection-manager-thread") {
-    override def run(): Unit = ConnectionManager.this.run()
-  }
-  selectorThread.setDaemon(true)
-  // start this thread last, since it invokes run(), which accesses members above
-  selectorThread.start()
-
-  private def triggerWrite(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    writeRunnableStarted.synchronized {
-      // So that we do not trigger more write events while processing this one.
-      // The write method will re-register when done.
-      if (conn.changeInterestForWrite()) conn.unregisterInterest()
-      if (writeRunnableStarted.contains(key)) {
-        // key.interestOps(key.interestOps() & ~ SelectionKey.OP_WRITE)
-        return
-      }
-
-      writeRunnableStarted += key
-    }
-    handleReadWriteExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var register: Boolean = false
-          try {
-            register = conn.write()
-          } finally {
-            writeRunnableStarted.synchronized {
-              writeRunnableStarted -= key
-              val needReregister = register || conn.resetForceReregister()
-              if (needReregister && conn.changeInterestForWrite()) {
-                conn.registerInterest()
-              }
-            }
-          }
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when writing to " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-
-  private def triggerRead(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    readRunnableStarted.synchronized {
-      // So that we do not trigger more read events while processing this one.
-      // The read method will re-register when done.
-      if (conn.changeInterestForRead())conn.unregisterInterest()
-      if (readRunnableStarted.contains(key)) {
-        return
-      }
-
-      readRunnableStarted += key
-    }
-    handleReadWriteExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var register: Boolean = false
-          try {
-            register = conn.read()
-          } finally {
-            readRunnableStarted.synchronized {
-              readRunnableStarted -= key
-              if (register && conn.changeInterestForRead()) {
-                conn.registerInterest()
-              }
-            }
-          }
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when reading from " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-  private def triggerConnect(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null).asInstanceOf[SendingConnection]
-    if (conn == null) return
-
-    // prevent other events from being triggered
-    // Since we are still trying to connect, we do not need to do the additional steps in
-    // triggerWrite
-    conn.changeConnectionKeyInterest(0)
-
-    handleConnectExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var tries: Int = 10
-          while (tries >= 0) {
-            if (conn.finishConnect(false)) return
-            // Sleep ?
-            Thread.sleep(1)
-            tries -= 1
-          }
-
-          // fallback to previous behavior : we should not really come here since this method was
-          // triggered since channel became connectable : but at times, the first finishConnect need
-          // not succeed : hence the loop to retry a few 'times'.
-          conn.finishConnect(true)
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when finishConnect for " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-  // MUST be called within selector loop - else deadlock.
-  private def triggerForceCloseByException(key: SelectionKey, e: Exception) {
-    try {
-      key.interestOps(0)
-    } catch {
-      // ignore exceptions
-      case e: Exception => logDebug("Ignoring exception", e)
-    }
-
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    // Pushing to connect threadpool
-    handleConnectExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          conn.callOnExceptionCallbacks(e)
-        } catch {
-          // ignore exceptions
-          case NonFatal(e) => logDebug("Ignoring exception", e)
-        }
-        try {
-          conn.close()
-        } catch {
-          // ignore exceptions
-          case NonFatal(e) => logDebug("Ignoring exception", e)
-        }
-      }
-    })
-  }
-
-
-  def run() {
-    try {
-      while (isActive) {
-        while (!registerRequests.isEmpty) {
-          val conn: SendingConnection = registerRequests.dequeue()
-          addListeners(conn)
-          conn.connect()
-          addConnection(conn)
-        }
-
-        while(!keyInterestChangeRequests.isEmpty) {
-          val (key, ops) = keyInterestChangeRequests.dequeue()
-
-          try {
-            if (key.isValid) {
-              val connection = connectionsByKey.getOrElse(key, null)
-              if (connection != null) {
-                val lastOps = key.interestOps()
-                key.interestOps(ops)
-
-                // hot loop - prevent materialization of string if trace not enabled.
-                if (isTraceEnabled()) {
-                  def intToOpStr(op: Int): String = {
-                    val opStrs = ArrayBuffer[String]()
-                    if ((op & SelectionKey.OP_READ) != 0) opStrs += "READ"
-                    if ((op & SelectionKey.OP_WRITE) != 0) opStrs += "WRITE"
-                    if ((op & SelectionKey.OP_CONNECT) != 0) opStrs += "CONNECT"
-                    if ((op & SelectionKey.OP_ACCEPT) != 0) opStrs += "ACCEPT"
-                    if (opStrs.size > 0) opStrs.reduceLeft(_ + " | " + _) else " "
-                  }
-
-                  logTrace("Changed key for connection to [" +
-                    connection.getRemoteConnectionManagerId()  + "] changed from [" +
-                      intToOpStr(lastOps) + "] to [" + intToOpStr(ops) + "]")
-                }
-              }
-            } else {
-              logInfo("Key not valid ? " + key)
-              throw new CancelledKeyException()
-            }
-          } catch {
-            case e: CancelledKeyException => {
-              logInfo("key already cancelled ? " + key, e)
-              triggerForceCloseByException(key, e)
-            }
-            case e: Exception => {
-              logError("Exception processing key " + key, e)
-              triggerForceCloseByException(key, e)
-            }
-          }
-        }
-
-        val selectedKeysCount =
-          try {
-            selector.select()
-          } catch {
-            // Explicitly only dealing with CancelledKeyException here since other exceptions
-            // should be dealt with differently.
-            case e: CancelledKeyException =>
-              // Some keys within the selectors list are invalid/closed. clear them.
-              val allKeys = selector.keys().iterator()
-
-              while (allKeys.hasNext) {
-                val key = allKeys.next()
-                try {
-                  if (! key.isValid) {
-                    logInfo("Key not valid ? " + key)
-                    throw new CancelledKeyException()
-                  }
-                } catch {
-                  case e: CancelledKeyException => {
-                    logInfo("key already cancelled ? " + key, e)
-                    triggerForceCloseByException(key, e)
-                  }
-                  case e: Exception => {
-                    logError("Exception processing key " + key, e)
-                    triggerForceCloseByException(key, e)
-                  }
-                }
-              }
-              0
-
-            case e: ClosedSelectorException =>
-              logDebug("Failed select() as selector is closed.", e)
-              return
-          }
-
-        if (selectedKeysCount == 0) {
-          logDebug("Selector selected " + selectedKeysCount + " of " + selector.keys.size +
-            " keys")
-        }
-        if (selectorThread.isInterrupted) {
-          logInfo("Selector thread was interrupted!")
-          return
-        }
-
-        if (0 != selectedKeysCount) {
-          val selectedKeys = selector.selectedKeys().iterator()
-          while (selectedKeys.hasNext) {
-            val key = selectedKeys.next
-            selectedKeys.remove()
-            try {
-              if (key.isValid) {
-                if (key.isAcceptable) {
-                  acceptConnection(key)
-                } else
-                if (key.isConnectable) {
-                  triggerConnect(key)
-                } else
-                if (key.isReadable) {
-                  triggerRead(key)
-                } else
-                if (key.isWritable) {
-                  triggerWrite(key)
-                }
-              } else {
-                logInfo("Key not valid ? " + key)
-                throw new CancelledKeyException()
-              }
-            } catch {
-              // weird, but we saw this happening - even though key.isValid was true,
-              // key.isAcceptable would throw CancelledKeyException.
-              case e: CancelledKeyException => {
-                logInfo("key already cancelled ? " + key, e)
-                triggerForceCloseByException(key, e)
-              }
-              case e: Exception => {
-                logError("Exception processing key " + key, e)
-                triggerForceCloseByException(key, e)
-              }
-            }
-          }
-        }
-      }
-    } catch {
-      case e: Exception => logError("Error in select loop", e)
-    }
-  }
-
-  def acceptConnection(key: SelectionKey) {
-    val serverChannel = key.channel.asInstanceOf[ServerSocketChannel]
-
-    var newChannel = serverChannel.accept()
-
-    // accept them all in a tight loop. non blocking accept with no processing, should be fine
-    while (newChannel != null) {
-      try {
-        val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-        val newConnection = new ReceivingConnection(newChannel, selector, newConnectionId,
-          securityManager)
-        newConnection.onReceive(receiveMessage)
-        addListeners(newConnection)
-        addConnection(newConnection)
-        logInfo("Accepted connection from [" + newConnection.remoteAddress + "]")
-      } catch {
-        // might happen in case of issues with registering with selector
-        case e: Exception => logError("Error in accept loop", e)
-      }
-
-      newChannel = serverChannel.accept()
-    }
-  }
-
-  private def addListeners(connection: Connection) {
-    connection.onKeyInterestChange(changeConnectionKeyInterest)
-    connection.onException(handleConnectionError)
-    connection.onClose(removeConnection)
-  }
-
-  def addConnection(connection: Connection) {
-    connectionsByKey += ((connection.key, connection))
-  }
-
-  def removeConnection(connection: Connection) {
-    connectionsByKey -= connection.key
-
-    try {
-      connection match {
-        case sendingConnection: SendingConnection =>
-          val sendingConnectionManagerId = sendingConnection.getRemoteConnectionManagerId()
-          logInfo("Removing SendingConnection to " + sendingConnectionManagerId)
-
-          connectionsById -= sendingConnectionManagerId
-          connectionsAwaitingSasl -= connection.connectionId
-
-          messageStatuses.synchronized {
-            messageStatuses.values.filter(_.connectionManagerId == sendingConnectionManagerId)
-              .foreach(status => {
-                logInfo("Notifying " + status)
-                status.failWithoutAck()
-              })
-
-            messageStatuses.retain((i, status) => {
-              status.connectionManagerId != sendingConnectionManagerId
-            })
-          }
-        case receivingConnection: ReceivingConnection =>
-          val remoteConnectionManagerId = receivingConnection.getRemoteConnectionManagerId()
-          logInfo("Removing ReceivingConnection to " + remoteConnectionManagerId)
-
-          val sendingConnectionOpt = connectionsById.get(remoteConnectionManagerId)
-          if (!sendingConnectionOpt.isDefined) {
-            logError(s"Corresponding SendingConnection to ${remoteConnectionManagerId} not found")
-            return
-          }
-
-          val sendingConnection = sendingConnectionOpt.get
-          connectionsById -= remoteConnectionManagerId
-          sendingConnection.close()
-
-          val sendingConnectionManagerId = sendingConnection.getRemoteConnectionManagerId()
-
-          assert(sendingConnectionManagerId == remoteConnectionManagerId)
-
-          messageStatuses.synchronized {
-            for (s <- messageStatuses.values
-                 if s.connectionManagerId == sendingConnectionManagerId) {
-              logInfo("Notifying " + s)
-              s.failWithoutAck()
-            }
-
-            messageStatuses.retain((i, status) => {
-              status.connectionManagerId != sendingConnectionManagerId
-            })
-          }
-        case _ => logError("Unsupported type of connection.")
-      }
-    } finally {
-      // So that the selection keys can be removed.
-      wakeupSelector()
-    }
-  }
-
-  def handleConnectionError(connection: Connection, e: Throwable) {
-    logInfo("Handling connection error on connection to " +
-      connection.getRemoteConnectionManagerId())
-    removeConnection(connection)
-  }
-
-  def changeConnectionKeyInterest(connection: Connection, ops: Int) {
-    keyInterestChangeRequests += ((connection.key, ops))
-    // so that registrations happen !
-    wakeupSelector()
-  }
-
-  def receiveMessage(connection: Connection, message: Message) {
-    val connectionManagerId = ConnectionManagerId.fromSocketAddress(message.senderAddress)
-    logDebug("Received [" + message + "] from [" + connectionManagerId + "]")
-    val runnable = new Runnable() {
-      val creationTime = System.currentTimeMillis
-      def run() {
-        try {
-          logDebug("Handler thread delay is " + (System.currentTimeMillis - creationTime) + " ms")
-          handleMessage(connectionManagerId, message, connection)
-          logDebug("Handling delay is " + (System.currentTimeMillis - creationTime) + " ms")
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when handling messages from " +
-              connection.getRemoteConnectionManagerId(), e)
-            connection.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    }
-    handleMessageExecutor.execute(runnable)
-    /* handleMessage(connection, message) */
-  }
-
-  private def handleClientAuthentication(
-      waitingConn: SendingConnection,
-      securityMsg: SecurityMessage,
-      connectionId : ConnectionId) {
-    if (waitingConn.isSaslComplete()) {
-      logDebug("Client sasl completed for id: "  + waitingConn.connectionId)
-      connectionsAwaitingSasl -= waitingConn.connectionId
-      waitingConn.registerAfterAuth()
-      wakeupSelector()
-      return
-    } else {
-      var replyToken : Array[Byte] = null
-      try {
-        replyToken = waitingConn.sparkSaslClient.response(securityMsg.getToken)
-        if (waitingConn.isSaslComplete()) {
-          logDebug("Client sasl completed after evaluate for id: " + waitingConn.connectionId)
-          connectionsAwaitingSasl -= waitingConn.connectionId
-          waitingConn.registerAfterAuth()
-          wakeupSelector()
-          return
-        }
-        val securityMsgResp = SecurityMessage.fromResponse(replyToken,
-          securityMsg.getConnectionId.toString)
-        val message = securityMsgResp.toBufferMessage
-        if (message == null) throw new IOException("Error creating security message")
-        sendSecurityMessage(waitingConn.getRemoteConnectionManagerId(), message)
-      } catch {
-        case e: Exception =>
-          logError("Error handling sasl client authentication", e)
-          waitingConn.close()
-          throw new IOException("Error evaluating sasl response: ", e)
-      }
-    }
-  }
-
-  private def handleServerAuthentication(
-      connection: Connection,
-      securityMsg: SecurityMessage,
-      connectionId: ConnectionId) {
-    if (!connection.isSaslComplete()) {
-      logDebug("saslContext not established")
-      var replyToken : Array[Byte] = null
-      try {
-        connection.synchronized {
-          if (connection.sparkSaslServer == null) {
-            logDebug("Creating sasl Server")
-            connection.sparkSaslServer = new SparkSaslServer(conf.getAppId, securityManager, false)
-          }
-        }
-        replyToken = connection.sparkSaslServer.response(securityMsg.getToken)
-        if (connection.isSaslComplete()) {
-          logDebug("Server sasl completed: " + connection.connectionId +
-            " for: " + connectionId)
-        } else {
-          logDebug("Server sasl not completed: " + connection.connectionId +
-            " for: " + connectionId)
-        }
-        if (replyToken != null) {
-          val securityMsgResp = SecurityMessage.fromResponse(replyToken,
-            securityMsg.getConnectionId)
-          val message = securityMsgResp.toBufferMessage
-          if (message == null) throw new Exception("Error creating security Message")
-          sendSecurityMessage(connection.getRemoteConnectionManagerId(), message)
-        }
-      } catch {
-        case e: Exception => {
-          logError("Error in server auth negotiation: " + e)
-          // It would probably be better to send an error message telling other side auth failed
-          // but for now just close
-          connection.close()
-        }
-      }
-    } else {
-      logDebug("connection already established for this connection id: " + connection.connectionId)
-    }
-  }
-
-
-  private def handleAuthentication(conn: Connection, bufferMessage: BufferMessage): Boolean = {
-    if (bufferMessage.isSecurityNeg) {
-      logDebug("This is security neg message")
-
-      // parse as SecurityMessage
-      val securityMsg = SecurityMessage.fromBufferMessage(bufferMessage)
-      val connectionId = ConnectionId.createConnectionIdFromString(securityMsg.getConnectionId)
-
-      connectionsAwaitingSasl.get(connectionId) match {
-        case Some(waitingConn) => {
-          // Client - this must be in response to us doing Send
-          logDebug("Client handleAuth for id: " +  waitingConn.connectionId)
-          handleClientAuthentication(waitingConn, securityMsg, connectionId)
-        }
-        case None => {
-          // Server - someone sent us something and we haven't authenticated yet
-          logDebug("Server handleAuth for id: " + connectionId)
-          handleServerAuthentication(conn, securityMsg, connectionId)
-        }
-      }
-      return true
-    } else {
-      if (!conn.isSaslComplete()) {
-        // We could handle this better and tell the client we need to do authentication
-        // negotiation, but for now just ignore them.
-        logError("message sent that is not security negotiation message on connection " +
-                 "not authenticated yet, ignoring it!!")
-        return true
-      }
-    }
-    false
-  }
-
-  private def handleMessage(
-      connectionManagerId: ConnectionManagerId,
-      message: Message,
-      connection: Connection) {
-    logDebug("Handling [" + message + "] from [" + connectionManagerId + "]")
-    message match {
-      case bufferMessage: BufferMessage => {
-        if (authEnabled) {
-          val res = handleAuthentication(connection, bufferMessage)
-          if (res) {
-            // message was security negotiation so skip the rest
-            logDebug("After handleAuth result was true, returning")
-            return
-          }
-        }
-        if (bufferMessage.hasAckId()) {
-          messageStatuses.synchronized {
-            messageStatuses.get(bufferMessage.ackId) match {
-              case Some(status) => {
-                messageStatuses -= bufferMessage.ackId
-                status.success(message)
-              }
-              case None => {
-                /**
-                 * We can fall down on this code because of following 2 cases
-                 *
-                 * (1) Invalid ack sent due to buggy code.
-                 *
-                 * (2) Late-arriving ack for a SendMessageStatus
-                 *     To avoid unwilling late-arriving ack
-                 *     caused by long pause like GC, you can set
-                 *     larger value than default to spark.core.connection.ack.wait.timeout
-                 */
-                logWarning(s"Could not find reference for received ack Message ${message.id}")
-              }
-            }
-          }
-        } else {
-          var ackMessage : Option[Message] = None
-          try {
-            ackMessage = if (onReceiveCallback != null) {
-              logDebug("Calling back")
-              onReceiveCallback(bufferMessage, connectionManagerId)
-            } else {
-              logDebug("Not calling back as callback is null")
-              None
-            }
-
-            if (ackMessage.isDefined) {
-              if (!ackMessage.get.isInstanceOf[BufferMessage]) {
-                logDebug("Response to " + bufferMessage + " is not a buffer message, it is of type "
-                  + ackMessage.get.getClass)
-              } else if (!ackMessage.get.asInstanceOf[BufferMessage].hasAckId) {
-                logDebug("Response to " + bufferMessage + " does not have ack id set")
-                ackMessage.get.asInstanceOf[BufferMessage].ackId = bufferMessage.id
-              }
-            }
-          } catch {
-            case e: Exception => {
-              logError(s"Exception was thrown while processing message", e)
-              ackMessage = Some(Message.createErrorMessage(e, bufferMessage.id))
-            }
-          } finally {
-            sendMessage(connectionManagerId, ackMessage.getOrElse {
-              Message.createBufferMessage(bufferMessage.id)
-            })
-          }
-        }
-      }
-      case _ => throw new Exception("Unknown type message received")
-    }
-  }
-
-  private def checkSendAuthFirst(connManagerId: ConnectionManagerId, conn: SendingConnection) {
-    // see if we need to do sasl before writing
-    // this should only be the first negotiation as the Client!!!
-    if (!conn.isSaslComplete()) {
-      conn.synchronized {
-        if (conn.sparkSaslClient == null) {
-          conn.sparkSaslClient = new SparkSaslClient(conf.getAppId, securityManager, false)
-          var firstResponse: Array[Byte] = null
-          try {
-            firstResponse = conn.sparkSaslClient.firstToken()
-            val securityMsg = SecurityMessage.fromResponse(firstResponse,
-              conn.connectionId.toString())
-            val message = securityMsg.toBufferMessage
-            if (message == null) throw new Exception("Error creating security message")
-            connectionsAwaitingSasl += ((conn.connectionId, conn))
-            sendSecurityMessage(connManagerId, message)
-            logDebug("adding connectionsAwaitingSasl id: " + conn.connectionId +
-              " to: " + connManagerId)
-          } catch {
-            case e: Exception => {
-              logError("Error getting first response from the SaslClient.", e)
-              conn.close()
-              throw new Exception("Error getting first response from the SaslClient")
-            }
-          }
-        }
-      }
-    } else {
-      logDebug("Sasl already established ")
-    }
-  }
-
-  // allow us to add messages to the inbox for doing sasl negotiating
-  private def sendSecurityMessage(connManagerId: ConnectionManagerId, message: Message) {
-    def startNewConnection(): SendingConnection = {
-      val inetSocketAddress = new InetSocketAddress(connManagerId.host, connManagerId.port)
-      val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-      val newConnection = new SendingConnection(inetSocketAddress, selector, connManagerId,
-        newConnectionId, securityManager)
-      logInfo("creating new sending connection for security! " + newConnectionId )
-      registerRequests.enqueue(newConnection)
-
-      newConnection
-    }
-    // I removed the lookupKey stuff as part of merge ... should I re-add it ?
-    // We did not find it useful in our test-env ...
-    // If we do re-add it, we should consistently use it everywhere I guess ?
-    message.senderAddress = id.toSocketAddress()
-    logTrace("Sending Security [" + message + "] to [" + connManagerId + "]")
-    val connection = connectionsById.getOrElseUpdate(connManagerId, startNewConnection())
-
-    // send security message until going connection has been authenticated
-    connection.send(message)
-
-    wakeupSelector()
-  }
-
-  private def sendMessage(connectionManagerId: ConnectionManagerId, message: Message) {
-    def startNewConnection(): SendingConnection = {
-      val inetSocketAddress = new InetSocketAddress(connectionManagerId.host,
-        connectionManagerId.port)
-      val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-      val newConnection = new SendingConnection(inetSocketAddress, selector, connectionManagerId,
-        newConnectionId, securityManager)
-      newConnection.onException {
-        case (conn, e) => {
-          logError("Exception while sending message.", e)
-          reportSendingMessageFailure(message.id, e)
-        }
-      }
-      logTrace("creating new sending connection: " + newConnectionId)
-      registerRequests.enqueue(newConnection)
-
-      newConnection
-    }
-    val connection = connectionsById.getOrElseUpdate(connectionManagerId, startNewConnection())
-
-    message.senderAddress = id.toSocketAddress()
-    logDebug("Before Sending [" + message + "] to [" + connectionManagerId + "]" + " " +
-      "connectionid: "  + connection.connectionId)
-
-    if (authEnabled) {
-      try {
-        checkSendAuthFirst(connectionManagerId, connection)
-      } catch {
-        case NonFatal(e) => {
-          reportSendingMessageFailure(message.id, e)
-        }
-      }
-    }
-    logDebug("Sending [" + message + "] to [" + connectionManagerId + "]")
-    connection.send(message)
-    wakeupSelector()
-  }
-
-  private def reportSendingMessageFailure(messageId: Int, e: Throwable): Unit = {
-    // need to tell sender it failed
-    messageStatuses.synchronized {
-      val s = messageStatuses.get(messageId)
-      s match {
-        case Some(msgStatus) => {
-          messageStatuses -= messageId
-          logInfo("Notifying " + msgStatus.connectionManagerId)
-          msgStatus.failure(e)
-        }
-        case None => {
-          logError("no messageStatus for failed message id: " + messageId)
-        }
-      }
-    }
-  }
-
-  private def wakeupSelector() {
-    selector.wakeup()
-  }
-
-  /**
-   * Send a message and block until an acknowledgment is received or an error occurs.
-   * @param connectionManagerId the message's destination
-   * @param message the message being sent
-   * @return a Future that either returns the acknowledgment message or captures an exception.
-   */
-  def sendMessageReliably(connectionManagerId: ConnectionManagerId, message: Message)
-      : Future[Message] = {
-    val promise = Promise[Message]()
-
-    // It's important that the TimerTask doesn't capture a reference to `message`, which can cause
-    // memory leaks since cancelled TimerTasks won't necessarily be garbage collected until the time
-    // at which they would originally be scheduled to run.  Therefore, extract the message id
-    // from outside of the TimerTask closure (see SPARK-4393 for more context).
-    val messageId = message.id
-    // Keep a weak reference to the promise so that the completed promise may be garbage-collected
-    val promiseReference = new WeakReference(promise)
-    val timeoutTask: TimerTask = new TimerTask {
-      override def run(timeout: Timeout): Unit = {
-        messageStatuses.synchronized {
-          messageStatuses.remove(messageId).foreach { s =>
-            val e = new IOException("sendMessageReliably failed because ack " +
-              s"was not received within $ackTimeout sec")
-            val p = promiseReference.get
-            if (p != null) {
-              // Attempt to fail the promise with a Timeout exception
-              if (!p.tryFailure(e)) {
-                // If we reach here, then someone else has already signalled success or failure
-                // on this promise, so log a warning:
-                logError("Ignore error because promise is completed", e)
-              }
-            } else {
-              // The WeakReference was empty, which should never happen because
-              // sendMessageReliably's caller should have a strong reference to promise.future;
-              logError("Promise was garbage collected; this should never happen!", e)
-            }
-          }
-        }
-      }
-    }
-
-    val timeoutTaskHandle = ackTimeoutMonitor.newTimeout(timeoutTask, ackTimeout, TimeUnit.SECONDS)
-
-    val status = new MessageStatus(message, connectionManagerId, s => {
-      timeoutTaskHandle.cancel()
-      s match {
-        case scala.util.Failure(e) =>
-          // Indicates a failure where we either never sent or never got ACK'd
-          if (!promise.tryFailure(e)) {
-            logWarning("Ignore error because promise is completed", e)
-          }
-        case scala.util.Success(ackMessage) =>
-          if (ackMessage.hasError) {
-            val errorMsgByteBuf = ackMessage.asInstanceOf[BufferMessage].buffers.head
-            val errorMsgBytes = new Array[Byte](errorMsgByteBuf.limit())
-            errorMsgByteBuf.get(errorMsgBytes)
-            val errorMsg = new String(errorMsgBytes, UTF_8)
-            val e = new IOException(
-              s"sendMessageReliably failed with ACK that signalled a remote error: $errorMsg")
-            if (!promise.tryFailure(e)) {
-              logWarning("Ignore error because promise is completed", e)
-            }
-          } else {
-            if (!promise.trySuccess(ackMessage)) {
-              logWarning("Drop ackMessage because promise is completed")
-            }
-          }
-      }
-    })
-    messageStatuses.synchronized {
-      messageStatuses += ((message.id, status))
-    }
-
-    sendMessage(connectionManagerId, message)
-    promise.future
-  }
-
-  def onReceiveMessage(callback: (Message, ConnectionManagerId) => Option[Message]) {
-    onReceiveCallback = callback
-  }
-
-  def stop() {
-    isActive = false
-    ackTimeoutMonitor.stop()
-    selector.close()
-    selectorThread.interrupt()
-    selectorThread.join()
-    val connections = connectionsByKey.values
-    connections.foreach(_.close())
-    if (connectionsByKey.size != 0) {
-      logWarning("All connections not cleaned up")
-    }
-    handleMessageExecutor.shutdown()
-    handleReadWriteExecutor.shutdown()
-    handleConnectExecutor.shutdown()
-    logInfo("ConnectionManager stopped")
-  }
-}
-
-
-private[spark] object ConnectionManager {
-  import scala.concurrent.ExecutionContext.Implicits.global
-
-  def main(args: Array[String]) {
-    val conf = new SparkConf
-    val manager = new ConnectionManager(9999, conf, new SecurityManager(conf))
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      // scalastyle:off println
-      println("Received [" + msg + "] from [" + id + "]")
-      // scalastyle:on println
-      None
-    })
-
-    /* testSequentialSending(manager) */
-    /* System.gc() */
-
-    /* testParallelSending(manager) */
-    /* System.gc() */
-
-    /* testParallelDecreasingSending(manager) */
-    /* System.gc() */
-
-    testContinuousSending(manager)
-    System.gc()
-  }
-
-  // scalastyle:off println
-  def testSequentialSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Sequential Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      Await.result(manager.sendMessageReliably(manager.id, bufferMessage), Duration.Inf)
-    })
-    println("--------------------------")
-    println()
-  }
-
-  def testParallelSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Parallel Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val startTime = System.currentTimeMillis
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(manager.id, bufferMessage)
-    }).foreach(f => {
-      f.onFailure {
-        case e => println("Failed due to " + e)
-      }
-      Await.ready(f, 1 second)
-    })
-    val finishTime = System.currentTimeMillis
-
-    val mb = size * count / 1024.0 / 1024.0
-    val ms = finishTime - startTime
-    val tput = mb * 1000.0 / ms
-    println("--------------------------")
-    println("Started at " + startTime + ", finished at " + finishTime)
-    println("Sent " + count + " messages of size " + size + " in " + ms + " ms " +
-      "(" + tput + " MB/s)")
-    println("--------------------------")
-    println()
-  }
-
-  def testParallelDecreasingSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Parallel Decreasing Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-    val buffers = Array.tabulate(count) { i =>
-      val bufferLen = size * (i + 1)
-      val bufferContent = Array.tabulate[Byte](bufferLen)(x => x.toByte)
-      ByteBuffer.allocate(bufferLen).put(bufferContent)
-    }
-    buffers.foreach(_.flip)
-    val mb = buffers.map(_.remaining).reduceLeft(_ + _) / 1024.0 / 1024.0
-
-    val startTime = System.currentTimeMillis
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffers(count - 1 - i).duplicate)
-      manager.sendMessageReliably(manager.id, bufferMessage)
-    }).foreach(f => {
-      f.onFailure {
-        case e => println("Failed due to " + e)
-      }
-      Await.ready(f, 1 second)
-    })
-    val finishTime = System.currentTimeMillis
-
-    val ms = finishTime - startTime
-    val tput = mb * 1000.0 / ms
-    println("--------------------------")
-    /* println("Started at " + startTime + ", finished at " + finishTime) */
-    println("Sent " + mb + " MB in " + ms + " ms (" + tput + " MB/s)")
-    println("--------------------------")
-    println()
-  }
-
-  def testContinuousSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Continuous Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val startTime = System.currentTimeMillis
-    while(true) {
-      (0 until count).map(i => {
-          val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-          manager.sendMessageReliably(manager.id, bufferMessage)
-        }).foreach(f => {
-          f.onFailure {
-            case e => println("Failed due to " + e)
-          }
-          Await.ready(f, 1 second)
-        })
-      val finishTime = System.currentTimeMillis
-      Thread.sleep(1000)
-      val mb = size * count / 1024.0 / 1024.0
-      val ms = finishTime - startTime
-      val tput = mb * 1000.0 / ms
-      println("Sent " + mb + " MB in " + ms + " ms (" + tput + " MB/s)")
-      println("--------------------------")
-      println()
-    }
-  }
-  // scalastyle:on println
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala
deleted file mode 100644
index 1cd13d887c6f6..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net.InetSocketAddress
-
-import org.apache.spark.util.Utils
-
-private[nio] case class ConnectionManagerId(host: String, port: Int) {
-  // DEBUG code
-  Utils.checkHost(host)
-  assert (port > 0)
-
-  def toSocketAddress(): InetSocketAddress = new InetSocketAddress(host, port)
-}
-
-
-private[nio] object ConnectionManagerId {
-  def fromSocketAddress(socketAddress: InetSocketAddress): ConnectionManagerId = {
-    new ConnectionManagerId(socketAddress.getHostName, socketAddress.getPort)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Message.scala b/core/src/main/scala/org/apache/spark/network/nio/Message.scala
deleted file mode 100644
index 85d2fe2bf9c20..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/Message.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net.InetSocketAddress
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-import com.google.common.base.Charsets.UTF_8
-
-import org.apache.spark.util.Utils
-
-private[nio] abstract class Message(val typ: Long, val id: Int) {
-  var senderAddress: InetSocketAddress = null
-  var started = false
-  var startTime = -1L
-  var finishTime = -1L
-  var isSecurityNeg = false
-  var hasError = false
-
-  def size: Int
-
-  def getChunkForSending(maxChunkSize: Int): Option[MessageChunk]
-
-  def getChunkForReceiving(chunkSize: Int): Option[MessageChunk]
-
-  def timeTaken(): String = (finishTime - startTime).toString + " ms"
-
-  override def toString: String = {
-    this.getClass.getSimpleName + "(id = " + id + ", size = " + size + ")"
-  }
-}
-
-
-private[nio] object Message {
-  val BUFFER_MESSAGE = 1111111111L
-
-  var lastId = 1
-
-  def getNewId(): Int = synchronized {
-    lastId += 1
-    if (lastId == 0) {
-      lastId += 1
-    }
-    lastId
-  }
-
-  def createBufferMessage(dataBuffers: Seq[ByteBuffer], ackId: Int): BufferMessage = {
-    if (dataBuffers == null) {
-      return new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer], ackId)
-    }
-    if (dataBuffers.exists(_ == null)) {
-      throw new Exception("Attempting to create buffer message with null buffer")
-    }
-    new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer] ++= dataBuffers, ackId)
-  }
-
-  def createBufferMessage(dataBuffers: Seq[ByteBuffer]): BufferMessage =
-    createBufferMessage(dataBuffers, 0)
-
-  def createBufferMessage(dataBuffer: ByteBuffer, ackId: Int): BufferMessage = {
-    if (dataBuffer == null) {
-      createBufferMessage(Array(ByteBuffer.allocate(0)), ackId)
-    } else {
-      createBufferMessage(Array(dataBuffer), ackId)
-    }
-  }
-
-  def createBufferMessage(dataBuffer: ByteBuffer): BufferMessage =
-    createBufferMessage(dataBuffer, 0)
-
-  def createBufferMessage(ackId: Int): BufferMessage = {
-    createBufferMessage(new Array[ByteBuffer](0), ackId)
-  }
-
-  /**
-   * Create a "negative acknowledgment" to notify a sender that an error occurred
-   * while processing its message.  The exception's stacktrace will be formatted
-   * as a string, serialized into a byte array, and sent as the message payload.
-   */
-  def createErrorMessage(exception: Exception, ackId: Int): BufferMessage = {
-    val exceptionString = Utils.exceptionString(exception)
-    val serializedExceptionString = ByteBuffer.wrap(exceptionString.getBytes(UTF_8))
-    val errorMessage = createBufferMessage(serializedExceptionString, ackId)
-    errorMessage.hasError = true
-    errorMessage
-  }
-
-  def create(header: MessageChunkHeader): Message = {
-    val newMessage: Message = header.typ match {
-      case BUFFER_MESSAGE => new BufferMessage(header.id,
-        ArrayBuffer(ByteBuffer.allocate(header.totalSize)), header.other)
-    }
-    newMessage.hasError = header.hasError
-    newMessage.senderAddress = header.address
-    newMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala b/core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala
deleted file mode 100644
index a4568e849fa13..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-private[nio]
-class MessageChunk(val header: MessageChunkHeader, val buffer: ByteBuffer) {
-
-  val size: Int = if (buffer == null) 0 else buffer.remaining
-
-  lazy val buffers: ArrayBuffer[ByteBuffer] = {
-    val ab = new ArrayBuffer[ByteBuffer]()
-    ab += header.buffer
-    if (buffer != null) {
-      ab += buffer
-    }
-    ab
-  }
-
-  override def toString: String = {
-    "" + this.getClass.getSimpleName + " (id = " + header.id + ", size = " + size + ")"
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala b/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala
deleted file mode 100644
index 7b3da4bb9d5ee..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net.{InetAddress, InetSocketAddress}
-import java.nio.ByteBuffer
-
-private[nio] class MessageChunkHeader(
-    val typ: Long,
-    val id: Int,
-    val totalSize: Int,
-    val chunkSize: Int,
-    val other: Int,
-    val hasError: Boolean,
-    val securityNeg: Int,
-    val address: InetSocketAddress) {
-  lazy val buffer = {
-    // No need to change this, at 'use' time, we do a reverse lookup of the hostname.
-    // Refer to network.Connection
-    val ip = address.getAddress.getAddress()
-    val port = address.getPort()
-    ByteBuffer.
-      allocate(MessageChunkHeader.HEADER_SIZE).
-      putLong(typ).
-      putInt(id).
-      putInt(totalSize).
-      putInt(chunkSize).
-      putInt(other).
-      put(if (hasError) 1.asInstanceOf[Byte] else 0.asInstanceOf[Byte]).
-      putInt(securityNeg).
-      putInt(ip.size).
-      put(ip).
-      putInt(port).
-      position(MessageChunkHeader.HEADER_SIZE).
-      flip.asInstanceOf[ByteBuffer]
-  }
-
-  override def toString: String = {
-    "" + this.getClass.getSimpleName + ":" + id + " of type " + typ +
-      " and sizes " + totalSize + " / " + chunkSize + " bytes, securityNeg: " + securityNeg
-  }
-
-}
-
-
-private[nio] object MessageChunkHeader {
-  val HEADER_SIZE = 45
-
-  def create(buffer: ByteBuffer): MessageChunkHeader = {
-    if (buffer.remaining != HEADER_SIZE) {
-      throw new IllegalArgumentException("Cannot convert buffer data to Message")
-    }
-    val typ = buffer.getLong()
-    val id = buffer.getInt()
-    val totalSize = buffer.getInt()
-    val chunkSize = buffer.getInt()
-    val other = buffer.getInt()
-    val hasError = buffer.get() != 0
-    val securityNeg = buffer.getInt()
-    val ipSize = buffer.getInt()
-    val ipBytes = new Array[Byte](ipSize)
-    buffer.get(ipBytes)
-    val ip = InetAddress.getByAddress(ipBytes)
-    val port = buffer.getInt()
-    new MessageChunkHeader(typ, id, totalSize, chunkSize, other, hasError, securityNeg,
-      new InetSocketAddress(ip, port))
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala
deleted file mode 100644
index b2aec160635c7..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark.network._
-import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
-import org.apache.spark.network.shuffle.BlockFetchingListener
-import org.apache.spark.storage.{BlockId, StorageLevel}
-import org.apache.spark.util.Utils
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
-
-import scala.concurrent.Future
-
-
-/**
- * A [[BlockTransferService]] implementation based on [[ConnectionManager]], a custom
- * implementation using Java NIO.
- */
-final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityManager)
-  extends BlockTransferService with Logging {
-
-  private var cm: ConnectionManager = _
-
-  private var blockDataManager: BlockDataManager = _
-
-  /**
-   * Port number the service is listening on, available only after [[init]] is invoked.
-   */
-  override def port: Int = {
-    checkInit()
-    cm.id.port
-  }
-
-  /**
-   * Host name the service is listening on, available only after [[init]] is invoked.
-   */
-  override def hostName: String = {
-    checkInit()
-    cm.id.host
-  }
-
-  /**
-   * Initialize the transfer service by giving it the BlockDataManager that can be used to fetch
-   * local blocks or put local blocks.
-   */
-  override def init(blockDataManager: BlockDataManager): Unit = {
-    this.blockDataManager = blockDataManager
-    cm = new ConnectionManager(
-      conf.getInt("spark.blockManager.port", 0),
-      conf,
-      securityManager,
-      "Connection manager for block manager")
-    cm.onReceiveMessage(onBlockMessageReceive)
-  }
-
-  /**
-   * Tear down the transfer service.
-   */
-  override def close(): Unit = {
-    if (cm != null) {
-      cm.stop()
-    }
-  }
-
-  override def fetchBlocks(
-      host: String,
-      port: Int,
-      execId: String,
-      blockIds: Array[String],
-      listener: BlockFetchingListener): Unit = {
-    checkInit()
-
-    val cmId = new ConnectionManagerId(host, port)
-    val blockMessageArray = new BlockMessageArray(blockIds.map { blockId =>
-      BlockMessage.fromGetBlock(GetBlock(BlockId(blockId)))
-    })
-
-    val future = cm.sendMessageReliably(cmId, blockMessageArray.toBufferMessage)
-
-    // Register the listener on success/failure future callback.
-    future.onSuccess { case message =>
-      val bufferMessage = message.asInstanceOf[BufferMessage]
-      val blockMessageArray = BlockMessageArray.fromBufferMessage(bufferMessage)
-
-      // SPARK-4064: In some cases(eg. Remote block was removed) blockMessageArray may be empty.
-      if (blockMessageArray.isEmpty) {
-        blockIds.foreach { id =>
-          listener.onBlockFetchFailure(id, new SparkException(s"Received empty message from $cmId"))
-        }
-      } else {
-        for (blockMessage: BlockMessage <- blockMessageArray) {
-          val msgType = blockMessage.getType
-          if (msgType != BlockMessage.TYPE_GOT_BLOCK) {
-            if (blockMessage.getId != null) {
-              listener.onBlockFetchFailure(blockMessage.getId.toString,
-                new SparkException(s"Unexpected message $msgType received from $cmId"))
-            }
-          } else {
-            val blockId = blockMessage.getId
-            val networkSize = blockMessage.getData.limit()
-            listener.onBlockFetchSuccess(
-              blockId.toString, new NioManagedBuffer(blockMessage.getData))
-          }
-        }
-      }
-    }(cm.futureExecContext)
-
-    future.onFailure { case exception =>
-      blockIds.foreach { blockId =>
-        listener.onBlockFetchFailure(blockId, exception)
-      }
-    }(cm.futureExecContext)
-  }
-
-  /**
-   * Upload a single block to a remote node, available only after [[init]] is invoked.
-   *
-   * This call blocks until the upload completes, or throws an exception upon failures.
-   */
-  override def uploadBlock(
-      hostname: String,
-      port: Int,
-      execId: String,
-      blockId: BlockId,
-      blockData: ManagedBuffer,
-      level: StorageLevel)
-    : Future[Unit] = {
-    checkInit()
-    val msg = PutBlock(blockId, blockData.nioByteBuffer(), level)
-    val blockMessageArray = new BlockMessageArray(BlockMessage.fromPutBlock(msg))
-    val remoteCmId = new ConnectionManagerId(hostName, port)
-    val reply = cm.sendMessageReliably(remoteCmId, blockMessageArray.toBufferMessage)
-    reply.map(x => ())(cm.futureExecContext)
-  }
-
-  private def checkInit(): Unit = if (cm == null) {
-    throw new IllegalStateException(getClass.getName + " has not been initialized")
-  }
-
-  private def onBlockMessageReceive(msg: Message, id: ConnectionManagerId): Option[Message] = {
-    logDebug("Handling message " + msg)
-    msg match {
-      case bufferMessage: BufferMessage =>
-        try {
-          logDebug("Handling as a buffer message " + bufferMessage)
-          val blockMessages = BlockMessageArray.fromBufferMessage(bufferMessage)
-          logDebug("Parsed as a block message array")
-          val responseMessages = blockMessages.map(processBlockMessage).filter(_ != None).map(_.get)
-          Some(new BlockMessageArray(responseMessages).toBufferMessage)
-        } catch {
-          case e: Exception =>
-            logError("Exception handling buffer message", e)
-            Some(Message.createErrorMessage(e, msg.id))
-        }
-
-      case otherMessage: Any =>
-        val errorMsg = s"Received unknown message type: ${otherMessage.getClass.getName}"
-        logError(errorMsg)
-        Some(Message.createErrorMessage(new UnsupportedOperationException(errorMsg), msg.id))
-    }
-  }
-
-  private def processBlockMessage(blockMessage: BlockMessage): Option[BlockMessage] = {
-    blockMessage.getType match {
-      case BlockMessage.TYPE_PUT_BLOCK =>
-        val msg = PutBlock(blockMessage.getId, blockMessage.getData, blockMessage.getLevel)
-        logDebug("Received [" + msg + "]")
-        putBlock(msg.id, msg.data, msg.level)
-        None
-
-      case BlockMessage.TYPE_GET_BLOCK =>
-        val msg = new GetBlock(blockMessage.getId)
-        logDebug("Received [" + msg + "]")
-        val buffer = getBlock(msg.id)
-        if (buffer == null) {
-          return None
-        }
-        Some(BlockMessage.fromGotBlock(GotBlock(msg.id, buffer)))
-
-      case _ => None
-    }
-  }
-
-  private def putBlock(blockId: BlockId, bytes: ByteBuffer, level: StorageLevel) {
-    val startTimeMs = System.currentTimeMillis()
-    logDebug("PutBlock " + blockId + " started from " + startTimeMs + " with data: " + bytes)
-    blockDataManager.putBlockData(blockId, new NioManagedBuffer(bytes), level)
-    logDebug("PutBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)
-      + " with data size: " + bytes.limit)
-  }
-
-  private def getBlock(blockId: BlockId): ByteBuffer = {
-    val startTimeMs = System.currentTimeMillis()
-    logDebug("GetBlock " + blockId + " started from " + startTimeMs)
-    val buffer = blockDataManager.getBlockData(blockId)
-    logDebug("GetBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)
-      + " and got buffer " + buffer)
-    buffer.nioByteBuffer()
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
deleted file mode 100644
index 232c552f9865d..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.{ArrayBuffer, StringBuilder}
-
-import org.apache.spark._
-
-/**
- * SecurityMessage is class that contains the connectionId and sasl token
- * used in SASL negotiation. SecurityMessage has routines for converting
- * it to and from a BufferMessage so that it can be sent by the ConnectionManager
- * and easily consumed by users when received.
- * The api was modeled after BlockMessage.
- *
- * The connectionId is the connectionId of the client side. Since
- * message passing is asynchronous and its possible for the server side (receiving)
- * to get multiple different types of messages on the same connection the connectionId
- * is used to know which connnection the security message is intended for.
- *
- * For instance, lets say we are node_0. We need to send data to node_1. The node_0 side
- * is acting as a client and connecting to node_1. SASL negotiation has to occur
- * between node_0 and node_1 before node_1 trusts node_0 so node_0 sends a security message.
- * node_1 receives the message from node_0 but before it can process it and send a response,
- * some thread on node_1 decides it needs to send data to node_0 so it connects to node_0
- * and sends a security message of its own to authenticate as a client. Now node_0 gets
- * the message and it needs to decide if this message is in response to it being a client
- * (from the first send) or if its just node_1 trying to connect to it to send data.  This
- * is where the connectionId field is used. node_0 can lookup the connectionId to see if
- * it is in response to it being a client or if its in response to someone sending other data.
- *
- * The format of a SecurityMessage as its sent is:
- *   - Length of the ConnectionId
- *   - ConnectionId
- *   - Length of the token
- *   - Token
- */
-private[nio] class SecurityMessage extends Logging {
-
-  private var connectionId: String = null
-  private var token: Array[Byte] = null
-
-  def set(byteArr: Array[Byte], newconnectionId: String) {
-    if (byteArr == null) {
-      token = new Array[Byte](0)
-    } else {
-      token = byteArr
-    }
-    connectionId = newconnectionId
-  }
-
-  /**
-   * Read the given buffer and set the members of this class.
-   */
-  def set(buffer: ByteBuffer) {
-    val idLength = buffer.getInt()
-    val idBuilder = new StringBuilder(idLength)
-    for (i <- 1 to idLength) {
-        idBuilder += buffer.getChar()
-    }
-    connectionId = idBuilder.toString()
-
-    val tokenLength = buffer.getInt()
-    token = new Array[Byte](tokenLength)
-    if (tokenLength > 0) {
-      buffer.get(token, 0, tokenLength)
-    }
-  }
-
-  def set(bufferMsg: BufferMessage) {
-    val buffer = bufferMsg.buffers.apply(0)
-    buffer.clear()
-    set(buffer)
-  }
-
-  def getConnectionId: String = {
-    return connectionId
-  }
-
-  def getToken: Array[Byte] = {
-    return token
-  }
-
-  /**
-   * Create a BufferMessage that can be sent by the ConnectionManager containing
-   * the security information from this class.
-   * @return BufferMessage
-   */
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-
-    // 4 bytes for the length of the connectionId
-    // connectionId is of type char so multiple the length by 2 to get number of bytes
-    // 4 bytes for the length of token
-    // token is a byte buffer so just take the length
-    var buffer = ByteBuffer.allocate(4 + connectionId.length() * 2 + 4 + token.length)
-    buffer.putInt(connectionId.length())
-    connectionId.foreach((x: Char) => buffer.putChar(x))
-    buffer.putInt(token.length)
-
-    if (token.length > 0) {
-      buffer.put(token)
-    }
-    buffer.flip()
-    buffers += buffer
-
-    var message = Message.createBufferMessage(buffers)
-    logDebug("message total size is : " + message.size)
-    message.isSecurityNeg = true
-    return message
-  }
-
-  override def toString: String = {
-    "SecurityMessage [connId= " + connectionId + ", Token = " + token + "]"
-  }
-}
-
-private[nio] object SecurityMessage {
-
-  /**
-   * Convert the given BufferMessage to a SecurityMessage by parsing the contents
-   * of the BufferMessage and populating the SecurityMessage fields.
-   * @param bufferMessage is a BufferMessage that was received
-   * @return new SecurityMessage
-   */
-  def fromBufferMessage(bufferMessage: BufferMessage): SecurityMessage = {
-    val newSecurityMessage = new SecurityMessage()
-    newSecurityMessage.set(bufferMessage)
-    newSecurityMessage
-  }
-
-  /**
-   * Create a SecurityMessage to send from a given saslResponse.
-   * @param response is the response to a challenge from the SaslClient or Saslserver
-   * @param connectionId the client connectionId we are negotiation authentication for
-   * @return a new SecurityMessage
-   */
-  def fromResponse(response : Array[Byte], connectionId : String) : SecurityMessage = {
-    val newSecurityMessage = new SecurityMessage()
-    newSecurityMessage.set(response, connectionId)
-    newSecurityMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index b977711e7d5ad..c5195c1143a8f 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -35,7 +35,6 @@ import org.roaringbitmap.{ArrayContainer, BitmapContainer, RoaringArray, Roaring
 import org.apache.spark._
 import org.apache.spark.api.python.PythonBroadcast
 import org.apache.spark.broadcast.HttpBroadcast
-import org.apache.spark.network.nio.{GetBlock, GotBlock, PutBlock}
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus}
 import org.apache.spark.storage._
@@ -362,9 +361,6 @@ private[serializer] object KryoSerializer {
   private val toRegister: Seq[Class[_]] = Seq(
     ByteBuffer.allocate(1).getClass,
     classOf[StorageLevel],
-    classOf[PutBlock],
-    classOf[GotBlock],
-    classOf[GetBlock],
     classOf[CompressedMapStatus],
     classOf[HighlyCompressedMapStatus],
     classOf[RoaringBitmap],
diff --git a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala b/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
deleted file mode 100644
index 5e364cc0edeb2..0000000000000
--- a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.io.IOException
-import java.nio._
-
-import scala.concurrent.duration._
-import scala.concurrent.{Await, TimeoutException}
-import scala.language.postfixOps
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.util.Utils
-
-/**
-  * Test the ConnectionManager with various security settings.
-  */
-class ConnectionManagerSuite extends SparkFunSuite {
-
-  test("security default off") {
-    val conf = new SparkConf
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var receivedMessage = false
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      receivedMessage = true
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    Await.result(manager.sendMessageReliably(manager.id, bufferMessage), 10 seconds)
-
-    assert(receivedMessage == true)
-
-    manager.stop()
-  }
-
-  test("security on same password") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.authenticate.secret", "good")
-    conf.set("spark.app.id", "app-id")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-    val managerServer = new ConnectionManager(0, conf, securityManager)
-    var numReceivedServerMessages = 0
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val count = 10
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      Await.result(manager.sendMessageReliably(managerServer.id, bufferMessage), 10 seconds)
-    })
-
-    assert(numReceivedServerMessages == 10)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security mismatch password") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.app.id", "app-id")
-    conf.set("spark.authenticate.secret", "good")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = conf.clone.set("spark.authenticate.secret", "bad")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    // Expect managerServer to close connection, which we'll report as an error:
-    intercept[IOException] {
-      Await.result(manager.sendMessageReliably(managerServer.id, bufferMessage), 10 seconds)
-    }
-
-    assert(numReceivedServerMessages == 0)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security mismatch auth off") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    conf.set("spark.authenticate.secret", "good")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = new SparkConf
-    badconf.set("spark.authenticate", "true")
-    badconf.set("spark.authenticate.secret", "good")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    (0 until 1).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(managerServer.id, bufferMessage)
-    }).foreach(f => {
-      try {
-        val g = Await.result(f, 1 second)
-        assert(false)
-      } catch {
-        case i: IOException =>
-          assert(true)
-        case e: TimeoutException => {
-          // we should timeout here since the client can't do the negotiation
-          assert(true)
-        }
-      }
-    })
-
-    assert(numReceivedServerMessages == 0)
-    assert(numReceivedMessages == 0)
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security auth off") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = new SparkConf
-    badconf.set("spark.authenticate", "false")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    (0 until 10).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(managerServer.id, bufferMessage)
-    }).foreach(f => {
-      try {
-        val g = Await.result(f, 1 second)
-      } catch {
-        case e: Exception => {
-          assert(false)
-        }
-      }
-    })
-    assert(numReceivedServerMessages == 10)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("Ack error message") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    val managerServer = new ConnectionManager(0, conf, securityManager)
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      throw new Exception("Custom exception text")
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer)
-
-    val future = manager.sendMessageReliably(managerServer.id, bufferMessage)
-
-    val exception = intercept[IOException] {
-      Await.result(future, 1 second)
-    }
-    assert(Utils.exceptionString(exception).contains("Custom exception text"))
-
-    manager.stop()
-    managerServer.stop()
-
-  }
-
-  test("sendMessageReliably timeout") {
-    val clientConf = new SparkConf
-    clientConf.set("spark.authenticate", "false")
-    val ackTimeoutS = 30
-    clientConf.set("spark.core.connection.ack.wait.timeout", s"${ackTimeoutS}s")
-
-    val clientSecurityManager = new SecurityManager(clientConf)
-    val manager = new ConnectionManager(0, clientConf, clientSecurityManager)
-
-    val serverConf = new SparkConf
-    serverConf.set("spark.authenticate", "false")
-    val serverSecurityManager = new SecurityManager(serverConf)
-    val managerServer = new ConnectionManager(0, serverConf, serverSecurityManager)
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      // sleep 60 sec > ack timeout for simulating server slow down or hang up
-      Thread.sleep(ackTimeoutS * 3 * 1000)
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-
-    val future = manager.sendMessageReliably(managerServer.id, bufferMessage)
-
-    // Future should throw IOException in 30 sec.
-    // Otherwise TimeoutExcepton is thrown from Await.result.
-    // We expect TimeoutException is not thrown.
-    intercept[IOException] {
-      Await.result(future, (ackTimeoutS * 2) second)
-    }
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-}
-
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index 0f5ba46f69c2f..eb5af70d57aec 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -26,10 +26,10 @@ import org.mockito.Mockito.{mock, when}
 import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.network.BlockTransferService
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -38,7 +38,7 @@ import org.apache.spark.storage.StorageLevel._
 /** Testsuite that tests block replication in BlockManager */
 class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
 
-  private val conf = new SparkConf(false)
+  private val conf = new SparkConf(false).set("spark.app.id", "test")
   var rpcEnv: RpcEnv = null
   var master: BlockManagerMaster = null
   val securityMgr = new SecurityManager(conf)
@@ -59,7 +59,7 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val store = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     store.initialize("app-id")
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index e5b54d66c8157..34bb4952e7246 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -30,10 +30,10 @@ import org.scalatest._
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts._
 
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.executor.DataReadMethod
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -44,7 +44,7 @@ import org.apache.spark.util._
 class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach
   with PrivateMethodTester with ResetSystemProperties {
 
-  private val conf = new SparkConf(false)
+  private val conf = new SparkConf(false).set("spark.app.id", "test")
   var store: BlockManager = null
   var store2: BlockManager = null
   var store3: BlockManager = null
@@ -66,7 +66,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val manager = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     manager.initialize("app-id")
@@ -819,7 +819,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
 
   test("block store put failure") {
     // Use Java serializer so we can create an unserializable error.
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     store = new BlockManager(SparkContext.DRIVER_IDENTIFIER, rpcEnv, master,
       new JavaSerializer(conf), 1200, conf, mapOutputTracker, shuffleManager, transfer, securityMgr,
       0)
@@ -833,7 +833,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
 
     // Make sure get a1 doesn't hang and returns None.
     failAfter(1 second) {
-      assert(store.getSingle("a1") == None, "a1 should not be in store")
+      assert(store.getSingle("a1").isEmpty, "a1 should not be in store")
     }
   }
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 29a36bd67f28b..a2cc7a37e2240 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -382,17 +382,6 @@ Apart from these, the following properties are also available, and may be useful
     overhead per reduce task, so keep it small unless you have a large amount of memory.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.blockTransferService</code></td>
-  <td>netty</td>
-  <td>
-    Implementation to use for transferring shuffle and cached blocks between executors. There
-    are two implementations available: <code>netty</code> and <code>nio</code>. Netty-based
-    block transfer is intended to be simpler but equally efficient and is the default option
-    starting in 1.2, and <code>nio</code> block transfer is deprecated in Spark 1.5.0 and will
-    be removed in Spark 1.6.0.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.compress</code></td>
   <td>true</td>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 714ce3cd9b1de..3b8b6c8ffa375 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -32,635 +32,638 @@ import com.typesafe.tools.mima.core.ProblemFilters._
  * MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap")
  */
 object MimaExcludes {
-    def excludes(version: String) =
-      version match {
-        case v if v.startsWith("1.5") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("network"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in 1.3.
-            excludePackage("org.spark-project.jetty"),
-            MimaBuild.excludeSparkPackage("unused"),
-            // JavaRDDLike is not meant to be extended by user programs
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.partitioner"),
-            // Modification of private static method
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaUtils.org$apache$spark$streaming$kafka$KafkaUtils$$leadersForRanges"),
-            // Mima false positive (was a private[spark] class)
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.util.collection.PairIterator"),
-            // Removing a testing method from a private class
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
-            // While private MiMa is still not happy about the changes,
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.classification.LogisticCostFun.this"),
-            // SQL execution is considered private.
-            excludePackage("org.apache.spark.sql.execution"),
-            // The old JSON RDD is removed in favor of streaming Jackson
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
-            // local function inside a method
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
-          ) ++ Seq(
-            // SPARK-8479 Add numNonzeros and numActives to Matrix.
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.numNonzeros"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.numActives")
-          ) ++ Seq(
-            // SPARK-8914 Remove RDDApi
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.RDDApi")
-          ) ++ Seq(
-            // SPARK-7292 Provide operator to truncate lineage cheaply
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.rdd.RDDCheckpointData"),
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.rdd.CheckpointRDD")
-          ) ++ Seq(
-            // SPARK-8701 Add input metadata in the batch page.
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.streaming.scheduler.InputInfo$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.streaming.scheduler.InputInfo")
-          ) ++ Seq(
-            // SPARK-6797 Support YARN modes for SparkR
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.PairwiseRRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.RRDD.createRWorker"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.RRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.StringRRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.BaseRRDD.this")
-          ) ++ Seq(
-            // SPARK-7422 add argmax for sparse vectors
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.argmax")
-          ) ++ Seq(
-            // SPARK-8906 Move all internal data source classes into execution.datasources
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException"),
-            // SPARK-9763 Minimize exposure of internal SQL classes
-            excludePackage("org.apache.spark.sql.parquet"),
-            excludePackage("org.apache.spark.sql.json"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$JDBCConversion"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$DriverWrapper"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DefaultSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation")
-          ) ++ Seq(
-            // SPARK-4751 Dynamic allocation for standalone mode
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkContext.supportDynamicAllocation")
-          ) ++ Seq(
-            // SPARK-9580: Remove SQL test singletons
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.LocalSQLContext$SQLSession"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.LocalSQLContext"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.TestSQLContext"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.TestSQLContext$")
-          ) ++ Seq(
-            // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.mllib.linalg.VectorUDT.serialize")
-          )
+  def excludes(version: String) = version match {
+    case v if v.startsWith("1.6") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("network")
+        )
+    case v if v.startsWith("1.5") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("network"),
+        MimaBuild.excludeSparkPackage("deploy"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        // JavaRDDLike is not meant to be extended by user programs
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.partitioner"),
+        // Modification of private static method
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaUtils.org$apache$spark$streaming$kafka$KafkaUtils$$leadersForRanges"),
+        // Mima false positive (was a private[spark] class)
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.util.collection.PairIterator"),
+        // Removing a testing method from a private class
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
+        // While private MiMa is still not happy about the changes,
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticCostFun.this"),
+        // SQL execution is considered private.
+        excludePackage("org.apache.spark.sql.execution"),
+        // The old JSON RDD is removed in favor of streaming Jackson
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
+        // local function inside a method
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
+      ) ++ Seq(
+        // SPARK-8479 Add numNonzeros and numActives to Matrix.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.numNonzeros"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.numActives")
+      ) ++ Seq(
+        // SPARK-8914 Remove RDDApi
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.RDDApi")
+      ) ++ Seq(
+        // SPARK-7292 Provide operator to truncate lineage cheaply
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.rdd.RDDCheckpointData"),
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.rdd.CheckpointRDD")
+      ) ++ Seq(
+        // SPARK-8701 Add input metadata in the batch page.
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.streaming.scheduler.InputInfo$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.streaming.scheduler.InputInfo")
+      ) ++ Seq(
+        // SPARK-6797 Support YARN modes for SparkR
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.PairwiseRRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.RRDD.createRWorker"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.RRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.StringRRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.BaseRRDD.this")
+      ) ++ Seq(
+        // SPARK-7422 add argmax for sparse vectors
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.argmax")
+      ) ++ Seq(
+        // SPARK-8906 Move all internal data source classes into execution.datasources
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException"),
+        // SPARK-9763 Minimize exposure of internal SQL classes
+        excludePackage("org.apache.spark.sql.parquet"),
+        excludePackage("org.apache.spark.sql.json"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$JDBCConversion"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$DriverWrapper"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DefaultSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation")
+      ) ++ Seq(
+        // SPARK-4751 Dynamic allocation for standalone mode
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkContext.supportDynamicAllocation")
+      ) ++ Seq(
+        // SPARK-9580: Remove SQL test singletons
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.LocalSQLContext$SQLSession"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.LocalSQLContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.TestSQLContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.TestSQLContext$")
+      ) ++ Seq(
+        // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.mllib.linalg.VectorUDT.serialize")
+      )
 
-        case v if v.startsWith("1.4") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("ml"),
-            // SPARK-7910 Adding a method to get the partioner to JavaRDD,
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitioner"),
-            // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in 1.3.
-            excludePackage("org.spark-project.jetty"),
-            MimaBuild.excludeSparkPackage("unused"),
-            ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.rdd.JdbcRDD.compute"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint")
-          ) ++ Seq(
-            // SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
-            // the stage class is defined as private[spark]
-            ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
-          ) ++ Seq(
-            // SPARK-6510 Add a Graph#minus method acting as Set#difference
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
-          ) ++ Seq(
-            // SPARK-6492 Fix deadlock in SparkContext.stop()
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
-                "apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
-          )++ Seq(
-            // SPARK-6693 add tostring with max lines and width for matrix
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.toString")
-          )++ Seq(
-            // SPARK-6703 Add getOrCreate method to SparkContext
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]
-                ("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
-          )++ Seq(
-            // SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.mllib.clustering.LDA$EMOptimizer")
-          ) ++ Seq(
-            // SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.compressed"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toDense"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toSparse"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numActives"),
-            // SPARK-7681 add SparseVector support for gemv
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
-          ) ++ Seq(
-            // Execution should never be included as its always internal.
-            MimaBuild.excludeSparkPackage("sql.execution"),
-            // This `protected[sql]` method was removed in 1.3.1
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.SQLContext.checkAnalysis"),
-            // These `private[sql]` class were removed in 1.4.0:
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.execution.AddExchange"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.execution.AddExchange$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.PartitionSpec"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.PartitionSpec$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.Partition"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.Partition$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
-            // These test support classes were moved out of src/main and into src/test:
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTestData"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTestData$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.TestGroupWriteSupport"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager"),
-            // TODO: Remove the following rule once ParquetTest has been moved to src/test.
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTest")
-          ) ++ Seq(
-            // SPARK-7530 Added StreamingContext.getState()
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.StreamingContext.state_=")
-          ) ++ Seq(
-            // SPARK-7081 changed ShuffleWriter from a trait to an abstract class and removed some
-            // unnecessary type bounds in order to fix some compiler warnings that occurred when
-            // implementing this interface in Java. Note that ShuffleWriter is private[spark].
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.shuffle.ShuffleWriter")
-          ) ++ Seq(
-            // SPARK-6888 make jdbc driver handling user definable
-            // This patch renames some classes to API friendly names.
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.PostgresQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.NoQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.MySQLQuirks")
-          )
+    case v if v.startsWith("1.4") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("ml"),
+        // SPARK-7910 Adding a method to get the partioner to JavaRDD,
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitioner"),
+        // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.rdd.JdbcRDD.compute"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint")
+      ) ++ Seq(
+        // SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
+        // the stage class is defined as private[spark]
+        ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
+      ) ++ Seq(
+        // SPARK-6510 Add a Graph#minus method acting as Set#difference
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
+      ) ++ Seq(
+        // SPARK-6492 Fix deadlock in SparkContext.stop()
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
+            "apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
+      )++ Seq(
+        // SPARK-6693 add tostring with max lines and width for matrix
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.toString")
+      )++ Seq(
+        // SPARK-6703 Add getOrCreate method to SparkContext
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]
+            ("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
+      )++ Seq(
+        // SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.mllib.clustering.LDA$EMOptimizer")
+      ) ++ Seq(
+        // SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.compressed"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.toDense"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.toSparse"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.numActives"),
+        // SPARK-7681 add SparseVector support for gemv
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.multiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
+      ) ++ Seq(
+        // Execution should never be included as its always internal.
+        MimaBuild.excludeSparkPackage("sql.execution"),
+        // This `protected[sql]` method was removed in 1.3.1
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.checkAnalysis"),
+        // These `private[sql]` class were removed in 1.4.0:
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.execution.AddExchange"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.execution.AddExchange$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.PartitionSpec"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.PartitionSpec$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.Partition"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.Partition$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
+        // These test support classes were moved out of src/main and into src/test:
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTestData"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTestData$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.TestGroupWriteSupport"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager"),
+        // TODO: Remove the following rule once ParquetTest has been moved to src/test.
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTest")
+      ) ++ Seq(
+        // SPARK-7530 Added StreamingContext.getState()
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.StreamingContext.state_=")
+      ) ++ Seq(
+        // SPARK-7081 changed ShuffleWriter from a trait to an abstract class and removed some
+        // unnecessary type bounds in order to fix some compiler warnings that occurred when
+        // implementing this interface in Java. Note that ShuffleWriter is private[spark].
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.shuffle.ShuffleWriter")
+      ) ++ Seq(
+        // SPARK-6888 make jdbc driver handling user definable
+        // This patch renames some classes to API friendly names.
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.PostgresQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.NoQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.MySQLQuirks")
+      )
 
-        case v if v.startsWith("1.3") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("ml"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in the 1.2 build.
-            MimaBuild.excludeSparkPackage("unused"),
-            ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
-          ) ++ Seq(
-            // SPARK-2321
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkStageInfoImpl.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkStageInfo.submissionTime")
-          ) ++ Seq(
-            // SPARK-4614
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrices.randn"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrices.rand")
-          ) ++ Seq(
-            // SPARK-5321
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.transpose"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
-                "org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.foreachActive")
-          ) ++ Seq(
-            // SPARK-5540
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
-            // SPARK-5536
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
-          ) ++ Seq(
-            // SPARK-3325
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
-            // SPARK-2757
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
-                "removeAndGetProcessor")
-          ) ++ Seq(
-            // SPARK-5123 (SparkSQL data type change) - alpha component only
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.ml.feature.HashingTF.outputDataType"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.ml.feature.Tokenizer.outputDataType"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.feature.Tokenizer.validateInputType"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
-          ) ++ Seq(
-            // SPARK-4014
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.TaskContext.taskAttemptId"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.TaskContext.attemptNumber")
-          ) ++ Seq(
-            // SPARK-5166 Spark SQL API stabilization
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
-          ) ++ Seq(
-            // SPARK-5270
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.isEmpty")
-          ) ++ Seq(
-            // SPARK-5430
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.treeReduce"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.treeAggregate")
-          ) ++ Seq(
-            // SPARK-5297 Java FileStream do not work with custom key/values
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
-          ) ++ Seq(
-            // SPARK-5315 Spark Streaming Java API returns Scala DStream
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
-          ) ++ Seq(
-            // SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.graphx.Graph.getCheckpointFiles"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.graphx.Graph.isCheckpointed")
-          ) ++ Seq(
-            // SPARK-4789 Standardize ML Prediction APIs
-            ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
-          ) ++ Seq(
-            // SPARK-5814
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
-          ) ++ Seq(
-            // SPARK-4682
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
-          ) ++ Seq(
-            // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
-          )
+    case v if v.startsWith("1.3") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("ml"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in the 1.2 build.
+        MimaBuild.excludeSparkPackage("unused"),
+        ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
+      ) ++ Seq(
+        // SPARK-2321
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkStageInfoImpl.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkStageInfo.submissionTime")
+      ) ++ Seq(
+        // SPARK-4614
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrices.randn"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrices.rand")
+      ) ++ Seq(
+        // SPARK-5321
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.transpose"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
+            "org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.foreachActive")
+      ) ++ Seq(
+        // SPARK-5540
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
+        // SPARK-5536
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
+      ) ++ Seq(
+        // SPARK-3325
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
+        // SPARK-2757
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
+            "removeAndGetProcessor")
+      ) ++ Seq(
+        // SPARK-5123 (SparkSQL data type change) - alpha component only
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.ml.feature.HashingTF.outputDataType"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.ml.feature.Tokenizer.outputDataType"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.feature.Tokenizer.validateInputType"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
+      ) ++ Seq(
+        // SPARK-4014
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.TaskContext.taskAttemptId"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.TaskContext.attemptNumber")
+      ) ++ Seq(
+        // SPARK-5166 Spark SQL API stabilization
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
+      ) ++ Seq(
+        // SPARK-5270
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.isEmpty")
+      ) ++ Seq(
+        // SPARK-5430
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.treeReduce"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.treeAggregate")
+      ) ++ Seq(
+        // SPARK-5297 Java FileStream do not work with custom key/values
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
+      ) ++ Seq(
+        // SPARK-5315 Spark Streaming Java API returns Scala DStream
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
+      ) ++ Seq(
+        // SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.graphx.Graph.getCheckpointFiles"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.graphx.Graph.isCheckpointed")
+      ) ++ Seq(
+        // SPARK-4789 Standardize ML Prediction APIs
+        ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
+      ) ++ Seq(
+        // SPARK-5814
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
+      ) ++ Seq(
+        // SPARK-4682
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
+      ) ++ Seq(
+        // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
+      )
 
-        case v if v.startsWith("1.2") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("graphx")
-          ) ++
-          MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
-          MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
-          Seq(
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.scheduler.TaskLocation"),
-            // Added normL1 and normL2 to trait MultivariateStatisticalSummary
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
-            // MapStatus should be private[spark]
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.scheduler.MapStatus"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.network.netty.PathResolver"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.network.netty.client.BlockClientListener"),
+    case v if v.startsWith("1.2") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("graphx")
+      ) ++
+      MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
+      MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
+      Seq(
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.scheduler.TaskLocation"),
+        // Added normL1 and normL2 to trait MultivariateStatisticalSummary
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
+        // MapStatus should be private[spark]
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.scheduler.MapStatus"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.network.netty.PathResolver"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.network.netty.client.BlockClientListener"),
 
-            // TaskContext was promoted to Abstract class
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.TaskContext"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.util.collection.SortDataFormat")
-          ) ++ Seq(
-            // Adding new methods to the JavaRDDLike trait:
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.takeAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.countAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.collectAsync")
-          ) ++ Seq(
-            // SPARK-3822
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
-          ) ++ Seq(
-            // SPARK-1209
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
-            ProblemFilters.exclude[MissingTypesProblem](
-              "org.apache.spark.rdd.PairRDDFunctions")
-          ) ++ Seq(
-            // SPARK-4062
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
-          )
+        // TaskContext was promoted to Abstract class
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.TaskContext"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.util.collection.SortDataFormat")
+      ) ++ Seq(
+        // Adding new methods to the JavaRDDLike trait:
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.takeAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.countAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.collectAsync")
+      ) ++ Seq(
+        // SPARK-3822
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
+      ) ++ Seq(
+        // SPARK-1209
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
+        ProblemFilters.exclude[MissingTypesProblem](
+          "org.apache.spark.rdd.PairRDDFunctions")
+      ) ++ Seq(
+        // SPARK-4062
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
+      )
 
-        case v if v.startsWith("1.1") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("graphx")
-          ) ++
-          Seq(
-            // Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
-            // Should probably mark this as Experimental
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
-            // We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
-            // for countApproxDistinct* functions, which does not work in Java. We later removed
-            // them, and use the following to tell Mima to not care about them.
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.DiskStore.getValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.MemoryStore.Entry")
-          ) ++
-          Seq(
-            // Serializer interface change. See SPARK-3045.
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.DeserializationStream"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.Serializer"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.SerializationStream"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.SerializerInstance")
-          )++
-          Seq(
-            // Renamed putValues -> putArray + putIterator
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.MemoryStore.putValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.DiskStore.putValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.TachyonStore.putValues")
-          ) ++
-          Seq(
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.flume.FlumeReceiver.this"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaReceiver.this")
-          ) ++
-          Seq( // Ignore some private methods in ALS.
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
-            ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
-              "org.apache.spark.mllib.recommendation.ALS.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
-          ) ++
-          MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
-          MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
-          MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
-          MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
-          MimaBuild.excludeSparkClass("storage.Values") ++
-          MimaBuild.excludeSparkClass("storage.Entry") ++
-          MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
-          // Class was missing "@DeveloperApi" annotation in 1.0.
-          MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
-          Seq(
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Gini.calculate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Variance.calculate")
-          ) ++
-          Seq( // Package-private classes removed in SPARK-2341
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
-          ) ++
-          Seq( // package-private classes removed in MLlib
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
-          ) ++
-          Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
-          ) ++
-          Seq( // synthetic methods generated in LabeledPoint
-            ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
-          ) ++
-          Seq ( // Scala 2.11 compatibility fix
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
-          )
-        case v if v.startsWith("1.0") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("api.java"),
-            MimaBuild.excludeSparkPackage("mllib"),
-            MimaBuild.excludeSparkPackage("streaming")
-          ) ++
-          MimaBuild.excludeSparkClass("rdd.ClassTags") ++
-          MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
-          MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
-          MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
-          MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
-          MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
-          MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
-          MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
-          MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
-          MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
-          MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
-          MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
-          MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
-        case _ => Seq()
-      }
-}
+    case v if v.startsWith("1.1") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("graphx")
+      ) ++
+      Seq(
+        // Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
+        // Should probably mark this as Experimental
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
+        // We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
+        // for countApproxDistinct* functions, which does not work in Java. We later removed
+        // them, and use the following to tell Mima to not care about them.
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.DiskStore.getValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.MemoryStore.Entry")
+      ) ++
+      Seq(
+        // Serializer interface change. See SPARK-3045.
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.DeserializationStream"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.Serializer"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.SerializationStream"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.SerializerInstance")
+      )++
+      Seq(
+        // Renamed putValues -> putArray + putIterator
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.MemoryStore.putValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.DiskStore.putValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.TachyonStore.putValues")
+      ) ++
+      Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.flume.FlumeReceiver.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaReceiver.this")
+      ) ++
+      Seq( // Ignore some private methods in ALS.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
+        ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
+          "org.apache.spark.mllib.recommendation.ALS.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
+      ) ++
+      MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
+      MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
+      MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
+      MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
+      MimaBuild.excludeSparkClass("storage.Values") ++
+      MimaBuild.excludeSparkClass("storage.Entry") ++
+      MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
+      // Class was missing "@DeveloperApi" annotation in 1.0.
+      MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
+      Seq(
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Gini.calculate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Variance.calculate")
+      ) ++
+      Seq( // Package-private classes removed in SPARK-2341
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
+      ) ++
+      Seq( // package-private classes removed in MLlib
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
+      ) ++
+      Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
+      ) ++
+      Seq( // synthetic methods generated in LabeledPoint
+        ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
+      ) ++
+      Seq ( // Scala 2.11 compatibility fix
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
+      )
+    case v if v.startsWith("1.0") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("api.java"),
+        MimaBuild.excludeSparkPackage("mllib"),
+        MimaBuild.excludeSparkPackage("streaming")
+      ) ++
+      MimaBuild.excludeSparkClass("rdd.ClassTags") ++
+      MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
+      MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
+      MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
+      MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
+      MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
+      MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
+      MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
+      MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
+      MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
+      MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
+      MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
+      MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
+    case _ => Seq()
+  }
+}
\ No newline at end of file
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 6c0c926755c20..13cfe29d7b304 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -29,7 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.network.nio.NioBlockTransferService
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
@@ -47,7 +47,9 @@ class ReceivedBlockHandlerSuite
   with Matchers
   with Logging {
 
-  val conf = new SparkConf().set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
+  val conf = new SparkConf()
+    .set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
+    .set("spark.app.id", "streaming-test")
   val hadoopConf = new Configuration()
   val streamId = 1
   val securityMgr = new SecurityManager(conf)
@@ -184,7 +186,7 @@ class ReceivedBlockHandlerSuite
   }
 
   test("Test Block - isFullyConsumed") {
-    val sparkConf = new SparkConf()
+    val sparkConf = new SparkConf().set("spark.app.id", "streaming-test")
     sparkConf.set("spark.storage.unrollMemoryThreshold", "512")
     // spark.storage.unrollFraction set to 0.4 for BlockManager
     sparkConf.set("spark.storage.unrollFraction", "0.4")
@@ -251,7 +253,7 @@ class ReceivedBlockHandlerSuite
       maxMem: Long,
       conf: SparkConf,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val manager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     manager.initialize("app-id")

From 9d8e838d883ed21f9ef562e7e3ac074c7e4adb88 Mon Sep 17 00:00:00 2001
From: Stephen Hopper <shopper@shopper-osx.local>
Date: Tue, 8 Sep 2015 14:36:34 +0100
Subject: [PATCH 205/802] =?UTF-8?q?[DOC]=20Added=20R=20to=20the=20list=20o?=
 =?UTF-8?q?f=20languages=20with=20"high-level=20API"=20support=20in=20the?=
 =?UTF-8?q?=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… main README.

Author: Stephen Hopper <shopper@shopper-osx.local>

Closes #8646 from enragedginger/master.
---
 README.md           |  4 ++--
 docs/quick-start.md | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 380422ca00dbe..76e29b4235666 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Apache Spark
 
 Spark is a fast and general cluster computing system for Big Data. It provides
-high-level APIs in Scala, Java, and Python, and an optimized engine that
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
 supports general computation graphs for data analysis. It also supports a
 rich set of higher-level tools including Spark SQL for SQL and DataFrames,
 MLlib for machine learning, GraphX for graph processing,
@@ -94,5 +94,5 @@ distribution.
 
 ## Configuration
 
-Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
+Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
diff --git a/docs/quick-start.md b/docs/quick-start.md
index ce2cc9d2169cd..d481fe0ea6d70 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -126,7 +126,7 @@ scala> val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (w
 wordCounts: spark.RDD[(String, Int)] = spark.ShuffledAggregatedRDD@71f027b8
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight scala %}
 scala> wordCounts.collect()
@@ -163,7 +163,7 @@ One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can i
 >>> wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight python %}
 >>> wordCounts.collect()
@@ -217,13 +217,13 @@ a cluster, as described in the [programming guide](programming-guide.html#initia
 </div>
 
 # Self-Contained Applications
-Now say we wanted to write a self-contained application using the Spark API. We will walk through a
-simple application in both Scala (with SBT), Java (with Maven), and Python.
+Suppose we wish to write a self-contained application using the Spark API. We will walk through a
+simple application in Scala (with sbt), Java (with Maven), and Python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-We'll create a very simple Spark application in Scala. So simple, in fact, that it's
+We'll create a very simple Spark application in Scala--so simple, in fact, that it's
 named `SimpleApp.scala`:
 
 {% highlight scala %}
@@ -259,7 +259,7 @@ object which contains information about our
 application. 
 
 Our application depends on the Spark API, so we'll also include an sbt configuration file, 
-`simple.sbt` which explains that Spark is a dependency. This file also adds a repository that 
+`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that 
 Spark depends on:
 
 {% highlight scala %}
@@ -302,7 +302,7 @@ Lines with a: 46, Lines with b: 23
 
 </div>
 <div data-lang="java" markdown="1">
-This example will use Maven to compile an application jar, but any similar build system will work.
+This example will use Maven to compile an application JAR, but any similar build system will work.
 
 We'll create a very simple Spark application, `SimpleApp.java`:
 
@@ -374,7 +374,7 @@ $ find .
 Now, we can package the application using Maven and execute it with `./bin/spark-submit`.
 
 {% highlight bash %}
-# Package a jar containing your application
+# Package a JAR containing your application
 $ mvn package
 ...
 [INFO] Building jar: {..}/{..}/target/simple-project-1.0.jar

From 6ceed852ab716d8acc46ce90cba9cfcff6d3616f Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Tue, 8 Sep 2015 14:38:10 +0100
Subject: [PATCH 206/802] Docs small fixes

Author: Jacek Laskowski <jacek@japila.pl>

Closes #8629 from jaceklaskowski/docs-fixes.
---
 docs/building-spark.md   | 23 +++++++++++------------
 docs/cluster-overview.md | 15 ++++++++-------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index f133eb96d9a21..4db32cfd628bc 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -61,12 +61,13 @@ If you don't run this, you may see errors like the following:
 You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
 
 **Note:**
-* *For Java 8 and above this step is not required.*
-* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.*
+
+* For Java 8 and above this step is not required.
+* If using `build/mvn` with no `MAVEN_OPTS` set, the script will automate this for you.
 
 # Specifying the Hadoop Version
 
-Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
+Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the `hadoop.version` property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
 
 <table class="table">
   <thead>
@@ -91,7 +92,7 @@ mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package
 mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package
 {% endhighlight %}
 
-You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later.
+You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later.
 
 Examples:
 
@@ -125,7 +126,7 @@ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -Dskip
 # Building for Scala 2.11
 To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` property:
 
-    dev/change-scala-version.sh 2.11
+    ./dev/change-scala-version.sh 2.11
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
 Spark does not yet support its JDBC component for Scala 2.11.
@@ -163,11 +164,9 @@ the `spark-parent` module).
 
 Thus, the full flow for running continuous-compilation of the `core` submodule may look more like:
 
-```
- $ mvn install
- $ cd core
- $ mvn scala:cc
-```
+    $ mvn install
+    $ cd core
+    $ mvn scala:cc
 
 # Building Spark with IntelliJ IDEA or Eclipse
 
@@ -193,11 +192,11 @@ then ship it over to the cluster. We are investigating the exact cause for this.
 
 # Packaging without Hadoop Dependencies for YARN
 
-The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
+The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with `yarn.application.classpath`.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
 
 # Building with SBT
 
-Maven is the official recommendation for packaging Spark, and is the "build of reference".
+Maven is the official build tool recommended for packaging Spark, and is the *build of reference*.
 But SBT is supported for day-to-day development since it can provide much faster iterative
 compilation. More advanced developers may wish to use SBT.
 
diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index 7079de546e2f5..faaf154d243f5 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -5,18 +5,19 @@ title: Cluster Mode Overview
 
 This document gives a short overview of how Spark runs on clusters, to make it easier to understand
 the components involved. Read through the [application submission guide](submitting-applications.html)
-to submit applications to a cluster.
+to learn about launching applications on a cluster.
 
 # Components
 
-Spark applications run as independent sets of processes on a cluster, coordinated by the SparkContext
+Spark applications run as independent sets of processes on a cluster, coordinated by the `SparkContext`
 object in your main program (called the _driver program_).
+
 Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_
-(either Spark's own standalone cluster manager or Mesos/YARN), which allocate resources across
+(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across
 applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are
 processes that run computations and store data for your application.
 Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to
-the executors. Finally, SparkContext sends *tasks* for the executors to run.
+the executors. Finally, SparkContext sends *tasks* to the executors to run.
 
 <p style="text-align: center;">
   <img src="img/cluster-overview.png" title="Spark cluster components" alt="Spark cluster components" />
@@ -33,9 +34,9 @@ There are several useful things to note about this architecture:
 2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor
    processes, and these communicate with each other, it is relatively easy to run it even on a
    cluster manager that also supports other applications (e.g. Mesos/YARN).
-3. The driver program must listen for and accept incoming connections from its executors throughout 
-   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config 
-   section](configuration.html#networking)). As such, the driver program must be network 
+3. The driver program must listen for and accept incoming connections from its executors throughout
+   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config
+   section](configuration.html#networking)). As such, the driver program must be network
    addressable from the worker nodes.
 4. Because the driver schedules tasks on the cluster, it should be run close to the worker
    nodes, preferably on the same local area network. If you'd like to send requests to the

From 990c9f79c28db501018a0a3af446ff879962475d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Tue, 8 Sep 2015 23:07:34 +0800
Subject: [PATCH 207/802] [SPARK-9170] [SQL] Use OrcStructInspector to be case
 preserving when writing ORC files

JIRA: https://issues.apache.org/jira/browse/SPARK-9170

`StandardStructObjectInspector` will implicitly lowercase column names. But I think Orc format doesn't have such requirement. In fact, there is a `OrcStructInspector` specified for Orc format. We should use it when serialize rows to Orc file. It can be case preserving when writing ORC files.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7520 from viirya/use_orcstruct.
---
 .../spark/sql/hive/orc/OrcRelation.scala      | 47 ++++++++++---------
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 14 ++++++
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 4eeca9aec12bd..7e89109259955 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -25,9 +25,9 @@ import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit}
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
+import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit, OrcStruct}
+import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector
+import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, StructTypeInfo}
 import org.apache.hadoop.io.{NullWritable, Writable}
 import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
@@ -89,21 +89,10 @@ private[orc] class OrcOutputWriter(
       TypeInfoUtils.getTypeInfoFromTypeString(
         HiveMetastoreTypes.toMetastoreType(dataSchema))
 
-    TypeInfoUtils
-      .getStandardJavaObjectInspectorFromTypeInfo(typeInfo)
-      .asInstanceOf[StructObjectInspector]
+    OrcStruct.createObjectInspector(typeInfo.asInstanceOf[StructTypeInfo])
+      .asInstanceOf[SettableStructObjectInspector]
   }
 
-  // Used to hold temporary `Writable` fields of the next row to be written.
-  private val reusableOutputBuffer = new Array[Any](dataSchema.length)
-
-  // Used to convert Catalyst values into Hadoop `Writable`s.
-  private val wrappers = structOI.getAllStructFieldRefs.asScala
-    .zip(dataSchema.fields.map(_.dataType))
-    .map { case (ref, dt) =>
-      wrapperFor(ref.getFieldObjectInspector, dt)
-    }.toArray
-
   // `OrcRecordWriter.close()` creates an empty file if no rows are written at all.  We use this
   // flag to decide whether `OrcRecordWriter.close()` needs to be called.
   private var recordWriterInstantiated = false
@@ -127,16 +116,32 @@ private[orc] class OrcOutputWriter(
 
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
-  override protected[sql] def writeInternal(row: InternalRow): Unit = {
+  private def wrapOrcStruct(
+      struct: OrcStruct,
+      oi: SettableStructObjectInspector,
+      row: InternalRow): Unit = {
+    val fieldRefs = oi.getAllStructFieldRefs
     var i = 0
-    while (i < row.numFields) {
-      reusableOutputBuffer(i) = wrappers(i)(row.get(i, dataSchema(i).dataType))
+    while (i < fieldRefs.size) {
+      oi.setStructFieldData(
+        struct,
+        fieldRefs.get(i),
+        wrap(
+          row.get(i, dataSchema(i).dataType),
+          fieldRefs.get(i).getFieldObjectInspector,
+          dataSchema(i).dataType))
       i += 1
     }
+  }
+
+  val cachedOrcStruct = structOI.create().asInstanceOf[OrcStruct]
+
+  override protected[sql] def writeInternal(row: InternalRow): Unit = {
+    wrapOrcStruct(cachedOrcStruct, structOI, row)
 
     recordWriter.write(
       NullWritable.get(),
-      serializer.serialize(reusableOutputBuffer, structOI))
+      serializer.serialize(cachedOrcStruct, structOI))
   }
 
   override def close(): Unit = {
@@ -259,7 +264,7 @@ private[orc] case class OrcTableScan(
     maybeStructOI.map { soi =>
       val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
         case (attr, ordinal) =>
-          soi.getStructFieldRef(attr.name.toLowerCase) -> ordinal
+          soi.getStructFieldRef(attr.name) -> ordinal
       }.unzip
       val unwrappers = fieldRefs.map(unwrapperFor)
       // Map each tuple to a row object
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 744d462938141..8bc33fcf5d906 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -287,6 +287,20 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("SPARK-9170: Don't implicitly lowercase of user-provided columns") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      sqlContext.range(0, 10).select('id as "Acol").write.format("orc").save(path)
+      sqlContext.read.format("orc").load(path).schema("Acol")
+      intercept[IllegalArgumentException] {
+        sqlContext.read.format("orc").load(path).schema("acol")
+      }
+      checkAnswer(sqlContext.read.format("orc").load(path).select("acol").sort("acol"),
+        (0 until 10).map(Row(_)))
+    }
+  }
+
   test("SPARK-8501: Avoids discovery schema from empty ORC files") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath

From 5b2192e846b843d8a0cb9427d19bb677431194a0 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 8 Sep 2015 11:11:35 -0700
Subject: [PATCH 208/802] [SPARK-10480] [ML] Fix
 ML.LinearRegressionModel.copy()

This PR fix two model ```copy()``` related issues:
[SPARK-10480](https://issues.apache.org/jira/browse/SPARK-10480)
```ML.LinearRegressionModel.copy()``` ignored argument ```extra```, it will not take effect when users setting this parameter.
[SPARK-10479](https://issues.apache.org/jira/browse/SPARK-10479)
```ML.LogisticRegressionModel.copy()``` should copy model summary if available.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8641 from yanboliang/linear-regression-copy.
---
 .../apache/spark/ml/classification/LogisticRegression.scala   | 4 +++-
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 21fbe38ca8233..a460262b87e43 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -468,7 +468,9 @@ class LogisticRegressionModel private[ml] (
   }
 
   override def copy(extra: ParamMap): LogisticRegressionModel = {
-    copyValues(new LogisticRegressionModel(uid, weights, intercept), extra).setParent(parent)
+    val newModel = copyValues(new LogisticRegressionModel(uid, weights, intercept), extra)
+    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
+    newModel.setParent(parent)
   }
 
   override protected def raw2prediction(rawPrediction: Vector): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 884003eb38524..e4602d36ccc87 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -310,7 +310,7 @@ class LinearRegressionModel private[ml] (
   }
 
   override def copy(extra: ParamMap): LinearRegressionModel = {
-    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept))
+    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
     if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
     newModel.setParent(parent)
   }

From 5fd57955ef477347408f68eb1cb6ad1881fdb6e0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 8 Sep 2015 12:05:41 -0700
Subject: [PATCH 209/802] [SPARK-10316] [SQL] respect nondeterministic
 expressions in PhysicalOperation

We did a lot of special handling for non-deterministic expressions in `Optimizer`. However, `PhysicalOperation` just collects all Projects and Filters and mess it up. We should respect the operators order caused by non-deterministic expressions in `PhysicalOperation`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8486 from cloud-fan/fix.
---
 .../sql/catalyst/planning/patterns.scala      | 38 ++++---------------
 .../org/apache/spark/sql/DataFrameSuite.scala | 12 ++++++
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index e8abcd63f7d85..53537799517ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -17,35 +17,12 @@
 
 package org.apache.spark.sql.catalyst.planning
 
-import scala.annotation.tailrec
-
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
-/**
- * A pattern that matches any number of filter operations on top of another relational operator.
- * Adjacent filter operators are collected and their conditions are broken up and returned as a
- * sequence of conjunctive predicates.
- *
- * @return A tuple containing a sequence of conjunctive predicates that should be used to filter the
- *         output and a relational operator.
- */
-object FilteredOperation extends PredicateHelper {
-  type ReturnType = (Seq[Expression], LogicalPlan)
-
-  def unapply(plan: LogicalPlan): Option[ReturnType] = Some(collectFilters(Nil, plan))
-
-  @tailrec
-  private def collectFilters(filters: Seq[Expression], plan: LogicalPlan): ReturnType = plan match {
-    case Filter(condition, child) =>
-      collectFilters(filters ++ splitConjunctivePredicates(condition), child)
-    case other => (filters, other)
-  }
-}
-
 /**
  * A pattern that matches any number of project or filter operations on top of another relational
  * operator.  All filter operators are collected and their conditions are broken up and returned
@@ -62,8 +39,9 @@ object PhysicalOperation extends PredicateHelper {
   }
 
   /**
-   * Collects projects and filters, in-lining/substituting aliases if necessary.  Here are two
-   * examples for alias in-lining/substitution.  Before:
+   * Collects all deterministic projects and filters, in-lining/substituting aliases if necessary.
+   * Here are two examples for alias in-lining/substitution.
+   * Before:
    * {{{
    *   SELECT c1 FROM (SELECT key AS c1 FROM t1) t2 WHERE c1 > 10
    *   SELECT c1 AS c2 FROM (SELECT key AS c1 FROM t1) t2 WHERE c1 > 10
@@ -74,15 +52,15 @@ object PhysicalOperation extends PredicateHelper {
    *   SELECT key AS c2 FROM t1 WHERE key > 10
    * }}}
    */
-  def collectProjectsAndFilters(plan: LogicalPlan):
+  private def collectProjectsAndFilters(plan: LogicalPlan):
       (Option[Seq[NamedExpression]], Seq[Expression], LogicalPlan, Map[Attribute, Expression]) =
     plan match {
-      case Project(fields, child) =>
+      case Project(fields, child) if fields.forall(_.deterministic) =>
         val (_, filters, other, aliases) = collectProjectsAndFilters(child)
         val substitutedFields = fields.map(substitute(aliases)).asInstanceOf[Seq[NamedExpression]]
         (Some(substitutedFields), filters, other, collectAliases(substitutedFields))
 
-      case Filter(condition, child) =>
+      case Filter(condition, child) if condition.deterministic =>
         val (fields, filters, other, aliases) = collectProjectsAndFilters(child)
         val substitutedCondition = substitute(aliases)(condition)
         (fields, filters ++ splitConjunctivePredicates(substitutedCondition), other, aliases)
@@ -91,11 +69,11 @@ object PhysicalOperation extends PredicateHelper {
         (None, Nil, other, Map.empty)
     }
 
-  def collectAliases(fields: Seq[Expression]): Map[Attribute, Expression] = fields.collect {
+  private def collectAliases(fields: Seq[Expression]): Map[Attribute, Expression] = fields.collect {
     case a @ Alias(child, _) => a.toAttribute -> child
   }.toMap
 
-  def substitute(aliases: Map[Attribute, Expression])(expr: Expression): Expression = {
+  private def substitute(aliases: Map[Attribute, Expression])(expr: Expression): Expression = {
     expr.transform {
       case a @ Alias(ref: AttributeReference, name) =>
         aliases.get(ref).map(Alias(_, name)(a.exprId, a.qualifiers)).getOrElse(a)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index b5b9f11785074..dbed4fc247140 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -22,6 +22,8 @@ import java.io.File
 import scala.language.postfixOps
 import scala.util.Random
 
+import org.scalatest.Matchers._
+
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
@@ -895,4 +897,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       .orderBy(sum('j))
     checkAnswer(query, Row(1, 2))
   }
+
+  test("SPARK-10316: respect non-deterministic expressions in PhysicalOperation") {
+    val input = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+      (1 to 10).map(i => s"""{"id": $i}""")))
+
+    val df = input.select($"id", rand(0).as('r))
+    df.as("a").join(df.filter($"r" < 0.5).as("b"), $"a.id" === $"b.id").collect().foreach { row =>
+      assert(row.getDouble(1) - row.getDouble(3) === 0.0 +- 0.001)
+    }
+  }
 }

From f7b55dbfc3343cad988e2490478fce1a11343c73 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 8 Sep 2015 12:48:21 -0700
Subject: [PATCH 210/802] [SPARK-10470] [ML] ml.IsotonicRegressionModel.copy
 should set parent

Copied model must have the same parent, but ml.IsotonicRegressionModel.copy did not set parent.
Here fix it and add test case.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8637 from yanboliang/spark-10470.
---
 .../org/apache/spark/ml/regression/IsotonicRegression.scala  | 2 +-
 .../apache/spark/ml/regression/IsotonicRegressionSuite.scala | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index d43a3447d3975..2ff500f291abc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -203,7 +203,7 @@ class IsotonicRegressionModel private[ml] (
   def predictions: Vector = Vectors.dense(oldModel.predictions)
 
   override def copy(extra: ParamMap): IsotonicRegressionModel = {
-    copyValues(new IsotonicRegressionModel(uid, oldModel), extra)
+    copyValues(new IsotonicRegressionModel(uid, oldModel), extra).setParent(parent)
   }
 
   override def transform(dataset: DataFrame): DataFrame = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index c0ab00b68a2f3..59f4193abc8f0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
@@ -89,6 +90,10 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(ir.getFeatureIndex === 0)
 
     val model = ir.fit(dataset)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+
     model.transform(dataset)
       .select("label", "features", "prediction", "weight")
       .collect()

From 7a9dcbc91d55dbc0cbf4812319bde65f4509b467 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 8 Sep 2015 14:10:12 -0700
Subject: [PATCH 211/802] [SPARK-10441] [SQL] Save data correctly to json.

https://issues.apache.org/jira/browse/SPARK-10441

Author: Yin Huai <yhuai@databricks.com>

Closes #8597 from yhuai/timestampJson.
---
 .../spark/sql/RandomDataGenerator.scala       | 41 +++++++++-
 .../datasources/json/JacksonGenerator.scala   | 11 ++-
 .../datasources/json/JacksonParser.scala      | 31 ++++++++
 .../hive/orc/OrcHadoopFsRelationSuite.scala   |  8 ++
 .../sources/JsonHadoopFsRelationSuite.scala   |  8 ++
 .../ParquetHadoopFsRelationSuite.scala        |  9 ++-
 .../SimpleTextHadoopFsRelationSuite.scala     | 19 ++++-
 .../sql/sources/SimpleTextRelation.scala      |  7 +-
 .../sql/sources/hadoopFsRelationSuites.scala  | 79 +++++++++++++++++++
 9 files changed, 205 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 11e0c120f4072..4025cbcec1019 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -23,6 +23,8 @@ import java.math.MathContext
 
 import scala.util.Random
 
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -84,6 +86,7 @@ object RandomDataGenerator {
    * random data generator is defined for that data type. The generated values will use an external
    * representation of the data type; for example, the random generator for [[DateType]] will return
    * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a [[Row]].
+   * For a [[UserDefinedType]] for a class X, an instance of class X is returned.
    *
    * @param dataType the type to generate values for
    * @param nullable whether null values should be generated
@@ -106,7 +109,22 @@ object RandomDataGenerator {
       })
       case BooleanType => Some(() => rand.nextBoolean())
       case DateType => Some(() => new java.sql.Date(rand.nextInt()))
-      case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong()))
+      case TimestampType =>
+        val generator =
+          () => {
+            var milliseconds = rand.nextLong() % 253402329599999L
+            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+            // for "0001-01-01 00:00:00.000000". We need to find a
+            // number that is greater or equals to this number as a valid timestamp value.
+            while (milliseconds < -62135740800000L) {
+              // 253402329599999L is the the number of milliseconds since
+              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+              milliseconds = rand.nextLong() % 253402329599999L
+            }
+            // DateTimeUtils.toJavaTimestamp takes microsecond.
+            DateTimeUtils.toJavaTimestamp(milliseconds * 1000)
+          }
+        Some(generator)
       case CalendarIntervalType => Some(() => {
         val months = rand.nextInt(1000)
         val ns = rand.nextLong()
@@ -159,6 +177,27 @@ object RandomDataGenerator {
           None
         }
       }
+      case udt: UserDefinedType[_] => {
+        val maybeSqlTypeGenerator = forType(udt.sqlType, nullable, seed)
+        // Because random data generator at here returns scala value, we need to
+        // convert it to catalyst value to call udt's deserialize.
+        val toCatalystType = CatalystTypeConverters.createToCatalystConverter(udt.sqlType)
+
+        if (maybeSqlTypeGenerator.isDefined) {
+          val sqlTypeGenerator = maybeSqlTypeGenerator.get
+          val generator = () => {
+            val generatedScalaValue = sqlTypeGenerator.apply()
+            if (generatedScalaValue == null) {
+              null
+            } else {
+              udt.deserialize(toCatalystType(generatedScalaValue))
+            }
+          }
+          Some(generator)
+        } else {
+          None
+        }
+      }
       case unsupportedType => None
     }
     // Handle nullability by wrapping the non-null value generator:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index 330ba907b2ef9..f65c7bbd6e29d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 
 import scala.collection.Map
 
@@ -89,7 +90,7 @@ private[sql] object JacksonGenerator {
     def valWriter: (DataType, Any) => Unit = {
       case (_, null) | (NullType, _) => gen.writeNull()
       case (StringType, v) => gen.writeString(v.toString)
-      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
+      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
       case (IntegerType, v: Int) => gen.writeNumber(v)
       case (ShortType, v: Short) => gen.writeNumber(v)
       case (FloatType, v: Float) => gen.writeNumber(v)
@@ -99,8 +100,12 @@ private[sql] object JacksonGenerator {
       case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
       case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
       case (BooleanType, v: Boolean) => gen.writeBoolean(v)
-      case (DateType, v) => gen.writeString(v.toString)
-      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))
+      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
+      // For UDT values, they should be in the SQL type's corresponding value type.
+      // We should not see values in the user-defined class at here.
+      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
+      // an ArrayData at here, instead of a Vector.
+      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)
 
       case (ArrayType(ty, _), v: ArrayData) =>
         gen.writeStartArray()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index cd68bd667c5c4..ff4d8c04e8eaf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -81,9 +81,37 @@ private[sql] object JacksonParser {
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
         parser.getFloatValue
 
+      case (VALUE_STRING, FloatType) =>
+        // Special case handling for NaN and Infinity.
+        val value = parser.getText
+        val lowerCaseValue = value.toLowerCase()
+        if (lowerCaseValue.equals("nan") ||
+          lowerCaseValue.equals("infinity") ||
+          lowerCaseValue.equals("-infinity") ||
+          lowerCaseValue.equals("inf") ||
+          lowerCaseValue.equals("-inf")) {
+          value.toFloat
+        } else {
+          sys.error(s"Cannot parse $value as FloatType.")
+        }
+
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) =>
         parser.getDoubleValue
 
+      case (VALUE_STRING, DoubleType) =>
+        // Special case handling for NaN and Infinity.
+        val value = parser.getText
+        val lowerCaseValue = value.toLowerCase()
+        if (lowerCaseValue.equals("nan") ||
+          lowerCaseValue.equals("infinity") ||
+          lowerCaseValue.equals("-infinity") ||
+          lowerCaseValue.equals("inf") ||
+          lowerCaseValue.equals("-inf")) {
+          value.toDouble
+        } else {
+          sys.error(s"Cannot parse $value as DoubleType.")
+        }
+
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) =>
         Decimal(parser.getDecimalValue, dt.precision, dt.scale)
 
@@ -126,6 +154,9 @@ private[sql] object JacksonParser {
 
       case (_, udt: UserDefinedType[_]) =>
         convertField(factory, parser, udt.sqlType)
+
+      case (token, dataType) =>
+        sys.error(s"Failed to parse a value for data type $dataType (current token: $token).")
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index 9a299c3f9d1f3..92043d66c914f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -28,6 +28,14 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
 
   override val dataSourceName: String = classOf[DefaultSource].getCanonicalName
 
+  // ORC does not play well with NullType and UDT.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: CalendarIntervalType => false
+    case _: UserDefinedType[_] => false
+    case _ => true
+  }
+
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 1945b15002337..ef37787137d07 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,6 +28,14 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
+  // JSON does not write data of NullType and does not play well with BinaryType.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: BinaryType => false
+    case _: CalendarIntervalType => false
+    case _ => true
+  }
+
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
index 08c3c17973043..e2d754e806403 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -24,7 +24,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.{execution, AnalysisException, SaveMode}
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types._
 
 
 class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
@@ -32,6 +32,13 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
 
   override val dataSourceName: String = "parquet"
 
+  // Parquet does not play well with NullType.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: CalendarIntervalType => false
+    case _ => true
+  }
+
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index 1125ca670107b..a3a124488d983 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -20,11 +20,28 @@ package org.apache.spark.sql.sources
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types._
 
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
 
+  // We have a very limited number of supported types at here since it is just for a
+  // test relation and we do very basic testing at here.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: BinaryType => false
+    // We are using random data generator and the generated strings are not really valid string.
+    case _: StringType => false
+    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
+    case _: CalendarIntervalType => false
+    case _: DateType => false
+    case _: TimestampType => false
+    case _: ArrayType => false
+    case _: MapType => false
+    case _: StructType => false
+    case _: UserDefinedType[_] => false
+    case _ => true
+  }
+
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
       val basePath = new Path(file.getCanonicalPath)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 527ca7a81cad8..aeaaa3e1c5220 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -68,7 +68,9 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends
     new AppendingTextOutputFormat(new Path(path)).getRecordWriter(context)
 
   override def write(row: Row): Unit = {
-    val serialized = row.toSeq.map(_.toString).mkString(",")
+    val serialized = row.toSeq.map { v =>
+      if (v == null) "" else v.toString
+    }.mkString(",")
     recordWriter.write(null, new Text(serialized))
   }
 
@@ -112,7 +114,8 @@ class SimpleTextRelation(
     val fields = dataSchema.map(_.dataType)
 
     sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record =>
-      Row(record.split(",").zip(fields).map { case (value, dataType) =>
+      Row(record.split(",", -1).zip(fields).map { case (v, dataType) =>
+        val value = if (v == "") null else v
         // `Cast`ed values are always of Catalyst types (i.e. UTF8String instead of String, etc.)
         val catalystValue = Cast(Literal(value), dataType).eval()
         // Here we're converting Catalyst values to Scala values to test `needsConversion`
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 2ad2618dfc436..24f43cf7c15ca 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -38,6 +38,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
 
   val dataSourceName: String
 
+  protected def supportsDataType(dataType: DataType): Boolean = true
+
   val dataSchema =
     StructType(
       Seq(
@@ -98,6 +100,83 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
+  test("test all data types") {
+    withTempPath { file =>
+      // Create the schema.
+      val struct =
+        StructType(
+          StructField("f1", FloatType, true) ::
+            StructField("f2", ArrayType(BooleanType), true) :: Nil)
+      // TODO: add CalendarIntervalType to here once we can save it out.
+      val dataTypes =
+        Seq(
+          StringType, BinaryType, NullType, BooleanType,
+          ByteType, ShortType, IntegerType, LongType,
+          FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+          DateType, TimestampType,
+          ArrayType(IntegerType), MapType(StringType, LongType), struct,
+          new MyDenseVectorUDT())
+      val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
+        StructField(s"col$index", dataType, nullable = true)
+      }
+      val schema = StructType(fields)
+
+      // Generate data at the driver side. We need to materialize the data first and then
+      // create RDD.
+      val maybeDataGenerator =
+        RandomDataGenerator.forType(
+          dataType = schema,
+          nullable = true,
+          seed = Some(System.nanoTime()))
+      val dataGenerator =
+        maybeDataGenerator
+          .getOrElse(fail(s"Failed to create data generator for schema $schema"))
+      val data = (1 to 10).map { i =>
+        dataGenerator.apply() match {
+          case row: Row => row
+          case null => Row.fromSeq(Seq.fill(schema.length)(null))
+          case other =>
+            fail(s"Row or null is expected to be generated, " +
+              s"but a ${other.getClass.getCanonicalName} is generated.")
+        }
+      }
+
+      // Create a DF for the schema with random data.
+      val rdd = sqlContext.sparkContext.parallelize(data, 10)
+      val df = sqlContext.createDataFrame(rdd, schema)
+
+      // All columns that have supported data types of this source.
+      val supportedColumns = schema.fields.collect {
+        case StructField(name, dataType, _, _) if supportsDataType(dataType) => name
+      }
+      val selectedColumns = util.Random.shuffle(supportedColumns.toSeq)
+
+      val dfToBeSaved = df.selectExpr(selectedColumns: _*)
+
+      // Save the data out.
+      dfToBeSaved
+        .write
+        .format(dataSourceName)
+        .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests.
+        .save(file.getCanonicalPath)
+
+      val loadedDF =
+        sqlContext
+          .read
+          .format(dataSourceName)
+          .schema(dfToBeSaved.schema)
+          .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests.
+          .load(file.getCanonicalPath)
+          .selectExpr(selectedColumns: _*)
+
+      // Read the data back.
+      checkAnswer(
+        loadedDF,
+        dfToBeSaved
+      )
+    }
+  }
+
   test("save()/load() - non-partitioned table - Overwrite") {
     withTempPath { file =>
       testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)

From e6f8d3686016a305a747c5bcc85f46fd4c0cbe83 Mon Sep 17 00:00:00 2001
From: Vinod K C <vinod.kc@huawei.com>
Date: Tue, 8 Sep 2015 14:44:05 -0700
Subject: [PATCH 212/802] [SPARK-10468] [ MLLIB ] Verify schema before
 Dataframe select API call

Loader.checkSchema was called to verify the schema after dataframe.select(...).
Schema verification should be done before dataframe.select(...)

Author: Vinod K C <vinod.kc@huawei.com>

Closes #8636 from vinodkc/fix_GaussianMixtureModel_load_verification.
---
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala  | 3 +--
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala  | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 7f6163e04bf17..a5902190d4637 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -168,10 +168,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
+      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
 
       val (weights, gaussians) = dataArray.map {
         case Row(weight: Double, mu: Vector, sigma: Matrix) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 36b124c5d2966..58857c338f546 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -590,12 +590,10 @@ object Word2VecModel extends Loader[Word2VecModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-
-      val dataArray = dataFrame.select("word", "vector").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
 
+      val dataArray = dataFrame.select("word", "vector").collect()
       val word2VecMap = dataArray.map(i => (i.getString(0), i.getSeq[Float](1).toArray)).toMap
       new Word2VecModel(word2VecMap)
     }

From 52b24a602ad615a7f6aa427aefb1c7444c05d298 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 8 Sep 2015 14:54:43 -0700
Subject: [PATCH 213/802] [SPARK-10492] [STREAMING] [DOCUMENTATION] Update
 Streaming documentation about rate limiting and backpressure

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8656 from tdas/SPARK-10492 and squashes the following commits:

986cdd6 [Tathagata Das] Added information on backpressure
---
 docs/configuration.md               | 13 +++++++++++++
 docs/streaming-programming-guide.md | 13 ++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index a2cc7a37e2240..e287591f3fda1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1433,6 +1433,19 @@ Apart from these, the following properties are also available, and may be useful
 #### Spark Streaming
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.streaming.backpressure.enabled</code></td>
+  <td>false</td>
+  <td>
+    Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5).
+    This enables the Spark Streaming to control the receiving rate based on the 
+    current batch scheduling delays and processing times so that the system receives
+    only as fast as the system can process. Internally, this dynamically sets the 
+    maximum receiving rate of receivers. This rate is upper bounded by the values
+    `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition`
+    if they are set (see below).
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.blockInterval</code></td>
   <td>200ms</td>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index a1acf83f75245..c751dbb41785a 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -1807,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following.
     + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this
       with Mesos.
 
-- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2,
+- *Configuring write ahead logs* - Since Spark 1.2,
   we have introduced _write ahead logs_ for achieving strong
   fault-tolerance guarantees. If enabled,  all the data received from a receiver gets written into
   a write ahead log in the configuration checkpoint directory. This prevents data loss on driver
@@ -1822,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following.
   stored in a replicated storage system. This can be done by setting the storage level for the
   input stream to `StorageLevel.MEMORY_AND_DISK_SER`.
 
+- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming
+  application to process data as fast as it is being received, the receivers can be rate limited
+  by setting a maximum rate limit in terms of records / sec.
+  See the [configuration parameters](configuration.html#spark-streaming)
+  `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition`
+  for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that
+  eliminate the need to set this rate limit, as Spark Streaming automatically figures out the
+  rate limits and dynamically adjusts them if the processing conditions change. This backpressure
+  can be enabled by setting the [configuration parameter](configuration.html#spark-streaming)
+  `spark.streaming.backpressure.enabled` to `true`.
+
 ### Upgrading Application Code
 {:.no_toc}
 

From d637a666d5932002c8ce0bd23c06064fbfdc1c97 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 8 Sep 2015 16:16:50 -0700
Subject: [PATCH 214/802] [SPARK-10327] [SQL] Cache Table is not working while
 subquery has alias in its project list

```scala
    import org.apache.spark.sql.hive.execution.HiveTableScan
    sql("select key, value, key + 1 from src").registerTempTable("abc")
    cacheTable("abc")

    val sparkPlan = sql(
      """select a.key, b.key, c.key from
        |abc a join abc b on a.key=b.key
        |join abc c on a.key=c.key""".stripMargin).queryExecution.sparkPlan

    assert(sparkPlan.collect { case e: InMemoryColumnarTableScan => e }.size === 3) // failed
    assert(sparkPlan.collect { case e: HiveTableScan => e }.size === 0) // failed
```

The actual plan is:

```
== Parsed Logical Plan ==
'Project [unresolvedalias('a.key),unresolvedalias('b.key),unresolvedalias('c.key)]
 'Join Inner, Some(('a.key = 'c.key))
  'Join Inner, Some(('a.key = 'b.key))
   'UnresolvedRelation [abc], Some(a)
   'UnresolvedRelation [abc], Some(b)
  'UnresolvedRelation [abc], Some(c)

== Analyzed Logical Plan ==
key: int, key: int, key: int
Project [key#14,key#61,key#66]
 Join Inner, Some((key#14 = key#66))
  Join Inner, Some((key#14 = key#61))
   Subquery a
    Subquery abc
     Project [key#14,value#15,(key#14 + 1) AS _c2#16]
      MetastoreRelation default, src, None
   Subquery b
    Subquery abc
     Project [key#61,value#62,(key#61 + 1) AS _c2#58]
      MetastoreRelation default, src, None
  Subquery c
   Subquery abc
    Project [key#66,value#67,(key#66 + 1) AS _c2#63]
     MetastoreRelation default, src, None

== Optimized Logical Plan ==
Project [key#14,key#61,key#66]
 Join Inner, Some((key#14 = key#66))
  Project [key#14,key#61]
   Join Inner, Some((key#14 = key#61))
    Project [key#14]
     InMemoryRelation [key#14,value#15,_c2#16], true, 10000, StorageLevel(true, true, false, true, 1), (Project [key#14,value#15,(key#14 + 1) AS _c2#16]), Some(abc)
    Project [key#61]
     MetastoreRelation default, src, None
  Project [key#66]
   MetastoreRelation default, src, None

== Physical Plan ==
TungstenProject [key#14,key#61,key#66]
 BroadcastHashJoin [key#14], [key#66], BuildRight
  TungstenProject [key#14,key#61]
   BroadcastHashJoin [key#14], [key#61], BuildRight
    ConvertToUnsafe
     InMemoryColumnarTableScan [key#14], (InMemoryRelation [key#14,value#15,_c2#16], true, 10000, StorageLevel(true, true, false, true, 1), (Project [key#14,value#15,(key#14 + 1) AS _c2#16]), Some(abc))
    ConvertToUnsafe
     HiveTableScan [key#61], (MetastoreRelation default, src, None)
  ConvertToUnsafe
   HiveTableScan [key#66], (MetastoreRelation default, src, None)
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8494 from chenghao-intel/weird_cache.
---
 .../sql/catalyst/plans/logical/LogicalPlan.scala | 15 ++++++++++++---
 .../org/apache/spark/sql/CachedTableSuite.scala  | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 9bb466ac2d29c..8f8747e105932 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -135,16 +135,25 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   /** Args that have cleaned such that differences in expression id should not affect equality */
   protected lazy val cleanArgs: Seq[Any] = {
     val input = children.flatMap(_.output)
+    def cleanExpression(e: Expression) = e match {
+      case a: Alias =>
+        // As the root of the expression, Alias will always take an arbitrary exprId, we need
+        // to erase that for equality testing.
+        val cleanedExprId = Alias(a.child, a.name)(ExprId(-1), a.qualifiers)
+        BindReferences.bindReference(cleanedExprId, input, allowFailures = true)
+      case other => BindReferences.bindReference(other, input, allowFailures = true)
+    }
+
     productIterator.map {
       // Children are checked using sameResult above.
       case tn: TreeNode[_] if containsChild(tn) => null
-      case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+      case e: Expression => cleanExpression(e)
       case s: Option[_] => s.map {
-        case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+        case e: Expression => cleanExpression(e)
         case other => other
       }
       case s: Seq[_] => s.map {
-        case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+        case e: Expression => cleanExpression(e)
         case other => other
       }
       case other => other
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 3a3541a8429b6..84e66b50cf1d7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.execution.PhysicalRDD
+
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
@@ -338,4 +340,18 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
       assert((accsSize - 2) == Accumulators.originals.size)
     }
   }
+
+  test("SPARK-10327 Cache Table is not working while subquery has alias in its project list") {
+    ctx.sparkContext.parallelize((1, 1) :: (2, 2) :: Nil)
+      .toDF("key", "value").selectExpr("key", "value", "key+1").registerTempTable("abc")
+    ctx.cacheTable("abc")
+
+    val sparkPlan = sql(
+      """select a.key, b.key, c.key from
+        |abc a join abc b on a.key=b.key
+        |join abc c on a.key=c.key""".stripMargin).queryExecution.sparkPlan
+
+    assert(sparkPlan.collect { case e: InMemoryColumnarTableScan => e }.size === 3)
+    assert(sparkPlan.collect { case e: PhysicalRDD => e }.size === 0)
+  }
 }

From 2143d592c802ec8f83a1eb5ce9b33ad8e48d7196 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 8 Sep 2015 16:51:45 -0700
Subject: [PATCH 215/802] [HOTFIX] Fix build break caused by #8494

Author: Michael Armbrust <michael@databricks.com>

Closes #8659 from marmbrus/testBuildBreak.
---
 .../test/scala/org/apache/spark/sql/CachedTableSuite.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 84e66b50cf1d7..356d4ff3fa837 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -342,9 +342,9 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-10327 Cache Table is not working while subquery has alias in its project list") {
-    ctx.sparkContext.parallelize((1, 1) :: (2, 2) :: Nil)
+    sparkContext.parallelize((1, 1) :: (2, 2) :: Nil)
       .toDF("key", "value").selectExpr("key", "value", "key+1").registerTempTable("abc")
-    ctx.cacheTable("abc")
+    sqlContext.cacheTable("abc")
 
     val sparkPlan = sql(
       """select a.key, b.key, c.key from

From ae74c3fa84885dac18f39368e6dda48a2f7a25a5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 8 Sep 2015 17:36:00 -0700
Subject: [PATCH 216/802] [RELEASE] Add more contributors & only show names in
 release notes.

Author: Reynold Xin <rxin@databricks.com>

Closes #8660 from rxin/contrib.
---
 dev/create-release/generate-contributors.py | 20 +++++++++------
 dev/create-release/known_translations       | 27 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index 8aaa250bd7e29..db9c680a4bad3 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -178,13 +178,16 @@ def populate(issue_type, components):
             author_info[author][issue_type].add(component)
     # Find issues and components associated with this commit
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
-        jira_type = jira_issue.fields.issuetype.name
-        jira_type = translate_issue_type(jira_type, issue, warnings)
-        jira_components = [translate_component(c.name, _hash, warnings)\
-          for c in jira_issue.fields.components]
-        all_components = set(jira_components + commit_components)
-        populate(jira_type, all_components)
+        try:
+            jira_issue = jira_client.issue(issue)
+            jira_type = jira_issue.fields.issuetype.name
+            jira_type = translate_issue_type(jira_type, issue, warnings)
+            jira_components = [translate_component(c.name, _hash, warnings)\
+              for c in jira_issue.fields.components]
+            all_components = set(jira_components + commit_components)
+            populate(jira_type, all_components)
+        except Exception as e:
+            print "Unexpected error:", e
     # For docs without an associated JIRA, manually add it ourselves
     if is_docs(title) and not issues:
         populate("documentation", commit_components)
@@ -223,7 +226,8 @@ def populate(issue_type, components):
     # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
     if author in invalid_authors and invalid_authors[author]:
         author = author + "/" + "/".join(invalid_authors[author])
-    line = " * %s -- %s" % (author, contribution)
+    #line = " * %s -- %s" % (author, contribution)
+    line = author
     contributors_file.write(line + "\n")
 contributors_file.close()
 print "Contributors list is successfully written to %s!" % contributors_file_name
diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
index e462302f28423..3563fe3cc3c03 100644
--- a/dev/create-release/known_translations
+++ b/dev/create-release/known_translations
@@ -138,3 +138,30 @@ lee19 - Lee
 lockwobr - Brian Lockwood
 navis - Navis Ryu
 pparkkin - Paavo Parkkinen
+HyukjinKwon - Hyukjin Kwon
+JDrit - Joseph Batchik
+JuhongPark - Juhong Park
+KaiXinXiaoLei - KaiXinXIaoLei
+NamelessAnalyst - NamelessAnalyst
+alyaxey - Alex Slusarenko
+baishuo - Shuo Bai
+fe2s - Oleksiy Dyagilev
+felixcheung - Felix Cheung
+feynmanliang - Feynman Liang
+josepablocam - Jose Cambronero
+kai-zeng - Kai Zeng
+mosessky - mosessky
+msannell - Michael Sannella
+nishkamravi2 - Nishkam Ravi
+noel-smith - Noel Smith
+petz2000 - Patrick Baier
+qiansl127 - Shilei Qian
+rahulpalamuttam - Rahul Palamuttam
+rowan000 - Rowan Chattaway
+sarutak - Kousuke Saruta
+sethah - Seth Hendrickson
+small-wang - Wang Wei
+stanzhai - Stan Zhai
+tien-dungle - Tien-Dung Le
+xuchenCN - Xu Chen
+zhangjiajin - Zhang JiaJin

From 820913f554bef610d07ca2dadaead657f916ae63 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 8 Sep 2015 20:39:15 -0700
Subject: [PATCH 217/802] [SPARK-10071] [STREAMING] Output a warning when
 writing QueueInputDStream and throw a better exception when reading
 QueueInputDStream

Output a warning when serializing QueueInputDStream rather than throwing an exception to allow unit tests use it. Moreover, this PR also throws an better exception when deserializing QueueInputDStream to make the user find out the problem easily. The previous exception is hard to understand: https://issues.apache.org/jira/browse/SPARK-8553

Author: zsxwing <zsxwing@gmail.com>

Closes #8624 from zsxwing/SPARK-10071 and squashes the following commits:

847cfa8 [zsxwing] Output a warning when writing QueueInputDStream and throw a better exception when reading QueueInputDStream
---
 .../apache/spark/streaming/Checkpoint.scala   |  6 ++--
 .../streaming/dstream/QueueInputDStream.scala |  9 ++++--
 .../streaming/StreamingContextSuite.scala     | 28 +++++++++++++------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 3985e1a3d9dfa..27024ecfd9101 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -321,7 +321,7 @@ object CheckpointReader extends Logging {
 
     // Try to read the checkpoint files in the order
     logInfo("Checkpoint files found: " + checkpointFiles.mkString(","))
-    val compressionCodec = CompressionCodec.createCodec(conf)
+    var readError: Exception = null
     checkpointFiles.foreach(file => {
       logInfo("Attempting to load checkpoint from file " + file)
       try {
@@ -332,13 +332,15 @@ object CheckpointReader extends Logging {
         return Some(cp)
       } catch {
         case e: Exception =>
+          readError = e
           logWarning("Error reading checkpoint from file " + file, e)
       }
     })
 
     // If none of checkpoint files could be read, then throw exception
     if (!ignoreReadError) {
-      throw new SparkException(s"Failed to read checkpoint from directory $checkpointPath")
+      throw new SparkException(
+        s"Failed to read checkpoint from directory $checkpointPath", readError)
     }
     None
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index a2f5d82a79bd3..bab78a3536b47 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
-import java.io.{NotSerializableException, ObjectOutputStream}
+import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}
 
 import scala.collection.mutable.{ArrayBuffer, Queue}
 import scala.reflect.ClassTag
@@ -37,8 +37,13 @@ class QueueInputDStream[T: ClassTag](
 
   override def stop() { }
 
+  private def readObject(in: ObjectInputStream): Unit = {
+    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
+      "Please don't use queueStream when checkpointing is enabled.")
+  }
+
   private def writeObject(oos: ObjectOutputStream): Unit = {
-    throw new NotSerializableException("queueStream doesn't support checkpointing")
+    logWarning("queueStream doesn't support checkpointing")
   }
 
   override def compute(validTime: Time): Option[RDD[T]] = {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 7423ef6bcb6ea..d26894e88fc26 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -30,7 +30,7 @@ import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark._
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.storage.StorageLevel
@@ -726,16 +726,26 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
   }
 
   test("queueStream doesn't support checkpointing") {
-    val checkpointDir = Utils.createTempDir()
-    ssc = new StreamingContext(master, appName, batchDuration)
-    val rdd = ssc.sparkContext.parallelize(1 to 10)
-    ssc.queueStream[Int](Queue(rdd)).print()
-    ssc.checkpoint(checkpointDir.getAbsolutePath)
-    val e = intercept[NotSerializableException] {
-      ssc.start()
+    val checkpointDirectory = Utils.createTempDir().getAbsolutePath()
+    def creatingFunction(): StreamingContext = {
+      val _ssc = new StreamingContext(conf, batchDuration)
+      val rdd = _ssc.sparkContext.parallelize(1 to 10)
+      _ssc.checkpoint(checkpointDirectory)
+      _ssc.queueStream[Int](Queue(rdd)).register()
+      _ssc
+    }
+    ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _)
+    ssc.start()
+    eventually(timeout(10000 millis)) {
+      assert(Checkpoint.getCheckpointFiles(checkpointDirectory).size > 1)
+    }
+    ssc.stop()
+    val e = intercept[SparkException] {
+      ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _)
     }
     // StreamingContext.validate changes the message, so use "contains" here
-    assert(e.getMessage.contains("queueStream doesn't support checkpointing"))
+    assert(e.getCause.getMessage.contains("queueStream doesn't support checkpointing. " +
+      "Please don't use queueStream when checkpointing is enabled."))
   }
 
   def addInputStream(s: StreamingContext): DStream[Int] = {

From 52fe32f6ac7a04fa9b4478fda1307c5b0c61c4a2 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 8 Sep 2015 20:51:20 -0700
Subject: [PATCH 218/802] [SPARK-9834] [MLLIB] implement weighted least squares
 via normal equation

The goal of this PR is to have a weighted least squares implementation that takes the normal equation approach, and hence to be able to provide R-like summary statistics and support IRLS (used by GLMs). The tests match R's lm and glmnet.

There are couple TODOs that can be addressed in future PRs:
* consolidate summary statistics aggregators
* move `dspr` to `BLAS`
* etc

It would be nice to have this merged first because it blocks couple other features.

dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8588 from mengxr/SPARK-9834.
---
 .../spark/ml/optim/WeightedLeastSquares.scala | 296 ++++++++++++++++++
 .../org/apache/spark/mllib/linalg/BLAS.scala  |   7 +
 .../mllib/linalg/distributed/RowMatrix.scala  |   3 +-
 .../ml/optim/WeightedLeastSquaresSuite.scala  | 133 ++++++++
 4 files changed, 438 insertions(+), 1 deletion(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
new file mode 100644
index 0000000000000..a99e2ac4c6913
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.optim
+
+import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
+import org.netlib.util.intW
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+import org.apache.spark.rdd.RDD
+
+/**
+ * Model fitted by [[WeightedLeastSquares]].
+ * @param coefficients model coefficients
+ * @param intercept model intercept
+ */
+private[ml] class WeightedLeastSquaresModel(
+    val coefficients: DenseVector,
+    val intercept: Double) extends Serializable
+
+/**
+ * Weighted least squares solver via normal equation.
+ * Given weighted observations (w,,i,,, a,,i,,, b,,i,,), we use the following weighted least squares
+ * formulation:
+ *
+ * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w_i
+ *   + 1/2 lambda / delta sum,,j,, (sigma,,j,, x,,j,,)^2^,
+ *
+ * where lambda is the regularization parameter, and delta and sigma,,j,, are controlled by
+ * [[standardizeLabel]] and [[standardizeFeatures]], respectively.
+ *
+ * Set [[regParam]] to 0.0 and turn off both [[standardizeFeatures]] and [[standardizeLabel]] to
+ * match R's `lm`.
+ * Turn on [[standardizeLabel]] to match R's `glmnet`.
+ *
+ * @param fitIntercept whether to fit intercept. If false, z is 0.0.
+ * @param regParam L2 regularization parameter (lambda)
+ * @param standardizeFeatures whether to standardize features. If true, sigma_,,j,, is the
+ *                            population standard deviation of the j-th column of A. Otherwise,
+ *                            sigma,,j,, is 1.0.
+ * @param standardizeLabel whether to standardize label. If true, delta is the population standard
+ *                         deviation of the label column b. Otherwise, delta is 1.0.
+ */
+private[ml] class WeightedLeastSquares(
+    val fitIntercept: Boolean,
+    val regParam: Double,
+    val standardizeFeatures: Boolean,
+    val standardizeLabel: Boolean) extends Logging with Serializable {
+  import WeightedLeastSquares._
+
+  require(regParam >= 0.0, s"regParam cannot be negative: $regParam")
+  if (regParam == 0.0) {
+    logWarning("regParam is zero, which might cause numerical instability and overfitting.")
+  }
+
+  /**
+   * Creates a [[WeightedLeastSquaresModel]] from an RDD of [[Instance]]s.
+   */
+  def fit(instances: RDD[Instance]): WeightedLeastSquaresModel = {
+    val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_))
+    summary.validate()
+    logInfo(s"Number of instances: ${summary.count}.")
+    val triK = summary.triK
+    val bBar = summary.bBar
+    val bStd = summary.bStd
+    val aBar = summary.aBar
+    val aVar = summary.aVar
+    val abBar = summary.abBar
+    val aaBar = summary.aaBar
+    val aaValues = aaBar.values
+
+    if (fitIntercept) {
+      // shift centers
+      // A^T A - aBar aBar^T
+      RowMatrix.dspr(-1.0, aBar, aaValues)
+      // A^T b - bBar aBar
+      BLAS.axpy(-bBar, aBar, abBar)
+    }
+
+    // add regularization to diagonals
+    var i = 0
+    var j = 2
+    while (i < triK) {
+      var lambda = regParam
+      if (standardizeFeatures) {
+        lambda *= aVar(j - 2)
+      }
+      if (standardizeLabel) {
+        // TODO: handle the case when bStd = 0
+        lambda /= bStd
+      }
+      aaValues(i) += lambda
+      i += j
+      j += 1
+    }
+
+    val x = choleskySolve(aaBar.values, abBar)
+
+    // compute intercept
+    val intercept = if (fitIntercept) {
+      bBar - BLAS.dot(aBar, x)
+    } else {
+      0.0
+    }
+
+    new WeightedLeastSquaresModel(x, intercept)
+  }
+
+  /**
+   * Solves a symmetric positive definite linear system via Cholesky factorization.
+   * The input arguments are modified in-place to store the factorization and the solution.
+   * @param A the upper triangular part of A
+   * @param bx right-hand side
+   * @return the solution vector
+   */
+  // TODO: SPARK-10490 - consolidate this and the Cholesky solver in ALS
+  private def choleskySolve(A: Array[Double], bx: DenseVector): DenseVector = {
+    val k = bx.size
+    val info = new intW(0)
+    lapack.dppsv("U", k, 1, A, bx.values, k, info)
+    val code = info.`val`
+    assert(code == 0, s"lapack.dpotrs returned $code.")
+    bx
+  }
+}
+
+private[ml] object WeightedLeastSquares {
+
+  /**
+   * Case class for weighted observations.
+   * @param w weight, must be positive
+   * @param a features
+   * @param b label
+   */
+  case class Instance(w: Double, a: Vector, b: Double) {
+    require(w >= 0.0, s"Weight cannot be negative: $w.")
+  }
+
+  /**
+   * Aggregator to provide necessary summary statistics for solving [[WeightedLeastSquares]].
+   */
+  // TODO: consolidate aggregates for summary statistics
+  private class Aggregator extends Serializable {
+    var initialized: Boolean = false
+    var k: Int = _
+    var count: Long = _
+    var triK: Int = _
+    private var wSum: Double = _
+    private var wwSum: Double = _
+    private var bSum: Double = _
+    private var bbSum: Double = _
+    private var aSum: DenseVector = _
+    private var abSum: DenseVector = _
+    private var aaSum: DenseVector = _
+
+    private def init(k: Int): Unit = {
+      require(k <= 4096, "In order to take the normal equation approach efficiently, " +
+        s"we set the max number of features to 4096 but got $k.")
+      this.k = k
+      triK = k * (k + 1) / 2
+      count = 0L
+      wSum = 0.0
+      wwSum = 0.0
+      bSum = 0.0
+      bbSum = 0.0
+      aSum = new DenseVector(Array.ofDim(k))
+      abSum = new DenseVector(Array.ofDim(k))
+      aaSum = new DenseVector(Array.ofDim(triK))
+      initialized = true
+    }
+
+    /**
+     * Adds an instance.
+     */
+    def add(instance: Instance): this.type = {
+      val Instance(w, a, b) = instance
+      val ak = a.size
+      if (!initialized) {
+        init(ak)
+        initialized = true
+      }
+      assert(ak == k, s"Dimension mismatch. Expect vectors of size $k but got $ak.")
+      count += 1L
+      wSum += w
+      wwSum += w * w
+      bSum += w * b
+      bbSum += w * b * b
+      BLAS.axpy(w, a, aSum)
+      BLAS.axpy(w * b, a, abSum)
+      RowMatrix.dspr(w, a, aaSum.values)
+      this
+    }
+
+    /**
+     * Merges another [[Aggregator]].
+     */
+    def merge(other: Aggregator): this.type = {
+      if (!other.initialized) {
+        this
+      } else {
+        if (!initialized) {
+          init(other.k)
+        }
+        assert(k == other.k, s"dimension mismatch: this.k = $k but other.k = ${other.k}")
+        count += other.count
+        wSum += other.wSum
+        wwSum += other.wwSum
+        bSum += other.bSum
+        bbSum += other.bbSum
+        BLAS.axpy(1.0, other.aSum, aSum)
+        BLAS.axpy(1.0, other.abSum, abSum)
+        BLAS.axpy(1.0, other.aaSum, aaSum)
+        this
+      }
+    }
+
+    /**
+     * Validates that we have seen observations.
+     */
+    def validate(): Unit = {
+      assert(initialized, "Training dataset is empty.")
+      assert(wSum > 0.0, "Sum of weights cannot be zero.")
+    }
+
+    /**
+     * Weighted mean of features.
+     */
+    def aBar: DenseVector = {
+      val output = aSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted mean of labels.
+     */
+    def bBar: Double = bSum / wSum
+
+    /**
+     * Weighted population standard deviation of labels.
+     */
+    def bStd: Double = math.sqrt(bbSum / wSum - bBar * bBar)
+
+    /**
+     * Weighted mean of (label * features).
+     */
+    def abBar: DenseVector = {
+      val output = abSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted mean of (features * features^T^).
+     */
+    def aaBar: DenseVector = {
+      val output = aaSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted population variance of features.
+     */
+    def aVar: DenseVector = {
+      val variance = Array.ofDim[Double](k)
+      var i = 0
+      var j = 2
+      val aaValues = aaSum.values
+      while (i < triK) {
+        val l = j - 2
+        val aw = aSum(l) / wSum
+        variance(l) = aaValues(i) / wSum - aw * aw
+        i += j
+        j += 1
+      }
+      new DenseVector(variance)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index ab475af264dd3..9ee81eda8a8c0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -92,6 +92,13 @@ private[spark] object BLAS extends Serializable with Logging {
     }
   }
 
+  /** Y += a * x */
+  private[spark] def axpy(a: Double, X: DenseMatrix, Y: DenseMatrix): Unit = {
+    require(X.numRows == Y.numRows && X.numCols == Y.numCols, "Dimension mismatch: " +
+      s"size(X) = ${(X.numRows, X.numCols)} but size(Y) = ${(Y.numRows, Y.numCols)}.")
+    f2jBLAS.daxpy(X.numRows * X.numCols, a, X.values, 1, Y.values, 1)
+  }
+
   /**
    * dot(x, y)
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 9a423ddafdc09..83779ac88989b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -678,7 +678,8 @@ object RowMatrix {
    *
    * @param U the upper triangular part of the matrix packed in an array (column major)
    */
-  private def dspr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
+  // TODO: SPARK-10491 - move this method to linalg.BLAS
+  private[spark] def dspr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
     // TODO: Find a better home (breeze?) for this method.
     val n = v.size
     v match {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
new file mode 100644
index 0000000000000..652f3adb984d3
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.optim
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.optim.WeightedLeastSquares.Instance
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.rdd.RDD
+
+class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  private var instances: RDD[Instance] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    /*
+       R code:
+
+       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+       b <- c(17, 19, 23, 29)
+       w <- c(1, 2, 3, 4)
+     */
+    instances = sc.parallelize(Seq(
+      Instance(1.0, Vectors.dense(0.0, 5.0).toSparse, 17.0),
+      Instance(2.0, Vectors.dense(1.0, 7.0), 19.0),
+      Instance(3.0, Vectors.dense(2.0, 11.0), 23.0),
+      Instance(4.0, Vectors.dense(3.0, 13.0), 29.0)
+    ), 2)
+  }
+
+  test("WLS against lm") {
+    /*
+       R code:
+
+       df <- as.data.frame(cbind(A, b))
+       for (formula in c(b ~ . -1, b ~ .)) {
+         model <- lm(formula, data=df, weights=w)
+         print(as.vector(coef(model)))
+       }
+
+       [1] -3.727121  3.009983
+       [1] 18.08  6.08 -0.60
+     */
+
+    val expected = Seq(
+      Vectors.dense(0.0, -3.727121, 3.009983),
+      Vectors.dense(18.08, 6.08, -0.60))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(
+        fitIntercept, regParam = 0.0, standardizeFeatures = false, standardizeLabel = false)
+        .fit(instances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expected(idx) absTol 1e-4)
+      idx += 1
+    }
+  }
+
+  test("WLS against glmnet") {
+    /*
+       R code:
+
+       library(glmnet)
+
+       for (intercept in c(FALSE, TRUE)) {
+         for (lambda in c(0.0, 0.1, 1.0)) {
+           for (standardize in c(FALSE, TRUE)) {
+             model <- glmnet(A, b, weights=w, intercept=intercept, lambda=lambda,
+                             standardize=standardize, alpha=0, thresh=1E-14)
+             print(as.vector(coef(model)))
+           }
+         }
+       }
+
+       [1]  0.000000 -3.727117  3.009982
+       [1]  0.000000 -3.727117  3.009982
+       [1]  0.000000 -3.307532  2.924206
+       [1]  0.000000 -2.914790  2.840627
+       [1]  0.000000 -1.526575  2.558158
+       [1] 0.00000000 0.06984238 2.20488344
+       [1] 18.0799727  6.0799832 -0.5999941
+       [1] 18.0799727  6.0799832 -0.5999941
+       [1] 13.5356178  3.2714044  0.3770744
+       [1] 14.064629  3.565802  0.269593
+       [1] 10.1238013  0.9708569  1.1475466
+       [1] 13.1860638  2.1761382  0.6213134
+     */
+
+    val expected = Seq(
+      Vectors.dense(0.0, -3.727117, 3.009982),
+      Vectors.dense(0.0, -3.727117, 3.009982),
+      Vectors.dense(0.0, -3.307532, 2.924206),
+      Vectors.dense(0.0, -2.914790, 2.840627),
+      Vectors.dense(0.0, -1.526575, 2.558158),
+      Vectors.dense(0.0, 0.06984238, 2.20488344),
+      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
+      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
+      Vectors.dense(13.5356178, 3.2714044, 0.3770744),
+      Vectors.dense(14.064629, 3.565802, 0.269593),
+      Vectors.dense(10.1238013, 0.9708569, 1.1475466),
+      Vectors.dense(13.1860638, 2.1761382, 0.6213134))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true);
+         regParam <- Seq(0.0, 0.1, 1.0);
+         standardizeFeatures <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(
+        fitIntercept, regParam, standardizeFeatures, standardizeLabel = true)
+        .fit(instances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expected(idx) absTol 1e-4)
+      idx += 1
+    }
+  }
+}

From a1573489a37def97b7c26b798898ffbbdc4defa8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 8 Sep 2015 20:54:02 -0700
Subject: [PATCH 219/802] [SPARK-10464] [MLLIB] Add WeibullGenerator for
 RandomDataGenerator

Add WeibullGenerator for RandomDataGenerator.
#8611 need use WeibullGenerator to generate random data based on Weibull distribution.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8622 from yanboliang/spark-10464.
---
 .../mllib/random/RandomDataGenerator.scala    | 27 +++++++++++++++++--
 .../random/RandomDataGeneratorSuite.scala     | 16 ++++++++++-
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index a2d85a68cd327..9eab7efc160da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.random
 
-import org.apache.commons.math3.distribution.{ExponentialDistribution,
-  GammaDistribution, LogNormalDistribution, PoissonDistribution}
+import org.apache.commons.math3.distribution._
 
 import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
@@ -195,3 +194,27 @@ class LogNormalGenerator @Since("1.3.0") (
   @Since("1.3.0")
   override def copy(): LogNormalGenerator = new LogNormalGenerator(mean, std)
 }
+
+/**
+ * :: DeveloperApi ::
+ * Generates i.i.d. samples from the Weibull distribution with the
+ * given shape and scale parameter.
+ *
+ * @param alpha shape parameter for the Weibull distribution.
+ * @param beta scale parameter for the Weibull distribution.
+ */
+@DeveloperApi
+class WeibullGenerator(
+    val alpha: Double,
+    val beta: Double) extends RandomDataGenerator[Double] {
+
+  private val rng = new WeibullDistribution(alpha, beta)
+
+  override def nextValue(): Double = rng.sample()
+
+  override def setSeed(seed: Long): Unit = {
+    rng.reseedRandomGenerator(seed)
+  }
+
+  override def copy(): WeibullGenerator = new WeibullGenerator(alpha, beta)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index a5ca1518f82f5..8416771552fd3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.random
 
-import scala.math
+import org.apache.commons.math3.special.Gamma
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.StatCounter
@@ -136,4 +136,18 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
         distributionChecks(gamma, expectedMean, expectedStd, 0.1)
     }
   }
+
+  test("WeibullGenerator") {
+    List((1.0, 2.0), (2.0, 3.0), (2.5, 3.5), (10.4, 2.222)).map {
+      case (alpha: Double, beta: Double) =>
+        val weibull = new WeibullGenerator(alpha, beta)
+        apiChecks(weibull)
+
+        val expectedMean = math.exp(Gamma.logGamma(1 + (1 / alpha))) * beta
+        val expectedVariance = math.exp(
+          Gamma.logGamma(1 + (2 / alpha))) * beta * beta - expectedMean * expectedMean
+        val expectedStd = math.sqrt(expectedVariance)
+        distributionChecks(weibull, expectedMean, expectedStd, 0.1)
+    }
+  }
 }

From 3a11e50e21ececbec9708eb487b08196f195cd87 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 8 Sep 2015 20:56:22 -0700
Subject: [PATCH 220/802] [SPARK-10373] [PYSPARK] move @since into pyspark from
 sql

cc mengxr

Author: Davies Liu <davies@databricks.com>

Closes #8657 from davies/move_since.
---
 python/pyspark/__init__.py       | 16 ++++++++++++++++
 python/pyspark/sql/__init__.py   | 15 ---------------
 python/pyspark/sql/column.py     |  2 +-
 python/pyspark/sql/context.py    |  2 +-
 python/pyspark/sql/dataframe.py  |  2 +-
 python/pyspark/sql/functions.py  |  3 +--
 python/pyspark/sql/group.py      |  2 +-
 python/pyspark/sql/readwriter.py |  3 +--
 python/pyspark/sql/window.py     |  3 +--
 9 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 5f70ac6ed8fe6..8475dfb1c6ad0 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -48,6 +48,22 @@
 from pyspark.status import *
 from pyspark.profiler import Profiler, BasicProfiler
 
+
+def since(version):
+    """
+    A decorator that annotates a function to append the version of Spark the function was added.
+    """
+    import re
+    indent_p = re.compile(r'\n( +)')
+
+    def deco(f):
+        indents = indent_p.findall(f.__doc__)
+        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
+        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
+        return f
+    return deco
+
+
 # for back compatibility
 from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row
 
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index ad9c891ba1c04..98eaf52866d23 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -44,21 +44,6 @@
 from __future__ import absolute_import
 
 
-def since(version):
-    """
-    A decorator that annotates a function to append the version of Spark the function was added.
-    """
-    import re
-    indent_p = re.compile(r'\n( +)')
-
-    def deco(f):
-        indents = indent_p.findall(f.__doc__)
-        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
-        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
-        return f
-    return deco
-
-
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 56e75e8caee88..573f65f5bf096 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -22,9 +22,9 @@
     basestring = str
     long = int
 
+from pyspark import since
 from pyspark.context import SparkContext
 from pyspark.rdd import ignore_unicode_prefix
-from pyspark.sql import since
 from pyspark.sql.types import *
 
 __all__ = ["DataFrame", "Column", "SchemaRDD", "DataFrameNaFunctions",
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 0ef46c44644ab..89c8c6e0d94f1 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -26,9 +26,9 @@
 
 from py4j.protocol import Py4JError
 
+from pyspark import since
 from pyspark.rdd import RDD, _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
-from pyspark.sql import since
 from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter
 from pyspark.sql.dataframe import DataFrame
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index e269ef4304f3f..c5bf55791240b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -26,11 +26,11 @@
 else:
     from itertools import imap as map
 
+from pyspark import since
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
 from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.sql import since
 from pyspark.sql.types import _parse_datatype_json_string
 from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 4b74a501521a5..26b8662718a60 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -24,10 +24,9 @@
 if sys.version < "3":
     from itertools import imap as map
 
-from pyspark import SparkContext
+from pyspark import since, SparkContext
 from pyspark.rdd import _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
-from pyspark.sql import since
 from pyspark.sql.types import StringType
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 04594d5a836ce..71c0bccc5eeff 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.rdd import ignore_unicode_prefix
-from pyspark.sql import since
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import *
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 3fa6895880a97..f43d8bf646a9e 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -22,8 +22,7 @@
 
 from py4j.java_gateway import JavaClass
 
-from pyspark import RDD
-from pyspark.sql import since
+from pyspark import RDD, since
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index eaf4d7e98620a..57bbe340bbd4d 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -17,8 +17,7 @@
 
 import sys
 
-from pyspark import SparkContext
-from pyspark.sql import since
+from pyspark import since, SparkContext
 from pyspark.sql.column import _to_seq, _to_java_column
 
 __all__ = ["Window", "WindowSpec"]

From 0e2f2163314972f6be18e3453c64314d1bee7bb9 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 8 Sep 2015 21:26:20 -0700
Subject: [PATCH 221/802] [SPARK-10094] Pyspark ML Feature transformers marked
 as experimental

Modified class-level docstrings to mark all feature transformers in pyspark.ml as experimental.

Author: noelsmith <mail@noelsmith.com>

Closes #8623 from noel-smith/SPARK-10094-mark-pyspark-ml-trans-exp.
---
 python/pyspark/ml/feature.py | 52 ++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index d955307e27efd..a7c5b2b62e764 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -36,6 +36,8 @@
 @inherit_doc
 class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Binarize a column of continuous features given a threshold.
 
     >>> df = sqlContext.createDataFrame([(0.5,)], ["values"])
@@ -92,6 +94,8 @@ def getThreshold(self):
 @inherit_doc
 class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Maps a column of continuous features to a column of feature buckets.
 
     >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
@@ -169,6 +173,8 @@ def getSplits(self):
 @inherit_doc
 class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A feature transformer that takes the 1D discrete cosine transform
     of a real vector. No zero padding is performed on the input vector.
     It returns a real vector of the same length representing the DCT.
@@ -232,6 +238,8 @@ def getInverse(self):
 @inherit_doc
 class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Outputs the Hadamard product (i.e., the element-wise product) of each input vector
     with a provided "weight" vector. In other words, it scales each column of the dataset
     by a scalar multiplier.
@@ -289,6 +297,8 @@ def getScalingVec(self):
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     """
+    .. note:: Experimental
+
     Maps a sequence of terms to their term frequencies using the
     hashing trick.
 
@@ -327,6 +337,8 @@ def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
 @inherit_doc
 class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Compute the Inverse Document Frequency (IDF) given a collection of documents.
 
     >>> from pyspark.mllib.linalg import DenseVector
@@ -387,6 +399,8 @@ def _create_model(self, java_model):
 
 class IDFModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by IDF.
     """
 
@@ -395,6 +409,8 @@ class IDFModel(JavaModel):
 @ignore_unicode_prefix
 class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A feature transformer that converts the input array of strings into an array of n-grams. Null
     values in the input array are ignored.
     It returns an array of n-grams where each n-gram is represented by a space-separated string of
@@ -463,6 +479,8 @@ def getN(self):
 @inherit_doc
 class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
      Normalize a vector to have unit norm using the given p-norm.
 
     >>> from pyspark.mllib.linalg import Vectors
@@ -519,6 +537,8 @@ def getP(self):
 @inherit_doc
 class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A one-hot encoder that maps a column of category indices to a
     column of binary vectors, with at most a single one-value per row
     that indicates the input category index.
@@ -591,6 +611,8 @@ def getDropLast(self):
 @inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
     which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an
     expansion of a product of sums expresses it as a sum of products by using the fact that
@@ -649,6 +671,8 @@ def getDegree(self):
 @ignore_unicode_prefix
 class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A regex based tokenizer that extracts tokens either by using the
     provided regex pattern (in Java dialect) to split the text
     (default) or repeatedly matching the regex (if gaps is false).
@@ -746,6 +770,8 @@ def getPattern(self):
 @inherit_doc
 class SQLTransformer(JavaTransformer):
     """
+    .. note:: Experimental
+
     Implements the transforms which are defined by SQL statement.
     Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
     where '__THIS__' represents the underlying table of the input dataset.
@@ -797,6 +823,8 @@ def getStatement(self):
 @inherit_doc
 class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Standardizes features by removing the mean and scaling to unit variance using column summary
     statistics on the samples in the training set.
 
@@ -870,6 +898,8 @@ def _create_model(self, java_model):
 
 class StandardScalerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by StandardScaler.
     """
 
@@ -891,6 +921,8 @@ def mean(self):
 @inherit_doc
 class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A label indexer that maps a string column of labels to an ML column of label indices.
     If the input column is numeric, we cast it to string and index the string values.
     The indices are in [0, numLabels), ordered by label frequencies.
@@ -929,6 +961,8 @@ def _create_model(self, java_model):
 
 class StringIndexerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by StringIndexer.
     """
 
@@ -1006,6 +1040,8 @@ def getCaseSensitive(self):
 @ignore_unicode_prefix
 class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A tokenizer that converts the input string to lowercase and then
     splits it by white spaces.
 
@@ -1051,6 +1087,8 @@ def setParams(self, inputCol=None, outputCol=None):
 @inherit_doc
 class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
     """
+    .. note:: Experimental
+
     A feature transformer that merges multiple columns into a vector column.
 
     >>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
@@ -1087,6 +1125,8 @@ def setParams(self, inputCols=None, outputCol=None):
 @inherit_doc
 class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Class for indexing categorical feature columns in a dataset of [[Vector]].
 
     This has 2 usage modes:
@@ -1186,6 +1226,8 @@ def _create_model(self, java_model):
 
 class VectorIndexerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by VectorIndexer.
     """
 
@@ -1194,6 +1236,8 @@ class VectorIndexerModel(JavaModel):
 @ignore_unicode_prefix
 class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
     natural language processing or machine learning process.
 
@@ -1307,6 +1351,8 @@ def _create_model(self, java_model):
 
 class Word2VecModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by Word2Vec.
     """
 
@@ -1332,6 +1378,8 @@ def findSynonyms(self, word, num):
 @inherit_doc
 class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     PCA trains a model to project vectors to a low-dimensional space using PCA.
 
     >>> from pyspark.mllib.linalg import Vectors
@@ -1387,6 +1435,8 @@ def _create_model(self, java_model):
 
 class PCAModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by PCA.
     """
 
@@ -1470,6 +1520,8 @@ def _create_model(self, java_model):
 
 class RFormulaModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by :py:class:`RFormula`.
     """
 

From 2f6fd5256c6650868916a3eefaa0beb091187cbb Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 8 Sep 2015 22:13:05 -0700
Subject: [PATCH 222/802] [SPARK-9654] [ML] [PYSPARK] Add IndexToString to
 PySpark

Adds IndexToString to PySpark.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #7976 from holdenk/SPARK-9654-add-string-indexer-inverse-in-pyspark.
---
 .../spark/ml/feature/StringIndexer.scala      |  2 +-
 python/pyspark/ml/feature.py                  | 74 ++++++++++++++++++-
 python/pyspark/ml/wrapper.py                  |  3 +-
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 77aeed0ab0370..b6482ffe0b2ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -102,7 +102,7 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod
  * [[StringIndexerModel.transform]] would return the input dataset unmodified.
  * This is a temporary fix for the case when target labels do not exist during prediction.
  *
- * @param labels  Ordered list of labels, corresponding to indices to be assigned
+ * @param labels  Ordered list of labels, corresponding to indices to be assigned.
  */
 @Experimental
 class StringIndexerModel (
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a7c5b2b62e764..8c26cfbd5a47d 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -27,10 +27,11 @@
 from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
-           'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
-           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
-           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover']
+           'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion',
+           'RegexTokenizer', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
+           'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
+           'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel',
+           'StopWordsRemover']
 
 
 @inherit_doc
@@ -934,6 +935,11 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
     ...     key=lambda x: x[0])
     [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
+    >>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels())
+    >>> itd = inverter.transform(td)
+    >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
+    ...     key=lambda x: x[0])
+    [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
     """
 
     @keyword_only
@@ -965,6 +971,66 @@ class StringIndexerModel(JavaModel):
 
     Model fitted by StringIndexer.
     """
+    @property
+    def labels(self):
+        """
+        Ordered list of labels, corresponding to indices to be assigned.
+        """
+        return self._java_obj.labels
+
+
+@inherit_doc
+class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    A :py:class:`Transformer` that maps a column of string indices back to a new column of
+    corresponding string values using either the ML attributes of the input column, or if
+    provided using the labels supplied by the user.
+    All original columns are kept during transformation.
+    See L{StringIndexer} for converting strings into indices.
+    """
+
+    # a placeholder to make the labels show up in generated doc
+    labels = Param(Params._dummy(), "labels",
+                   "Optional array of labels to be provided by the user, if not supplied or " +
+                   "empty, column metadata is read for labels")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, labels=None):
+        """
+        __init__(self, inputCol=None, outputCol=None, labels=None)
+        """
+        super(IndexToString, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
+                                            self.uid)
+        self.labels = Param(self, "labels",
+                            "Optional array of labels to be provided by the user, if not " +
+                            "supplied or empty, column metadata is read for labels")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, labels=None):
+        """
+        setParams(self, inputCol=None, outputCol=None, labels=None)
+        Sets params for this IndexToString.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setLabels(self, value):
+        """
+        Sets the value of :py:attr:`labels`.
+        """
+        self._paramMap[self.labels] = value
+        return self
+
+    def getLabels(self):
+        """
+        Gets the value of :py:attr:`labels` or its default value.
+        """
+        return self.getOrDefault(self.labels)
 
 
 class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 253705bde913e..8218c7c5f801c 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -136,7 +136,8 @@ def _fit(self, dataset):
 class JavaTransformer(Transformer, JavaWrapper):
     """
     Base class for :py:class:`Transformer`s that wrap Java/Scala
-    implementations.
+    implementations. Subclasses should ensure they have the transformer Java object
+    available as _java_obj.
     """
 
     __metaclass__ = ABCMeta

From 91a577d2778ab5946f0c40cb80c89de24e3d10e8 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Sep 2015 22:33:23 -0700
Subject: [PATCH 223/802] [SPARK-10249] [ML] [DOC] Add Python Code Example to
 StopWordsRemover User Guide

jira: https://issues.apache.org/jira/browse/SPARK-10249

update user guide since python support added.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #8620 from hhbyyh/swPyDocExample.
---
 docs/ml-features.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 90654d1e5a248..58b31a5a5cc47 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -512,6 +512,25 @@ DataFrame dataset = jsql.createDataFrame(rdd, schema);
 remover.transform(dataset).show();
 {% endhighlight %}
 </div>
+
+<div data-lang="python" markdown="1">
+[`StopWordsRemover`](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).
+
+{% highlight python %}
+from pyspark.ml.feature import StopWordsRemover
+
+sentenceData = sqlContext.createDataFrame([
+  (0, ["I", "saw", "the", "red", "baloon"]),
+  (1, ["Mary", "had", "a", "little", "lamb"])
+], ["label", "raw"])
+
+remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+remover.transform(sentenceData).show(truncate=False)
+{% endhighlight %}
+</div>
 </div>
 
 ## $n$-gram

From c1bc4f439f54625c01a585691e5293cd9961eb0c Mon Sep 17 00:00:00 2001
From: Luc Bourlier <luc.bourlier@typesafe.com>
Date: Wed, 9 Sep 2015 09:57:58 +0100
Subject: [PATCH 224/802] [SPARK-10227] fatal warnings with sbt on Scala 2.11

The bulk of the changes are on `transient` annotation on class parameter. Often the compiler doesn't generate a field for this parameters, so the the transient annotation would be unnecessary.
But if the class parameter are used in methods, then fields are created. So it is safer to keep the annotations.

The remainder are some potential bugs, and deprecated syntax.

Author: Luc Bourlier <luc.bourlier@typesafe.com>

Closes #8433 from skyluc/issue/sbt-2.11.
---
 .../scala/org/apache/spark/Accumulators.scala |  2 +-
 .../scala/org/apache/spark/Dependency.scala   |  2 +-
 .../scala/org/apache/spark/Partitioner.scala  |  4 +-
 .../org/apache/spark/SparkHadoopWriter.scala  |  2 +-
 .../apache/spark/api/python/PythonRDD.scala   |  4 +-
 .../spark/input/PortableDataStream.scala      |  4 +-
 .../netty/NettyBlockTransferService.scala     |  2 +-
 .../org/apache/spark/rdd/BinaryFileRDD.scala  |  6 +--
 .../scala/org/apache/spark/rdd/BlockRDD.scala |  4 +-
 .../org/apache/spark/rdd/CartesianRDD.scala   |  4 +-
 .../org/apache/spark/rdd/CheckpointRDD.scala  |  2 +-
 .../org/apache/spark/rdd/HadoopRDD.scala      |  8 ++--
 .../apache/spark/rdd/LocalCheckpointRDD.scala |  2 +-
 .../spark/rdd/LocalRDDCheckpointData.scala    |  2 +-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 18 ++++----
 .../spark/rdd/ParallelCollectionRDD.scala     |  4 +-
 .../spark/rdd/PartitionPruningRDD.scala       |  6 +--
 .../spark/rdd/PartitionwiseSampledRDD.scala   |  4 +-
 .../apache/spark/rdd/RDDCheckpointData.scala  |  2 +-
 .../spark/rdd/ReliableCheckpointRDD.scala     |  2 +-
 .../spark/rdd/ReliableRDDCheckpointData.scala |  2 +-
 .../apache/spark/rdd/SqlNewHadoopRDD.scala    |  6 +--
 .../scala/org/apache/spark/rdd/UnionRDD.scala |  4 +-
 .../spark/rdd/ZippedPartitionsRDD.scala       |  2 +-
 .../apache/spark/rdd/ZippedWithIndexRDD.scala |  2 +-
 .../org/apache/spark/rpc/RpcEndpointRef.scala |  2 +-
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    | 21 ++++++---
 .../apache/spark/scheduler/ResultTask.scala   |  2 +-
 .../shuffle/unsafe/UnsafeShuffleManager.scala |  2 +-
 .../apache/spark/util/ClosureCleaner.scala    |  2 +-
 .../spark/util/ShutdownHookManager.scala      |  4 +-
 .../util/logging/RollingFileAppender.scala    | 46 +++++++++----------
 .../streaming/flume/FlumeInputDStream.scala   |  2 +-
 .../flume/FlumePollingInputDStream.scala      |  2 +-
 .../kafka/DirectKafkaInputDStream.scala       |  4 +-
 .../streaming/kafka/KafkaInputDStream.scala   |  2 +-
 .../streaming/mqtt/MQTTInputDStream.scala     |  2 +-
 .../twitter/TwitterInputDStream.scala         |  2 +-
 .../org/apache/spark/graphx/EdgeRDD.scala     |  4 +-
 .../scala/org/apache/spark/graphx/Graph.scala |  6 +--
 .../org/apache/spark/graphx/VertexRDD.scala   |  4 +-
 .../apache/spark/mllib/rdd/RandomRDD.scala    | 12 ++---
 .../org/apache/spark/repl/SparkILoop.scala    |  2 +-
 .../expressions/stringExpressions.scala       |  2 +-
 .../spark/sql/types/AbstractDataType.scala    |  2 +-
 .../datasources/WriterContainer.scala         | 10 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../apache/spark/sql/hive/TableReader.scala   |  8 ++--
 .../hive/execution/ScriptTransformation.scala |  2 +-
 .../spark/sql/hive/hiveWriterContainers.scala | 10 ++--
 .../apache/spark/streaming/Checkpoint.scala   |  2 +-
 .../streaming/api/python/PythonDStream.scala  | 12 ++---
 .../streaming/dstream/FileInputDStream.scala  |  2 +-
 .../streaming/dstream/InputDStream.scala      |  2 +-
 .../dstream/PluggableInputDStream.scala       |  2 +-
 .../streaming/dstream/QueueInputDStream.scala |  4 +-
 .../streaming/dstream/RawInputDStream.scala   |  2 +-
 .../dstream/ReceiverInputDStream.scala        |  2 +-
 .../dstream/SocketInputDStream.scala          |  2 +-
 .../rdd/WriteAheadLogBackedBlockRDD.scala     | 22 ++++-----
 60 files changed, 158 insertions(+), 151 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index c39c8667d013e..5592b75afb75b 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -47,7 +47,7 @@ import org.apache.spark.util.Utils
  * @tparam T partial data that can be added in
  */
 class Accumulable[R, T] private[spark] (
-    @transient initialValue: R,
+    initialValue: R,
     param: AccumulableParam[R, T],
     val name: Option[String],
     internal: Boolean)
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
index fc8cdde9348ee..cfeeb3902c033 100644
--- a/core/src/main/scala/org/apache/spark/Dependency.scala
+++ b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -66,7 +66,7 @@ abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  */
 @DeveloperApi
 class ShuffleDependency[K, V, C](
-    @transient _rdd: RDD[_ <: Product2[K, V]],
+    @transient private val _rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
     val serializer: Option[Serializer] = None,
     val keyOrdering: Option[Ordering[K]] = None,
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 29e581bb57cbc..e4df7af81a6d2 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -104,8 +104,8 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * the value of `partitions`.
  */
 class RangePartitioner[K : Ordering : ClassTag, V](
-    @transient partitions: Int,
-    @transient rdd: RDD[_ <: Product2[K, V]],
+    partitions: Int,
+    rdd: RDD[_ <: Product2[K, V]],
     private var ascending: Boolean = true)
   extends Partitioner {
 
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index f5dd36cbcfe6d..ae5926dd534a6 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -37,7 +37,7 @@ import org.apache.spark.util.SerializableJobConf
  * a filename to write to, etc, exactly like in a Hadoop MapReduce job.
  */
 private[spark]
-class SparkHadoopWriter(@transient jobConf: JobConf)
+class SparkHadoopWriter(jobConf: JobConf)
   extends Logging
   with SparkHadoopMapRedUtil
   with Serializable {
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b4d152b336602..69da180593bb5 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -41,7 +41,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
 import scala.util.control.NonFatal
 
 private[spark] class PythonRDD(
-    @transient parent: RDD[_],
+    parent: RDD[_],
     command: Array[Byte],
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
@@ -785,7 +785,7 @@ class BytesToString extends org.apache.spark.api.java.function.Function[Array[By
  * Internal class that acts as an `AccumulatorParam` for Python accumulators. Inside, it
  * collects a list of pickled strings that we pass to Python through a socket.
  */
-private class PythonAccumulatorParam(@transient serverHost: String, serverPort: Int)
+private class PythonAccumulatorParam(@transient private val serverHost: String, serverPort: Int)
   extends AccumulatorParam[JList[Array[Byte]]] {
 
   Utils.checkHost(serverHost, "Expected hostname")
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index a5ad47293f1c2..e2ffc3b64e5db 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -131,8 +131,8 @@ private[spark] class StreamInputFormat extends StreamFileInputFormat[PortableDat
  */
 @Experimental
 class PortableDataStream(
-    @transient isplit: CombineFileSplit,
-    @transient context: TaskAttemptContext,
+    isplit: CombineFileSplit,
+    context: TaskAttemptContext,
     index: Integer)
   extends Serializable {
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index 4b851bcb36597..70a42f9045e6b 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -137,7 +137,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
       new RpcResponseCallback {
         override def onSuccess(response: Array[Byte]): Unit = {
           logTrace(s"Successfully uploaded block $blockId")
-          result.success()
+          result.success((): Unit)
         }
         override def onFailure(e: Throwable): Unit = {
           logError(s"Error while uploading block $blockId", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 1f755db485812..6fec00dcd0d85 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -28,7 +28,7 @@ private[spark] class BinaryFileRDD[T](
     inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
     keyClass: Class[String],
     valueClass: Class[T],
-    @transient conf: Configuration,
+    conf: Configuration,
     minPartitions: Int)
   extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {
 
@@ -36,10 +36,10 @@ private[spark] class BinaryFileRDD[T](
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(conf)
+        configurable.setConf(getConf)
       case _ =>
     }
-    val jobContext = newJobContext(conf, jobId)
+    val jobContext = newJobContext(getConf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
diff --git a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
index 922030263756b..fc1710fbad0a3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
@@ -28,7 +28,7 @@ private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends P
 }
 
 private[spark]
-class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId])
+class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId])
   extends RDD[T](sc, Nil) {
 
   @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get)
@@ -64,7 +64,7 @@ class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds
    */
   private[spark] def removeBlocks() {
     blockIds.foreach { blockId =>
-      sc.env.blockManager.master.removeBlock(blockId)
+      sparkContext.env.blockManager.master.removeBlock(blockId)
     }
     _isValid = false
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
index c1d6971787572..18e8cddbc40db 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
@@ -27,8 +27,8 @@ import org.apache.spark.util.Utils
 private[spark]
 class CartesianPartition(
     idx: Int,
-    @transient rdd1: RDD[_],
-    @transient rdd2: RDD[_],
+    @transient private val rdd1: RDD[_],
+    @transient private val rdd2: RDD[_],
     s1Index: Int,
     s2Index: Int
   ) extends Partition {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
index 72fe215dae73e..b0364623af4cf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -29,7 +29,7 @@ private[spark] class CheckpointRDDPartition(val index: Int) extends Partition
 /**
  * An RDD that recovers checkpointed data from storage.
  */
-private[spark] abstract class CheckpointRDD[T: ClassTag](@transient sc: SparkContext)
+private[spark] abstract class CheckpointRDD[T: ClassTag](sc: SparkContext)
   extends RDD[T](sc, Nil) {
 
   // CheckpointRDD should not be checkpointed again
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index e1f8719eead02..8f2655d63b797 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel
 /**
  * A Spark split class that wraps around a Hadoop InputSplit.
  */
-private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSplit)
+private[spark] class HadoopPartition(rddId: Int, idx: Int, s: InputSplit)
   extends Partition {
 
   val inputSplit = new SerializableWritable[InputSplit](s)
@@ -99,7 +99,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
  */
 @DeveloperApi
 class HadoopRDD[K, V](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     broadcastedConf: Broadcast[SerializableConfiguration],
     initLocalJobConfFuncOpt: Option[JobConf => Unit],
     inputFormatClass: Class[_ <: InputFormat[K, V]],
@@ -109,7 +109,7 @@ class HadoopRDD[K, V](
   extends RDD[(K, V)](sc, Nil) with Logging {
 
   if (initLocalJobConfFuncOpt.isDefined) {
-    sc.clean(initLocalJobConfFuncOpt.get)
+    sparkContext.clean(initLocalJobConfFuncOpt.get)
   }
 
   def this(
@@ -137,7 +137,7 @@ class HadoopRDD[K, V](
   // used to build JobTracker ID
   private val createTime = new Date()
 
-  private val shouldCloneJobConf = sc.conf.getBoolean("spark.hadoop.cloneConf", false)
+  private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
 
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
   protected def getJobConf(): JobConf = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
index daa5779d688cc..bfe19195fcd37 100644
--- a/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark.storage.RDDBlockId
  * @param numPartitions the number of partitions in the checkpointed RDD
  */
 private[spark] class LocalCheckpointRDD[T: ClassTag](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     rddId: Int,
     numPartitions: Int)
   extends CheckpointRDD[T](sc) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
index d6fad896845f6..c115e0ff74d3c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
@@ -31,7 +31,7 @@ import org.apache.spark.util.Utils
  * is written to the local, ephemeral block storage that lives in each executor. This is useful
  * for use cases where RDDs build up long lineages that need to be truncated often (e.g. GraphX).
  */
-private[spark] class LocalRDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] class LocalRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends RDDCheckpointData[T](rdd) with Logging {
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 6a9c004d65cff..174979aaeb231 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -40,7 +40,7 @@ import org.apache.spark.storage.StorageLevel
 private[spark] class NewHadoopPartition(
     rddId: Int,
     val index: Int,
-    @transient rawSplit: InputSplit with Writable)
+    rawSplit: InputSplit with Writable)
   extends Partition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
@@ -68,14 +68,14 @@ class NewHadoopRDD[K, V](
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
-    @transient conf: Configuration)
+    @transient private val _conf: Configuration)
   extends RDD[(K, V)](sc, Nil)
   with SparkHadoopMapReduceUtil
   with Logging {
 
   // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it
-  private val confBroadcast = sc.broadcast(new SerializableConfiguration(conf))
-  // private val serializableConf = new SerializableWritable(conf)
+  private val confBroadcast = sc.broadcast(new SerializableConfiguration(_conf))
+  // private val serializableConf = new SerializableWritable(_conf)
 
   private val jobTrackerId: String = {
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
@@ -88,10 +88,10 @@ class NewHadoopRDD[K, V](
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(conf)
+        configurable.setConf(_conf)
       case _ =>
     }
-    val jobContext = newJobContext(conf, jobId)
+    val jobContext = newJobContext(_conf, jobId)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
@@ -262,7 +262,7 @@ private[spark] class WholeTextFileRDD(
     inputFormatClass: Class[_ <: WholeTextFileInputFormat],
     keyClass: Class[String],
     valueClass: Class[String],
-    @transient conf: Configuration,
+    conf: Configuration,
     minPartitions: Int)
   extends NewHadoopRDD[String, String](sc, inputFormatClass, keyClass, valueClass, conf) {
 
@@ -270,10 +270,10 @@ private[spark] class WholeTextFileRDD(
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(conf)
+        configurable.setConf(getConf)
       case _ =>
     }
-    val jobContext = newJobContext(conf, jobId)
+    val jobContext = newJobContext(getConf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
index e2394e28f8d26..582fa93afe34e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
@@ -83,8 +83,8 @@ private[spark] class ParallelCollectionPartition[T: ClassTag](
 }
 
 private[spark] class ParallelCollectionRDD[T: ClassTag](
-    @transient sc: SparkContext,
-    @transient data: Seq[T],
+    sc: SparkContext,
+    @transient private val data: Seq[T],
     numSlices: Int,
     locationPrefs: Map[Int, Seq[String]])
     extends RDD[T](sc, Nil) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
index a00f4c1cdff91..d6a37e8cc5dac 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
@@ -32,7 +32,7 @@ private[spark] class PartitionPruningRDDPartition(idx: Int, val parentSplit: Par
  * Represents a dependency between the PartitionPruningRDD and its parent. In this
  * case, the child RDD contains a subset of partitions of the parents'.
  */
-private[spark] class PruneDependency[T](rdd: RDD[T], @transient partitionFilterFunc: Int => Boolean)
+private[spark] class PruneDependency[T](rdd: RDD[T], partitionFilterFunc: Int => Boolean)
   extends NarrowDependency[T](rdd) {
 
   @transient
@@ -55,8 +55,8 @@ private[spark] class PruneDependency[T](rdd: RDD[T], @transient partitionFilterF
  */
 @DeveloperApi
 class PartitionPruningRDD[T: ClassTag](
-    @transient prev: RDD[T],
-    @transient partitionFilterFunc: Int => Boolean)
+    prev: RDD[T],
+    partitionFilterFunc: Int => Boolean)
   extends RDD[T](prev.context, List(new PruneDependency(prev, partitionFilterFunc))) {
 
   override def compute(split: Partition, context: TaskContext): Iterator[T] = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index a637d6f15b7e5..3b1acacf409b9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -47,8 +47,8 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
 private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
     prev: RDD[T],
     sampler: RandomSampler[T, U],
-    @transient preservesPartitioning: Boolean,
-    @transient seed: Long = Utils.random.nextLong)
+    preservesPartitioning: Boolean,
+    @transient private val seed: Long = Utils.random.nextLong)
   extends RDD[U](prev) {
 
   @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 0e43520870c0a..429514b4f6bee 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -36,7 +36,7 @@ private[spark] object CheckpointState extends Enumeration {
  * as well as, manages the post-checkpoint state by providing the updated partitions,
  * iterator and preferred locations of the checkpointed RDD.
  */
-private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends Serializable {
 
   import CheckpointState._
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index 35d8b0bfd18c5..1c3b5da19ceba 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -32,7 +32,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
  * An RDD that reads from checkpoint files previously written to reliable storage.
  */
 private[spark] class ReliableCheckpointRDD[T: ClassTag](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     val checkpointPath: String)
   extends CheckpointRDD[T](sc) {
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
index 1df8eef5ff2b9..e9f6060301ba3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
@@ -28,7 +28,7 @@ import org.apache.spark.util.SerializableConfiguration
  * An implementation of checkpointing that writes the RDD data to reliable storage.
  * This allows drivers to be restarted on failure with previously computed state.
  */
-private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends RDDCheckpointData[T](rdd) with Logging {
 
   // The directory to which the associated RDD has been checkpointed to
diff --git a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
index fa3fecc80cb63..9babe56267e08 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Ut
 private[spark] class SqlNewHadoopPartition(
     rddId: Int,
     val index: Int,
-    @transient rawSplit: InputSplit with Writable)
+    rawSplit: InputSplit with Writable)
   extends SparkPartition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
@@ -61,9 +61,9 @@ private[spark] class SqlNewHadoopPartition(
  * changes based on [[org.apache.spark.rdd.HadoopRDD]].
  */
 private[spark] class SqlNewHadoopRDD[V: ClassTag](
-    @transient sc : SparkContext,
+    sc : SparkContext,
     broadcastedConf: Broadcast[SerializableConfiguration],
-    @transient initDriverSideJobFuncOpt: Option[Job => Unit],
+    @transient private val initDriverSideJobFuncOpt: Option[Job => Unit],
     initLocalJobFuncOpt: Option[Job => Unit],
     inputFormatClass: Class[_ <: InputFormat[Void, V]],
     valueClass: Class[V])
diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
index 3986645350a82..66cf4369da2ef 100644
--- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
  */
 private[spark] class UnionPartition[T: ClassTag](
     idx: Int,
-    @transient rdd: RDD[T],
+    @transient private val rdd: RDD[T],
     val parentRddIndex: Int,
-    @transient parentRddPartitionIndex: Int)
+    @transient private val parentRddPartitionIndex: Int)
   extends Partition {
 
   var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index b3c64394abc76..70bf04de6400d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.util.Utils
 
 private[spark] class ZippedPartitionsPartition(
     idx: Int,
-    @transient rdds: Seq[RDD[_]],
+    @transient private val rdds: Seq[RDD[_]],
     @transient val preferredLocations: Seq[String])
   extends Partition {
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index e277ae28d588f..32931d59acb18 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -37,7 +37,7 @@ class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  * @tparam T parent RDD item type
  */
 private[spark]
-class ZippedWithIndexRDD[T: ClassTag](@transient prev: RDD[T]) extends RDD[(T, Long)](prev) {
+class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev) {
 
   /** The start index of each partition. */
   @transient private val startIndices: Array[Long] = {
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
index 7409ac8859991..f25710bb5bd6e 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, Logging, SparkConf}
 /**
  * A reference for a remote [[RpcEndpoint]]. [[RpcEndpointRef]] is thread-safe.
  */
-private[spark] abstract class RpcEndpointRef(@transient conf: SparkConf)
+private[spark] abstract class RpcEndpointRef(conf: SparkConf)
   extends Serializable with Logging {
 
   private[this] val maxRetries = RpcUtils.numRetries(conf)
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index fc17542abf81d..ad67e1c5ad4d5 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -87,9 +87,9 @@ private[spark] class AkkaRpcEnv private[akka] (
 
   override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
     @volatile var endpointRef: AkkaRpcEndpointRef = null
-    // Use lazy because the Actor needs to use `endpointRef`.
+    // Use defered function because the Actor needs to use `endpointRef`.
     // So `actorRef` should be created after assigning `endpointRef`.
-    lazy val actorRef = actorSystem.actorOf(Props(new Actor with ActorLogReceive with Logging {
+    val actorRef = () => actorSystem.actorOf(Props(new Actor with ActorLogReceive with Logging {
 
       assert(endpointRef != null)
 
@@ -272,13 +272,20 @@ private[akka] class ErrorMonitor extends Actor with ActorLogReceive with Logging
 }
 
 private[akka] class AkkaRpcEndpointRef(
-    @transient defaultAddress: RpcAddress,
-    @transient _actorRef: => ActorRef,
-    @transient conf: SparkConf,
-    @transient initInConstructor: Boolean = true)
+    @transient private val defaultAddress: RpcAddress,
+    @transient private val _actorRef: () => ActorRef,
+    conf: SparkConf,
+    initInConstructor: Boolean)
   extends RpcEndpointRef(conf) with Logging {
 
-  lazy val actorRef = _actorRef
+  def this(
+      defaultAddress: RpcAddress,
+      _actorRef: ActorRef,
+      conf: SparkConf) = {
+    this(defaultAddress, () => _actorRef, conf, true)
+  }
+
+  lazy val actorRef = _actorRef()
 
   override lazy val address: RpcAddress = {
     val akkaAddress = actorRef.path.address
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index c4dc080e2b22b..fb693721a9cb6 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -44,7 +44,7 @@ private[spark] class ResultTask[T, U](
     stageAttemptId: Int,
     taskBinary: Broadcast[Array[Byte]],
     partition: Partition,
-    @transient locs: Seq[TaskLocation],
+    locs: Seq[TaskLocation],
     val outputId: Int,
     internalAccumulators: Seq[Accumulator[Long]])
   extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
diff --git a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
index df7bbd64247dd..75f22f642b9d1 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
@@ -159,7 +159,7 @@ private[spark] class UnsafeShuffleManager(conf: SparkConf) extends ShuffleManage
       mapId: Int,
       context: TaskContext): ShuffleWriter[K, V] = {
     handle match {
-      case unsafeShuffleHandle: UnsafeShuffleHandle[K, V] =>
+      case unsafeShuffleHandle: UnsafeShuffleHandle[K @unchecked, V @unchecked] =>
         numMapsForShufflesThatUsedNewPath.putIfAbsent(handle.shuffleId, unsafeShuffleHandle.numMaps)
         val env = SparkEnv.get
         new UnsafeShuffleWriter(
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 150d82b3930ef..1b49dca9dc78b 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -94,7 +94,7 @@ private[spark] object ClosureCleaner extends Logging {
     if (cls.isPrimitive) {
       cls match {
         case java.lang.Boolean.TYPE => new java.lang.Boolean(false)
-        case java.lang.Character.TYPE => new java.lang.Character('\0')
+        case java.lang.Character.TYPE => new java.lang.Character('\u0000')
         case java.lang.Void.TYPE =>
           // This should not happen because `Foo(void x) {}` does not compile.
           throw new IllegalStateException("Unexpected void parameter in constructor")
diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
index 61ff9b89ec1c1..db4a8b304ec3e 100644
--- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
+++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
@@ -217,7 +217,9 @@ private [util] class SparkShutdownHookManager {
     }
     Try(Utils.classForName("org.apache.hadoop.util.ShutdownHookManager")) match {
       case Success(shmClass) =>
-        val fsPriority = classOf[FileSystem].getField("SHUTDOWN_HOOK_PRIORITY").get()
+        val fsPriority = classOf[FileSystem]
+          .getField("SHUTDOWN_HOOK_PRIORITY")
+          .get(null) // static field, the value is not used
           .asInstanceOf[Int]
         val shm = shmClass.getMethod("get").invoke(null)
         shm.getClass().getMethod("addShutdownHook", classOf[Runnable], classOf[Int])
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
index 7138b4b8e4533..1e8476c4a047e 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
@@ -79,32 +79,30 @@ private[spark] class RollingFileAppender(
     val rolloverSuffix = rollingPolicy.generateRolledOverFileSuffix()
     val rolloverFile = new File(
       activeFile.getParentFile, activeFile.getName + rolloverSuffix).getAbsoluteFile
-    try {
-      logDebug(s"Attempting to rollover file $activeFile to file $rolloverFile")
-      if (activeFile.exists) {
-        if (!rolloverFile.exists) {
-          Files.move(activeFile, rolloverFile)
-          logInfo(s"Rolled over $activeFile to $rolloverFile")
-        } else {
-          // In case the rollover file name clashes, make a unique file name.
-          // The resultant file names are long and ugly, so this is used only
-          // if there is a name collision. This can be avoided by the using
-          // the right pattern such that name collisions do not occur.
-          var i = 0
-          var altRolloverFile: File = null
-          do {
-            altRolloverFile = new File(activeFile.getParent,
-              s"${activeFile.getName}$rolloverSuffix--$i").getAbsoluteFile
-            i += 1
-          } while (i < 10000 && altRolloverFile.exists)
-
-          logWarning(s"Rollover file $rolloverFile already exists, " +
-            s"rolled over $activeFile to file $altRolloverFile")
-          Files.move(activeFile, altRolloverFile)
-        }
+    logDebug(s"Attempting to rollover file $activeFile to file $rolloverFile")
+    if (activeFile.exists) {
+      if (!rolloverFile.exists) {
+        Files.move(activeFile, rolloverFile)
+        logInfo(s"Rolled over $activeFile to $rolloverFile")
       } else {
-        logWarning(s"File $activeFile does not exist")
+        // In case the rollover file name clashes, make a unique file name.
+        // The resultant file names are long and ugly, so this is used only
+        // if there is a name collision. This can be avoided by the using
+        // the right pattern such that name collisions do not occur.
+        var i = 0
+        var altRolloverFile: File = null
+        do {
+          altRolloverFile = new File(activeFile.getParent,
+            s"${activeFile.getName}$rolloverSuffix--$i").getAbsoluteFile
+          i += 1
+        } while (i < 10000 && altRolloverFile.exists)
+
+        logWarning(s"Rollover file $rolloverFile already exists, " +
+          s"rolled over $activeFile to file $altRolloverFile")
+        Files.move(activeFile, altRolloverFile)
       }
+    } else {
+      logWarning(s"File $activeFile does not exist")
     }
   }
 
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
index 2bf99cb3cba1f..c8780aa83bdbd 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
@@ -43,7 +43,7 @@ import org.jboss.netty.handler.codec.compression._
 
 private[streaming]
 class FlumeInputDStream[T: ClassTag](
-  @transient ssc_ : StreamingContext,
+  ssc_ : StreamingContext,
   host: String,
   port: Int,
   storageLevel: StorageLevel,
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 0bc46209b8369..3b936d88abd3e 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -46,7 +46,7 @@ import org.apache.spark.streaming.flume.sink._
  * @tparam T Class type of the object of this stream
  */
 private[streaming] class FlumePollingInputDStream[T: ClassTag](
-    @transient _ssc: StreamingContext,
+    _ssc: StreamingContext,
     val addresses: Seq[InetSocketAddress],
     val maxBatchSize: Int,
     val parallelism: Int,
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 1000094e93cb3..8a087474d3169 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -58,7 +58,7 @@ class DirectKafkaInputDStream[
   U <: Decoder[K]: ClassTag,
   T <: Decoder[V]: ClassTag,
   R: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     val kafkaParams: Map[String, String],
     val fromOffsets: Map[TopicAndPartition, Long],
     messageHandler: MessageAndMetadata[K, V] => R
@@ -79,7 +79,7 @@ class DirectKafkaInputDStream[
   override protected[streaming] val rateController: Option[RateController] = {
     if (RateController.isBackPressureEnabled(ssc.conf)) {
       Some(new DirectKafkaRateController(id,
-        RateEstimator.create(ssc.conf, ssc_.graph.batchDuration)))
+        RateEstimator.create(ssc.conf, context.graph.batchDuration)))
     } else {
       None
     }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 04b2dc10d39ea..38730fecf332a 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -48,7 +48,7 @@ class KafkaInputDStream[
   V: ClassTag,
   U <: Decoder[_]: ClassTag,
   T <: Decoder[_]: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     kafkaParams: Map[String, String],
     topics: Map[String, Int],
     useReliableReceiver: Boolean,
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
index 7c2f18cb35bda..116c170489e96 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class MQTTInputDStream(
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     brokerUrl: String,
     topic: String,
     storageLevel: StorageLevel
diff --git a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
index 7cf02d85d73d3..d7de74b350543 100644
--- a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
+++ b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.streaming.receiver.Receiver
 */
 private[streaming]
 class TwitterInputDStream(
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     twitterAuth: Option[Authorization],
     filters: Seq[String],
     storageLevel: StorageLevel
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
index 4611a3ace219b..ee7302a1edbf6 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
@@ -38,8 +38,8 @@ import org.apache.spark.graphx.impl.EdgeRDDImpl
  * `impl.ReplicatedVertexView`.
  */
 abstract class EdgeRDD[ED](
-    @transient sc: SparkContext,
-    @transient deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
+    sc: SparkContext,
+    deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
 
   // scalastyle:off structural.type
   private[graphx] def partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])] forSome { type VD }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index db73a8abc5733..869caa340f52b 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -46,7 +46,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * @note vertex ids are unique.
    * @return an RDD containing the vertices in this graph
    */
-  @transient val vertices: VertexRDD[VD]
+  val vertices: VertexRDD[VD]
 
   /**
    * An RDD containing the edges and their associated attributes.  The entries in the RDD contain
@@ -59,7 +59,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * along with their vertex data.
    *
    */
-  @transient val edges: EdgeRDD[ED]
+  val edges: EdgeRDD[ED]
 
   /**
    * An RDD containing the edge triplets, which are edges along with the vertex data associated with
@@ -77,7 +77,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * val numInvalid = graph.triplets.map(e => if (e.src.data == e.dst.data) 1 else 0).sum
    * }}}
    */
-  @transient val triplets: RDD[EdgeTriplet[VD, ED]]
+  val triplets: RDD[EdgeTriplet[VD, ED]]
 
   /**
    * Caches the vertices and edges associated with this graph at the specified storage level,
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
index a9f04b559c3d1..1ef7a78fbcd00 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
@@ -55,8 +55,8 @@ import org.apache.spark.graphx.impl.VertexRDDImpl
  * @tparam VD the vertex attribute associated with each vertex in the set.
  */
 abstract class VertexRDD[VD](
-    @transient sc: SparkContext,
-    @transient deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
+    sc: SparkContext,
+    deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
 
   implicit protected def vdTag: ClassTag[VD]
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
index 910eff9540a47..f8cea7ecea6bf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
@@ -35,11 +35,11 @@ private[mllib] class RandomRDDPartition[T](override val index: Int,
 }
 
 // These two classes are necessary since Range objects in Scala cannot have size > Int.MaxValue
-private[mllib] class RandomRDD[T: ClassTag](@transient sc: SparkContext,
+private[mllib] class RandomRDD[T: ClassTag](sc: SparkContext,
     size: Long,
     numPartitions: Int,
-    @transient rng: RandomDataGenerator[T],
-    @transient seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil) {
+    @transient private val rng: RandomDataGenerator[T],
+    @transient private val seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil) {
 
   require(size > 0, "Positive RDD size required.")
   require(numPartitions > 0, "Positive number of partitions required")
@@ -56,12 +56,12 @@ private[mllib] class RandomRDD[T: ClassTag](@transient sc: SparkContext,
   }
 }
 
-private[mllib] class RandomVectorRDD(@transient sc: SparkContext,
+private[mllib] class RandomVectorRDD(sc: SparkContext,
     size: Long,
     vectorSize: Int,
     numPartitions: Int,
-    @transient rng: RandomDataGenerator[Double],
-    @transient seed: Long = Utils.random.nextLong) extends RDD[Vector](sc, Nil) {
+    @transient private val rng: RandomDataGenerator[Double],
+    @transient private val seed: Long = Utils.random.nextLong) extends RDD[Vector](sc, Nil) {
 
   require(size > 0, "Positive RDD size required.")
   require(numPartitions > 0, "Positive number of partitions required")
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index bf609ff0f65fc..33d262558b1fc 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -118,5 +118,5 @@ object SparkILoop {
       }
     }
   }
-  def run(lines: List[String]): String = run(lines map (_ + "\n") mkString)
+  def run(lines: List[String]): String = run(lines.map(_ + "\n").mkString)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 48d02bb534501..a09d5b6e3ad14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -255,7 +255,7 @@ object StringTranslate {
     val dict = new HashMap[Character, Character]()
     var i = 0
     while (i < matching.length()) {
-      val rep = if (i < replace.length()) replace.charAt(i) else '\0'
+      val rep = if (i < replace.length()) replace.charAt(i) else '\u0000'
       if (null == dict.get(matching.charAt(i))) {
         dict.put(matching.charAt(i), rep)
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index e0667c629486d..1d2d007c2b4d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -126,7 +126,7 @@ protected[sql] object AnyDataType extends AbstractDataType {
  */
 protected[sql] abstract class AtomicType extends DataType {
   private[sql] type InternalType
-  @transient private[sql] val tag: TypeTag[InternalType]
+  private[sql] val tag: TypeTag[InternalType]
   private[sql] val ordering: Ordering[InternalType]
 
   @transient private[sql] val classTag = ScalaReflectionLock.synchronized {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 879fd69863211..9a573db0c023a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.SerializableConfiguration
 
 private[sql] abstract class BaseWriterContainer(
     @transient val relation: HadoopFsRelation,
-    @transient job: Job,
+    @transient private val job: Job,
     isAppend: Boolean)
   extends SparkHadoopMapReduceUtil
   with Logging
@@ -222,8 +222,8 @@ private[sql] abstract class BaseWriterContainer(
  * A writer that writes all of the rows in a partition to a single file.
  */
 private[sql] class DefaultWriterContainer(
-    @transient relation: HadoopFsRelation,
-    @transient job: Job,
+    relation: HadoopFsRelation,
+    job: Job,
     isAppend: Boolean)
   extends BaseWriterContainer(relation, job, isAppend) {
 
@@ -286,8 +286,8 @@ private[sql] class DefaultWriterContainer(
  * writer externally sorts the remaining rows and then writes out them out one file at a time.
  */
 private[sql] class DynamicPartitionWriterContainer(
-    @transient relation: HadoopFsRelation,
-    @transient job: Job,
+    relation: HadoopFsRelation,
+    job: Job,
     partitionColumns: Seq[Attribute],
     dataColumns: Seq[Attribute],
     inputSchema: Seq[Attribute],
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index b8da0840ae569..0a5569b0a4446 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -767,7 +767,7 @@ private[hive] case class InsertIntoHiveTable(
 private[hive] case class MetastoreRelation
     (databaseName: String, tableName: String, alias: Option[String])
     (val table: HiveTable)
-    (@transient sqlContext: SQLContext)
+    (@transient private val sqlContext: SQLContext)
   extends LeafNode with MultiInstanceRelation with FileRelation {
 
   override def equals(other: Any): Boolean = other match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index dc355690852bd..e35468a624c3e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -54,10 +54,10 @@ private[hive] sealed trait TableReader {
  */
 private[hive]
 class HadoopTableReader(
-    @transient attributes: Seq[Attribute],
-    @transient relation: MetastoreRelation,
-    @transient sc: HiveContext,
-    @transient hiveExtraConf: HiveConf)
+    @transient private val attributes: Seq[Attribute],
+    @transient private val relation: MetastoreRelation,
+    @transient private val sc: HiveContext,
+    hiveExtraConf: HiveConf)
   extends TableReader with Logging {
 
   // Hadoop honors "mapred.map.tasks" as hint, but will ignore when mapred.job.tracker is "local".
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index c7651daffe36e..32bddbaeaeaf9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -53,7 +53,7 @@ case class ScriptTransformation(
     script: String,
     output: Seq[Attribute],
     child: SparkPlan,
-    ioschema: HiveScriptIOSchema)(@transient sc: HiveContext)
+    ioschema: HiveScriptIOSchema)(@transient private val sc: HiveContext)
   extends UnaryNode {
 
   override def otherCopyArgs: Seq[HiveContext] = sc :: Nil
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 8dc796b056a72..29a6f08f40728 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -45,7 +45,7 @@ import org.apache.spark.util.SerializableJobConf
  * It is based on [[SparkHadoopWriter]].
  */
 private[hive] class SparkHiveWriterContainer(
-    @transient jobConf: JobConf,
+    jobConf: JobConf,
     fileSinkConf: FileSinkDesc)
   extends Logging
   with SparkHadoopMapRedUtil
@@ -163,7 +163,7 @@ private[spark] object SparkHiveDynamicPartitionWriterContainer {
 }
 
 private[spark] class SparkHiveDynamicPartitionWriterContainer(
-    @transient jobConf: JobConf,
+    jobConf: JobConf,
     fileSinkConf: FileSinkDesc,
     dynamicPartColNames: Array[String])
   extends SparkHiveWriterContainer(jobConf, fileSinkConf) {
@@ -194,10 +194,10 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
     // Better solution is to add a step similar to what Hive FileSinkOperator.jobCloseOp does:
     // calling something like Utilities.mvFileToFinalPath to cleanup the output directory and then
     // load it with loadDynamicPartitions/loadPartition/loadTable.
-    val oldMarker = jobConf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
-    jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, false)
+    val oldMarker = conf.value.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
+    conf.value.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, false)
     super.commitJob()
-    jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
+    conf.value.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
   }
 
   override def getLocalFileWriter(row: InternalRow, schema: StructType)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 27024ecfd9101..8a6050f5227bf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -32,7 +32,7 @@ import org.apache.spark.streaming.scheduler.JobGenerator
 
 
 private[streaming]
-class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
+class Checkpoint(ssc: StreamingContext, val checkpointTime: Time)
   extends Logging with Serializable {
   val master = ssc.sc.master
   val framework = ssc.sc.appName
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2c373640d2fd9..dfc569451df86 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -170,7 +170,7 @@ private[python] object PythonDStream {
  */
 private[python] abstract class PythonDStream(
     parent: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -187,7 +187,7 @@ private[python] abstract class PythonDStream(
  */
 private[python] class PythonTransformedDStream (
     parent: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends PythonDStream(parent, pfunc) {
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
@@ -206,7 +206,7 @@ private[python] class PythonTransformedDStream (
 private[python] class PythonTransformed2DStream(
     parent: DStream[_],
     parent2: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -230,7 +230,7 @@ private[python] class PythonTransformed2DStream(
  */
 private[python] class PythonStateDStream(
     parent: DStream[Array[Byte]],
-    @transient reduceFunc: PythonTransformFunction)
+    reduceFunc: PythonTransformFunction)
   extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
@@ -252,8 +252,8 @@ private[python] class PythonStateDStream(
  */
 private[python] class PythonReducedWindowedDStream(
     parent: DStream[Array[Byte]],
-    @transient preduceFunc: PythonTransformFunction,
-    @transient pinvReduceFunc: PythonTransformFunction,
+    preduceFunc: PythonTransformFunction,
+    @transient private val pinvReduceFunc: PythonTransformFunction,
     _windowDuration: Duration,
     _slideDuration: Duration)
   extends PythonDStream(parent, preduceFunc) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index c358f5b5bd70b..40208a64861fb 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -70,7 +70,7 @@ import org.apache.spark.util.{SerializableConfiguration, TimeStampedHashMap, Uti
  */
 private[streaming]
 class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     directory: String,
     filter: Path => Boolean = FileInputDStream.defaultFilter,
     newFilesOnly: Boolean = true,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
index a6c4cd220e42f..95994c983c0cc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.Utils
  *
  * @param ssc_ Streaming context that will execute this input stream
  */
-abstract class InputDStream[T: ClassTag] (@transient ssc_ : StreamingContext)
+abstract class InputDStream[T: ClassTag] (ssc_ : StreamingContext)
   extends DStream[T](ssc_) {
 
   private[streaming] var lastValidTime: Time = null
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
index 186e1bf03a944..002aac9f43617 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
@@ -23,7 +23,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class PluggableInputDStream[T: ClassTag](
-  @transient ssc_ : StreamingContext,
+  ssc_ : StreamingContext,
   receiver: Receiver[T]) extends ReceiverInputDStream[T](ssc_) {
 
   def getReceiver(): Receiver[T] = {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index bab78a3536b47..a2685046e03d4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -27,7 +27,7 @@ import org.apache.spark.streaming.{Time, StreamingContext}
 
 private[streaming]
 class QueueInputDStream[T: ClassTag](
-    @transient ssc: StreamingContext,
+    ssc: StreamingContext,
     val queue: Queue[RDD[T]],
     oneAtATime: Boolean,
     defaultRDD: RDD[T]
@@ -57,7 +57,7 @@ class QueueInputDStream[T: ClassTag](
       if (oneAtATime) {
         Some(buffer.head)
       } else {
-        Some(new UnionRDD(ssc.sc, buffer.toSeq))
+        Some(new UnionRDD(context.sc, buffer.toSeq))
       }
     } else if (defaultRDD != null) {
       Some(defaultRDD)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
index e2925b9e03ec3..5a9eda7c12776 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.streaming.receiver.Receiver
  */
 private[streaming]
 class RawInputDStream[T: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     host: String,
     port: Int,
     storageLevel: StorageLevel
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index 6c139f32da31d..87c20afd5c13c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.{StreamingContext, Time}
  * @param ssc_ Streaming context that will execute this input stream
  * @tparam T Class type of the object of this stream
  */
-abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingContext)
+abstract class ReceiverInputDStream[T: ClassTag](ssc_ : StreamingContext)
   extends InputDStream[T](ssc_) {
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
index 5ce5b7aae6e69..de84e0c9a498d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
@@ -32,7 +32,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class SocketInputDStream[T: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     host: String,
     port: Int,
     bytesToObjects: InputStream => Iterator[T],
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index e081ffe46f502..f811784b25c82 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -61,7 +61,7 @@ class WriteAheadLogBackedBlockRDDPartition(
  *
  *
  * @param sc SparkContext
- * @param blockIds Ids of the blocks that contains this RDD's data
+ * @param _blockIds Ids of the blocks that contains this RDD's data
  * @param walRecordHandles Record handles in write ahead logs that contain this RDD's data
  * @param isBlockIdValid Whether the block Ids are valid (i.e., the blocks are present in the Spark
  *                         executors). If not, then block lookups by the block ids will be skipped.
@@ -73,23 +73,23 @@ class WriteAheadLogBackedBlockRDDPartition(
  */
 private[streaming]
 class WriteAheadLogBackedBlockRDD[T: ClassTag](
-    @transient sc: SparkContext,
-    @transient blockIds: Array[BlockId],
+    sc: SparkContext,
+    @transient private val _blockIds: Array[BlockId],
     @transient val walRecordHandles: Array[WriteAheadLogRecordHandle],
-    @transient isBlockIdValid: Array[Boolean] = Array.empty,
+    @transient private val isBlockIdValid: Array[Boolean] = Array.empty,
     storeInBlockManager: Boolean = false,
     storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER)
-  extends BlockRDD[T](sc, blockIds) {
+  extends BlockRDD[T](sc, _blockIds) {
 
   require(
-    blockIds.length == walRecordHandles.length,
-    s"Number of block Ids (${blockIds.length}) must be " +
+    _blockIds.length == walRecordHandles.length,
+    s"Number of block Ids (${_blockIds.length}) must be " +
       s" same as number of WAL record handles (${walRecordHandles.length})")
 
   require(
-    isBlockIdValid.isEmpty || isBlockIdValid.length == blockIds.length,
+    isBlockIdValid.isEmpty || isBlockIdValid.length == _blockIds.length,
     s"Number of elements in isBlockIdValid (${isBlockIdValid.length}) must be " +
-      s" same as number of block Ids (${blockIds.length})")
+      s" same as number of block Ids (${_blockIds.length})")
 
   // Hadoop configuration is not serializable, so broadcast it as a serializable.
   @transient private val hadoopConfig = sc.hadoopConfiguration
@@ -99,9 +99,9 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
 
   override def getPartitions: Array[Partition] = {
     assertValid()
-    Array.tabulate(blockIds.length) { i =>
+    Array.tabulate(_blockIds.length) { i =>
       val isValid = if (isBlockIdValid.length == 0) true else isBlockIdValid(i)
-      new WriteAheadLogBackedBlockRDDPartition(i, blockIds(i), isValid, walRecordHandles(i))
+      new WriteAheadLogBackedBlockRDDPartition(i, _blockIds(i), isValid, walRecordHandles(i))
     }
   }
 

From 2ddeb63126d26149eda197e85b7b26ef16a6e97c Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Wed, 9 Sep 2015 09:29:10 -0700
Subject: [PATCH 225/802] [SPARK-10117] [MLLIB] Implement SQL data source API
 for reading LIBSVM data

It is convenient to implement data source API for LIBSVM format to have a better integration with DataFrames and ML pipeline API.

Two option is implemented.
* `numFeatures`: Specify the dimension of features vector
* `featuresType`: Specify the type of output vector. `sparse` is default.

Author: lewuathe <lewuathe@me.com>

Closes #8537 from Lewuathe/SPARK-10117 and squashes the following commits:

986999d [lewuathe] Change unit test phrase
11d513f [lewuathe] Fix some reviews
21600a4 [lewuathe] Merge branch 'master' into SPARK-10117
9ce63c7 [lewuathe] Rewrite service loader file
1fdd2df [lewuathe] Merge branch 'SPARK-10117' of github.com:Lewuathe/spark into SPARK-10117
ba3657c [lewuathe] Merge branch 'master' into SPARK-10117
0ea1c1c [lewuathe] LibSVMRelation is registered into META-INF
4f40891 [lewuathe] Improve test suites
5ab62ab [lewuathe] Merge branch 'master' into SPARK-10117
8660d0e [lewuathe] Fix Java unit test
b56a948 [lewuathe] Merge branch 'master' into SPARK-10117
2c12894 [lewuathe] Remove unnecessary tag
7d693c2 [lewuathe] Resolv conflict
62010af [lewuathe] Merge branch 'master' into SPARK-10117
a97ee97 [lewuathe] Fix some points
aef9564 [lewuathe] Fix
70ee4dd [lewuathe] Add Java test
3fd8dce [lewuathe] [SPARK-10117] Implement SQL data source API for reading LIBSVM data
40d3027 [lewuathe] Add Java test
7056d4a [lewuathe] Merge branch 'master' into SPARK-10117
99accaa [lewuathe] [SPARK-10117] Implement SQL data source API for reading LIBSVM data
---
 ...pache.spark.sql.sources.DataSourceRegister |  1 +
 .../ml/source/libsvm/LibSVMRelation.scala     | 99 +++++++++++++++++++
 .../ml/source/JavaLibSVMRelationSuite.java    | 80 +++++++++++++++
 .../spark/ml/source/LibSVMRelationSuite.scala | 76 ++++++++++++++
 4 files changed, 256 insertions(+)
 create mode 100644 mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala

diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
new file mode 100644
index 0000000000000..f632dd603c449
--- /dev/null
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -0,0 +1 @@
+org.apache.spark.ml.source.libsvm.DefaultSource
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
new file mode 100644
index 0000000000000..b12cb62a4ef15
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source.libsvm
+
+import com.google.common.base.Objects
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Since
+import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.{StructType, StructField, DoubleType}
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.sources._
+
+/**
+ * LibSVMRelation provides the DataFrame constructed from LibSVM format data.
+ * @param path File path of LibSVM format
+ * @param numFeatures The number of features
+ * @param vectorType The type of vector. It can be 'sparse' or 'dense'
+ * @param sqlContext The Spark SQLContext
+ */
+private[ml] class LibSVMRelation(val path: String, val numFeatures: Int, val vectorType: String)
+    (@transient val sqlContext: SQLContext)
+  extends BaseRelation with TableScan with Logging with Serializable {
+
+  override def schema: StructType = StructType(
+    StructField("label", DoubleType, nullable = false) ::
+      StructField("features", new VectorUDT(), nullable = false) :: Nil
+  )
+
+  override def buildScan(): RDD[Row] = {
+    val sc = sqlContext.sparkContext
+    val baseRdd = MLUtils.loadLibSVMFile(sc, path, numFeatures)
+
+    baseRdd.map { pt =>
+      val features = if (vectorType == "dense") pt.features.toDense else pt.features.toSparse
+      Row(pt.label, features)
+    }
+  }
+
+  override def hashCode(): Int = {
+    Objects.hashCode(path, schema)
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case that: LibSVMRelation => (this.path == that.path) && this.schema.equals(that.schema)
+    case _ => false
+  }
+
+}
+
+/**
+ * This is used for creating DataFrame from LibSVM format file.
+ * The LibSVM file path must be specified to DefaultSource.
+ */
+@Since("1.6.0")
+class DefaultSource extends RelationProvider with DataSourceRegister {
+
+  @Since("1.6.0")
+  override def shortName(): String = "libsvm"
+
+  private def checkPath(parameters: Map[String, String]): String = {
+    require(parameters.contains("path"), "'path' must be specified")
+    parameters.get("path").get
+  }
+
+  /**
+   * Returns a new base relation with the given parameters.
+   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   * by the Map that is passed to the function.
+   */
+  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
+    : BaseRelation = {
+    val path = checkPath(parameters)
+    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
+    /**
+     * featuresType can be selected "dense" or "sparse".
+     * This parameter decides the type of returned feature vector.
+     */
+    val vectorType = parameters.getOrElse("vectorType", "sparse")
+    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java b/mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java
new file mode 100644
index 0000000000000..11fa4eec0ccf0
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source;
+
+import java.io.File;
+import java.io.IOException;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.DenseVector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.util.Utils;
+
+
+/**
+ * Test LibSVMRelation in Java.
+ */
+public class JavaLibSVMRelationSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+  private transient DataFrame dataset;
+
+  private File tmpDir;
+  private File path;
+
+  @Before
+  public void setUp() throws IOException {
+    jsc = new JavaSparkContext("local", "JavaLibSVMRelationSuite");
+    jsql = new SQLContext(jsc);
+
+    tmpDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource");
+    path = new File(tmpDir.getPath(), "part-00000");
+
+    String s = "1 1:1.0 3:2.0 5:3.0\n0\n0 2:4.0 4:5.0 6:6.0";
+    Files.write(s, path, Charsets.US_ASCII);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+    Utils.deleteRecursively(tmpDir);
+  }
+
+  @Test
+  public void verifyLibSVMDF() {
+    dataset = jsql.read().format("libsvm").option("vectorType", "dense").load(path.getPath());
+    Assert.assertEquals("label", dataset.columns()[0]);
+    Assert.assertEquals("features", dataset.columns()[1]);
+    Row r = dataset.first();
+    Assert.assertEquals(1.0, r.getDouble(0), 1e-15);
+    DenseVector v = r.getAs(1);
+    Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v);
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
new file mode 100644
index 0000000000000..8ed134128c8d2
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{SparseVector, Vectors, DenseVector}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
+class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
+  var path: String = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val lines =
+      """
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
+      """.stripMargin
+    val tempDir = Utils.createTempDir()
+    val file = new File(tempDir.getPath, "part-00000")
+    Files.write(lines, file, Charsets.US_ASCII)
+    path = tempDir.toURI.toString
+  }
+
+  test("select as sparse vector") {
+    val df = sqlContext.read.format("libsvm").load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+
+  test("select as dense vector") {
+    val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense"))
+      .load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    assert(df.count() == 3)
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[DenseVector](1)
+    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
+  }
+
+  test("select a vector with specifying the longer dimension") {
+    val df = sqlContext.read.option("numFeatures", "100").format("libsvm")
+      .load(path)
+    val row1 = df.first()
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+}

From c0052d8d09eebadadb5ed35ac512caaf73919551 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Wed, 9 Sep 2015 10:26:53 -0700
Subject: [PATCH 226/802] =?UTF-8?q?[SPARK-10481]=20[YARN]=20SPARK=5FPREPEN?=
 =?UTF-8?q?D=5FCLASSES=20make=20spark-yarn=20related=20jar=20could=20n?=
 =?UTF-8?q?=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Throw a more readable exception. Please help review. Thanks

Author: Jeff Zhang <zjffdu@apache.org>

Closes #8649 from zjffdu/SPARK-10481.
---
 .../src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index e9a02baafd28e..a2c4bc2f5480b 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1045,7 +1045,10 @@ object Client extends Logging {
           s"in favor of the $CONF_SPARK_JAR configuration variable.")
       System.getenv(ENV_SPARK_JAR)
     } else {
-      SparkContext.jarOfClass(this.getClass).head
+      SparkContext.jarOfClass(this.getClass).getOrElse(throw new SparkException("Could not "
+        + "find jar containing Spark classes. The jar can be defined using the "
+        + "spark.yarn.jar configuration option. If testing Spark, either set that option or "
+        + "make sure SPARK_PREPEND_CLASSES is not set."))
     }
   }
 

From 71da1633c4dcc4e748fbe3b3236af90032b695ae Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 9 Sep 2015 10:57:29 -0700
Subject: [PATCH 227/802] [SPARK-10461] [SQL] make sure `input.primitive` is
 always variable name not code at `GenerateUnsafeProjection`

When we generate unsafe code inside `createCodeForXXX`, we always assign the `input.primitive` to a temp variable in case `input.primitive` is expression code.

This PR did some refactor to make sure `input.primitive` is always variable name, and some other typo and style fixes.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8613 from cloud-fan/minor.
---
 .../expressions/codegen/CodeGenerator.scala   | 10 +--
 .../codegen/GenerateMutableProjection.scala   |  2 -
 .../codegen/GenerateProjection.scala          | 12 +--
 .../codegen/GenerateUnsafeProjection.scala    | 85 ++++++++++---------
 .../expressions/complexTypeCreator.scala      | 33 ++++---
 5 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index bf96248feaef7..da3103b4ebb6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -84,11 +84,11 @@ class CodeGenContext {
   /**
    * Holding all the functions those will be added into generated class.
    */
-  val addedFuntions: mutable.Map[String, String] =
+  val addedFunctions: mutable.Map[String, String] =
     mutable.Map.empty[String, String]
 
   def addNewFunction(funcName: String, funcCode: String): Unit = {
-    addedFuntions += ((funcName, funcCode))
+    addedFunctions += ((funcName, funcCode))
   }
 
   final val JAVA_BOOLEAN = "boolean"
@@ -298,8 +298,8 @@ class CodeGenContext {
            |  $body
            |}
          """.stripMargin
-         addNewFunction(name, code)
-         name
+        addNewFunction(name, code)
+        name
       }
 
       functions.map(name => s"$name($row);").mkString("\n")
@@ -337,7 +337,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   }
 
   protected def declareAddedFunctions(ctx: CodeGenContext): String = {
-    ctx.addedFuntions.map { case (funcName, funcCode) => funcCode }.mkString("\n")
+    ctx.addedFunctions.map { case (funcName, funcCode) => funcCode }.mkString("\n")
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index b4d4df8934bd4..793023b9fbed3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp
 import org.apache.spark.sql.types.DecimalType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index c744e84d822e8..2164ddf03d1b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -48,7 +48,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     val columns = expressions.zipWithIndex.map {
       case (e, i) =>
         s"private ${ctx.javaType(e.dataType)} c$i = ${ctx.defaultValue(e.dataType)};\n"
-    }.mkString("\n      ")
+    }.mkString("\n")
 
     val initColumns = expressions.zipWithIndex.map {
       case (e, i) =>
@@ -67,18 +67,18 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val getCases = (0 until expressions.size).map { i =>
       s"case $i: return c$i;"
-    }.mkString("\n        ")
+    }.mkString("\n")
 
     val updateCases = expressions.zipWithIndex.map { case (e, i) =>
       s"case $i: { c$i = (${ctx.boxedType(e.dataType)})value; return;}"
-    }.mkString("\n        ")
+    }.mkString("\n")
 
     val specificAccessorFunctions = ctx.primitiveTypes.map { jt =>
       val cases = expressions.zipWithIndex.flatMap {
         case (e, i) if ctx.javaType(e.dataType) == jt =>
           Some(s"case $i: return c$i;")
         case _ => None
-      }.mkString("\n        ")
+      }.mkString("\n")
       if (cases.length > 0) {
         val getter = "get" + ctx.primitiveTypeName(jt)
         s"""
@@ -103,7 +103,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         case (e, i) if ctx.javaType(e.dataType) == jt =>
           Some(s"case $i: { c$i = value; return; }")
         case _ => None
-      }.mkString("\n        ")
+      }.mkString("\n")
       if (cases.length > 0) {
         val setter = "set" + ctx.primitiveTypeName(jt)
         s"""
@@ -152,7 +152,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val copyColumns = expressions.zipWithIndex.map { case (e, i) =>
         s"""if (!nullBits[$i]) arr[$i] = c$i;"""
-    }.mkString("\n      ")
+    }.mkString("\n")
 
     val code = s"""
     public SpecificProjection generate($exprType[] expr) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index b570fe86db1aa..03c5f449bf9ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -134,7 +134,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState("byte[]", buffer, s"this.$buffer = new byte[$fixedSize];")
     val cursor = ctx.freshName("cursor")
     ctx.addMutableState("int", cursor, s"this.$cursor = 0;")
-    val tmp = ctx.freshName("tmpBuffer")
+    val tmpBuffer = ctx.freshName("tmpBuffer")
 
     val convertedFields = inputTypes.zip(inputs).zipWithIndex.map { case ((dt, input), i) =>
       val ev = createConvertCode(ctx, input, dt)
@@ -144,10 +144,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           int $numBytes = $cursor + (${genAdditionalSize(dt, ev)});
           if ($buffer.length < $numBytes) {
             // This will not happen frequently, because the buffer is re-used.
-            byte[] $tmp = new byte[$numBytes * 2];
+            byte[] $tmpBuffer = new byte[$numBytes * 2];
             Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
-              $tmp, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
-            $buffer = $tmp;
+              $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
+            $buffer = $tmpBuffer;
           }
           $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $numBytes);
          """
@@ -207,20 +207,22 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val buffer = ctx.freshName("buffer")
     ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
     val outputIsNull = ctx.freshName("isNull")
-    val tmp = ctx.freshName("tmp")
     val numElements = ctx.freshName("numElements")
     val fixedSize = ctx.freshName("fixedSize")
     val numBytes = ctx.freshName("numBytes")
     val elements = ctx.freshName("elements")
     val cursor = ctx.freshName("cursor")
     val index = ctx.freshName("index")
+    val elementName = ctx.freshName("elementName")
 
-    val element = GeneratedExpressionCode(
-      code = "",
-      isNull = s"$tmp.isNullAt($index)",
-      primitive = s"${ctx.getValue(tmp, elementType, index)}"
-    )
-    val convertedElement: GeneratedExpressionCode = createConvertCode(ctx, element, elementType)
+    val element = {
+      val code = s"${ctx.javaType(elementType)} $elementName = " +
+        s"${ctx.getValue(input.primitive, elementType, index)};"
+      val isNull = s"${input.primitive}.isNullAt($index)"
+      GeneratedExpressionCode(code, isNull, elementName)
+    }
+
+    val convertedElement = createConvertCode(ctx, element, elementType)
 
     // go through the input array to calculate how many bytes we need.
     val calculateNumBytes = elementType match {
@@ -272,6 +274,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
         // Should we do word align?
         val elementSize = elementType.defaultSize
         s"""
+          ${convertedElement.code}
           Platform.put${ctx.primitiveTypeName(elementType)}(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
@@ -280,6 +283,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
         """
       case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
         s"""
+          ${convertedElement.code}
           Platform.putLong(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
@@ -307,11 +311,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       ${input.code}
       final boolean $outputIsNull = ${input.isNull};
       if (!$outputIsNull) {
-        final ArrayData $tmp = ${input.primitive};
-        if ($tmp instanceof UnsafeArrayData) {
-          $output = (UnsafeArrayData) $tmp;
+        if (${input.primitive} instanceof UnsafeArrayData) {
+          $output = (UnsafeArrayData) ${input.primitive};
         } else {
-          final int $numElements = $tmp.numElements();
+          final int $numElements = ${input.primitive}.numElements();
           final int $fixedSize = 4 * $numElements;
           int $numBytes = $fixedSize;
 
@@ -350,29 +353,31 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       valueType: DataType): GeneratedExpressionCode = {
     val output = ctx.freshName("convertedMap")
     val outputIsNull = ctx.freshName("isNull")
-    val tmp = ctx.freshName("tmp")
-
-    val keyArray = GeneratedExpressionCode(
-      code = "",
-      isNull = "false",
-      primitive = s"$tmp.keyArray()"
-    )
-    val valueArray = GeneratedExpressionCode(
-      code = "",
-      isNull = "false",
-      primitive = s"$tmp.valueArray()"
-    )
-    val convertedKeys: GeneratedExpressionCode = createCodeForArray(ctx, keyArray, keyType)
-    val convertedValues: GeneratedExpressionCode = createCodeForArray(ctx, valueArray, valueType)
+    val keyArrayName = ctx.freshName("keyArrayName")
+    val valueArrayName = ctx.freshName("valueArrayName")
+
+    val keyArray = {
+      val code = s"ArrayData $keyArrayName = ${input.primitive}.keyArray();"
+      val isNull = "false"
+      GeneratedExpressionCode(code, isNull, keyArrayName)
+    }
+
+    val valueArray = {
+      val code = s"ArrayData $valueArrayName = ${input.primitive}.valueArray();"
+      val isNull = "false"
+      GeneratedExpressionCode(code, isNull, valueArrayName)
+    }
+
+    val convertedKeys = createCodeForArray(ctx, keyArray, keyType)
+    val convertedValues = createCodeForArray(ctx, valueArray, valueType)
 
     val code = s"""
       ${input.code}
       final boolean $outputIsNull = ${input.isNull};
       UnsafeMapData $output = null;
       if (!$outputIsNull) {
-        final MapData $tmp = ${input.primitive};
-        if ($tmp instanceof UnsafeMapData) {
-          $output = (UnsafeMapData) $tmp;
+        if (${input.primitive} instanceof UnsafeMapData) {
+          $output = (UnsafeMapData) ${input.primitive};
         } else {
           ${convertedKeys.code}
           ${convertedValues.code}
@@ -393,22 +398,22 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case t: StructType =>
       val output = ctx.freshName("convertedStruct")
       val outputIsNull = ctx.freshName("isNull")
-      val tmp = ctx.freshName("tmp")
       val fieldTypes = t.fields.map(_.dataType)
       val fieldEvals = fieldTypes.zipWithIndex.map { case (dt, i) =>
-        val getFieldCode = ctx.getValue(tmp, dt, i.toString)
-        val fieldIsNull = s"$tmp.isNullAt($i)"
-        GeneratedExpressionCode("", fieldIsNull, getFieldCode)
+        val fieldName = ctx.freshName("fieldName")
+        val code = s"${ctx.javaType(dt)} $fieldName = " +
+          s"${ctx.getValue(input.primitive, dt, i.toString)};"
+        val isNull = s"${input.primitive}.isNullAt($i)"
+        GeneratedExpressionCode(code, isNull, fieldName)
       }
-      val converter = createCodeForStruct(ctx, tmp, fieldEvals, fieldTypes)
+      val converter = createCodeForStruct(ctx, input.primitive, fieldEvals, fieldTypes)
       val code = s"""
         ${input.code}
          UnsafeRow $output = null;
          final boolean $outputIsNull = ${input.isNull};
          if (!$outputIsNull) {
-           final InternalRow $tmp = ${input.primitive};
-           if ($tmp instanceof UnsafeRow) {
-             $output = (UnsafeRow) $tmp;
+           if (${input.primitive} instanceof UnsafeRow) {
+             $output = (UnsafeRow) ${input.primitive};
            } else {
              ${converter.code}
              $output = ${converter.primitive};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 1c546719730b7..82eab5fb3d03a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -48,21 +48,22 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val arrayClass = classOf[GenericArrayData].getName
+    val values = ctx.freshName("values")
     s"""
       final boolean ${ev.isNull} = false;
-      final Object[] values = new Object[${children.size}];
+      final Object[] $values = new Object[${children.size}];
     """ +
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            values[$i] = null;
+            $values[$i] = null;
           } else {
-            values[$i] = ${eval.primitive};
+            $values[$i] = ${eval.primitive};
           }
          """
       }.mkString("\n") +
-      s"final ${ctx.javaType(dataType)} ${ev.primitive} = new $arrayClass(values);"
+      s"final ArrayData ${ev.primitive} = new $arrayClass($values);"
   }
 
   override def prettyName: String = "array"
@@ -94,21 +95,23 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val rowClass = classOf[GenericMutableRow].getName
+    val rowClass = classOf[GenericInternalRow].getName
+    val values = ctx.freshName("values")
     s"""
       boolean ${ev.isNull} = false;
-      final $rowClass ${ev.primitive} = new $rowClass(${children.size});
+      final Object[] $values = new Object[${children.size}];
     """ +
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            ${ev.primitive}.update($i, null);
+            $values[$i] = null;
           } else {
-            ${ev.primitive}.update($i, ${eval.primitive});
+            $values[$i] = ${eval.primitive};
           }
          """
-      }.mkString("\n")
+      }.mkString("\n") +
+      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
   }
 
   override def prettyName: String = "struct"
@@ -161,21 +164,23 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val rowClass = classOf[GenericMutableRow].getName
+    val rowClass = classOf[GenericInternalRow].getName
+    val values = ctx.freshName("values")
     s"""
       boolean ${ev.isNull} = false;
-      final $rowClass ${ev.primitive} = new $rowClass(${valExprs.size});
+      final Object[] $values = new Object[${valExprs.size}];
     """ +
       valExprs.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            ${ev.primitive}.update($i, null);
+            $values[$i] = null;
           } else {
-            ${ev.primitive}.update($i, ${eval.primitive});
+            $values[$i] = ${eval.primitive};
           }
          """
-      }.mkString("\n")
+      }.mkString("\n") +
+      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
   }
 
   override def prettyName: String = "named_struct"

From 45de518742446ddfbd4816c9d0f8501139f9bc2d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 9 Sep 2015 16:02:27 -0700
Subject: [PATCH 228/802] [SPARK-9730] [SQL] Add Full Outer Join support for
 SortMergeJoin

This PR is based on #8383 , thanks to viirya

JIRA: https://issues.apache.org/jira/browse/SPARK-9730

This patch adds the Full Outer Join support for SortMergeJoin. A new class SortMergeFullJoinScanner is added to scan rows from left and right iterators. FullOuterIterator is simply a wrapper of type RowIterator to consume joined rows from SortMergeFullJoinScanner.

Closes #8383

Author: Liang-Chi Hsieh <viirya@appier.com>
Author: Davies Liu <davies@databricks.com>

Closes #8579 from davies/smj_fullouter.
---
 .../apache/spark/util/collection/BitSet.scala |  11 +
 .../spark/sql/execution/SparkStrategies.scala |   9 +-
 .../execution/joins/SortMergeOuterJoin.scala  | 227 +++++++++++++++++-
 .../org/apache/spark/sql/JoinSuite.scala      |   2 +-
 .../sql/execution/joins/OuterJoinSuite.scala  |  44 ++--
 5 files changed, 259 insertions(+), 34 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 9c15b1188d91c..7ab67fc3a2de9 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -32,6 +32,17 @@ class BitSet(numBits: Int) extends Serializable {
    */
   def capacity: Int = numWords * 64
 
+  /**
+   * Clear all set bits.
+   */
+  def clear(): Unit = {
+    var i = 0
+    while (i < numWords) {
+      words(i) = 0L
+      i += 1
+    }
+  }
+
   /**
    * Set all the bits up to a given index
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 2170bc73a0fd6..4572d5efc92bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -132,15 +132,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         joins.BroadcastHashOuterJoin(
           leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
 
-      case ExtractEquiJoinKeys(LeftOuter, leftKeys, rightKeys, condition, left, right)
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
         if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) =>
         joins.SortMergeOuterJoin(
-          leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil
-
-      case ExtractEquiJoinKeys(RightOuter, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) =>
-        joins.SortMergeOuterJoin(
-          leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
+          leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
 
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) =>
         joins.ShuffledHashOuterJoin(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
index dea9e5e580a1e..ab20ee573ab5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
@@ -17,20 +17,21 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
+import org.apache.spark.util.collection.BitSet
 
 /**
  * :: DeveloperApi ::
  * Performs an sort merge outer join of two child relations.
- *
- * Note: this does not support full outer join yet; see SPARK-9730 for progress on this.
  */
 @DeveloperApi
 case class SortMergeOuterJoin(
@@ -52,6 +53,8 @@ case class SortMergeOuterJoin(
         left.output ++ right.output.map(_.withNullability(true))
       case RightOuter =>
         left.output.map(_.withNullability(true)) ++ right.output
+      case FullOuter =>
+        (left.output ++ right.output).map(_.withNullability(true))
       case x =>
         throw new IllegalArgumentException(
           s"${getClass.getSimpleName} should not take $x as the JoinType")
@@ -62,6 +65,7 @@ case class SortMergeOuterJoin(
     // For left and right outer joins, the output is partitioned by the streamed input's join keys.
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
+    case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
     case x =>
       throw new IllegalArgumentException(
         s"${getClass.getSimpleName} should not take $x as the JoinType")
@@ -71,6 +75,8 @@ case class SortMergeOuterJoin(
     // For left and right outer joins, the output is ordered by the streamed input's join keys.
     case LeftOuter => requiredOrders(leftKeys)
     case RightOuter => requiredOrders(rightKeys)
+    // there are null rows in both streams, so there is no order
+    case FullOuter => Nil
     case x => throw new IllegalArgumentException(
       s"SortMergeOuterJoin should not take $x as the JoinType")
   }
@@ -165,6 +171,26 @@ case class SortMergeOuterJoin(
           new RightOuterIterator(
             smjScanner, leftNullRow, boundCondition, resultProj, numOutputRows).toScala
 
+        case FullOuter =>
+          val leftNullRow = new GenericInternalRow(left.output.length)
+          val rightNullRow = new GenericInternalRow(right.output.length)
+          val smjScanner = new SortMergeFullOuterJoinScanner(
+            leftKeyGenerator = createLeftKeyGenerator(),
+            rightKeyGenerator = createRightKeyGenerator(),
+            keyOrdering,
+            leftIter = RowIterator.fromScala(leftIter),
+            numLeftRows,
+            rightIter = RowIterator.fromScala(rightIter),
+            numRightRows,
+            boundCondition,
+            leftNullRow,
+            rightNullRow)
+
+          new FullOuterIterator(
+            smjScanner,
+            resultProj,
+            numOutputRows).toScala
+
         case x =>
           throw new IllegalArgumentException(
             s"SortMergeOuterJoin should not take $x as the JoinType")
@@ -271,3 +297,196 @@ private class RightOuterIterator(
 
   override def getRow: InternalRow = resultProj(joinedRow)
 }
+
+private class SortMergeFullOuterJoinScanner(
+    leftKeyGenerator: Projection,
+    rightKeyGenerator: Projection,
+    keyOrdering: Ordering[InternalRow],
+    leftIter: RowIterator,
+    numLeftRows: LongSQLMetric,
+    rightIter: RowIterator,
+    numRightRows: LongSQLMetric,
+    boundCondition: InternalRow => Boolean,
+    leftNullRow: InternalRow,
+    rightNullRow: InternalRow)  {
+  private[this] val joinedRow: JoinedRow = new JoinedRow()
+  private[this] var leftRow: InternalRow = _
+  private[this] var leftRowKey: InternalRow = _
+  private[this] var rightRow: InternalRow = _
+  private[this] var rightRowKey: InternalRow = _
+
+  private[this] var leftIndex: Int = 0
+  private[this] var rightIndex: Int = 0
+  private[this] val leftMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow]
+  private[this] val rightMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow]
+  private[this] var leftMatched: BitSet = new BitSet(1)
+  private[this] var rightMatched: BitSet = new BitSet(1)
+
+  advancedLeft()
+  advancedRight()
+
+  // --- Private methods --------------------------------------------------------------------------
+
+  /**
+   * Advance the left iterator and compute the new row's join key.
+   * @return true if the left iterator returned a row and false otherwise.
+   */
+  private def advancedLeft(): Boolean = {
+    if (leftIter.advanceNext()) {
+      leftRow = leftIter.getRow
+      leftRowKey = leftKeyGenerator(leftRow)
+      numLeftRows += 1
+      true
+    } else {
+      leftRow = null
+      leftRowKey = null
+      false
+    }
+  }
+
+  /**
+   * Advance the right iterator and compute the new row's join key.
+   * @return true if the right iterator returned a row and false otherwise.
+   */
+  private def advancedRight(): Boolean = {
+    if (rightIter.advanceNext()) {
+      rightRow = rightIter.getRow
+      rightRowKey = rightKeyGenerator(rightRow)
+      numRightRows += 1
+      true
+    } else {
+      rightRow = null
+      rightRowKey = null
+      false
+    }
+  }
+
+  /**
+   * Populate the left and right buffers with rows matching the provided key.
+   * This consumes rows from both iterators until their keys are different from the matching key.
+   */
+  private def findMatchingRows(matchingKey: InternalRow): Unit = {
+    leftMatches.clear()
+    rightMatches.clear()
+    leftIndex = 0
+    rightIndex = 0
+
+    while (leftRowKey != null && keyOrdering.compare(leftRowKey, matchingKey) == 0) {
+      leftMatches += leftRow.copy()
+      advancedLeft()
+    }
+    while (rightRowKey != null && keyOrdering.compare(rightRowKey, matchingKey) == 0) {
+      rightMatches += rightRow.copy()
+      advancedRight()
+    }
+
+    if (leftMatches.size <= leftMatched.capacity) {
+      leftMatched.clear()
+    } else {
+      leftMatched = new BitSet(leftMatches.size)
+    }
+    if (rightMatches.size <= rightMatched.capacity) {
+      rightMatched.clear()
+    } else {
+      rightMatched = new BitSet(rightMatches.size)
+    }
+  }
+
+  /**
+   * Scan the left and right buffers for the next valid match.
+   *
+   * Note: this method mutates `joinedRow` to point to the latest matching rows in the buffers.
+   * If a left row has no valid matches on the right, or a right row has no valid matches on the
+   * left, then the row is joined with the null row and the result is considered a valid match.
+   *
+   * @return true if a valid match is found, false otherwise.
+   */
+  private def scanNextInBuffered(): Boolean = {
+    while (leftIndex < leftMatches.size) {
+      while (rightIndex < rightMatches.size) {
+        joinedRow(leftMatches(leftIndex), rightMatches(rightIndex))
+        if (boundCondition(joinedRow)) {
+          leftMatched.set(leftIndex)
+          rightMatched.set(rightIndex)
+          rightIndex += 1
+          return true
+        }
+        rightIndex += 1
+      }
+      rightIndex = 0
+      if (!leftMatched.get(leftIndex)) {
+        // the left row has never matched any right row, join it with null row
+        joinedRow(leftMatches(leftIndex), rightNullRow)
+        leftIndex += 1
+        return true
+      }
+      leftIndex += 1
+    }
+
+    while (rightIndex < rightMatches.size) {
+      if (!rightMatched.get(rightIndex)) {
+        // the right row has never matched any left row, join it with null row
+        joinedRow(leftNullRow, rightMatches(rightIndex))
+        rightIndex += 1
+        return true
+      }
+      rightIndex += 1
+    }
+
+    // There are no more valid matches in the left and right buffers
+    false
+  }
+
+  // --- Public methods --------------------------------------------------------------------------
+
+  def getJoinedRow(): JoinedRow = joinedRow
+
+  def advanceNext(): Boolean = {
+    // If we already buffered some matching rows, use them directly
+    if (leftIndex <= leftMatches.size || rightIndex <= rightMatches.size) {
+      if (scanNextInBuffered()) {
+        return true
+      }
+    }
+
+    if (leftRow != null && (leftRowKey.anyNull || rightRow == null)) {
+      joinedRow(leftRow.copy(), rightNullRow)
+      advancedLeft()
+      true
+    } else if (rightRow != null && (rightRowKey.anyNull || leftRow == null)) {
+      joinedRow(leftNullRow, rightRow.copy())
+      advancedRight()
+      true
+    } else if (leftRow != null && rightRow != null) {
+      // Both rows are present and neither have null values,
+      // so we populate the buffers with rows matching the next key
+      val comp = keyOrdering.compare(leftRowKey, rightRowKey)
+      if (comp <= 0) {
+        findMatchingRows(leftRowKey.copy())
+      } else {
+        findMatchingRows(rightRowKey.copy())
+      }
+      scanNextInBuffered()
+      true
+    } else {
+      // Both iterators have been consumed
+      false
+    }
+  }
+}
+
+private class FullOuterIterator(
+    smjScanner: SortMergeFullOuterJoinScanner,
+    resultProj: InternalRow => InternalRow,
+    numRows: LongSQLMetric
+  ) extends RowIterator {
+  private[this] val joinedRow: JoinedRow = smjScanner.getJoinedRow()
+
+  override def advanceNext(): Boolean = {
+    val r = smjScanner.advanceNext()
+    if (r) numRows += 1
+    r
+  }
+
+  override def getRow: InternalRow = resultProj(joinedRow)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index b05435bad5c5a..7a027e13089e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -83,7 +83,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
         classOf[SortMergeOuterJoin]),
       ("SELECT * FROM testData full outer join testData2 ON key = a",
-        classOf[ShuffledHashOuterJoin]),
+        classOf[SortMergeOuterJoin]),
       ("SELECT * FROM testData left JOIN testData2 ON (key * a != key + a)",
         classOf[BroadcastNestedLoopJoin]),
       ("SELECT * FROM testData right JOIN testData2 ON (key * a != key + a)",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
index c2e0bdac17968..09e0237a7cc50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -76,37 +76,37 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
 
     test(s"$testName using ShuffledHashOuterJoin") {
       extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-              EnsureRequirements(sqlContext).apply(
-                ShuffledHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
-              expectedAnswer.map(Row.fromTuple),
-              sortAnswers = true)
-          }
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            EnsureRequirements(sqlContext).apply(
+              ShuffledHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
+        }
       }
     }
 
     if (joinType != FullOuter) {
       test(s"$testName using BroadcastHashOuterJoin") {
         extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-            withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-              checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-                BroadcastHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right),
-                expectedAnswer.map(Row.fromTuple),
-                sortAnswers = true)
-            }
+          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+              BroadcastHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right),
+              expectedAnswer.map(Row.fromTuple),
+              sortAnswers = true)
+          }
         }
       }
+    }
 
-      test(s"$testName using SortMergeOuterJoin") {
-        extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-            withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-              checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-                EnsureRequirements(sqlContext).apply(
-                  SortMergeOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
-                expectedAnswer.map(Row.fromTuple),
-                sortAnswers = false)
-            }
+    test(s"$testName using SortMergeOuterJoin") {
+      extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            EnsureRequirements(sqlContext).apply(
+              SortMergeOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
         }
       }
     }

From 56a0fe5c6e4ae2929c48fae2d6225558d020e5f9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 9 Sep 2015 18:02:33 -0700
Subject: [PATCH 229/802] [SPARK-9772] [PYSPARK] [ML] Add Python API for
 ml.feature.VectorSlicer

Add Python API for ml.feature.VectorSlicer.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8102 from yanboliang/SPARK-9772.
---
 python/pyspark/ml/feature.py | 95 ++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8c26cfbd5a47d..1c423486be8d9 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -27,11 +27,11 @@
 from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion',
-           'RegexTokenizer', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
-           'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
-           'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel',
-           'StopWordsRemover']
+           'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
+           'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
+           'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
+           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
+           'Word2Vec', 'Word2VecModel']
 
 
 @inherit_doc
@@ -1298,6 +1298,91 @@ class VectorIndexerModel(JavaModel):
     """
 
 
+@inherit_doc
+class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    This class takes a feature vector and outputs a new feature vector with a subarray
+    of the original features.
+
+    The subset of features can be specified with either indices (`setIndices()`)
+    or names (`setNames()`).  At least one feature must be selected. Duplicate features
+    are not allowed, so there can be no overlap between selected indices and names.
+
+    The output vector will order features with the selected indices first (in the order given),
+    followed by the selected names (in the order given).
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
+    ...     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
+    ...     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
+    >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
+    >>> vs.transform(df).head().sliced
+    DenseVector([2.3, 1.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
+                    "a vector column. There can be no overlap with names.")
+    names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
+                  "a vector column. These names must be specified by ML " +
+                  "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
+                  "indices.")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
+        """
+        __init__(self, inputCol=None, outputCol=None, indices=None, names=None)
+        """
+        super(VectorSlicer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
+        self.indices = Param(self, "indices", "An array of indices to select features from " +
+                             "a vector column. There can be no overlap with names.")
+        self.names = Param(self, "names", "An array of feature names to select features from " +
+                           "a vector column. These names must be specified by ML " +
+                           "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
+                           "with indices.")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+        """
+        setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+        Sets params for this VectorSlicer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setIndices(self, value):
+        """
+        Sets the value of :py:attr:`indices`.
+        """
+        self._paramMap[self.indices] = value
+        return self
+
+    def getIndices(self):
+        """
+        Gets the value of indices or its default value.
+        """
+        return self.getOrDefault(self.indices)
+
+    def setNames(self, value):
+        """
+        Sets the value of :py:attr:`names`.
+        """
+        self._paramMap[self.names] = value
+        return self
+
+    def getNames(self):
+        """
+        Gets the value of names or its default value.
+        """
+        return self.getOrDefault(self.names)
+
+
 @inherit_doc
 @ignore_unicode_prefix
 class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):

From 1dc7548c598c4eb4ecc7d5bb8962a735bbd2c0f7 Mon Sep 17 00:00:00 2001
From: Sean Paradiso <seanparadiso@gmail.com>
Date: Wed, 9 Sep 2015 22:09:33 -0700
Subject: [PATCH 230/802] [MINOR] [MLLIB] [ML] [DOC] fixed typo: label for
 negative result should be 0.0 (original: 1.0)

Small typo in the example for `LabelledPoint` in the MLLib docs.

Author: Sean Paradiso <seanparadiso@gmail.com>

Closes #8680 from sparadiso/docs_mllib_smalltypo.
---
 docs/mllib-data-types.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 065bf4727624f..d8c7bdc63c70e 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -144,7 +144,7 @@ import org.apache.spark.mllib.regression.LabeledPoint;
 LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
 
 // Create a labeled point with a negative label and a sparse feature vector.
-LabeledPoint neg = new LabeledPoint(1.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
+LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
 {% endhighlight %}
 </div>
 

From 48817cc111a9705f40b7c842315eee24291c2198 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 10 Sep 2015 16:42:12 +0200
Subject: [PATCH 231/802] [SPARK-10497] [BUILD] [TRIVIAL] Handle both locations
 for JIRAError with python-jira

Location of JIRAError has moved between old and new versions of python-jira package.
Longer term it probably makes sense to pin to specific versions (as mentioned in https://issues.apache.org/jira/browse/SPARK-10498 ) but for now, making release tools works with both new and old versions of python-jira.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8661 from holdenk/SPARK-10497-release-utils-does-not-work-with-new-jira-python.
---
 dev/create-release/releaseutils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 51ab25a6a5bd8..7f152b7f53559 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -24,7 +24,11 @@
 
 try:
     from jira.client import JIRA
-    from jira.exceptions import JIRAError
+    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
+    try:
+        from jira.exceptions import JIRAError
+    except ImportError:
+        from jira.utils import JIRAError
 except ImportError:
     print "This tool requires the jira-python library"
     print "Install using 'sudo pip install jira'"

From 4f1daa1ef6b36440962f3c8faea3928599e33784 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 10 Sep 2015 10:04:38 -0700
Subject: [PATCH 232/802] [SPARK-10065] [SQL] avoid the extra copy when
 generate unsafe array

The reason for this extra copy is that we iterate the array twice: calculate elements data size and copy elements to array buffer.

A simple solution is to follow `createCodeForStruct`, we can dynamically grow the buffer when needed and thus don't need to know the data size ahead.

This PR also include some typo and style fixes, and did some minor refactor to make sure `input.primitive` is always variable name not code when generate unsafe code.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8496 from cloud-fan/avoid-copy.
---
 .../codegen/GenerateUnsafeProjection.scala    | 84 ++++++-------------
 1 file changed, 24 insertions(+), 60 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 03c5f449bf9ac..55562facf9652 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -206,11 +206,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState("UnsafeArrayData", output, s"$output = new UnsafeArrayData();")
     val buffer = ctx.freshName("buffer")
     ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
+    val tmpBuffer = ctx.freshName("tmpBuffer")
     val outputIsNull = ctx.freshName("isNull")
     val numElements = ctx.freshName("numElements")
     val fixedSize = ctx.freshName("fixedSize")
     val numBytes = ctx.freshName("numBytes")
-    val elements = ctx.freshName("elements")
     val cursor = ctx.freshName("cursor")
     val index = ctx.freshName("index")
     val elementName = ctx.freshName("elementName")
@@ -224,57 +224,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     val convertedElement = createConvertCode(ctx, element, elementType)
 
-    // go through the input array to calculate how many bytes we need.
-    val calculateNumBytes = elementType match {
-      case _ if ctx.isPrimitiveType(elementType) =>
-        // Should we do word align?
-        val elementSize = elementType.defaultSize
-        s"""
-          $numBytes += $elementSize * $numElements;
-        """
-      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-        s"""
-          $numBytes += 8 * $numElements;
-        """
-      case _ =>
-        val writer = getWriter(elementType)
-        val elementSize = s"$writer.getSize($elements[$index])"
-        // TODO(davies): avoid the copy
-        val unsafeType = elementType match {
-          case _: StructType => "UnsafeRow"
-          case _: ArrayType => "UnsafeArrayData"
-          case _: MapType => "UnsafeMapData"
-          case _ => ctx.javaType(elementType)
-        }
-        val copy = elementType match {
-          // We reuse the buffer during conversion, need copy it before process next element.
-          case _: StructType | _: ArrayType | _: MapType => ".copy()"
-          case _ => ""
-        }
-
-        val newElements = if (elementType == BinaryType) {
-          s"new byte[$numElements][]"
-        } else {
-          s"new $unsafeType[$numElements]"
-        }
-        s"""
-          final $unsafeType[] $elements = $newElements;
-          for (int $index = 0; $index < $numElements; $index++) {
-            ${convertedElement.code}
-            if (!${convertedElement.isNull}) {
-              $elements[$index] = ${convertedElement.primitive}$copy;
-              $numBytes += $elementSize;
-            }
-          }
-        """
-    }
-
     val writeElement = elementType match {
       case _ if ctx.isPrimitiveType(elementType) =>
         // Should we do word align?
         val elementSize = elementType.defaultSize
         s"""
-          ${convertedElement.code}
           Platform.put${ctx.primitiveTypeName(elementType)}(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
@@ -283,7 +237,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
         """
       case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
         s"""
-          ${convertedElement.code}
           Platform.putLong(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
@@ -296,15 +249,23 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           $cursor += $writer.write(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
-            $elements[$index]);
+            ${convertedElement.primitive});
         """
     }
 
-    val checkNull = elementType match {
-      case _ if ctx.isPrimitiveType(elementType) => s"${convertedElement.isNull}"
-      case t: DecimalType => s"$elements[$index] == null" +
-        s" || !$elements[$index].changePrecision(${t.precision}, ${t.scale})"
-      case _ => s"$elements[$index] == null"
+    val checkNull = convertedElement.isNull + (elementType match {
+      case t: DecimalType =>
+        s" || !${convertedElement.primitive}.changePrecision(${t.precision}, ${t.scale})"
+      case _ => ""
+    })
+
+    val elementSize = elementType match {
+      // Should we do word align for primitive types?
+      case _ if ctx.isPrimitiveType(elementType) => elementType.defaultSize.toString
+      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => "8"
+      case _ =>
+        val writer = getWriter(elementType)
+        s"$writer.getSize(${convertedElement.primitive})"
     }
 
     val code = s"""
@@ -318,18 +279,21 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           final int $fixedSize = 4 * $numElements;
           int $numBytes = $fixedSize;
 
-          $calculateNumBytes
-
-          if ($numBytes > $buffer.length) {
-            $buffer = new byte[$numBytes];
-          }
-
           int $cursor = $fixedSize;
           for (int $index = 0; $index < $numElements; $index++) {
+            ${convertedElement.code}
             if ($checkNull) {
               // If element is null, write the negative value address into offset region.
               Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, -$cursor);
             } else {
+              $numBytes += $elementSize;
+              if ($buffer.length < $numBytes) {
+                // This will not happen frequently, because the buffer is re-used.
+                byte[] $tmpBuffer = new byte[$numBytes * 2];
+                Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
+                  $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
+                $buffer = $tmpBuffer;
+              }
               Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, $cursor);
               $writeElement
             }

From f892d927d7246856dd3ea617b2942873359454bc Mon Sep 17 00:00:00 2001
From: Yash Datta <Yash.Datta@guavus.com>
Date: Thu, 10 Sep 2015 10:34:00 -0700
Subject: [PATCH 233/802] [SPARK-7142] [SQL] Minor enhancement to
 BooleanSimplification Optimizer rule

Use these in the optimizer as well:

            A and (not(A) or B) => A and B
            not(A and B) => not(A) or not(B)
            not(A or B) => not(A) and not(B)

Author: Yash Datta <Yash.Datta@guavus.com>

Closes #5700 from saucam/bool_simp.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala |  9 +++++++++
 .../optimizer/BooleanSimplificationSuite.scala   | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a430000bef653..d9b50f3c97da0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -434,6 +434,11 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         case (_, Literal(false, BooleanType)) => Literal(false)
         // a && a  =>  a
         case (l, r) if l fastEquals r => l
+        // a && (not(a) || b) => a && b
+        case (l, Or(l1, r)) if (Not(l) fastEquals l1) => And(l, r)
+        case (l, Or(r, l1)) if (Not(l) fastEquals l1) => And(l, r)
+        case (Or(l, l1), r) if (l1 fastEquals Not(r)) => And(l, r)
+        case (Or(l1, l), r) if (l1 fastEquals Not(r)) => And(l, r)
         // (a || b) && (a || c)  =>  a || (b && c)
         case _ =>
           // 1. Split left and right to get the disjunctive predicates,
@@ -512,6 +517,10 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         case LessThan(l, r) => GreaterThanOrEqual(l, r)
         // not(l <= r)  =>  l > r
         case LessThanOrEqual(l, r) => GreaterThan(l, r)
+        // not(l || r) => not(l) && not(r)
+        case Or(l, r) => And(Not(l), Not(r))
+        // not(l && r) => not(l) or not(r)
+        case And(l, r) => Or(Not(l), Not(r))
         // not(not(e))  =>  e
         case Not(e) => e
         case _ => not
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 1877cff1334bd..cde346e99eb17 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -89,6 +89,22 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
       ('a === 'b || 'b > 3 && 'a > 3 && 'a < 5))
   }
 
+  test("a && (!a || b)") {
+    checkCondition(('a && (!('a) || 'b )), ('a && 'b))
+
+    checkCondition(('a && ('b || !('a) )), ('a && 'b))
+
+    checkCondition(((!('a) || 'b ) && 'a), ('b && 'a))
+
+    checkCondition((('b || !('a) ) && 'a), ('b && 'a))
+  }
+
+  test("!(a && b) , !(a || b)") {
+    checkCondition((!('a && 'b)), (!('a) || !('b)))
+
+    checkCondition(!('a || 'b), (!('a) && !('b)))
+  }
+
   private val caseInsensitiveAnalyzer =
     new Analyzer(EmptyCatalog, EmptyFunctionRegistry, new SimpleCatalystConf(false))
 

From 49da38e5f728e05e8e929c4dcd37145ba060151d Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 10 Sep 2015 11:01:08 -0700
Subject: [PATCH 234/802] [SPARK-10301] [SPARK-10428] [SQL] Addresses comments
 of PR #8583 and #8509 for master

Author: Cheng Lian <lian@databricks.com>

Closes #8670 from liancheng/spark-10301/address-pr-comments.
---
 .../parquet/CatalystReadSupport.scala         |  20 +-
 .../parquet/CatalystRowConverter.scala        |  10 +
 .../parquet/ParquetQuerySuite.scala           | 291 ++++++++++++++++--
 .../parquet/ParquetSchemaSuite.scala          | 246 ++++++++++++++-
 4 files changed, 522 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index dc4ff06df6f22..5a8166fac5418 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -117,14 +117,18 @@ private[parquet] object CatalystReadSupport {
         // Only clips array types with nested type as element type.
         clipParquetListType(parquetType.asGroupType(), t.elementType)
 
-      case t: MapType if !isPrimitiveCatalystType(t.valueType) =>
-        // Only clips map types with nested type as value type.
+      case t: MapType
+        if !isPrimitiveCatalystType(t.keyType) ||
+           !isPrimitiveCatalystType(t.valueType) =>
+        // Only clips map types with nested key type or value type
         clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType)
 
       case t: StructType =>
         clipParquetGroup(parquetType.asGroupType(), t)
 
       case _ =>
+        // UDTs and primitive types are not clipped.  For UDTs, a clipped version might not be able
+        // to be mapped to desired user-space types.  So UDTs shouldn't participate schema merging.
         parquetType
     }
   }
@@ -204,14 +208,14 @@ private[parquet] object CatalystReadSupport {
   }
 
   /**
-   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[MapType]].  The value type
-   * of the [[MapType]] should also be a nested type, namely an [[ArrayType]], a [[MapType]], or a
-   * [[StructType]].  Note that key type of any [[MapType]] is always a primitive type.
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[MapType]].  Either key type or
+   * value type of the [[MapType]] must be a nested type, namely an [[ArrayType]], a [[MapType]], or
+   * a [[StructType]].
    */
   private def clipParquetMapType(
       parquetMap: GroupType, keyType: DataType, valueType: DataType): GroupType = {
-    // Precondition of this method, should only be called for maps with nested value types.
-    assert(!isPrimitiveCatalystType(valueType))
+    // Precondition of this method, only handles maps with nested key types or value types.
+    assert(!isPrimitiveCatalystType(keyType) || !isPrimitiveCatalystType(valueType))
 
     val repeatedGroup = parquetMap.getType(0).asGroupType()
     val parquetKeyType = repeatedGroup.getType(0)
@@ -221,7 +225,7 @@ private[parquet] object CatalystReadSupport {
       Types
         .repeatedGroup()
         .as(repeatedGroup.getOriginalType)
-        .addField(parquetKeyType)
+        .addField(clipParquetType(parquetKeyType, keyType))
         .addField(clipParquetType(parquetValueType, valueType))
         .named(repeatedGroup.getName)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index f17e794b76650..2ff2fda3610b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -123,6 +123,16 @@ private[parquet] class CatalystRowConverter(
     updater: ParentContainerUpdater)
   extends CatalystGroupConverter(updater) with Logging {
 
+  assert(
+    parquetType.getFieldCount == catalystType.length,
+    s"""Field counts of the Parquet schema and the Catalyst schema don't match:
+       |
+       |Parquet schema:
+       |$parquetType
+       |Catalyst schema:
+       |${catalystType.prettyJson}
+     """.stripMargin)
+
   logDebug(
     s"""Building row converter for the following schema:
        |
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 9edbb522684eb..1c1cfa34ad04b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,6 +22,9 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
+import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -228,54 +231,168 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
-  test("SPARK-10301 Clipping nested structs in requested schema") {
+  test("SPARK-10301 requested schema clipping - same schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 1L)))
+    }
+  }
+
+  // This test case is ignored because of parquet-mr bug PARQUET-370
+  ignore("SPARK-10301 requested schema clipping - schemas with disjoint sets of fields") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(null, null)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - requested schema contains physical schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 1L, null, null)))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'd', id + 3) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, null, null, 3L)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - physical schema contains requested schema") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
       val df = sqlContext
         .range(1)
-        .selectExpr("NAMED_STRUCT('a', id, 'b', id) AS s")
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s")
         .coalesce(1)
 
-      df.write.mode("append").parquet(path)
+      df.write.parquet(path)
 
-      val userDefinedSchema = new StructType()
-        .add("s", new StructType().add("a", LongType, nullable = true), nullable = true)
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true),
+            nullable = true)
 
       checkAnswer(
         sqlContext.read.schema(userDefinedSchema).parquet(path),
-        Row(Row(0)))
+        Row(Row(0L, 1L)))
     }
 
     withTempPath { dir =>
       val path = dir.getCanonicalPath
-
-      val df1 = sqlContext
+      val df = sqlContext
         .range(1)
-        .selectExpr("NAMED_STRUCT('a', id, 'b', id) AS s")
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s")
         .coalesce(1)
 
-      val df2 = sqlContext
-        .range(1, 2)
-        .selectExpr("NAMED_STRUCT('b', id, 'c', id) AS s")
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 3L)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - schemas overlap but don't contain each other") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
         .coalesce(1)
 
-      df1.write.parquet(path)
-      df2.write.mode(SaveMode.Append).parquet(path)
+      df.write.parquet(path)
 
-      val userDefinedSchema = new StructType()
-        .add("s",
-          new StructType()
-            .add("a", LongType, nullable = true)
-            .add("c", LongType, nullable = true),
-          nullable = true)
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
 
       checkAnswer(
         sqlContext.read.schema(userDefinedSchema).parquet(path),
-        Seq(
-          Row(Row(0, null)),
-          Row(Row(null, 1))))
+        Row(Row(1L, 2L, null)))
     }
+  }
 
+  test("SPARK-10301 requested schema clipping - deeply nested struct") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
@@ -304,4 +421,132 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         Row(Row(Seq(Row(0, null)))))
     }
   }
+
+  test("SPARK-10301 requested schema clipping - out of order") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('c', id + 2, 'b', id + 1, 'd', id + 3) AS s")
+        .coalesce(1)
+
+      df1.write.parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add("a", LongType, nullable = true)
+            .add("b", LongType, nullable = true)
+            .add("d", LongType, nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Seq(
+          Row(Row(0, 1, null)),
+          Row(Row(null, 2, 4))))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - schema merging") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      df1.write.mode(SaveMode.Append).parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      checkAnswer(
+        sqlContext
+          .read
+          .option("mergeSchema", "true")
+          .parquet(path)
+          .selectExpr("s.a", "s.b", "s.c"),
+        Seq(
+          Row(0, null, 2),
+          Row(1, 2, 3)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - UDT") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df = sqlContext
+        .range(1)
+        .selectExpr(
+          """NAMED_STRUCT(
+            |  'f0', CAST(id AS STRING),
+            |  'f1', NAMED_STRUCT(
+            |    'a', CAST(id + 1 AS INT),
+            |    'b', CAST(id + 2 AS LONG),
+            |    'c', CAST(id + 3.5 AS DOUBLE)
+            |  )
+            |) AS s
+          """.stripMargin)
+        .coalesce(1)
+
+      df.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("f1", new NestedStructUDT, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(NestedStruct(1, 2L, 3.5D))))
+    }
+  }
+}
+
+object TestingUDT {
+  @SQLUserDefinedType(udt = classOf[NestedStructUDT])
+  case class NestedStruct(a: Integer, b: Long, c: Double)
+
+  class NestedStructUDT extends UserDefinedType[NestedStruct] {
+    override def sqlType: DataType =
+      new StructType()
+        .add("a", IntegerType, nullable = true)
+        .add("b", LongType, nullable = false)
+        .add("c", DoubleType, nullable = false)
+
+    override def serialize(obj: Any): Any = {
+      val row = new SpecificMutableRow(sqlType.asInstanceOf[StructType].map(_.dataType))
+      obj match {
+        case n: NestedStruct =>
+          row.setInt(0, n.a)
+          row.setLong(1, n.b)
+          row.setDouble(2, n.c)
+      }
+    }
+
+    override def userClass: Class[NestedStruct] = classOf[NestedStruct]
+
+    override def deserialize(datum: Any): NestedStruct = {
+      datum match {
+        case row: InternalRow =>
+          NestedStruct(row.getInt(0), row.getLong(1), row.getDouble(2))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 5331d7c035236..5a8f772c32289 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1012,12 +1012,17 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       """.stripMargin,
 
     catalystSchema = {
-      val f11Type = new StructType().add("f011", DoubleType, nullable = true)
-      val f01Type = ArrayType(StringType, containsNull = false)
+      val f00Type = ArrayType(StringType, containsNull = false)
+      val f01Type = ArrayType(
+        new StructType()
+          .add("f011", DoubleType, nullable = true),
+        containsNull = false)
+
       val f0Type = new StructType()
-        .add("f00", f01Type, nullable = false)
-        .add("f01", f11Type, nullable = false)
+        .add("f00", f00Type, nullable = false)
+        .add("f01", f01Type, nullable = false)
       val f1Type = ArrayType(IntegerType, containsNull = true)
+
       new StructType()
         .add("f0", f0Type, nullable = false)
         .add("f1", f1Type, nullable = true)
@@ -1046,7 +1051,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     parquetSchema =
       """message root {
         |  required group f0 {
-        |    optional group f00 {
+        |    optional group f00 (LIST) {
         |      repeated binary f00_tuple (UTF8);
         |    }
         |
@@ -1061,13 +1066,13 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       """.stripMargin,
 
     catalystSchema = {
-      val f11ElementType = new StructType()
+      val f01ElementType = new StructType()
         .add("f011", DoubleType, nullable = true)
         .add("f012", LongType, nullable = true)
 
       val f0Type = new StructType()
-        .add("f00", ArrayType(StringType, containsNull = false), nullable = false)
-        .add("f01", ArrayType(f11ElementType, containsNull = false), nullable = false)
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
 
       new StructType().add("f0", f0Type, nullable = false)
     },
@@ -1075,7 +1080,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     expectedSchema =
       """message root {
         |  required group f0 {
-        |    optional group f00 {
+        |    optional group f00 (LIST) {
         |      repeated binary f00_tuple (UTF8);
         |    }
         |
@@ -1095,7 +1100,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     parquetSchema =
       """message root {
         |  required group f0 {
-        |    optional group f00 {
+        |    optional group f00 (LIST) {
         |      repeated binary array (UTF8);
         |    }
         |
@@ -1110,13 +1115,13 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       """.stripMargin,
 
     catalystSchema = {
-      val f11ElementType = new StructType()
+      val f01ElementType = new StructType()
         .add("f011", DoubleType, nullable = true)
         .add("f012", LongType, nullable = true)
 
       val f0Type = new StructType()
-        .add("f00", ArrayType(StringType, containsNull = false), nullable = false)
-        .add("f01", ArrayType(f11ElementType, containsNull = false), nullable = false)
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
 
       new StructType().add("f0", f0Type, nullable = false)
     },
@@ -1124,7 +1129,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     expectedSchema =
       """message root {
         |  required group f0 {
-        |    optional group f00 {
+        |    optional group f00 (LIST) {
         |      repeated binary array (UTF8);
         |    }
         |
@@ -1236,6 +1241,63 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
          |}
        """.stripMargin)
 
+  testSchemaClipping(
+    "standard array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group list {
+        |        required binary element (UTF8);
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group list {
+        |        required group element {
+        |          optional int32 f010;
+        |          optional double f011;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group list {
+        |        required binary element (UTF8);
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group list {
+        |        required group element {
+        |          optional double f011;
+        |          optional int64 f012;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
   testSchemaClipping(
     "empty requested schema",
 
@@ -1251,4 +1313,160 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     catalystSchema = new StructType(),
 
     expectedSchema = "message root {}")
+
+  testSchemaClipping(
+    "disjoint field sets",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    required int32 f00;
+        |    required int64 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema =
+      new StructType()
+        .add(
+          "f0",
+          new StructType()
+            .add("f02", FloatType, nullable = true)
+            .add("f03", DoubleType, nullable = true),
+          nullable = true),
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional float f02;
+        |    optional double f03;
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-avro style map",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group map (MAP_KEY_VALUE) {
+        |      required int32 key;
+        |      required group value {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val valueType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(IntegerType, valueType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group map (MAP_KEY_VALUE) {
+        |      required int32 key;
+        |      required group value {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "standard map",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required int32 key;
+        |      required group value {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val valueType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(IntegerType, valueType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required int32 key;
+        |      required group value {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "standard map with complex key",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required group key {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |      required int32 value;
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val keyType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(keyType, IntegerType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required group key {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |      required int32 value;
+        |    }
+        |  }
+        |}
+      """.stripMargin)
 }

From e04811137680f937669cdcc78771227aeb7cd849 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 10 Sep 2015 11:48:43 -0700
Subject: [PATCH 235/802] [SPARK-10466] [SQL] UnsafeRow SerDe exception with
 data spill

Data Spill with UnsafeRow causes assert failure.

```
java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:165)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2.writeKey(UnsafeRowSerializer.scala:75)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:180)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$writePartitionedFile$2$$anonfun$apply$1.apply(ExternalSorter.scala:688)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$writePartitionedFile$2$$anonfun$apply$1.apply(ExternalSorter.scala:687)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$writePartitionedFile$2.apply(ExternalSorter.scala:687)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$writePartitionedFile$2.apply(ExternalSorter.scala:683)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:683)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:80)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
```

To reproduce that with code (thanks andrewor14):
```scala
bin/spark-shell --master local
  --conf spark.shuffle.memoryFraction=0.005
  --conf spark.shuffle.sort.bypassMergeThreshold=0

sc.parallelize(1 to 2 * 1000 * 1000, 10)
  .map { i => (i, i) }.toDF("a", "b").groupBy("b").avg().count()
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8635 from chenghao-intel/unsafe_spill.
---
 .../util/collection/ExternalSorter.scala      |  6 ++
 .../sql/execution/UnsafeRowSerializer.scala   |  2 +-
 .../execution/UnsafeRowSerializerSuite.scala  | 64 +++++++++++++++++--
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 19287edbaf166..138c05dff19e4 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -188,6 +188,12 @@ private[spark] class ExternalSorter[K, V, C](
 
   private val spills = new ArrayBuffer[SpilledFile]
 
+  /**
+   * Number of files this sorter has spilled so far.
+   * Exposed for testing.
+   */
+  private[spark] def numSpills: Int = spills.size
+
   override def insertAll(records: Iterator[Product2[K, V]]): Unit = {
     // TODO: stop combining if we find that the reduction factor isn't high
     val shouldCombine = aggregator.isDefined
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
index 5c18558f9bde7..e060c06d9e2a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -72,7 +72,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
     override def writeKey[T: ClassTag](key: T): SerializationStream = {
       // The key is only needed on the map side when computing partition ids. It does not need to
       // be shuffled.
-      assert(key.isInstanceOf[Int])
+      assert(null == key || key.isInstanceOf[Int])
       this
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index bd02c73a26ace..0113d052e338d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -17,13 +17,17 @@
 
 package org.apache.spark.sql.execution
 
-import java.io.{DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
+import java.io.{File, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.Utils
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark._
 
 
 /**
@@ -40,9 +44,15 @@ class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStrea
 class UnsafeRowSerializerSuite extends SparkFunSuite {
 
   private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
-    val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]
+    val converter = unsafeRowConverter(schema)
+    converter(row)
+  }
+
+  private def unsafeRowConverter(schema: Array[DataType]): Row => UnsafeRow = {
     val converter = UnsafeProjection.create(schema)
-    converter.apply(internalRow)
+    (row: Row) => {
+      converter(CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow])
+    }
   }
 
   test("toUnsafeRow() test helper method") {
@@ -87,4 +97,50 @@ class UnsafeRowSerializerSuite extends SparkFunSuite {
     assert(!deserializerIter.hasNext)
     assert(input.closed)
   }
+
+  test("SPARK-10466: external sorter spilling with unsafe row serializer") {
+    var sc: SparkContext = null
+    var outputFile: File = null
+    val oldEnv = SparkEnv.get // save the old SparkEnv, as it will be overwritten
+    Utils.tryWithSafeFinally {
+      val conf = new SparkConf()
+        .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
+        .set("spark.shuffle.sort.bypassMergeThreshold", "0")
+        .set("spark.shuffle.memoryFraction", "0.0001")
+
+      sc = new SparkContext("local", "test", conf)
+      outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
+      // prepare data
+      val converter = unsafeRowConverter(Array(IntegerType))
+      val data = (1 to 1000).iterator.map { i =>
+        (i, converter(Row(i)))
+      }
+      val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
+        partitioner = Some(new HashPartitioner(10)),
+        serializer = Some(new UnsafeRowSerializer(numFields = 1)))
+
+      // Ensure we spilled something and have to merge them later
+      assert(sorter.numSpills === 0)
+      sorter.insertAll(data)
+      assert(sorter.numSpills > 0)
+
+      // Merging spilled files should not throw assertion error
+      val taskContext =
+        new TaskContextImpl(0, 0, 0, 0, null, null, InternalAccumulator.create(sc))
+      taskContext.taskMetrics.shuffleWriteMetrics = Some(new ShuffleWriteMetrics)
+      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), taskContext, outputFile)
+    } {
+      // Clean up
+      if (sc != null) {
+        sc.stop()
+      }
+
+      // restore the spark env
+      SparkEnv.set(oldEnv)
+
+      if (outputFile != null) {
+        outputFile.delete()
+      }
+    }
+  }
 }

From a76bde9dae54c4641e21f3c1ceb4870e3dc91881 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 10 Sep 2015 11:49:53 -0700
Subject: [PATCH 236/802] [SPARK-10469] [DOC] Try and document the three
 options

From JIRA:
Add documentation for tungsten-sort.
From the mailing list "I saw a new "spark.shuffle.manager=tungsten-sort" implemented in
https://issues.apache.org/jira/browse/SPARK-7081, but it can't be found its
corresponding description in
http://people.apache.org/~pwendell/spark-releases/spark-1.5.0-rc3-docs/configuration.html(Currenlty
there are only 'sort' and 'hash' two options)."

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8638 from holdenk/SPARK-10469-document-tungsten-sort.
---
 docs/configuration.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index e287591f3fda1..0b1a273916314 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -447,9 +447,12 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.manager</code></td>
   <td>sort</td>
   <td>
-    Implementation to use for shuffling data. There are two implementations available:
-    <code>sort</code> and <code>hash</code>. Sort-based shuffle is more memory-efficient and is
-    the default option starting in 1.2.
+    Implementation to use for shuffling data. There are three implementations available:
+    <code>sort</code>, <code>hash</code> and the new (1.5+) <code>tungsten-sort</code>.
+    Sort-based shuffle is more memory-efficient and is the default option starting in 1.2.
+    Tungsten-sort is similar to the sort based shuffle, with a direct binary cache-friendly
+    implementation with a fall back to regular sort based shuffle if its requirements are not
+    met.
   </td>
 </tr>
 <tr>

From af3bc59d1f5d9d952c2d7ad1af599c49f1dbdaf0 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Thu, 10 Sep 2015 11:58:54 -0700
Subject: [PATCH 237/802] [SPARK-8167] Make tasks that fail from YARN
 preemption not fail job

The architecture is that, in YARN mode, if the driver detects that an executor has disconnected, it asks the ApplicationMaster why the executor died. If the ApplicationMaster is aware that the executor died because of preemption, all tasks associated with that executor are not marked as failed. The executor
is still removed from the driver's list of available executors, however.

There's a few open questions:
1. Should standalone mode have a similar "get executor loss reason" as well? I localized this change as much as possible to affect only YARN, but there could be a valid case to differentiate executor losses in standalone mode as well.
2. I make a pretty strong assumption in YarnAllocator that getExecutorLossReason(executorId) will only be called once per executor id; I do this so that I can remove the metadata from the in-memory map to avoid object accumulation. It's not clear if I'm being overly zealous to save space, however.

cc vanzin specifically for review because it collided with some earlier YARN scheduling work.
cc JoshRosen because it's similar to output commit coordination we did in the past
cc andrewor14 for our discussion on how to get executor exit codes and loss reasons

Author: mcheah <mcheah@palantir.com>

Closes #8007 from mccheah/feature/preemption-handling.
---
 .../org/apache/spark/TaskEndReason.scala      | 18 +++-
 .../spark/scheduler/ExecutorLossReason.scala  | 14 ++-
 .../org/apache/spark/scheduler/Pool.scala     |  4 +-
 .../apache/spark/scheduler/Schedulable.scala  |  2 +-
 .../spark/scheduler/TaskSchedulerImpl.scala   |  9 +-
 .../spark/scheduler/TaskSetManager.scala      | 21 +++--
 .../cluster/CoarseGrainedClusterMessage.scala |  8 +-
 .../CoarseGrainedSchedulerBackend.scala       | 24 +++--
 .../cluster/SparkDeploySchedulerBackend.scala |  6 +-
 .../cluster/YarnSchedulerBackend.scala        | 77 ++++++++++++++--
 .../mesos/CoarseMesosSchedulerBackend.scala   |  4 +-
 .../cluster/mesos/MesosSchedulerBackend.scala |  2 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  9 +-
 .../spark/scheduler/TaskSetManagerSuite.scala | 33 ++++++-
 .../apache/spark/util/JsonProtocolSuite.scala | 10 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |  7 ++
 .../spark/deploy/yarn/YarnAllocator.scala     | 92 ++++++++++++++-----
 17 files changed, 261 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 934d00dc708b9..2ae878b3e6087 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -48,6 +48,8 @@ case object Success extends TaskEndReason
 sealed trait TaskFailedReason extends TaskEndReason {
   /** Error message displayed in the web UI. */
   def toErrorString: String
+
+  def shouldEventuallyFailJob: Boolean = true
 }
 
 /**
@@ -194,6 +196,12 @@ case object TaskKilled extends TaskFailedReason {
 case class TaskCommitDenied(jobID: Int, partitionID: Int, attemptID: Int) extends TaskFailedReason {
   override def toErrorString: String = s"TaskCommitDenied (Driver denied task commit)" +
     s" for job: $jobID, partition: $partitionID, attempt: $attemptID"
+  /**
+   * If a task failed because its attempt to commit was denied, do not count this failure
+   * towards failing the stage. This is intended to prevent spurious stage failures in cases
+   * where many speculative tasks are launched and denied to commit.
+   */
+  override def shouldEventuallyFailJob: Boolean = false
 }
 
 /**
@@ -202,8 +210,14 @@ case class TaskCommitDenied(jobID: Int, partitionID: Int, attemptID: Int) extend
  * the task crashed the JVM.
  */
 @DeveloperApi
-case class ExecutorLostFailure(execId: String) extends TaskFailedReason {
-  override def toErrorString: String = s"ExecutorLostFailure (executor ${execId} lost)"
+case class ExecutorLostFailure(execId: String, isNormalExit: Boolean = false)
+  extends TaskFailedReason {
+  override def toErrorString: String = {
+    val exitBehavior = if (isNormalExit) "normally" else "abnormally"
+    s"ExecutorLostFailure (executor ${execId} exited ${exitBehavior})"
+  }
+
+  override def shouldEventuallyFailJob: Boolean = !isNormalExit
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
index 2bc43a9186449..0a98c69b89ea5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -23,16 +23,20 @@ import org.apache.spark.executor.ExecutorExitCode
  * Represents an explanation for a executor or whole slave failing or exiting.
  */
 private[spark]
-class ExecutorLossReason(val message: String) {
+class ExecutorLossReason(val message: String) extends Serializable {
   override def toString: String = message
 }
 
 private[spark]
-case class ExecutorExited(val exitCode: Int)
-  extends ExecutorLossReason(ExecutorExitCode.explainExitCode(exitCode)) {
+case class ExecutorExited(exitCode: Int, isNormalExit: Boolean, reason: String)
+  extends ExecutorLossReason(reason)
+
+private[spark] object ExecutorExited {
+  def apply(exitCode: Int, isNormalExit: Boolean): ExecutorExited = {
+    ExecutorExited(exitCode, isNormalExit, ExecutorExitCode.explainExitCode(exitCode))
+  }
 }
 
 private[spark]
 case class SlaveLost(_message: String = "Slave lost")
-  extends ExecutorLossReason(_message) {
-}
+  extends ExecutorLossReason(_message)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
index 5821afea98982..551e39a81b695 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
@@ -83,8 +83,8 @@ private[spark] class Pool(
     null
   }
 
-  override def executorLost(executorId: String, host: String) {
-    schedulableQueue.asScala.foreach(_.executorLost(executorId, host))
+  override def executorLost(executorId: String, host: String, reason: ExecutorLossReason) {
+    schedulableQueue.asScala.foreach(_.executorLost(executorId, host, reason))
   }
 
   override def checkSpeculatableTasks(): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
index a87ef030e69c2..ab00bc8f0bf4e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
@@ -42,7 +42,7 @@ private[spark] trait Schedulable {
   def addSchedulable(schedulable: Schedulable): Unit
   def removeSchedulable(schedulable: Schedulable): Unit
   def getSchedulableByName(name: String): Schedulable
-  def executorLost(executorId: String, host: String): Unit
+  def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
   def checkSpeculatableTasks(): Boolean
   def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 1705e7f962de2..1c7bfe89c02ac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -332,7 +332,8 @@ private[spark] class TaskSchedulerImpl(
           // We lost this entire executor, so remember that it's gone
           val execId = taskIdToExecutorId(tid)
           if (activeExecutorIds.contains(execId)) {
-            removeExecutor(execId)
+            removeExecutor(execId,
+              SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
             failedExecutor = Some(execId)
           }
         }
@@ -464,7 +465,7 @@ private[spark] class TaskSchedulerImpl(
       if (activeExecutorIds.contains(executorId)) {
         val hostPort = executorIdToHost(executorId)
         logError("Lost executor %s on %s: %s".format(executorId, hostPort, reason))
-        removeExecutor(executorId)
+        removeExecutor(executorId, reason)
         failedExecutor = Some(executorId)
       } else {
          // We may get multiple executorLost() calls with different loss reasons. For example, one
@@ -482,7 +483,7 @@ private[spark] class TaskSchedulerImpl(
   }
 
   /** Remove an executor from all our data structures and mark it as lost */
-  private def removeExecutor(executorId: String) {
+  private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
     activeExecutorIds -= executorId
     val host = executorIdToHost(executorId)
     val execs = executorsByHost.getOrElse(host, new HashSet)
@@ -497,7 +498,7 @@ private[spark] class TaskSchedulerImpl(
       }
     }
     executorIdToHost -= executorId
-    rootPool.executorLost(executorId, host)
+    rootPool.executorLost(executorId, host, reason)
   }
 
   def executorAdded(execId: String, host: String) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 818b95d67f6be..62af9031b9f8b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -709,6 +709,11 @@ private[spark] class TaskSetManager(
         }
         ef.exception
 
+      case e: ExecutorLostFailure if e.isNormalExit =>
+        logInfo(s"Task $tid failed because while it was being computed, its executor" +
+          s" exited normally. Not marking the task as failed.")
+        None
+
       case e: TaskFailedReason =>  // TaskResultLost, TaskKilled, and others
         logWarning(failureReason)
         None
@@ -722,10 +727,9 @@ private[spark] class TaskSetManager(
       put(info.executorId, clock.getTimeMillis())
     sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics)
     addPendingTask(index)
-    if (!isZombie && state != TaskState.KILLED && !reason.isInstanceOf[TaskCommitDenied]) {
-      // If a task failed because its attempt to commit was denied, do not count this failure
-      // towards failing the stage. This is intended to prevent spurious stage failures in cases
-      // where many speculative tasks are launched and denied to commit.
+    if (!isZombie && state != TaskState.KILLED
+        && reason.isInstanceOf[TaskFailedReason]
+        && reason.asInstanceOf[TaskFailedReason].shouldEventuallyFailJob) {
       assert (null != failureReason)
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
@@ -778,7 +782,7 @@ private[spark] class TaskSetManager(
   }
 
   /** Called by TaskScheduler when an executor is lost so we can re-enqueue our tasks */
-  override def executorLost(execId: String, host: String) {
+  override def executorLost(execId: String, host: String, reason: ExecutorLossReason) {
     logInfo("Re-queueing tasks for " + execId + " from TaskSet " + taskSet.id)
 
     // Re-enqueue pending tasks for this host based on the status of the cluster. Note
@@ -809,9 +813,12 @@ private[spark] class TaskSetManager(
         }
       }
     }
-    // Also re-enqueue any tasks that were running on the node
     for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
-      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(execId))
+      val isNormalExit: Boolean = reason match {
+        case exited: ExecutorExited => exited.isNormalExit
+        case _ => false
+      }
+      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, isNormalExit))
     }
     // recalculate valid locality levels and waits when executor is lost
     recomputeLocality()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 06f5438433b6e..d94743677783f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.ExecutorLossReason
 import org.apache.spark.util.{SerializableBuffer, Utils}
 
 private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable
@@ -70,7 +71,8 @@ private[spark] object CoarseGrainedClusterMessages {
 
   case object StopExecutors extends CoarseGrainedClusterMessage
 
-  case class RemoveExecutor(executorId: String, reason: String) extends CoarseGrainedClusterMessage
+  case class RemoveExecutor(executorId: String, reason: ExecutorLossReason)
+    extends CoarseGrainedClusterMessage
 
   case class SetupDriver(driver: RpcEndpointRef) extends CoarseGrainedClusterMessage
 
@@ -92,6 +94,10 @@ private[spark] object CoarseGrainedClusterMessages {
       hostToLocalTaskCount: Map[String, Int])
     extends CoarseGrainedClusterMessage
 
+  // Check if an executor was force-killed but for a normal reason.
+  // This could be the case if the executor is preempted, for instance.
+  case class GetExecutorLossReason(executorId: String) extends CoarseGrainedClusterMessage
+
   case class KillExecutors(executorIds: Seq[String]) extends CoarseGrainedClusterMessage
 
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 5730a87f960a0..18771f79b44bb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -26,6 +26,7 @@ import org.apache.spark.rpc._
 import org.apache.spark.{ExecutorAllocationClient, Logging, SparkEnv, SparkException, TaskState}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.ENDPOINT_NAME
 import org.apache.spark.util.{ThreadUtils, SerializableBuffer, AkkaUtils, Utils}
 
 /**
@@ -82,7 +83,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     override protected def log = CoarseGrainedSchedulerBackend.this.log
 
-    private val addressToExecutorId = new HashMap[RpcAddress, String]
+    protected val addressToExecutorId = new HashMap[RpcAddress, String]
 
     private val reviveThread =
       ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
@@ -128,6 +129,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+
       case RegisterExecutor(executorId, executorRef, hostPort, cores, logUrls) =>
         Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
         if (executorDataMap.contains(executorId)) {
@@ -185,8 +187,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
-      addressToExecutorId.get(remoteAddress).foreach(removeExecutor(_,
-        "remote Rpc client disassociated"))
+      addressToExecutorId
+        .get(remoteAddress)
+        .foreach(removeExecutor(_, SlaveLost("remote Rpc client disassociated")))
     }
 
     // Make fake resource offers on just one executor
@@ -227,7 +230,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // Remove a disconnected slave from the cluster
-    def removeExecutor(executorId: String, reason: String): Unit = {
+    def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
       executorDataMap.get(executorId) match {
         case Some(executorInfo) =>
           // This must be synchronized because variables mutated
@@ -239,9 +242,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           }
           totalCoreCount.addAndGet(-executorInfo.totalCores)
           totalRegisteredExecutors.addAndGet(-1)
-          scheduler.executorLost(executorId, SlaveLost(reason))
+          scheduler.executorLost(executorId, reason)
           listenerBus.post(
-            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason))
+            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason.toString))
         case None => logInfo(s"Asked to remove non-existent executor $executorId")
       }
     }
@@ -263,8 +266,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // TODO (prashant) send conf instead of properties
-    driverEndpoint = rpcEnv.setupEndpoint(
-      CoarseGrainedSchedulerBackend.ENDPOINT_NAME, new DriverEndpoint(rpcEnv, properties))
+    driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
+  }
+
+  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+    new DriverEndpoint(rpcEnv, properties)
   }
 
   def stopExecutors() {
@@ -304,7 +310,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   }
 
   // Called by subclasses when notified of a lost worker
-  def removeExecutor(executorId: String, reason: String) {
+  def removeExecutor(executorId: String, reason: ExecutorLossReason) {
     try {
       driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, reason))
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index bbe51b4a09a22..27491ecf8b97d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
-import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason, SlaveLost, TaskSchedulerImpl}
+import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
 private[spark] class SparkDeploySchedulerBackend(
@@ -135,11 +135,11 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]) {
     val reason: ExecutorLossReason = exitStatus match {
-      case Some(code) => ExecutorExited(code)
+      case Some(code) => ExecutorExited(code, isNormalExit = true, message)
       case None => SlaveLost(message)
     }
     logInfo("Executor %s removed: %s".format(fullId, message))
-    removeExecutor(fullId.split("/")(1), reason.toString)
+    removeExecutor(fullId.split("/")(1), reason)
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 044f6288fabdd..6a4b536dee191 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.scheduler.cluster
 
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{Future, ExecutionContext}
 
 import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
-import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler._
 import org.apache.spark.ui.JettyUtils
 import org.apache.spark.util.{ThreadUtils, RpcUtils}
 
@@ -43,8 +44,10 @@ private[spark] abstract class YarnSchedulerBackend(
 
   protected var totalExpectedExecutors = 0
 
-  private val yarnSchedulerEndpoint = rpcEnv.setupEndpoint(
-    YarnSchedulerBackend.ENDPOINT_NAME, new YarnSchedulerEndpoint(rpcEnv))
+  private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv)
+
+  private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
+    YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)
 
   private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)
 
@@ -53,7 +56,7 @@ private[spark] abstract class YarnSchedulerBackend(
    * This includes executors already pending or running.
    */
   override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
-    yarnSchedulerEndpoint.askWithRetry[Boolean](
+    yarnSchedulerEndpointRef.askWithRetry[Boolean](
       RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
   }
 
@@ -61,7 +64,7 @@ private[spark] abstract class YarnSchedulerBackend(
    * Request that the ApplicationMaster kill the specified executors.
    */
   override def doKillExecutors(executorIds: Seq[String]): Boolean = {
-    yarnSchedulerEndpoint.askWithRetry[Boolean](KillExecutors(executorIds))
+    yarnSchedulerEndpointRef.askWithRetry[Boolean](KillExecutors(executorIds))
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
@@ -90,6 +93,41 @@ private[spark] abstract class YarnSchedulerBackend(
     }
   }
 
+  override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+    new YarnDriverEndpoint(rpcEnv, properties)
+  }
+
+  /**
+   * Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
+   * This endpoint communicates with the executors and queries the AM for an executor's exit
+   * status when the executor is disconnected.
+   */
+  private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
+      extends DriverEndpoint(rpcEnv, sparkProperties) {
+
+    /**
+     * When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
+     * handles it by assuming the Executor was lost for a bad reason and removes the executor
+     * immediately.
+     *
+     * In YARN's case however it is crucial to talk to the application master and ask why the
+     * executor had exited. In particular, the executor may have exited due to the executor
+     * having been preempted. If the executor "exited normally" according to the application
+     * master then we pass that information down to the TaskSetManager to inform the
+     * TaskSetManager that tasks on that lost executor should not count towards a job failure.
+     *
+     * TODO there's a race condition where while we are querying the ApplicationMaster for
+     * the executor loss reason, there is the potential that tasks will be scheduled on
+     * the executor that failed. We should fix this by having this onDisconnected event
+     * also "blacklist" executors so that tasks are not assigned to them.
+     */
+    override def onDisconnected(rpcAddress: RpcAddress): Unit = {
+      addressToExecutorId.get(rpcAddress).foreach { executorId =>
+        yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+      }
+    }
+  }
+
   /**
    * An [[RpcEndpoint]] that communicates with the ApplicationMaster.
    */
@@ -101,6 +139,33 @@ private[spark] abstract class YarnSchedulerBackend(
       ThreadUtils.newDaemonCachedThreadPool("yarn-scheduler-ask-am-thread-pool")
     implicit val askAmExecutor = ExecutionContext.fromExecutor(askAmThreadPool)
 
+    private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
+        executorId: String,
+        executorRpcAddress: RpcAddress): Unit = {
+      amEndpoint match {
+        case Some(am) =>
+          val lossReasonRequest = GetExecutorLossReason(executorId)
+          val future = am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
+          future onSuccess {
+            case reason: ExecutorLossReason => {
+              driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, reason))
+            }
+          }
+          future onFailure {
+            case NonFatal(e) => {
+              logWarning(s"Attempted to get executor loss reason" +
+                s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
+                s" but got no response. Marking as slave lost.", e)
+              driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, SlaveLost()))
+            }
+            case t => throw t
+          }
+        case None =>
+          logWarning("Attempted to check for an executor loss reason" +
+            " before the AM has registered!")
+      }
+    }
+
     override def receive: PartialFunction[Any, Unit] = {
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
@@ -113,6 +178,7 @@ private[spark] abstract class YarnSchedulerBackend(
         removeExecutor(executorId, reason)
     }
 
+
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case r: RequestExecutors =>
         amEndpoint match {
@@ -143,7 +209,6 @@ private[spark] abstract class YarnSchedulerBackend(
             logWarning("Attempted to kill executors before the AM has registered!")
             context.reply(false)
         }
-
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 452c32d5411cd..65df8874774ca 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -32,7 +32,7 @@ import org.apache.spark.{SecurityManager, SparkContext, SparkEnv, SparkException
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.RpcAddress
-import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.Utils
 
@@ -364,7 +364,7 @@ private[spark] class CoarseMesosSchedulerBackend(
         if (slaveIdToTaskId.containsKey(slaveId)) {
           val taskId: Int = slaveIdToTaskId.get(slaveId)
           taskIdToSlaveId.remove(taskId)
-          removeExecutor(sparkExecutorId(slaveId, taskId.toString), reason)
+          removeExecutor(sparkExecutorId(slaveId, taskId.toString), SlaveLost(reason))
         }
         // TODO: This assumes one Spark executor per Mesos slave,
         // which may no longer be true after SPARK-5095
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 2e424054be785..18da6d2491280 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -390,7 +390,7 @@ private[spark] class MesosSchedulerBackend(
                             slaveId: SlaveID, status: Int) {
     logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
                                                                  slaveId.getValue))
-    recordSlaveLost(d, slaveId, ExecutorExited(status))
+    recordSlaveLost(d, slaveId, ExecutorExited(status, isNormalExit = false))
   }
 
   override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index cbc94fd6d54d9..24f78744ad74c 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -362,8 +362,9 @@ private[spark] object JsonProtocol {
         ("Stack Trace" -> stackTrace) ~
         ("Full Stack Trace" -> exceptionFailure.fullStackTrace) ~
         ("Metrics" -> metrics)
-      case ExecutorLostFailure(executorId) =>
-        ("Executor ID" -> executorId)
+      case ExecutorLostFailure(executorId, isNormalExit) =>
+        ("Executor ID" -> executorId) ~
+        ("Normal Exit" -> isNormalExit)
       case _ => Utils.emptyJson
     }
     ("Reason" -> reason) ~ json
@@ -794,8 +795,10 @@ private[spark] object JsonProtocol {
       case `taskResultLost` => TaskResultLost
       case `taskKilled` => TaskKilled
       case `executorLostFailure` =>
+        val isNormalExit = Utils.jsonOption(json \ "Normal Exit").
+          map(_.extract[Boolean])
         val executorId = Utils.jsonOption(json \ "Executor ID").map(_.extract[String])
-        ExecutorLostFailure(executorId.getOrElse("Unknown"))
+        ExecutorLostFailure(executorId.getOrElse("Unknown"), isNormalExit.getOrElse(false))
       case `unknownReason` => UnknownReason
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index edbdb485c5ea4..f0eadf240943e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -334,7 +334,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
 
     // Now mark host2 as dead
     sched.removeExecutor("exec2")
-    manager.executorLost("exec2", "host2")
+    manager.executorLost("exec2", "host2", SlaveLost())
 
     // nothing should be chosen
     assert(manager.resourceOffer("exec1", "host1", ANY) === None)
@@ -504,13 +504,36 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
       Array(PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY)))
     // test if the valid locality is recomputed when the executor is lost
     sched.removeExecutor("execC")
-    manager.executorLost("execC", "host2")
+    manager.executorLost("execC", "host2", SlaveLost())
     assert(manager.myLocalityLevels.sameElements(Array(NODE_LOCAL, NO_PREF, ANY)))
     sched.removeExecutor("execD")
-    manager.executorLost("execD", "host1")
+    manager.executorLost("execD", "host1", SlaveLost())
     assert(manager.myLocalityLevels.sameElements(Array(NO_PREF, ANY)))
   }
 
+  test("Executors are added but exit normally while running tasks") {
+    sc = new SparkContext("local", "test")
+    val sched = new FakeTaskScheduler(sc)
+    val taskSet = FakeTask.createTaskSet(4,
+      Seq(TaskLocation("host1", "execA")),
+      Seq(TaskLocation("host1", "execB")),
+      Seq(TaskLocation("host2", "execC")),
+      Seq())
+    val manager = new TaskSetManager(sched, taskSet, 1, new ManualClock)
+    sched.addExecutor("execA", "host1")
+    manager.executorAdded()
+    sched.addExecutor("execC", "host2")
+    manager.executorAdded()
+    assert(manager.resourceOffer("exec1", "host1", ANY).isDefined)
+    sched.removeExecutor("execA")
+    manager.executorLost("execA", "host1", ExecutorExited(143, true, "Normal termination"))
+    assert(!sched.taskSetsFailed.contains(taskSet.id))
+    assert(manager.resourceOffer("execC", "host2", ANY).isDefined)
+    sched.removeExecutor("execC")
+    manager.executorLost("execC", "host2", ExecutorExited(1, false, "Abnormal termination"))
+    assert(sched.taskSetsFailed.contains(taskSet.id))
+  }
+
   test("test RACK_LOCAL tasks") {
     // Assign host1 to rack1
     FakeRackUtil.assignHostToRack("host1", "rack1")
@@ -721,8 +744,8 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.resourceOffer("execB.2", "host2", ANY) !== None)
     sched.removeExecutor("execA")
     sched.removeExecutor("execB.2")
-    manager.executorLost("execA", "host1")
-    manager.executorLost("execB.2", "host2")
+    manager.executorLost("execA", "host1", SlaveLost())
+    manager.executorLost("execB.2", "host2", SlaveLost())
     clock.advance(LOCALITY_WAIT_MS * 4)
     sched.addExecutor("execC", "host3")
     manager.executorAdded()
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 343a4139b0ca8..47e548ef0d442 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -151,7 +151,7 @@ class JsonProtocolSuite extends SparkFunSuite {
     testTaskEndReason(exceptionFailure)
     testTaskEndReason(TaskResultLost)
     testTaskEndReason(TaskKilled)
-    testTaskEndReason(ExecutorLostFailure("100"))
+    testTaskEndReason(ExecutorLostFailure("100", true))
     testTaskEndReason(UnknownReason)
 
     // BlockId
@@ -295,10 +295,10 @@ class JsonProtocolSuite extends SparkFunSuite {
 
   test("ExecutorLostFailure backward compatibility") {
     // ExecutorLostFailure in Spark 1.1.0 does not have an "Executor ID" property.
-    val executorLostFailure = ExecutorLostFailure("100")
+    val executorLostFailure = ExecutorLostFailure("100", true)
     val oldEvent = JsonProtocol.taskEndReasonToJson(executorLostFailure)
       .removeField({ _._1 == "Executor ID" })
-    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown")
+    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown", true)
     assert(expectedExecutorLostFailure === JsonProtocol.taskEndReasonFromJson(oldEvent))
   }
 
@@ -577,8 +577,10 @@ class JsonProtocolSuite extends SparkFunSuite {
         assertOptionEquals(r1.metrics, r2.metrics, assertTaskMetricsEquals)
       case (TaskResultLost, TaskResultLost) =>
       case (TaskKilled, TaskKilled) =>
-      case (ExecutorLostFailure(execId1), ExecutorLostFailure(execId2)) =>
+      case (ExecutorLostFailure(execId1, isNormalExit1),
+          ExecutorLostFailure(execId2, isNormalExit2)) =>
         assert(execId1 === execId2)
+        assert(isNormalExit1 === isNormalExit2)
       case (UnknownReason, UnknownReason) =>
       case _ => fail("Task end reasons don't match in types!")
     }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 991b5cec00bd8..93621b44c9183 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -590,6 +590,13 @@ private[spark] class ApplicationMaster(
           case None => logWarning("Container allocator is not ready to kill executors yet.")
         }
         context.reply(true)
+
+      case GetExecutorLossReason(eid) =>
+        Option(allocator) match {
+          case Some(a) => a.enqueueGetLossReasonRequest(eid, context)
+          case None => logWarning(s"Container allocator is not ready to find" +
+            s" executor loss reasons yet.")
+        }
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 5f897cbcb4e9f..fd88b8b7fe3b9 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -21,8 +21,9 @@ import java.util.Collections
 import java.util.concurrent._
 import java.util.regex.Pattern
 
-import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.collection.JavaConverters._
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
 
@@ -36,8 +37,9 @@ import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
+import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
 import org.apache.spark.util.Utils
 
 /**
@@ -93,6 +95,11 @@ private[yarn] class YarnAllocator(
       sparkConf.getInt("spark.executor.instances", YarnSparkHadoopUtil.DEFAULT_NUMBER_EXECUTORS)
     }
 
+  // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
+  // list of requesters that should be responded to once we find out why the given executor
+  // was lost.
+  private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
+
   // Keep track of which container is running which executor to remove the executors later
   // Visible for testing.
   private[yarn] val executorIdToContainer = new HashMap[String, Container]
@@ -235,9 +242,7 @@ private[yarn] class YarnAllocator(
     val completedContainers = allocateResponse.getCompletedContainersStatuses()
     if (completedContainers.size > 0) {
       logDebug("Completed %d containers".format(completedContainers.size))
-
       processCompletedContainers(completedContainers.asScala)
-
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
         .format(completedContainers.size, numExecutorsRunning))
     }
@@ -429,7 +434,7 @@ private[yarn] class YarnAllocator(
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
       val alreadyReleased = releasedContainers.remove(containerId)
-      if (!alreadyReleased) {
+      val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
         numExecutorsRunning -= 1
@@ -440,22 +445,42 @@ private[yarn] class YarnAllocator(
         // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
         // there are some exit status' we shouldn't necessarily count against us, but for
         // now I think its ok as none of the containers are expected to exit
-        if (completedContainer.getExitStatus == ContainerExitStatus.PREEMPTED) {
-          logInfo("Container preempted: " + containerId)
-        } else if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
-          logWarning(memLimitExceededLogMessage(
-            completedContainer.getDiagnostics,
-            VMEM_EXCEEDED_PATTERN))
-        } else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
-          logWarning(memLimitExceededLogMessage(
-            completedContainer.getDiagnostics,
-            PMEM_EXCEEDED_PATTERN))
-        } else if (completedContainer.getExitStatus != 0) {
-          logInfo("Container marked as failed: " + containerId +
-            ". Exit status: " + completedContainer.getExitStatus +
-            ". Diagnostics: " + completedContainer.getDiagnostics)
-          numExecutorsFailed += 1
+        val exitStatus = completedContainer.getExitStatus
+        val (isNormalExit, containerExitReason) = exitStatus match {
+          case ContainerExitStatus.SUCCESS =>
+            (true, s"Executor for container $containerId exited normally.")
+          case ContainerExitStatus.PREEMPTED =>
+            // Preemption should count as a normal exit, since YARN preempts containers merely
+            // to do resource sharing, and tasks that fail due to preempted executors could
+            // just as easily finish on any other executor. See SPARK-8167.
+            (true, s"Container $containerId was preempted.")
+          // Should probably still count memory exceeded exit codes towards task failures
+          case VMEM_EXCEEDED_EXIT_CODE =>
+            (false, memLimitExceededLogMessage(
+              completedContainer.getDiagnostics,
+              VMEM_EXCEEDED_PATTERN))
+          case PMEM_EXCEEDED_EXIT_CODE =>
+            (false, memLimitExceededLogMessage(
+              completedContainer.getDiagnostics,
+              PMEM_EXCEEDED_PATTERN))
+          case unknown =>
+            numExecutorsFailed += 1
+            (false, "Container marked as failed: " + containerId +
+              ". Exit status: " + completedContainer.getExitStatus +
+              ". Diagnostics: " + completedContainer.getDiagnostics)
+
+        }
+        if (isNormalExit) {
+          logInfo(containerExitReason)
+        } else {
+          logWarning(containerExitReason)
         }
+        ExecutorExited(0, isNormalExit, containerExitReason)
+      } else {
+        // If we have already released this container, then it must mean
+        // that the driver has explicitly requested it to be killed
+        ExecutorExited(completedContainer.getExitStatus, isNormalExit = true,
+          s"Container $containerId exited from explicit termination request.")
       }
 
       if (allocatedContainerToHostMap.contains(containerId)) {
@@ -474,18 +499,35 @@ private[yarn] class YarnAllocator(
 
       containerIdToExecutorId.remove(containerId).foreach { eid =>
         executorIdToContainer.remove(eid)
-
+        pendingLossReasonRequests.remove(eid).foreach { pendingRequests =>
+          // Notify application of executor loss reasons so it can decide whether it should abort
+          pendingRequests.foreach(_.reply(exitReason))
+        }
         if (!alreadyReleased) {
           // The executor could have gone away (like no route to host, node failure, etc)
           // Notify backend about the failure of the executor
           numUnexpectedContainerRelease += 1
-          driverRef.send(RemoveExecutor(eid,
-            s"Yarn deallocated the executor $eid (container $containerId)"))
+          driverRef.send(RemoveExecutor(eid, exitReason))
         }
       }
     }
   }
 
+  /**
+   * Register that some RpcCallContext has asked the AM why the executor was lost. Note that
+   * we can only find the loss reason to send back in the next call to allocateResources().
+   */
+  private[yarn] def enqueueGetLossReasonRequest(
+      eid: String,
+      context: RpcCallContext): Unit = synchronized {
+    if (executorIdToContainer.contains(eid)) {
+      pendingLossReasonRequests
+        .getOrElseUpdate(eid, new ArrayBuffer[RpcCallContext]) += context
+    } else {
+      logWarning(s"Tried to get the loss reason for non-existent executor $eid")
+    }
+  }
+
   private def internalReleaseContainer(container: Container): Unit = {
     releasedContainers.add(container.getId())
     amClient.releaseAssignedContainer(container.getId())
@@ -501,6 +543,8 @@ private object YarnAllocator {
     Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
   val VMEM_EXCEEDED_PATTERN =
     Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
+  val VMEM_EXCEEDED_EXIT_CODE = -103
+  val PMEM_EXCEEDED_EXIT_CODE = -104
 
   def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
     val matcher = pattern.matcher(diagnostics)

From f0562e8cdbab7ce40f3186da98595312252f8b5c Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Thu, 10 Sep 2015 12:00:21 -0700
Subject: [PATCH 238/802] [SPARK-6350] [MESOS] Fine-grained mode scheduler
 respects mesosExecutor.cores

This is a regression introduced in #4960, this commit fixes it and adds a test.

tnachen andrewor14 please review, this should be an easy one.

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #8653 from dragos/issue/mesos/fine-grained-maxExecutorCores.
---
 .../cluster/mesos/MesosSchedulerBackend.scala |  3 +-
 .../mesos/MesosSchedulerBackendSuite.scala    | 33 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 18da6d2491280..8edf7007a5daf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -32,7 +32,6 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.Utils
 
-
 /**
  * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
  * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks
@@ -127,7 +126,7 @@ private[spark] class MesosSchedulerBackend(
     }
     val builder = MesosExecutorInfo.newBuilder()
     val (resourcesAfterCpu, usedCpuResources) =
-      partitionResources(availableResources, "cpus", scheduler.CPUS_PER_TASK)
+      partitionResources(availableResources, "cpus", mesosExecutorCores)
     val (resourcesAfterMem, usedMemResources) =
       partitionResources(resourcesAfterCpu.asJava, "mem", calculateTotalMemory(sc))
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index 319b3173e7a6e..c4dc560031207 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -42,6 +42,38 @@ import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSui
 
 class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
 
+  test("Use configured mesosExecutor.cores for ExecutorInfo") {
+    val mesosExecutorCores = 3
+    val conf = new SparkConf
+    conf.set("spark.mesos.mesosExecutor.cores", mesosExecutorCores.toString)
+
+    val listenerBus = mock[LiveListenerBus]
+    listenerBus.post(
+      SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+    val sc = mock[SparkContext]
+    when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+
+    when(sc.conf).thenReturn(conf)
+    when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+    when(sc.executorMemory).thenReturn(100)
+    when(sc.listenerBus).thenReturn(listenerBus)
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+
+    val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
+
+    val resources = Arrays.asList(
+      mesosSchedulerBackend.createResource("cpus", 4),
+      mesosSchedulerBackend.createResource("mem", 1024))
+    // uri is null.
+    val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
+    val executorResources = executorInfo.getResourcesList
+    val cpus = executorResources.asScala.find(_.getName.equals("cpus")).get.getScalar.getValue
+
+    assert(cpus === mesosExecutorCores)
+  }
+
   test("check spark-class location correctly") {
     val conf = new SparkConf
     conf.set("spark.mesos.executor.home" , "/mesos-home")
@@ -263,7 +295,6 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
       .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
       .setHostname(s"host${id.toString}").build()
 
-
     val mesosOffers = new java.util.ArrayList[Offer]
     mesosOffers.add(offer)
 

From a5ef2d0600d5e23ca05fabc1005bb81e5ada0727 Mon Sep 17 00:00:00 2001
From: Akash Mishra <akash.mishra20@gmail.com>
Date: Thu, 10 Sep 2015 12:03:11 -0700
Subject: [PATCH 239/802] [SPARK-10514] [MESOS] waiting for min no of total
 cores acquired by Spark by implementing the sufficientResourcesRegistered
 method

spark.scheduler.minRegisteredResourcesRatio configuration parameter works for YARN mode but not for Mesos Coarse grained mode.

If the parameter specified default value of 0 will be set for spark.scheduler.minRegisteredResourcesRatio in base class and this method will always return true.

There are no existing test for YARN mode too. Hence not added test for the same.

Author: Akash Mishra <akash.mishra20@gmail.com>

Closes #8672 from SleepyThread/master.
---
 .../cluster/mesos/CoarseMesosSchedulerBackend.scala          | 4 ++++
 docs/configuration.md                                        | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 65df8874774ca..65cb5016cfcc9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -222,6 +222,10 @@ private[spark] class CoarseMesosSchedulerBackend(
     markRegistered()
   }
 
+  override def sufficientResourcesRegistered(): Boolean = {
+    totalCoresAcquired >= maxCores * minRegisteredRatio
+  }
+
   override def disconnected(d: SchedulerDriver) {}
 
   override def reregistered(d: SchedulerDriver, masterInfo: MasterInfo) {}
diff --git a/docs/configuration.md b/docs/configuration.md
index 0b1a273916314..1a701f18881fe 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1112,10 +1112,11 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
-  <td>0.8 for YARN mode; 0.0 otherwise</td>
+  <td>0.8 for YARN mode; 0.0 for standalone mode and Mesos coarse-grained mode</td>
   <td>
     The minimum ratio of registered resources (registered resources / total expected resources)
-    (resources are executors in yarn mode, CPU cores in standalone mode)
+    (resources are executors in yarn mode, CPU cores in standalone mode and Mesos coarsed-grained
+     mode ['spark.cores.max' value is total expected resources for Mesos coarse-grained mode] )
     to wait for before scheduling begins. Specified as a double between 0.0 and 1.0.
     Regardless of whether the minimum ratio of resources has been reached,
     the maximum amount of time it will wait before scheduling begins is controlled by config

From d88abb7e212fb55f9b0398a0f76a753c86b85cf1 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 10 Sep 2015 12:06:49 -0700
Subject: [PATCH 240/802] [SPARK-9990] [SQL] Create local hash join operator

This PR includes the following changes:
- Add SQLConf to LocalNode
- Add HashJoinNode
- Add ConvertToUnsafeNode and ConvertToSafeNode.scala to test unsafe hash join.

Author: zsxwing <zsxwing@gmail.com>

Closes #8535 from zsxwing/SPARK-9990.
---
 .../sql/execution/joins/HashedRelation.scala  |   4 +-
 .../execution/local/ConvertToSafeNode.scala   |  40 +++++
 .../execution/local/ConvertToUnsafeNode.scala |  40 +++++
 .../sql/execution/local/FilterNode.scala      |   4 +-
 .../sql/execution/local/HashJoinNode.scala    | 137 ++++++++++++++++++
 .../spark/sql/execution/local/LimitNode.scala |   3 +-
 .../spark/sql/execution/local/LocalNode.scala |  83 ++++++++++-
 .../sql/execution/local/ProjectNode.scala     |   4 +-
 .../sql/execution/local/SeqScanNode.scala     |   4 +-
 .../spark/sql/execution/local/UnionNode.scala |   3 +-
 .../sql/execution/local/FilterNodeSuite.scala |   4 +-
 .../execution/local/HashJoinNodeSuite.scala   | 130 +++++++++++++++++
 .../sql/execution/local/LimitNodeSuite.scala  |   4 +-
 .../sql/execution/local/LocalNodeTest.scala   |   9 +-
 .../execution/local/ProjectNodeSuite.scala    |   4 +-
 .../sql/execution/local/UnionNodeSuite.scala  |   6 +-
 16 files changed, 455 insertions(+), 24 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 6c0196c21a0d1..0cff21ca618b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -38,7 +38,7 @@ import org.apache.spark.{SparkConf, SparkEnv}
  * Interface for a hashed relation by some key. Use [[HashedRelation.apply]] to create a concrete
  * object.
  */
-private[joins] sealed trait HashedRelation {
+private[execution] sealed trait HashedRelation {
   def get(key: InternalRow): Seq[InternalRow]
 
   // This is a helper method to implement Externalizable, and is used by
@@ -111,7 +111,7 @@ final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalR
 // TODO(rxin): a version of [[HashedRelation]] backed by arrays for consecutive integer keys.
 
 
-private[joins] object HashedRelation {
+private[execution] object HashedRelation {
 
   def apply(
       input: Iterator[InternalRow],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala
new file mode 100644
index 0000000000000..b31c5a863832e
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala
@@ -0,0 +1,40 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, FromUnsafeProjection, Projection}
+
+case class ConvertToSafeNode(conf: SQLConf, child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var convertToSafe: Projection = _
+
+  override def open(): Unit = {
+    child.open()
+    convertToSafe = FromUnsafeProjection(child.schema)
+  }
+
+  override def next(): Boolean = child.next()
+
+  override def fetch(): InternalRow = convertToSafe(child.fetch())
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala
new file mode 100644
index 0000000000000..de2f4e661ab44
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala
@@ -0,0 +1,40 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Projection, UnsafeProjection}
+
+case class ConvertToUnsafeNode(conf: SQLConf, child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var convertToUnsafe: Projection = _
+
+  override def open(): Unit = {
+    child.open()
+    convertToUnsafe = UnsafeProjection.create(child.schema)
+  }
+
+  override def next(): Boolean = child.next()
+
+  override def fetch(): InternalRow = convertToUnsafe(child.fetch())
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
index 81dd37c7da733..dd1113b6726cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 
 
-case class FilterNode(condition: Expression, child: LocalNode) extends UnaryLocalNode {
+case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode)
+  extends UnaryLocalNode(conf) {
 
   private[this] var predicate: (InternalRow) => Boolean = _
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
new file mode 100644
index 0000000000000..a3e68d6a7c341
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
@@ -0,0 +1,137 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.joins._
+import org.apache.spark.sql.execution.metric.SQLMetrics
+
+/**
+ * Much of this code is similar to [[org.apache.spark.sql.execution.joins.HashJoin]].
+ */
+case class HashJoinNode(
+    conf: SQLConf,
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    buildSide: BuildSide,
+    left: LocalNode,
+    right: LocalNode) extends BinaryLocalNode(conf) {
+
+  private[this] lazy val (buildNode, buildKeys, streamedNode, streamedKeys) = buildSide match {
+    case BuildLeft => (left, leftKeys, right, rightKeys)
+    case BuildRight => (right, rightKeys, left, leftKeys)
+  }
+
+  private[this] var currentStreamedRow: InternalRow = _
+  private[this] var currentHashMatches: Seq[InternalRow] = _
+  private[this] var currentMatchPosition: Int = -1
+
+  private[this] var joinRow: JoinedRow = _
+  private[this] var resultProjection: (InternalRow) => InternalRow = _
+
+  private[this] var hashed: HashedRelation = _
+  private[this] var joinKeys: Projection = _
+
+  override def output: Seq[Attribute] = left.output ++ right.output
+
+  private[this] def isUnsafeMode: Boolean = {
+    (codegenEnabled && unsafeEnabled
+      && UnsafeProjection.canSupport(buildKeys)
+      && UnsafeProjection.canSupport(schema))
+  }
+
+  private[this] def buildSideKeyGenerator: Projection = {
+    if (isUnsafeMode) {
+      UnsafeProjection.create(buildKeys, buildNode.output)
+    } else {
+      newMutableProjection(buildKeys, buildNode.output)()
+    }
+  }
+
+  private[this] def streamSideKeyGenerator: Projection = {
+    if (isUnsafeMode) {
+      UnsafeProjection.create(streamedKeys, streamedNode.output)
+    } else {
+      newMutableProjection(streamedKeys, streamedNode.output)()
+    }
+  }
+
+  override def open(): Unit = {
+    buildNode.open()
+    hashed = HashedRelation.apply(
+      new LocalNodeIterator(buildNode), SQLMetrics.nullLongMetric, buildSideKeyGenerator)
+    streamedNode.open()
+    joinRow = new JoinedRow
+    resultProjection = {
+      if (isUnsafeMode) {
+        UnsafeProjection.create(schema)
+      } else {
+        identity[InternalRow]
+      }
+    }
+    joinKeys = streamSideKeyGenerator
+  }
+
+  override def next(): Boolean = {
+    currentMatchPosition += 1
+    if (currentHashMatches == null || currentMatchPosition >= currentHashMatches.size) {
+      fetchNextMatch()
+    } else {
+      true
+    }
+  }
+
+  /**
+   * Populate `currentHashMatches` with build-side rows matching the next streamed row.
+   * @return whether matches are found such that subsequent calls to `fetch` are valid.
+   */
+  private def fetchNextMatch(): Boolean = {
+    currentHashMatches = null
+    currentMatchPosition = -1
+
+    while (currentHashMatches == null && streamedNode.next()) {
+      currentStreamedRow = streamedNode.fetch()
+      val key = joinKeys(currentStreamedRow)
+      if (!key.anyNull) {
+        currentHashMatches = hashed.get(key)
+      }
+    }
+
+    if (currentHashMatches == null) {
+      false
+    } else {
+      currentMatchPosition = 0
+      true
+    }
+  }
+
+  override def fetch(): InternalRow = {
+    val ret = buildSide match {
+      case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
+      case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
+    }
+    resultProjection(ret)
+  }
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
index fffc52abf6dd5..401b10a5ed307 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 
-case class LimitNode(limit: Int, child: LocalNode) extends UnaryLocalNode {
+case class LimitNode(conf: SQLConf, limit: Int, child: LocalNode) extends UnaryLocalNode(conf) {
 
   private[this] var count = 0
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index 1c4469acbf264..c4f8ae304db39 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -17,9 +17,13 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.Row
+import scala.util.control.NonFatal
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.{SQLConf, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.StructType
 
@@ -29,7 +33,15 @@ import org.apache.spark.sql.types.StructType
  * Before consuming the iterator, open function must be called.
  * After consuming the iterator, close function must be called.
  */
-abstract class LocalNode extends TreeNode[LocalNode] {
+abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging {
+
+  protected val codegenEnabled: Boolean = conf.codegenEnabled
+
+  protected val unsafeEnabled: Boolean = conf.unsafeEnabled
+
+  lazy val schema: StructType = StructType.fromAttributes(output)
+
+  private[this] lazy val isTesting: Boolean = sys.props.contains("spark.testing")
 
   def output: Seq[Attribute]
 
@@ -73,17 +85,78 @@ abstract class LocalNode extends TreeNode[LocalNode] {
     }
     result
   }
+
+  protected def newMutableProjection(
+      expressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): () => MutableProjection = {
+    log.debug(
+      s"Creating MutableProj: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if (codegenEnabled) {
+      try {
+        GenerateMutableProjection.generate(expressions, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate mutable projection, fallback to interpreted", e)
+            () => new InterpretedMutableProjection(expressions, inputSchema)
+          }
+      }
+    } else {
+      () => new InterpretedMutableProjection(expressions, inputSchema)
+    }
+  }
+
 }
 
 
-abstract class LeafLocalNode extends LocalNode {
+abstract class LeafLocalNode(conf: SQLConf) extends LocalNode(conf) {
   override def children: Seq[LocalNode] = Seq.empty
 }
 
 
-abstract class UnaryLocalNode extends LocalNode {
+abstract class UnaryLocalNode(conf: SQLConf) extends LocalNode(conf) {
 
   def child: LocalNode
 
   override def children: Seq[LocalNode] = Seq(child)
 }
+
+abstract class BinaryLocalNode(conf: SQLConf) extends LocalNode(conf) {
+
+  def left: LocalNode
+
+  def right: LocalNode
+
+  override def children: Seq[LocalNode] = Seq(left, right)
+}
+
+/**
+ * An thin wrapper around a [[LocalNode]] that provides an `Iterator` interface.
+ */
+private[local] class LocalNodeIterator(localNode: LocalNode) extends Iterator[InternalRow] {
+  private var nextRow: InternalRow = _
+
+  override def hasNext: Boolean = {
+    if (nextRow == null) {
+      val res = localNode.next()
+      if (res) {
+        nextRow = localNode.fetch()
+      }
+      res
+    } else {
+      true
+    }
+  }
+
+  override def next(): InternalRow = {
+    if (hasNext) {
+      val res = nextRow
+      nextRow = null
+      res
+    } else {
+      throw new NoSuchElementException
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
index 9b8a4fe493026..11529d6dd9b83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, Attribute, NamedExpression}
 
 
-case class ProjectNode(projectList: Seq[NamedExpression], child: LocalNode) extends UnaryLocalNode {
+case class ProjectNode(conf: SQLConf, projectList: Seq[NamedExpression], child: LocalNode)
+  extends UnaryLocalNode(conf) {
 
   private[this] var project: UnsafeProjection = _
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
index 242cb66e07b7f..b8467f6ae58e0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 /**
  * An operator that scans some local data collection in the form of Scala Seq.
  */
-case class SeqScanNode(output: Seq[Attribute], data: Seq[InternalRow]) extends LeafLocalNode {
+case class SeqScanNode(conf: SQLConf, output: Seq[Attribute], data: Seq[InternalRow])
+  extends LeafLocalNode(conf) {
 
   private[this] var iterator: Iterator[InternalRow] = _
   private[this] var currentRow: InternalRow = _
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
index ba4aa7671aebd..0f2b8303e7372 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
-case class UnionNode(children: Seq[LocalNode]) extends LocalNode {
+case class UnionNode(conf: SQLConf, children: Seq[LocalNode]) extends LocalNode(conf) {
 
   override def output: Seq[Attribute] = children.head.output
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
index 07209f3779248..a12670e347c25 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
@@ -25,7 +25,7 @@ class FilterNodeSuite extends LocalNodeTest with SharedSQLContext {
     val condition = (testData.col("key") % 2) === 0
     checkAnswer(
       testData,
-      node => FilterNode(condition.expr, node),
+      node => FilterNode(conf, condition.expr, node),
       testData.filter(condition).collect()
     )
   }
@@ -34,7 +34,7 @@ class FilterNodeSuite extends LocalNodeTest with SharedSQLContext {
     val condition = (emptyTestData.col("key") % 2) === 0
     checkAnswer(
       emptyTestData,
-      node => FilterNode(condition.expr, node),
+      node => FilterNode(conf, condition.expr, node),
       emptyTestData.filter(condition).collect()
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
new file mode 100644
index 0000000000000..43b6f06aead88
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
@@ -0,0 +1,130 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.execution.joins
+
+class HashJoinNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  private def wrapForUnsafe(
+      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
+    if (conf.unsafeEnabled) {
+      (left: LocalNode, right: LocalNode) => {
+        val _left = ConvertToUnsafeNode(conf, left)
+        val _right = ConvertToUnsafeNode(conf, right)
+        val r = f(_left, _right)
+        ConvertToSafeNode(conf, r)
+      }
+    } else {
+      f
+    }
+  }
+
+  def joinSuite(suiteName: String, confPairs: (String, String)*): Unit = {
+    test(s"$suiteName: inner join with one match per row") {
+      withSQLConf(confPairs: _*) {
+        checkAnswer2(
+          upperCaseData,
+          lowerCaseData,
+          wrapForUnsafe(
+            (node1, node2) => HashJoinNode(
+              conf,
+              Seq(upperCaseData.col("N").expr),
+              Seq(lowerCaseData.col("n").expr),
+              joins.BuildLeft,
+              node1,
+              node2)
+          ),
+          upperCaseData.join(lowerCaseData, $"n" === $"N").collect()
+        )
+      }
+    }
+
+    test(s"$suiteName: inner join with multiple matches") {
+      withSQLConf(confPairs: _*) {
+        val x = testData2.where($"a" === 1).as("x")
+        val y = testData2.where($"a" === 1).as("y")
+        checkAnswer2(
+          x,
+          y,
+          wrapForUnsafe(
+            (node1, node2) => HashJoinNode(
+              conf,
+              Seq(x.col("a").expr),
+              Seq(y.col("a").expr),
+              joins.BuildLeft,
+              node1,
+              node2)
+          ),
+          x.join(y).where($"x.a" === $"y.a").collect()
+        )
+      }
+    }
+
+    test(s"$suiteName: inner join, no matches") {
+      withSQLConf(confPairs: _*) {
+        val x = testData2.where($"a" === 1).as("x")
+        val y = testData2.where($"a" === 2).as("y")
+        checkAnswer2(
+          x,
+          y,
+          wrapForUnsafe(
+            (node1, node2) => HashJoinNode(
+              conf,
+              Seq(x.col("a").expr),
+              Seq(y.col("a").expr),
+              joins.BuildLeft,
+              node1,
+              node2)
+          ),
+          Nil
+        )
+      }
+    }
+
+    test(s"$suiteName: big inner join, 4 matches per row") {
+      withSQLConf(confPairs: _*) {
+        val bigData = testData.unionAll(testData).unionAll(testData).unionAll(testData)
+        val bigDataX = bigData.as("x")
+        val bigDataY = bigData.as("y")
+
+        checkAnswer2(
+          bigDataX,
+          bigDataY,
+          wrapForUnsafe(
+            (node1, node2) =>
+              HashJoinNode(
+                conf,
+                Seq(bigDataX.col("key").expr),
+                Seq(bigDataY.col("key").expr),
+                joins.BuildLeft,
+                node1,
+                node2)
+          ),
+          bigDataX.join(bigDataY).where($"x.key" === $"y.key").collect())
+      }
+    }
+  }
+
+  joinSuite(
+    "general", SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
+  joinSuite("tungsten", SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
index 523c02f4a6014..3b183902007e4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
@@ -24,7 +24,7 @@ class LimitNodeSuite extends LocalNodeTest with SharedSQLContext {
   test("basic") {
     checkAnswer(
       testData,
-      node => LimitNode(10, node),
+      node => LimitNode(conf, 10, node),
       testData.limit(10).collect()
     )
   }
@@ -32,7 +32,7 @@ class LimitNodeSuite extends LocalNodeTest with SharedSQLContext {
   test("empty") {
     checkAnswer(
       emptyTestData,
-      node => LimitNode(10, node),
+      node => LimitNode(conf, 10, node),
       emptyTestData.limit(10).collect()
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
index 95f06081bd0a8..b95d4ea7f8f2a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -20,10 +20,12 @@ package org.apache.spark.sql.execution.local
 import scala.util.control.NonFatal
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.{DataFrame, Row, SQLConf}
+import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 
-class LocalNodeTest extends SparkFunSuite {
+class LocalNodeTest extends SparkFunSuite with SharedSQLContext {
+
+  def conf: SQLConf = sqlContext.conf
 
   /**
    * Runs the LocalNode and makes sure the answer matches the expected result.
@@ -92,6 +94,7 @@ class LocalNodeTest extends SparkFunSuite {
 
   protected def dataFrameToSeqScanNode(df: DataFrame): SeqScanNode = {
     new SeqScanNode(
+      conf,
       df.queryExecution.sparkPlan.output,
       df.queryExecution.toRdd.map(_.copy()).collect())
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
index ffcf092e2c66a..38e0a230c46d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
@@ -26,7 +26,7 @@ class ProjectNodeSuite extends LocalNodeTest with SharedSQLContext {
     val columns = Seq(output(1), output(0))
     checkAnswer(
       testData,
-      node => ProjectNode(columns, node),
+      node => ProjectNode(conf, columns, node),
       testData.select("value", "key").collect()
     )
   }
@@ -36,7 +36,7 @@ class ProjectNodeSuite extends LocalNodeTest with SharedSQLContext {
     val columns = Seq(output(1), output(0))
     checkAnswer(
       emptyTestData,
-      node => ProjectNode(columns, node),
+      node => ProjectNode(conf, columns, node),
       emptyTestData.select("value", "key").collect()
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
index 34670287c3e1d..eedd7320900f9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
@@ -25,7 +25,7 @@ class UnionNodeSuite extends LocalNodeTest with SharedSQLContext {
     checkAnswer2(
       testData,
       testData,
-      (node1, node2) => UnionNode(Seq(node1, node2)),
+      (node1, node2) => UnionNode(conf, Seq(node1, node2)),
       testData.unionAll(testData).collect()
     )
   }
@@ -34,7 +34,7 @@ class UnionNodeSuite extends LocalNodeTest with SharedSQLContext {
     checkAnswer2(
       emptyTestData,
       emptyTestData,
-      (node1, node2) => UnionNode(Seq(node1, node2)),
+      (node1, node2) => UnionNode(conf, Seq(node1, node2)),
       emptyTestData.unionAll(emptyTestData).collect()
     )
   }
@@ -44,7 +44,7 @@ class UnionNodeSuite extends LocalNodeTest with SharedSQLContext {
       emptyTestData, emptyTestData, testData, emptyTestData)
     doCheckAnswer(
       dfs,
-      nodes => UnionNode(nodes),
+      nodes => UnionNode(conf, nodes),
       dfs.reduce(_.unionAll(_)).collect()
     )
   }

From 45e3be5c138d983f40f619735d60bf7eb78c9bf0 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Thu, 10 Sep 2015 12:21:13 -0700
Subject: [PATCH 241/802] [SPARK-10049] [SPARKR] Support collecting data of
 ArraryType in DataFrame.

this PR :
1.  Enhance reflection in RBackend. Automatically matching a Java array to Scala Seq when finding methods. Util functions like seq(), listToSeq() in R side can be removed, as they will conflict with the Serde logic that transferrs a Scala seq to R side.

2.  Enhance the SerDe to support transferring  a Scala seq to R side. Data of ArrayType in DataFrame
after collection is observed to be of Scala Seq type.

3.  Support ArrayType in createDataFrame().

Author: Sun Rui <rui.sun@intel.com>

Closes #8458 from sun-rui/SPARK-10049.
---
 R/pkg/R/DataFrame.R                           |  26 ++--
 R/pkg/R/SQLContext.R                          |   4 +-
 R/pkg/R/column.R                              |   3 +-
 R/pkg/R/functions.R                           |  12 +-
 R/pkg/R/group.R                               |   4 +-
 R/pkg/R/schema.R                              |  54 +++++---
 R/pkg/R/utils.R                               |  10 --
 R/pkg/inst/tests/test_sparkSQL.R              |  44 ++++++-
 .../apache/spark/api/r/RBackendHandler.scala  | 121 +++++++++++++-----
 .../scala/org/apache/spark/api/r/SerDe.scala  | 109 ++++++++--------
 .../org/apache/spark/sql/api/r/SQLUtils.scala |  14 +-
 11 files changed, 250 insertions(+), 151 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 8a00238b41d60..c3c1893487334 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -271,7 +271,7 @@ setMethod("names<-",
           signature(x = "DataFrame"),
           function(x, value) {
             if (!is.null(value)) {
-              sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
+              sdf <- callJMethod(x@sdf, "toDF", as.list(value))
               dataFrame(sdf)
             }
           })
@@ -843,10 +843,10 @@ setMethod("groupBy",
            function(x, ...) {
              cols <- list(...)
              if (length(cols) >= 1 && class(cols[[1]]) == "character") {
-               sgd <- callJMethod(x@sdf, "groupBy", cols[[1]], listToSeq(cols[-1]))
+               sgd <- callJMethod(x@sdf, "groupBy", cols[[1]], cols[-1])
              } else {
                jcol <- lapply(cols, function(c) { c@jc })
-               sgd <- callJMethod(x@sdf, "groupBy", listToSeq(jcol))
+               sgd <- callJMethod(x@sdf, "groupBy", jcol)
              }
              groupedData(sgd)
            })
@@ -1079,7 +1079,7 @@ setMethod("subset", signature(x = "DataFrame"),
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
-            sdf <- callJMethod(x@sdf, "select", col, toSeq(...))
+            sdf <- callJMethod(x@sdf, "select", col, list(...))
             dataFrame(sdf)
           })
 
@@ -1090,7 +1090,7 @@ setMethod("select", signature(x = "DataFrame", col = "Column"),
             jcols <- lapply(list(col, ...), function(c) {
               c@jc
             })
-            sdf <- callJMethod(x@sdf, "select", listToSeq(jcols))
+            sdf <- callJMethod(x@sdf, "select", jcols)
             dataFrame(sdf)
           })
 
@@ -1106,7 +1106,7 @@ setMethod("select",
                 col(c)@jc
               }
             })
-            sdf <- callJMethod(x@sdf, "select", listToSeq(cols))
+            sdf <- callJMethod(x@sdf, "select", cols)
             dataFrame(sdf)
           })
 
@@ -1133,7 +1133,7 @@ setMethod("selectExpr",
           signature(x = "DataFrame", expr = "character"),
           function(x, expr, ...) {
             exprList <- list(expr, ...)
-            sdf <- callJMethod(x@sdf, "selectExpr", listToSeq(exprList))
+            sdf <- callJMethod(x@sdf, "selectExpr", exprList)
             dataFrame(sdf)
           })
 
@@ -1311,12 +1311,12 @@ setMethod("arrange",
           signature(x = "DataFrame", col = "characterOrColumn"),
           function(x, col, ...) {
             if (class(col) == "character") {
-              sdf <- callJMethod(x@sdf, "sort", col, toSeq(...))
+              sdf <- callJMethod(x@sdf, "sort", col, list(...))
             } else if (class(col) == "Column") {
               jcols <- lapply(list(col, ...), function(c) {
                 c@jc
               })
-              sdf <- callJMethod(x@sdf, "sort", listToSeq(jcols))
+              sdf <- callJMethod(x@sdf, "sort", jcols)
             }
             dataFrame(sdf)
           })
@@ -1664,7 +1664,7 @@ setMethod("describe",
           signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
             colList <- list(col, ...)
-            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            sdf <- callJMethod(x@sdf, "describe", colList)
             dataFrame(sdf)
           })
 
@@ -1674,7 +1674,7 @@ setMethod("describe",
           signature(x = "DataFrame"),
           function(x) {
             colList <- as.list(c(columns(x)))
-            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            sdf <- callJMethod(x@sdf, "describe", colList)
             dataFrame(sdf)
           })
 
@@ -1731,7 +1731,7 @@ setMethod("dropna",
 
             naFunctions <- callJMethod(x@sdf, "na")
             sdf <- callJMethod(naFunctions, "drop",
-                               as.integer(minNonNulls), listToSeq(as.list(cols)))
+                               as.integer(minNonNulls), as.list(cols))
             dataFrame(sdf)
           })
 
@@ -1815,7 +1815,7 @@ setMethod("fillna",
             sdf <- if (length(cols) == 0) {
               callJMethod(naFunctions, "fill", value)
             } else {
-              callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
+              callJMethod(naFunctions, "fill", value, as.list(cols))
             }
             dataFrame(sdf)
           })
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 1bc6445311473..4ac057d0f2d83 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -49,7 +49,7 @@ infer_type <- function(x) {
     stopifnot(length(x) > 0)
     names <- names(x)
     if (is.null(names)) {
-      list(type = "array", elementType = infer_type(x[[1]]), containsNull = TRUE)
+      paste0("array<", infer_type(x[[1]]), ">")
     } else {
       # StructType
       types <- lapply(x, infer_type)
@@ -59,7 +59,7 @@ infer_type <- function(x) {
       do.call(structType, fields)
     }
   } else if (length(x) > 1) {
-    list(type = "array", elementType = type, containsNull = TRUE)
+    paste0("array<", infer_type(x[[1]]), ">")
   } else {
     type
   }
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 4805096f3f9c5..42e9d12179db7 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -211,8 +211,7 @@ setMethod("cast",
 setMethod("%in%",
           signature(x = "Column"),
           function(x, table) {
-            table <- listToSeq(as.list(table))
-            jc <- callJMethod(x@jc, "in", table)
+            jc <- callJMethod(x@jc, "in", as.list(table))
             return(column(jc))
           })
 
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index d848730e70433..94687edb05442 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1331,7 +1331,7 @@ setMethod("countDistinct",
               x@jc
             })
             jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
-                              listToSeq(jcol))
+                              jcol)
             column(jc)
           })
 
@@ -1348,7 +1348,7 @@ setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "concat", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "concat", jcols)
             column(jc)
           })
 
@@ -1366,7 +1366,7 @@ setMethod("greatest",
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "greatest", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "greatest", jcols)
             column(jc)
           })
 
@@ -1384,7 +1384,7 @@ setMethod("least",
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "least", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "least", jcols)
             column(jc)
           })
 
@@ -1675,7 +1675,7 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
 #' @export
 setMethod("concat_ws", signature(sep = "character", x = "Column"),
           function(sep, x, ...) {
-            jcols <- listToSeq(lapply(list(x, ...), function(x) { x@jc }))
+            jcols <- lapply(list(x, ...), function(x) { x@jc })
             jc <- callJStatic("org.apache.spark.sql.functions", "concat_ws", sep, jcols)
             column(jc)
           })
@@ -1723,7 +1723,7 @@ setMethod("expr", signature(x = "character"),
 #' @export
 setMethod("format_string", signature(format = "character", x = "Column"),
           function(format, x, ...) {
-            jcols <- listToSeq(lapply(list(x, ...), function(arg) { arg@jc }))
+            jcols <- lapply(list(x, ...), function(arg) { arg@jc })
             jc <- callJStatic("org.apache.spark.sql.functions",
                               "format_string",
                               format, jcols)
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 576ac72f40fc0..4cab1a69f601a 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -102,7 +102,7 @@ setMethod("agg",
                 }
               }
               jcols <- lapply(cols, function(c) { c@jc })
-              sdf <- callJMethod(x@sgd, "agg", jcols[[1]], listToSeq(jcols[-1]))
+              sdf <- callJMethod(x@sgd, "agg", jcols[[1]], jcols[-1])
             } else {
               stop("agg can only support Column or character")
             }
@@ -124,7 +124,7 @@ createMethod <- function(name) {
   setMethod(name,
             signature(x = "GroupedData"),
             function(x, ...) {
-              sdf <- callJMethod(x@sgd, name, toSeq(...))
+              sdf <- callJMethod(x@sgd, name, list(...))
               dataFrame(sdf)
             })
 }
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 79c744ef29c23..62d4f73878d29 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -56,7 +56,7 @@ structType.structField <- function(x, ...) {
   })
   stObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                        "createStructType",
-                       listToSeq(sfObjList))
+                       sfObjList)
   structType(stObj)
 }
 
@@ -114,6 +114,35 @@ structField.jobj <- function(x) {
   obj
 }
 
+checkType <- function(type) {
+  primtiveTypes <- c("byte",
+                     "integer",
+                     "float",
+                     "double",
+                     "numeric",
+                     "character",
+                     "string",
+                     "binary",
+                     "raw",
+                     "logical",
+                     "boolean",
+                     "timestamp",
+                     "date")
+  if (type %in% primtiveTypes) {
+    return()
+  } else {
+    m <- regexec("^array<(.*)>$", type)
+    matchedStrings <- regmatches(type, m)
+    if (length(matchedStrings[[1]]) >= 2) {
+      elemType <- matchedStrings[[1]][2]
+      checkType(elemType)
+      return()
+    }
+  }
+
+  stop(paste("Unsupported type for Dataframe:", type))
+}
+
 structField.character <- function(x, type, nullable = TRUE) {
   if (class(x) != "character") {
     stop("Field name must be a string.")
@@ -124,28 +153,13 @@ structField.character <- function(x, type, nullable = TRUE) {
   if (class(nullable) != "logical") {
     stop("nullable must be either TRUE or FALSE")
   }
-  options <- c("byte",
-               "integer",
-               "float",
-               "double",
-               "numeric",
-               "character",
-               "string",
-               "binary",
-               "raw",
-               "logical",
-               "boolean",
-               "timestamp",
-               "date")
-  dataType <- if (type %in% options) {
-    type
-  } else {
-    stop(paste("Unsupported type for Dataframe:", type))
-  }
+
+  checkType(type)
+
   sfObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                        "createStructField",
                        x,
-                       dataType,
+                       type,
                        nullable)
   structField(sfObj)
 }
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 3babcb519378e..69a2bc728f842 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -361,16 +361,6 @@ numToInt <- function(num) {
   as.integer(num)
 }
 
-# create a Seq in JVM
-toSeq <- function(...) {
-  callJStatic("org.apache.spark.sql.api.r.SQLUtils", "toSeq", list(...))
-}
-
-# create a Seq in JVM from a list
-listToSeq <- function(l) {
-  callJStatic("org.apache.spark.sql.api.r.SQLUtils", "toSeq", l)
-}
-
 # Utility function to recursively traverse the Abstract Syntax Tree (AST) of a
 # user defined function (UDF), and to examine variables in the UDF to decide
 # if their values should be included in the new function environment.
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 6d331f9883d55..1ccfde59176f5 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -49,6 +49,14 @@ mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
 jsonPathNa <- tempfile(pattern="sparkr-test", fileext=".tmp")
 writeLines(mockLinesNa, jsonPathNa)
 
+# For test complex types in DataFrame
+mockLinesComplexType <-
+  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+    "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+    "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
 test_that("infer types", {
   expect_equal(infer_type(1L), "integer")
   expect_equal(infer_type(1.0), "double")
@@ -56,10 +64,8 @@ test_that("infer types", {
   expect_equal(infer_type(TRUE), "boolean")
   expect_equal(infer_type(as.Date("2015-03-11")), "date")
   expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
-  expect_equal(infer_type(c(1L, 2L)),
-               list(type = "array", elementType = "integer", containsNull = TRUE))
-  expect_equal(infer_type(list(1L, 2L)),
-               list(type = "array", elementType = "integer", containsNull = TRUE))
+  expect_equal(infer_type(c(1L, 2L)), "array<integer>")
+  expect_equal(infer_type(list(1L, 2L)), "array<integer>")
   testStruct <- infer_type(list(a = 1L, b = "2"))
   expect_equal(class(testStruct), "structType")
   checkStructField(testStruct$fields()[[1]], "a", "IntegerType", TRUE)
@@ -236,8 +242,7 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
-# TODO: enable this test after fix serialization for nested object
-#test_that("create DataFrame with nested array and struct", {
+test_that("create DataFrame with nested array and struct", {
 #  e <- new.env()
 #  assign("n", 3L, envir = e)
 #  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
@@ -247,7 +252,32 @@ test_that("create DataFrame with different data types", {
 #  expect_equal(count(df), 1)
 #  ldf <- collect(df)
 #  expect_equal(ldf[1,], l[[1]])
-#})
+
+
+  #  ArrayType only for now
+  l <- list(as.list(1:10), list("a", "b"))
+  df <- createDataFrame(sqlContext, list(l), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>")))
+  expect_equal(count(df), 1)
+  ldf <- collect(df)
+  expect_equal(names(ldf), c("a", "b"))
+  expect_equal(ldf[1, 1][[1]], l[[1]])
+  expect_equal(ldf[1, 2][[1]], l[[2]])
+})
+
+test_that("Collect DataFrame with complex types", {
+  # only ArrayType now
+  # TODO: tests for StructType and MapType after they are supported
+  df <- jsonFile(sqlContext, complexTypeJsonPath)
+
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 3)
+  expect_equal(names(ldf), c("c1", "c2", "c3"))
+  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
+  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
+  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+})
 
 test_that("jsonFile() on a local file returns a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index bb82f3285f1d9..2a792d81994fd 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -125,10 +125,11 @@ private[r] class RBackendHandler(server: RBackend)
       val methods = cls.getMethods
       val selectedMethods = methods.filter(m => m.getName == methodName)
       if (selectedMethods.length > 0) {
-        val methods = selectedMethods.filter { x =>
-          matchMethod(numArgs, args, x.getParameterTypes)
-        }
-        if (methods.isEmpty) {
+        val index = findMatchedSignature(
+          selectedMethods.map(_.getParameterTypes),
+          args)
+
+        if (index.isEmpty) {
           logWarning(s"cannot find matching method ${cls}.$methodName. "
             + s"Candidates are:")
           selectedMethods.foreach { method =>
@@ -136,18 +137,29 @@ private[r] class RBackendHandler(server: RBackend)
           }
           throw new Exception(s"No matched method found for $cls.$methodName")
         }
-        val ret = methods.head.invoke(obj, args : _*)
+
+        val ret = selectedMethods(index.get).invoke(obj, args : _*)
 
         // Write status bit
         writeInt(dos, 0)
         writeObject(dos, ret.asInstanceOf[AnyRef])
       } else if (methodName == "<init>") {
         // methodName should be "<init>" for constructor
-        val ctor = cls.getConstructors.filter { x =>
-          matchMethod(numArgs, args, x.getParameterTypes)
-        }.head
+        val ctors = cls.getConstructors
+        val index = findMatchedSignature(
+          ctors.map(_.getParameterTypes),
+          args)
 
-        val obj = ctor.newInstance(args : _*)
+        if (index.isEmpty) {
+          logWarning(s"cannot find matching constructor for ${cls}. "
+            + s"Candidates are:")
+          ctors.foreach { ctor =>
+            logWarning(s"$cls(${ctor.getParameterTypes.mkString(",")})")
+          }
+          throw new Exception(s"No matched constructor found for $cls")
+        }
+
+        val obj = ctors(index.get).newInstance(args : _*)
 
         writeInt(dos, 0)
         writeObject(dos, obj.asInstanceOf[AnyRef])
@@ -166,40 +178,79 @@ private[r] class RBackendHandler(server: RBackend)
 
   // Read a number of arguments from the data input stream
   def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = {
-    (0 until numArgs).map { arg =>
+    (0 until numArgs).map { _ =>
       readObject(dis)
     }.toArray
   }
 
-  // Checks if the arguments passed in args matches the parameter types.
-  // NOTE: Currently we do exact match. We may add type conversions later.
-  def matchMethod(
-      numArgs: Int,
-      args: Array[java.lang.Object],
-      parameterTypes: Array[Class[_]]): Boolean = {
-    if (parameterTypes.length != numArgs) {
-      return false
-    }
+  // Find a matching method signature in an array of signatures of constructors
+  // or methods of the same name according to the passed arguments. Arguments
+  // may be converted in order to match a signature.
+  //
+  // Note that in Java reflection, constructors and normal methods are of different
+  // classes, and share no parent class that provides methods for reflection uses.
+  // There is no unified way to handle them in this function. So an array of signatures
+  // is passed in instead of an array of candidate constructors or methods.
+  //
+  // Returns an Option[Int] which is the index of the matched signature in the array.
+  def findMatchedSignature(
+      parameterTypesOfMethods: Array[Array[Class[_]]],
+      args: Array[Object]): Option[Int] = {
+    val numArgs = args.length
+
+    for (index <- 0 until parameterTypesOfMethods.length) {
+      val parameterTypes = parameterTypesOfMethods(index)
+
+      if (parameterTypes.length == numArgs) {
+        var argMatched = true
+        var i = 0
+        while (i < numArgs && argMatched) {
+          val parameterType = parameterTypes(i)
+
+          if (parameterType == classOf[Seq[Any]] && args(i).getClass.isArray) {
+            // The case that the parameter type is a Scala Seq and the argument
+            // is a Java array is considered matching. The array will be converted
+            // to a Seq later if this method is matched.
+          } else {
+            var parameterWrapperType = parameterType
+
+            // Convert native parameters to Object types as args is Array[Object] here
+            if (parameterType.isPrimitive) {
+              parameterWrapperType = parameterType match {
+                case java.lang.Integer.TYPE => classOf[java.lang.Integer]
+                case java.lang.Long.TYPE => classOf[java.lang.Integer]
+                case java.lang.Double.TYPE => classOf[java.lang.Double]
+                case java.lang.Boolean.TYPE => classOf[java.lang.Boolean]
+                case _ => parameterType
+              }
+            }
+            if (!parameterWrapperType.isInstance(args(i))) {
+              argMatched = false
+            }
+          }
 
-    for (i <- 0 to numArgs - 1) {
-      val parameterType = parameterTypes(i)
-      var parameterWrapperType = parameterType
-
-      // Convert native parameters to Object types as args is Array[Object] here
-      if (parameterType.isPrimitive) {
-        parameterWrapperType = parameterType match {
-          case java.lang.Integer.TYPE => classOf[java.lang.Integer]
-          case java.lang.Long.TYPE => classOf[java.lang.Integer]
-          case java.lang.Double.TYPE => classOf[java.lang.Double]
-          case java.lang.Boolean.TYPE => classOf[java.lang.Boolean]
-          case _ => parameterType
+          i = i + 1
+        }
+
+        if (argMatched) {
+          // For now, we return the first matching method.
+          // TODO: find best method in matching methods.
+
+          // Convert args if needed
+          val parameterTypes = parameterTypesOfMethods(index)
+
+          (0 until numArgs).map { i =>
+            if (parameterTypes(i) == classOf[Seq[Any]] && args(i).getClass.isArray) {
+              // Convert a Java array to scala Seq
+              args(i) = args(i).asInstanceOf[Array[_]].toSeq
+            }
+          }
+
+          return Some(index)
         }
-      }
-      if (!parameterWrapperType.isInstance(args(i))) {
-        return false
       }
     }
-    true
+    None
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 190e193427af8..3c92bb7a1c73c 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -21,6 +21,7 @@ import java.io.{DataInputStream, DataOutputStream}
 import java.sql.{Timestamp, Date, Time}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.WrappedArray
 
 /**
  * Utility functions to serialize, deserialize objects to / from R
@@ -213,89 +214,97 @@ private[spark] object SerDe {
     }
   }
 
-  def writeObject(dos: DataOutputStream, value: Object): Unit = {
-    if (value == null) {
+  def writeObject(dos: DataOutputStream, obj: Object): Unit = {
+    if (obj == null) {
       writeType(dos, "void")
     } else {
-      value.getClass.getName match {
-        case "java.lang.Character" =>
+      // Convert ArrayType collected from DataFrame to Java array
+      // Collected data of ArrayType from a DataFrame is observed to be of
+      // type "scala.collection.mutable.WrappedArray"
+      val value =
+        if (obj.isInstanceOf[WrappedArray[_]]) {
+          obj.asInstanceOf[WrappedArray[_]].toArray
+        } else {
+          obj
+        }
+
+      value match {
+        case v: java.lang.Character =>
           writeType(dos, "character")
-          writeString(dos, value.asInstanceOf[Character].toString)
-        case "java.lang.String" =>
+          writeString(dos, v.toString)
+        case v: java.lang.String =>
           writeType(dos, "character")
-          writeString(dos, value.asInstanceOf[String])
-        case "java.lang.Long" =>
+          writeString(dos, v)
+        case v: java.lang.Long =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Long].toDouble)
-        case "java.lang.Float" =>
+          writeDouble(dos, v.toDouble)
+        case v: java.lang.Float =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Float].toDouble)
-        case "java.math.BigDecimal" =>
+          writeDouble(dos, v.toDouble)
+        case v: java.math.BigDecimal =>
           writeType(dos, "double")
-          val javaDecimal = value.asInstanceOf[java.math.BigDecimal]
-          writeDouble(dos, scala.math.BigDecimal(javaDecimal).toDouble)
-        case "java.lang.Double" =>
+          writeDouble(dos, scala.math.BigDecimal(v).toDouble)
+        case v: java.lang.Double =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Double])
-        case "java.lang.Byte" =>
+          writeDouble(dos, v)
+        case v: java.lang.Byte =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Byte].toInt)
-        case "java.lang.Short" =>
+          writeInt(dos, v.toInt)
+        case v: java.lang.Short =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Short].toInt)
-        case "java.lang.Integer" =>
+          writeInt(dos, v.toInt)
+        case v: java.lang.Integer =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Int])
-        case "java.lang.Boolean" =>
+          writeInt(dos, v)
+        case v: java.lang.Boolean =>
           writeType(dos, "logical")
-          writeBoolean(dos, value.asInstanceOf[Boolean])
-        case "java.sql.Date" =>
+          writeBoolean(dos, v)
+        case v: java.sql.Date =>
           writeType(dos, "date")
-          writeDate(dos, value.asInstanceOf[Date])
-        case "java.sql.Time" =>
+          writeDate(dos, v)
+        case v: java.sql.Time =>
           writeType(dos, "time")
-          writeTime(dos, value.asInstanceOf[Time])
-        case "java.sql.Timestamp" =>
+          writeTime(dos, v)
+        case v: java.sql.Timestamp =>
           writeType(dos, "time")
-          writeTime(dos, value.asInstanceOf[Timestamp])
+          writeTime(dos, v)
 
         // Handle arrays
 
         // Array of primitive types
 
         // Special handling for byte array
-        case "[B" =>
+        case v: Array[Byte] =>
           writeType(dos, "raw")
-          writeBytes(dos, value.asInstanceOf[Array[Byte]])
+          writeBytes(dos, v)
 
-        case "[C" =>
+        case v: Array[Char] =>
           writeType(dos, "array")
-          writeStringArr(dos, value.asInstanceOf[Array[Char]].map(_.toString))
-        case "[S" =>
+          writeStringArr(dos, v.map(_.toString))
+        case v: Array[Short] =>
           writeType(dos, "array")
-          writeIntArr(dos, value.asInstanceOf[Array[Short]].map(_.toInt))
-        case "[I" =>
+          writeIntArr(dos, v.map(_.toInt))
+        case v: Array[Int] =>
           writeType(dos, "array")
-          writeIntArr(dos, value.asInstanceOf[Array[Int]])
-        case "[J" =>
+          writeIntArr(dos, v)
+        case v: Array[Long] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Long]].map(_.toDouble))
-        case "[F" =>
+          writeDoubleArr(dos, v.map(_.toDouble))
+        case v: Array[Float] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Float]].map(_.toDouble))
-        case "[D" =>
+          writeDoubleArr(dos, v.map(_.toDouble))
+        case v: Array[Double] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
-        case "[Z" =>
+          writeDoubleArr(dos, v)
+        case v: Array[Boolean] =>
           writeType(dos, "array")
-          writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
+          writeBooleanArr(dos, v)
 
         // Array of objects, null objects use "void" type
-        case c if c.startsWith("[") =>
+        case v: Array[Object] =>
           writeType(dos, "list")
-          val array = value.asInstanceOf[Array[Object]]
-          writeInt(dos, array.length)
-          array.foreach(elem => writeObject(dos, elem))
+          writeInt(dos, v.length)
+          v.foreach(elem => writeObject(dos, elem))
 
         case _ =>
           writeType(dos, "jobj")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 7f3defec3d42e..d4b834adb6e39 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpres
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, GroupedData, Row, SQLContext, SaveMode}
 
+import scala.util.matching.Regex
+
 private[r] object SQLUtils {
   def createSQLContext(jsc: JavaSparkContext): SQLContext = {
     new SQLContext(jsc)
@@ -35,14 +37,15 @@ private[r] object SQLUtils {
     new JavaSparkContext(sqlCtx.sparkContext)
   }
 
-  def toSeq[T](arr: Array[T]): Seq[T] = {
-    arr.toSeq
-  }
-
   def createStructType(fields : Seq[StructField]): StructType = {
     StructType(fields)
   }
 
+  // Support using regex in string interpolation
+  private[this] implicit class RegexContext(sc: StringContext) {
+    def r: Regex = new Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*)
+  }
+
   def getSQLDataType(dataType: String): DataType = {
     dataType match {
       case "byte" => org.apache.spark.sql.types.ByteType
@@ -58,6 +61,9 @@ private[r] object SQLUtils {
       case "boolean" => org.apache.spark.sql.types.BooleanType
       case "timestamp" => org.apache.spark.sql.types.TimestampType
       case "date" => org.apache.spark.sql.types.DateType
+      case r"\Aarray<(.*)${elemType}>\Z" => {
+        org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
+      }
       case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
     }
   }

From 3db72554be3f13478ccd5915e746491982163298 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 10 Sep 2015 13:22:35 -0700
Subject: [PATCH 242/802] [SPARK-10443] [SQL] Refactor SortMergeOuterJoin to
 reduce duplication

`LeftOutputIterator` and `RightOutputIterator` are symmetrically identical and can share a lot of code. If someone makes a change in one but forgets to do the same thing in the other we'll end up with inconsistent behavior. This patch also adds inline comments to clarify the intention of the code.

Author: Andrew Or <andrew@databricks.com>

Closes #8596 from andrewor14/smoj-cleanup.
---
 .../execution/joins/SortMergeOuterJoin.scala  | 138 ++++++++++--------
 1 file changed, 77 insertions(+), 61 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
index ab20ee573ab5d..c117dff9c8b1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
@@ -199,99 +199,115 @@ case class SortMergeOuterJoin(
   }
 }
 
-
+/**
+ * An iterator for outputting rows in left outer join.
+ */
 private class LeftOuterIterator(
     smjScanner: SortMergeJoinScanner,
     rightNullRow: InternalRow,
     boundCondition: InternalRow => Boolean,
     resultProj: InternalRow => InternalRow,
-    numRows: LongSQLMetric
-  ) extends RowIterator {
-  private[this] val joinedRow: JoinedRow = new JoinedRow()
-  private[this] var rightIdx: Int = 0
-  assert(smjScanner.getBufferedMatches.length == 0)
-
-  private def advanceLeft(): Boolean = {
-    rightIdx = 0
-    if (smjScanner.findNextOuterJoinRows()) {
-      joinedRow.withLeft(smjScanner.getStreamedRow)
-      if (smjScanner.getBufferedMatches.isEmpty) {
-        // There are no matching right rows, so return nulls for the right row
-        joinedRow.withRight(rightNullRow)
-      } else {
-        // Find the next row from the right input that satisfied the bound condition
-        if (!advanceRightUntilBoundConditionSatisfied()) {
-          joinedRow.withRight(rightNullRow)
-        }
-      }
-      true
-    } else {
-      // Left input has been exhausted
-      false
-    }
-  }
-
-  private def advanceRightUntilBoundConditionSatisfied(): Boolean = {
-    var foundMatch: Boolean = false
-    while (!foundMatch && rightIdx < smjScanner.getBufferedMatches.length) {
-      foundMatch = boundCondition(joinedRow.withRight(smjScanner.getBufferedMatches(rightIdx)))
-      rightIdx += 1
-    }
-    foundMatch
-  }
-
-  override def advanceNext(): Boolean = {
-    val r = advanceRightUntilBoundConditionSatisfied() || advanceLeft()
-    if (r) numRows += 1
-    r
-  }
+    numOutputRows: LongSQLMetric)
+  extends OneSideOuterIterator(
+    smjScanner, rightNullRow, boundCondition, resultProj, numOutputRows) {
 
-  override def getRow: InternalRow = resultProj(joinedRow)
+  protected override def setStreamSideOutput(row: InternalRow): Unit = joinedRow.withLeft(row)
+  protected override def setBufferedSideOutput(row: InternalRow): Unit = joinedRow.withRight(row)
 }
 
+/**
+ * An iterator for outputting rows in right outer join.
+ */
 private class RightOuterIterator(
     smjScanner: SortMergeJoinScanner,
     leftNullRow: InternalRow,
     boundCondition: InternalRow => Boolean,
     resultProj: InternalRow => InternalRow,
-    numRows: LongSQLMetric
-  ) extends RowIterator {
-  private[this] val joinedRow: JoinedRow = new JoinedRow()
-  private[this] var leftIdx: Int = 0
+    numOutputRows: LongSQLMetric)
+  extends OneSideOuterIterator(
+    smjScanner, leftNullRow, boundCondition, resultProj, numOutputRows) {
+
+  protected override def setStreamSideOutput(row: InternalRow): Unit = joinedRow.withRight(row)
+  protected override def setBufferedSideOutput(row: InternalRow): Unit = joinedRow.withLeft(row)
+}
+
+/**
+ * An abstract iterator for sharing code between [[LeftOuterIterator]] and [[RightOuterIterator]].
+ *
+ * Each [[OneSideOuterIterator]] has a streamed side and a buffered side. Each row on the
+ * streamed side will output 0 or many rows, one for each matching row on the buffered side.
+ * If there are no matches, then the buffered side of the joined output will be a null row.
+ *
+ * In left outer join, the left is the streamed side and the right is the buffered side.
+ * In right outer join, the right is the streamed side and the left is the buffered side.
+ *
+ * @param smjScanner a scanner that streams rows and buffers any matching rows
+ * @param bufferedSideNullRow the default row to return when a streamed row has no matches
+ * @param boundCondition an additional filter condition for buffered rows
+ * @param resultProj how the output should be projected
+ * @param numOutputRows an accumulator metric for the number of rows output
+ */
+private abstract class OneSideOuterIterator(
+    smjScanner: SortMergeJoinScanner,
+    bufferedSideNullRow: InternalRow,
+    boundCondition: InternalRow => Boolean,
+    resultProj: InternalRow => InternalRow,
+    numOutputRows: LongSQLMetric) extends RowIterator {
+
+  // A row to store the joined result, reused many times
+  protected[this] val joinedRow: JoinedRow = new JoinedRow()
+
+  // Index of the buffered rows, reset to 0 whenever we advance to a new streamed row
+  private[this] var bufferIndex: Int = 0
+
+  // This iterator is initialized lazily so there should be no matches initially
   assert(smjScanner.getBufferedMatches.length == 0)
 
-  private def advanceRight(): Boolean = {
-    leftIdx = 0
+  // Set output methods to be overridden by subclasses
+  protected def setStreamSideOutput(row: InternalRow): Unit
+  protected def setBufferedSideOutput(row: InternalRow): Unit
+
+  /**
+   * Advance to the next row on the stream side and populate the buffer with matches.
+   * @return whether there are more rows in the stream to consume.
+   */
+  private def advanceStream(): Boolean = {
+    bufferIndex = 0
     if (smjScanner.findNextOuterJoinRows()) {
-      joinedRow.withRight(smjScanner.getStreamedRow)
+      setStreamSideOutput(smjScanner.getStreamedRow)
       if (smjScanner.getBufferedMatches.isEmpty) {
-        // There are no matching left rows, so return nulls for the left row
-        joinedRow.withLeft(leftNullRow)
+        // There are no matching rows in the buffer, so return the null row
+        setBufferedSideOutput(bufferedSideNullRow)
       } else {
-        // Find the next row from the left input that satisfied the bound condition
-        if (!advanceLeftUntilBoundConditionSatisfied()) {
-          joinedRow.withLeft(leftNullRow)
+        // Find the next row in the buffer that satisfied the bound condition
+        if (!advanceBufferUntilBoundConditionSatisfied()) {
+          setBufferedSideOutput(bufferedSideNullRow)
         }
       }
       true
     } else {
-      // Right input has been exhausted
+      // Stream has been exhausted
       false
     }
   }
 
-  private def advanceLeftUntilBoundConditionSatisfied(): Boolean = {
+  /**
+   * Advance to the next row in the buffer that satisfies the bound condition.
+   * @return whether there is such a row in the current buffer.
+   */
+  private def advanceBufferUntilBoundConditionSatisfied(): Boolean = {
     var foundMatch: Boolean = false
-    while (!foundMatch && leftIdx < smjScanner.getBufferedMatches.length) {
-      foundMatch = boundCondition(joinedRow.withLeft(smjScanner.getBufferedMatches(leftIdx)))
-      leftIdx += 1
+    while (!foundMatch && bufferIndex < smjScanner.getBufferedMatches.length) {
+      setBufferedSideOutput(smjScanner.getBufferedMatches(bufferIndex))
+      foundMatch = boundCondition(joinedRow)
+      bufferIndex += 1
     }
     foundMatch
   }
 
   override def advanceNext(): Boolean = {
-    val r = advanceLeftUntilBoundConditionSatisfied() || advanceRight()
-    if (r) numRows += 1
+    val r = advanceBufferUntilBoundConditionSatisfied() || advanceStream()
+    if (r) numOutputRows += 1
     r
   }
 

From 4204757714b36364611fb63bc008cf90fc53d8df Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 10 Sep 2015 13:43:13 -0700
Subject: [PATCH 243/802] Add 1.5 to master branch EC2 scripts

This change brings it to par with `branch-1.5` (and 1.5.0 release)

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #8704 from shivaram/ec2-1.5-update.
---
 ec2/spark_ec2.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 11fd7ee0ec8df..3a2361c6d6d2b 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.4.0"
+SPARK_EC2_VERSION = "1.5.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -71,6 +71,8 @@
     "1.3.0",
     "1.3.1",
     "1.4.0",
+    "1.4.1",
+    "1.5.0"
 ])
 
 SPARK_TACHYON_MAP = {
@@ -84,6 +86,8 @@
     "1.3.0": "0.5.0",
     "1.3.1": "0.5.0",
     "1.4.0": "0.6.4",
+    "1.4.1": "0.6.4",
+    "1.5.0": "0.7.1"
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
@@ -91,7 +95,7 @@
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
 DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.4"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.5"
 
 
 def setup_external_libs(libs):

From 89562a172fd3efa032f60714d600407c6cfe2c2f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Sep 2015 13:54:20 -0700
Subject: [PATCH 244/802] [SPARK-7544] [SQL] [PySpark] pyspark.sql.types.Row
 implements __getitem__

pyspark.sql.types.Row implements ```__getitem__```

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8333 from yanboliang/spark-7544.
---
 python/pyspark/sql/types.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 8bd58d69eeecd..1f86894855cbe 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1176,6 +1176,8 @@ class Row(tuple):
     >>> row = Row(name="Alice", age=11)
     >>> row
     Row(age=11, name='Alice')
+    >>> row['name'], row['age']
+    ('Alice', 11)
     >>> row.name, row.age
     ('Alice', 11)
 
@@ -1243,6 +1245,19 @@ def __call__(self, *args):
         """create new Row object"""
         return _create_row(self, args)
 
+    def __getitem__(self, item):
+        if isinstance(item, (int, slice)):
+            return super(Row, self).__getitem__(item)
+        try:
+            # it will be slow when it has many fields,
+            # but this will not be used in normal cases
+            idx = self.__fields__.index(item)
+            return super(Row, self).__getitem__(idx)
+        except IndexError:
+            raise KeyError(item)
+        except ValueError:
+            raise ValueError(item)
+
     def __getattr__(self, item):
         if item.startswith("__"):
             raise AttributeError(item)

From 0eabea8a058ad60411c1384930ba12c1c638f5f1 Mon Sep 17 00:00:00 2001
From: Matt Massie <massie@cs.berkeley.edu>
Date: Thu, 10 Sep 2015 17:24:33 -0700
Subject: [PATCH 245/802] [SPARK-9043] Serialize key, value and combiner
 classes in ShuffleDependency

ShuffleManager implementations are currently not given type information for
the key, value and combiner classes. Serialization of shuffle objects relies
on objects being JavaSerializable, with methods defined for reading/writing
the object or, alternatively, serialization via Kryo which uses reflection.

Serialization systems like Avro, Thrift and Protobuf generate classes with
zero argument constructors and explicit schema information
(e.g. IndexedRecords in Avro have get, put and getSchema methods).

By serializing the key, value and combiner class names in ShuffleDependency,
shuffle implementations will have access to schema information when
registerShuffle() is called.

Author: Matt Massie <massie@cs.berkeley.edu>

Closes #7403 from massie/shuffle-classtags.
---
 .../scala/org/apache/spark/bagel/Bagel.scala  |  2 +-
 .../scala/org/apache/spark/Dependency.scala   | 11 ++-
 .../apache/spark/api/java/JavaPairRDD.scala   |  2 +-
 .../org/apache/spark/rdd/CoGroupedRDD.scala   |  5 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   | 90 ++++++++++++++++---
 .../org/apache/spark/rdd/ShuffledRDD.scala    |  4 +-
 .../org/apache/spark/rdd/SubtractedRDD.scala  |  8 +-
 .../org/apache/spark/CheckpointSuite.scala    |  2 +-
 .../shuffle/ShuffleDependencySuite.scala      | 67 ++++++++++++++
 9 files changed, 168 insertions(+), 23 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala

diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
index ef0bb2ac13f08..4e6b7686f771d 100644
--- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
+++ b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -78,7 +78,7 @@ object Bagel extends Logging {
       val startTime = System.currentTimeMillis
 
       val aggregated = agg(verts, aggregator)
-      val combinedMsgs = msgs.combineByKey(
+      val combinedMsgs = msgs.combineByKeyWithClassTag(
         combiner.createCombiner _, combiner.mergeMsg _, combiner.mergeCombiners _, partitioner)
       val grouped = combinedMsgs.groupWith(verts)
       val superstep_ = superstep  // Create a read-only copy of superstep for capture in closure
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
index cfeeb3902c033..9aafc9eb1cde7 100644
--- a/core/src/main/scala/org/apache/spark/Dependency.scala
+++ b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark
 
+import scala.reflect.ClassTag
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
@@ -65,7 +67,7 @@ abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  * @param mapSideCombine whether to perform partial aggregation (also known as map-side combine)
  */
 @DeveloperApi
-class ShuffleDependency[K, V, C](
+class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     @transient private val _rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
     val serializer: Option[Serializer] = None,
@@ -76,6 +78,13 @@ class ShuffleDependency[K, V, C](
 
   override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
 
+  private[spark] val keyClassName: String = reflect.classTag[K].runtimeClass.getName
+  private[spark] val valueClassName: String = reflect.classTag[V].runtimeClass.getName
+  // Note: It's possible that the combiner class tag is null, if the combineByKey
+  // methods in PairRDDFunctions are used instead of combineByKeyWithClassTag.
+  private[spark] val combinerClassName: Option[String] =
+    Option(reflect.classTag[C]).map(_.runtimeClass.getName)
+
   val shuffleId: Int = _rdd.context.newShuffleId()
 
   val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle(
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index fb787979c1820..8344f6368ac48 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -239,7 +239,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
       mapSideCombine: Boolean,
       serializer: Serializer): JavaPairRDD[K, C] = {
       implicit val ctag: ClassTag[C] = fakeClassTag
-    fromRDD(rdd.combineByKey(
+    fromRDD(rdd.combineByKeyWithClassTag(
       createCombiner,
       mergeValue,
       mergeCombiners,
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 9c617fc719cb5..7bad749d58327 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -22,6 +22,7 @@ import scala.language.existentials
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
@@ -74,7 +75,9 @@ private[spark] class CoGroupPartition(
  * @param part partitioner used to partition the shuffle output
  */
 @DeveloperApi
-class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part: Partitioner)
+class CoGroupedRDD[K: ClassTag](
+    @transient var rdds: Seq[RDD[_ <: Product2[K, _]]],
+    part: Partitioner)
   extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
 
   // For example, `(k, a) cogroup (k, b)` produces k -> Array(ArrayBuffer as, ArrayBuffer bs).
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 4e5f2e8a5d467..c59f0d4aa75a0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -57,7 +57,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   with SparkHadoopMapReduceUtil
   with Serializable
 {
+
   /**
+   * :: Experimental ::
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
    * Note that V and C can be different -- for example, one might group an RDD of type
@@ -70,12 +72,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * In addition, users can control the partitioning of the output RDD, and whether to perform
    * map-side aggregation (if a mapper can produce multiple items with the same key).
    */
-  def combineByKey[C](createCombiner: V => C,
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
       mergeValue: (C, V) => C,
       mergeCombiners: (C, C) => C,
       partitioner: Partitioner,
       mapSideCombine: Boolean = true,
-      serializer: Serializer = null): RDD[(K, C)] = self.withScope {
+      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
     require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
     if (keyClass.isArray) {
       if (mapSideCombine) {
@@ -103,13 +107,50 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   /**
-   * Simplified version of combineByKey that hash-partitions the output RDD.
+   * Generic function to combine the elements for each key using a custom set of aggregation
+   * functions. This method is here for backward compatibility. It does not provide combiner
+   * classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
    */
-  def combineByKey[C](createCombiner: V => C,
+  def combineByKey[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C,
+      partitioner: Partitioner,
+      mapSideCombine: Boolean = true,
+      serializer: Serializer = null): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
+      partitioner, mapSideCombine, serializer)(null)
+  }
+
+  /**
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
+   * This method is here for backward compatibility. It does not provide combiner
+   * classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
+   */
+  def combineByKey[C](
+      createCombiner: V => C,
       mergeValue: (C, V) => C,
       mergeCombiners: (C, C) => C,
       numPartitions: Int): RDD[(K, C)] = self.withScope {
-    combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numPartitions))
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, numPartitions)(null)
+  }
+
+  /**
+   * :: Experimental ::
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
+   */
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C,
+      numPartitions: Int)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
+      new HashPartitioner(numPartitions))
   }
 
   /**
@@ -133,7 +174,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
     // We will clean the combiner closure later in `combineByKey`
     val cleanedSeqOp = self.context.clean(seqOp)
-    combineByKey[U]((v: V) => cleanedSeqOp(createZero(), v), cleanedSeqOp, combOp, partitioner)
+    combineByKeyWithClassTag[U]((v: V) => cleanedSeqOp(createZero(), v),
+      cleanedSeqOp, combOp, partitioner)
   }
 
   /**
@@ -182,7 +224,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
 
     val cleanedFunc = self.context.clean(func)
-    combineByKey[V]((v: V) => cleanedFunc(createZero(), v), cleanedFunc, cleanedFunc, partitioner)
+    combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
+      cleanedFunc, cleanedFunc, partitioner)
   }
 
   /**
@@ -268,7 +311,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * "combiner" in MapReduce.
    */
   def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
-    combineByKey[V]((v: V) => v, func, func, partitioner)
+    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
   }
 
   /**
@@ -392,7 +435,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       h1
     }
 
-    combineByKey(createHLL, mergeValueHLL, mergeHLL, partitioner).mapValues(_.cardinality())
+    combineByKeyWithClassTag(createHLL, mergeValueHLL, mergeHLL, partitioner)
+      .mapValues(_.cardinality())
   }
 
   /**
@@ -466,7 +510,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val createCombiner = (v: V) => CompactBuffer(v)
     val mergeValue = (buf: CompactBuffer[V], v: V) => buf += v
     val mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2
-    val bufs = combineByKey[CompactBuffer[V]](
+    val bufs = combineByKeyWithClassTag[CompactBuffer[V]](
       createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine = false)
     bufs.asInstanceOf[RDD[(K, Iterable[V])]]
   }
@@ -565,12 +609,30 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   /**
-   * Simplified version of combineByKey that hash-partitions the resulting RDD using the
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the resulting RDD using the
+   * existing partitioner/parallelism level. This method is here for backward compatibility. It
+   * does not provide combiner classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
+   */
+  def combineByKey[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null)
+  }
+
+  /**
+   * :: Experimental ::
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the resulting RDD using the
    * existing partitioner/parallelism level.
    */
-  def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C)
-    : RDD[(K, C)] = self.withScope {
-    combineByKey(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
index 2dc47f95937cb..cb15d912bbfb5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.rdd
 
+import scala.reflect.ClassTag
+
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.serializer.Serializer
@@ -37,7 +39,7 @@ private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  */
 // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs
 @DeveloperApi
-class ShuffledRDD[K, V, C](
+class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
     @transient var prev: RDD[_ <: Product2[K, V]],
     part: Partitioner)
   extends RDD[(K, C)](prev.context, Nil) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
index 9a4fa301b06e3..25ec685eff5ab 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
@@ -63,15 +63,17 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
   }
 
   override def getDependencies: Seq[Dependency[_]] = {
-    Seq(rdd1, rdd2).map { rdd =>
+    def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]])
+      : Dependency[_] = {
       if (rdd.partitioner == Some(part)) {
         logDebug("Adding one-to-one dependency with " + rdd)
         new OneToOneDependency(rdd)
       } else {
         logDebug("Adding shuffle dependency with " + rdd)
-        new ShuffleDependency(rdd, part, serializer)
+        new ShuffleDependency[T1, T2, Any](rdd, part, serializer)
       }
     }
+    Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2))
   }
 
   override def getPartitions: Array[Partition] = {
@@ -105,7 +107,7 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
         seq
       }
     }
-    def integrate(depNum: Int, op: Product2[K, V] => Unit) = {
+    def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = {
       dependencies(depNum) match {
         case oneToOneDependency: OneToOneDependency[_] =>
           val dependencyPartition = partition.narrowDeps(depNum).get.split
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index d343bb95cb68c..4d70bfed909b6 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -483,7 +483,7 @@ class FatPairRDD(parent: RDD[Int], _partitioner: Partitioner) extends RDD[(Int,
 object CheckpointSuite {
   // This is a custom cogroup function that does not use mapValues like
   // the PairRDDFunctions.cogroup()
-  def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner)
+  def cogroup[K: ClassTag, V: ClassTag](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner)
     : RDD[(K, Array[Iterable[V]])] = {
     new CoGroupedRDD[K](
       Seq(first.asInstanceOf[RDD[(K, _)]], second.asInstanceOf[RDD[(K, _)]]),
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala
new file mode 100644
index 0000000000000..4d5f599fb12ab
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.shuffle
+
+import org.apache.spark._
+
+case class KeyClass()
+
+case class ValueClass()
+
+case class CombinerClass()
+
+class ShuffleDependencySuite extends SparkFunSuite with LocalSparkContext {
+
+  val conf = new SparkConf(loadDefaults = false)
+
+  test("key, value, and combiner classes correct in shuffle dependency without aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .groupByKey()
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(!dep.mapSideCombine, "Test requires that no map-side aggregator is defined")
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+  }
+
+  test("key, value, and combiner classes available in shuffle dependency with aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .aggregateByKey(CombinerClass())({ case (a, b) => a }, { case (a, b) => a })
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(dep.mapSideCombine && dep.aggregator.isDefined, "Test requires map-side aggregation")
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+    assert(dep.combinerClassName == Some(classOf[CombinerClass].getName))
+  }
+
+  test("combineByKey null combiner class tag handled correctly") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .combineByKey((v: ValueClass) => v,
+        (c: AnyRef, v: ValueClass) => c,
+        (c1: AnyRef, c2: AnyRef) => c1)
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+    assert(dep.combinerClassName == None)
+  }
+
+}

From 339a527141984bfb182862b0987d3c4690c9ede1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Sep 2015 20:34:00 -0700
Subject: [PATCH 246/802] [SPARK-10023] [ML] [PySpark] Unified
 DecisionTreeParams checkpointInterval between Scala and Python API.

"checkpointInterval" is member of DecisionTreeParams in Scala API which is inconsistency with Python API, we should unified them.
```
member of DecisionTreeParams <-> Scala API
shared param for all ML Transformer/Estimator <-> Python API
```
Proposal:
"checkpointInterval" is also used by ALS, so we make it shared params at Scala.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8528 from yanboliang/spark-10023.
---
 .../DecisionTreeClassifier.scala              |  1 +
 .../ml/param/shared/SharedParamsCodeGen.scala |  3 +-
 .../spark/ml/param/shared/sharedParams.scala  |  4 +--
 .../org/apache/spark/ml/tree/treeParams.scala | 32 +++++++------------
 4 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 6f70b96b17ec6..0a75d5d22280f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared.HasCheckpointInterval
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8c16c6149b40d..e9e99ed1db40e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -56,7 +56,8 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
       ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")),
-      ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
+      ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that " +
+        "the cache will get checkpointed every 10 iterations.",
         isValid = "ParamValidators.gtEq(1)"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index c26768953e3db..30092170863ad 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -223,10 +223,10 @@ private[ml] trait HasOutputCol extends Params {
 private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for checkpoint interval (>= 1).
+   * Param for checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations..
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1))
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", ParamValidators.gtEq(1))
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index dbd8d31571d2e..d29f5253c9c3f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.tree
 import org.apache.spark.ml.classification.ClassifierParams
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed, HasThresholds}
+import org.apache.spark.ml.param.shared.{HasCheckpointInterval, HasMaxIter, HasSeed, HasThresholds}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.impurity.{Entropy => OldEntropy, Gini => OldGini, Impurity => OldImpurity, Variance => OldVariance}
 import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
@@ -30,7 +30,7 @@ import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait DecisionTreeParams extends PredictorParams {
+private[ml] trait DecisionTreeParams extends PredictorParams with HasCheckpointInterval {
 
   /**
    * Maximum depth of the tree (>= 0).
@@ -96,21 +96,6 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
     " algorithm will cache node IDs for each instance. Caching can speed up training of deeper" +
     " trees.")
 
-  /**
-   * Specifies how often to checkpoint the cached node IDs.
-   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
-   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
-   * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
-   * (default = 10)
-   * @group expertParam
-   */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "Specifies" +
-    " how often to checkpoint the cached node IDs.  E.g. 10 means that the cache will get" +
-    " checkpointed every 10 iterations. This is only used if cacheNodeIds is true and if the" +
-    " checkpoint directory is set in the SparkContext. Must be >= 1.",
-    ParamValidators.gtEq(1))
-
   setDefault(maxDepth -> 5, maxBins -> 32, minInstancesPerNode -> 1, minInfoGain -> 0.0,
     maxMemoryInMB -> 256, cacheNodeIds -> false, checkpointInterval -> 10)
 
@@ -150,12 +135,17 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   /** @group expertGetParam */
   final def getCacheNodeIds: Boolean = $(cacheNodeIds)
 
-  /** @group expertSetParam */
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group expertSetParam
+   */
   def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
-  /** @group expertGetParam */
-  final def getCheckpointInterval: Int = $(checkpointInterval)
-
   /** (private[ml]) Create a Strategy instance to use with the old API. */
   private[ml] def getOldStrategy(
       categoricalFeatures: Map[Int, Int],

From a140dd77c62255d6f7f6817a2517d47feb8540d4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Sep 2015 20:43:38 -0700
Subject: [PATCH 247/802] [SPARK-10027] [ML] [PySpark] Add Python API missing
 methods for ml.feature

Missing method of ml.feature are listed here:
```StringIndexer``` lacks of parameter ```handleInvalid```.
```StringIndexerModel``` lacks of method ```labels```.
```VectorIndexerModel``` lacks of methods ```numFeatures``` and ```categoryMaps```.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8313 from yanboliang/spark-10027.
---
 python/pyspark/ml/feature.py                  | 31 ++++++++++++++++---
 .../ml/param/_shared_params_code_gen.py       |  5 ++-
 python/pyspark/ml/param/shared.py             | 31 +++++++++++++++++--
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 1c423486be8d9..71dc636b83eac 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -920,7 +920,7 @@ def mean(self):
 
 
 @inherit_doc
-class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
+class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
     """
     .. note:: Experimental
 
@@ -943,19 +943,20 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     """
 
     @keyword_only
-    def __init__(self, inputCol=None, outputCol=None):
+    def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, inputCol=None, outputCol=None)
+        __init__(self, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(StringIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
+        self._setDefault(handleInvalid="error")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, inputCol=None, outputCol=None):
+    def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, inputCol=None, outputCol=None)
+        setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this StringIndexer.
         """
         kwargs = self.setParams._input_kwargs
@@ -1235,6 +1236,10 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> model = indexer.fit(df)
     >>> model.transform(df).head().indexed
     DenseVector([1.0, 0.0])
+    >>> model.numFeatures
+    2
+    >>> model.categoryMaps
+    {0: {0.0: 0, -1.0: 1}}
     >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test
     DenseVector([0.0, 1.0])
     >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}
@@ -1297,6 +1302,22 @@ class VectorIndexerModel(JavaModel):
     Model fitted by VectorIndexer.
     """
 
+    @property
+    def numFeatures(self):
+        """
+        Number of features, i.e., length of Vectors which this transforms.
+        """
+        return self._call_java("numFeatures")
+
+    @property
+    def categoryMaps(self):
+        """
+        Feature value index.  Keys are categorical feature indices (column indices).
+        Values are maps from original features values to 0-based category indices.
+        If a feature is not in this map, it is treated as continuous.
+        """
+        return self._call_java("javaCategoryMaps")
+
 
 @inherit_doc
 class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 69efc424ec4ef..926375e44871d 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -121,7 +121,10 @@ def get$Name(self):
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
         ("seed", "random seed", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms", None),
-        ("stepSize", "Step size to be used for each iteration of optimization.", None)]
+        ("stepSize", "Step size to be used for each iteration of optimization.", None),
+        ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
+         "out rows with bad values), or error (which will throw an errror). More options may be " +
+         "added later.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
         param_code = _gen_param_header(name, doc, defaultValueStr)
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 595124726366d..682170aee85fb 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -432,6 +432,33 @@ def getStepSize(self):
         return self.getOrDefault(self.stepSize)
 
 
+class HasHandleInvalid(Params):
+    """
+    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.")
+
+    def __init__(self):
+        super(HasHandleInvalid, self).__init__()
+        #: param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
+        self.handleInvalid = Param(self, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.")
+
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        self._paramMap[self.handleInvalid] = value
+        return self
+
+    def getHandleInvalid(self):
+        """
+        Gets the value of handleInvalid or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
@@ -444,7 +471,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +487,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.

From e1d7f64296d8164cc4c935aa524af664f63bf9c1 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 11 Sep 2015 18:26:56 +0800
Subject: [PATCH 248/802] [SPARK-10472] [SQL] Fixes DataType.typeName for UDT

Before this fix, `MyDenseVectorUDT.typeName` gives `mydensevecto`, which is not desirable.

Author: Cheng Lian <lian@databricks.com>

Closes #8640 from liancheng/spark-10472/udt-type-name.
---
 .../main/scala/org/apache/spark/sql/types/DataType.scala    | 4 +++-
 .../scala/org/apache/spark/sql/UserDefinedTypeSuite.scala   | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 7bcd623b3f33e..4b54c31dcc27a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -51,7 +51,9 @@ abstract class DataType extends AbstractDataType {
   def defaultSize: Int
 
   /** Name of the type used in JSON serialization. */
-  def typeName: String = this.getClass.getSimpleName.stripSuffix("$").dropRight(4).toLowerCase
+  def typeName: String = {
+    this.getClass.getSimpleName.stripSuffix("$").stripSuffix("Type").stripSuffix("UDT").toLowerCase
+  }
 
   private[sql] def jsonValue: JValue = typeName
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index fa8f9c8e00089..46d87843dfa4d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -157,4 +157,10 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
         Nil
     )
   }
+
+  test("SPARK-10472 UserDefinedType.typeName") {
+    assert(IntegerType.typeName === "integer")
+    assert(new MyDenseVectorUDT().typeName === "mydensevector")
+    assert(new OpenHashSetUDT(IntegerType).typeName === "openhashset")
+  }
 }

From 9bbe33f318c866c0b13088917542715062f0787f Mon Sep 17 00:00:00 2001
From: Ahir Reddy <ahirreddy@gmail.com>
Date: Fri, 11 Sep 2015 13:06:14 +0100
Subject: [PATCH 249/802] [SPARK-10556] Remove explicit Scala version for sbt
 project build files

Previously, project/plugins.sbt explicitly set scalaVersion to 2.10.4. This can cause issues when using a version of sbt that is compiled against a different version of Scala (for example sbt 0.13.9 uses 2.10.5). Removing this explicit setting will cause build files to be compiled and run against the same version of Scala that sbt is compiled against.

Note that this only applies to the project build files (items in project/), it is distinct from the version of Scala we target for the actual spark compilation.

Author: Ahir Reddy <ahirreddy@gmail.com>

Closes #8709 from ahirreddy/sbt-scala-version-fix.
---
 project/plugins.sbt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/project/plugins.sbt b/project/plugins.sbt
index 51820460ca1a0..c06687d8f197b 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,5 +1,3 @@
-scalaVersion := "2.10.4"
-
 resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 
 resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"

From c268ca4ddde2f5213b2e3985dcaaac5900aea71c Mon Sep 17 00:00:00 2001
From: y-shimizu <y.shimizu0429@gmail.com>
Date: Fri, 11 Sep 2015 08:27:30 -0700
Subject: [PATCH 250/802] [SPARK-10518] [DOCS] Update code examples in spark.ml
 user guide to use LIBSVM data source instead of MLUtils

I fixed to use LIBSVM data source in the example code in spark.ml instead of MLUtils

Author: y-shimizu <y.shimizu0429@gmail.com>

Closes #8697 from y-shimizu/SPARK-10518.
---
 docs/ml-ensembles.md      | 65 ++++++++++++---------------------------
 docs/ml-features.md       | 64 +++++++++++++-------------------------
 docs/ml-linear-methods.md | 22 ++++---------
 3 files changed, 47 insertions(+), 104 deletions(-)

diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index 62749909e01dc..58f566c9b4b55 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -121,10 +121,9 @@ import org.apache.spark.ml.classification.RandomForestClassifier
 import org.apache.spark.ml.classification.RandomForestClassificationModel
 import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -193,14 +192,11 @@ import org.apache.spark.ml.classification.RandomForestClassifier;
 import org.apache.spark.ml.classification.RandomForestClassificationModel;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
 import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -268,10 +264,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.classification import RandomForestClassifier
 from pyspark.ml.feature import StringIndexer, VectorIndexer
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Index labels, adding metadata to the label column.
 # Fit on whole dataset to include all labels in index.
@@ -327,10 +322,9 @@ import org.apache.spark.ml.regression.RandomForestRegressor
 import org.apache.spark.ml.regression.RandomForestRegressionModel
 import org.apache.spark.ml.feature.VectorIndexer
 import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -387,14 +381,11 @@ import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.RandomForestRegressionModel;
 import org.apache.spark.ml.regression.RandomForestRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -450,10 +441,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.regression import RandomForestRegressor
 from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Automatically identify categorical features, and index them.
 # Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -576,10 +566,9 @@ import org.apache.spark.ml.classification.GBTClassifier
 import org.apache.spark.ml.classification.GBTClassificationModel
 import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -648,14 +637,10 @@ import org.apache.spark.ml.classification.GBTClassifier;
 import org.apache.spark.ml.classification.GBTClassificationModel;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
 import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -724,10 +709,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.classification import GBTClassifier
 from pyspark.ml.feature import StringIndexer, VectorIndexer
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Index labels, adding metadata to the label column.
 # Fit on whole dataset to include all labels in index.
@@ -783,10 +767,9 @@ import org.apache.spark.ml.regression.GBTRegressor
 import org.apache.spark.ml.regression.GBTRegressionModel
 import org.apache.spark.ml.feature.VectorIndexer
 import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -844,14 +827,10 @@ import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.GBTRegressionModel;
 import org.apache.spark.ml.regression.GBTRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -908,10 +887,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.regression import GBTRegressor
 from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Automatically identify categorical features, and index them.
 # Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -970,15 +948,14 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie
 {% highlight scala %}
 import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{Row, SQLContext}
 
 val sqlContext = new SQLContext(sc)
 
 // parse data into dataframe
-val data = MLUtils.loadLibSVMFile(sc, 
-  "data/mllib/sample_multiclass_classification_data.txt")
-val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3))
+val data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_multiclass_classification_data.txt")
+val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
 
 // instantiate multiclass learner and train
 val ovr = new OneVsRest().setClassifier(new LogisticRegression)
@@ -1016,9 +993,6 @@ import org.apache.spark.ml.classification.OneVsRest;
 import org.apache.spark.ml.classification.OneVsRestModel;
 import org.apache.spark.mllib.evaluation.MulticlassMetrics;
 import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 
@@ -1026,10 +1000,9 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
 JavaSparkContext jsc = new JavaSparkContext(conf);
 SQLContext jsql = new SQLContext(jsc);
 
-RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
-  "data/mllib/sample_multiclass_classification_data.txt");
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_multiclass_classification_data.txt");
 
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
 DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
 DataFrame train = splits[0];
 DataFrame test = splits[1];
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 58b31a5a5cc47..a414c21b5c280 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1179,9 +1179,9 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.feature.VectorIndexer
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val indexer = new VectorIndexer()
   .setInputCol("features")
   .setOutputCol("indexed")
@@ -1200,16 +1200,12 @@ val indexedData = indexerModel.transform(data)
 {% highlight java %}
 import java.util.Map;
 
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
-  "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 VectorIndexer indexer = new VectorIndexer()
   .setInputCol("features")
   .setOutputCol("indexed")
@@ -1230,9 +1226,9 @@ DataFrame indexedData = indexerModel.transform(data);
 <div data-lang="python" markdown="1">
 {% highlight python %}
 from pyspark.ml.feature import VectorIndexer
-from pyspark.mllib.util import MLUtils
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
 indexerModel = indexer.fit(data)
 
@@ -1253,10 +1249,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.ml.feature.Normalizer
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 
 // Normalize each Vector using $L^1$ norm.
 val normalizer = new Normalizer()
@@ -1272,15 +1267,11 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
 
 <div data-lang="java">
 {% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Normalize each Vector using $L^1$ norm.
 Normalizer normalizer = new Normalizer()
@@ -1297,11 +1288,10 @@ DataFrame lInfNormData =
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark.mllib.util import MLUtils
 from pyspark.ml.feature import Normalizer
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 
 # Normalize each Vector using $L^1$ norm.
 normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@@ -1335,10 +1325,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.ml.feature.StandardScaler
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val scaler = new StandardScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1355,16 +1344,12 @@ val scaledData = scalerModel.transform(dataFrame)
 
 <div data-lang="java">
 {% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.StandardScaler;
 import org.apache.spark.ml.feature.StandardScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 StandardScaler scaler = new StandardScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1381,11 +1366,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark.mllib.util import MLUtils
 from pyspark.ml.feature import StandardScaler
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                         withStd=True, withMean=False)
 
@@ -1424,10 +1408,9 @@ More details can be found in the API docs for
 [MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
 {% highlight scala %}
 import org.apache.spark.ml.feature.MinMaxScaler
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val scaler = new MinMaxScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1448,13 +1431,10 @@ More details can be found in the API docs for
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.MinMaxScaler;
 import org.apache.spark.ml.feature.MinMaxScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 MinMaxScaler scaler = new MinMaxScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures");
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index cdd9d4999fa1b..4e94e2f9c708d 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -59,10 +59,9 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.mllib.util.MLUtils
 
 // Load training data
-val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 val lr = new LogisticRegression()
   .setMaxIter(10)
@@ -81,8 +80,6 @@ println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
 {% highlight java %}
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.sql.DataFrame;
@@ -98,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+    DataFrame training = sqlContext.read.format("libsvm").load(path);
 
     LogisticRegression lr = new LogisticRegression()
       .setMaxIter(10)
@@ -118,11 +115,9 @@ public class LogisticRegressionWithElasticNetExample {
 <div data-lang="python" markdown="1">
 {% highlight python %}
 from pyspark.ml.classification import LogisticRegression
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.util import MLUtils
 
 # Load training data
-training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
 
@@ -251,10 +246,9 @@ regression model and extracting model summary statistics.
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.mllib.util.MLUtils
 
 // Load training data
-val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 val lr = new LinearRegression()
   .setMaxIter(10)
@@ -283,8 +277,6 @@ import org.apache.spark.ml.regression.LinearRegression;
 import org.apache.spark.ml.regression.LinearRegressionModel;
 import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
 import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.sql.DataFrame;
@@ -300,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+    DataFrame training = sqlContext.read.format("libsvm").load(path);
 
     LinearRegression lr = new LinearRegression()
       .setMaxIter(10)
@@ -329,11 +321,9 @@ public class LinearRegressionWithElasticNetExample {
 <!--- TODO: Add python model summaries once implemented -->
 {% highlight python %}
 from pyspark.ml.regression import LinearRegression
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.util import MLUtils
 
 # Load training data
-training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
 

From b656e6134fc5cd27e1fe6b6ab30fd7633cab0b14 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 11 Sep 2015 08:50:35 -0700
Subject: [PATCH 251/802] [SPARK-10026] [ML] [PySpark] Implement some common
 Params for regression in PySpark

LinearRegression and LogisticRegression lack of some Params for Python, and some Params are not shared classes which lead we need to write them for each class. These kinds of Params are list here:
```scala
HasElasticNetParam
HasFitIntercept
HasStandardization
HasThresholds
```
Here we implement them in shared params at Python side and make LinearRegression/LogisticRegression parameters peer with Scala one.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8508 from yanboliang/spark-10026.
---
 python/pyspark/ml/classification.py           |  75 ++----------
 .../ml/param/_shared_params_code_gen.py       |  11 +-
 python/pyspark/ml/param/shared.py             | 111 ++++++++++++++++++
 python/pyspark/ml/regression.py               |  42 ++-----
 4 files changed, 143 insertions(+), 96 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 83f808efc3bf0..22bdd1b322aca 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -31,7 +31,8 @@
 
 @inherit_doc
 class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                         HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol):
+                         HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol,
+                         HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds):
     """
     Logistic regression.
     Currently, this class only supports binary classification.
@@ -65,17 +66,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     """
 
     # a placeholder to make it appear in the generated doc
-    elasticNetParam = \
-        Param(Params._dummy(), "elasticNetParam",
-              "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-              "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
-    thresholds = Param(Params._dummy(), "thresholds",
-                       "Thresholds in multi-class classification" +
-                       " to adjust the probability of predicting each class." +
-                       " Array must have length equal to the number of classes, with values >= 0." +
-                       " The class with largest value p/t is predicted, where p is the original" +
-                       " probability of that class and t is the class' threshold.")
     threshold = Param(Params._dummy(), "threshold",
                       "Threshold in binary classification prediction, in range [0, 1]." +
                       " If threshold and thresholds are both set, they must match.")
@@ -83,40 +73,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                 threshold=0.5, thresholds=None,
-                 probabilityCol="probability", rawPredictionCol="rawPrediction"):
+                 threshold=0.5, thresholds=None, probabilityCol="probability",
+                 rawPredictionCol="rawPrediction", standardization=True):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                 threshold=0.5, thresholds=None, \
-                 probabilityCol="probability", rawPredictionCol="rawPrediction")
+                 threshold=0.5, thresholds=None, probabilityCol="probability", \
+                 rawPredictionCol="rawPrediction", standardization=True)
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
         super(LogisticRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid)
-        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
-        #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
-        self.elasticNetParam = \
-            Param(self, "elasticNetParam",
-                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-                  "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-        #: param for whether to fit an intercept term.
-        self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
         #: param for threshold in binary classification, in range [0, 1].
         self.threshold = Param(self, "threshold",
                                "Threshold in binary classification prediction, in range [0, 1]." +
                                " If threshold and thresholds are both set, they must match.")
-        #: param for thresholds or cutoffs in binary or multiclass classification
-        self.thresholds = \
-            Param(self, "thresholds",
-                  "Thresholds in multi-class classification" +
-                  " to adjust the probability of predicting each class." +
-                  " Array must have length equal to the number of classes, with values >= 0." +
-                  " The class with largest value p/t is predicted, where p is the original" +
-                  " probability of that class and t is the class' threshold.")
-        self._setDefault(maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1E-6,
-                         fitIntercept=True, threshold=0.5)
+        self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
         self._checkThresholdConsistency()
@@ -124,13 +97,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                  threshold=0.5, thresholds=None,
-                  probabilityCol="probability", rawPredictionCol="rawPrediction"):
+                  threshold=0.5, thresholds=None, probabilityCol="probability",
+                  rawPredictionCol="rawPrediction", standardization=True):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                  threshold=0.5, thresholds=None, \
-                  probabilityCol="probability", rawPredictionCol="rawPrediction")
+                  threshold=0.5, thresholds=None, probabilityCol="probability", \
+                  rawPredictionCol="rawPrediction", standardization=True)
         Sets params for logistic regression.
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
@@ -142,32 +115,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LogisticRegressionModel(java_model)
 
-    def setElasticNetParam(self, value):
-        """
-        Sets the value of :py:attr:`elasticNetParam`.
-        """
-        self._paramMap[self.elasticNetParam] = value
-        return self
-
-    def getElasticNetParam(self):
-        """
-        Gets the value of elasticNetParam or its default value.
-        """
-        return self.getOrDefault(self.elasticNetParam)
-
-    def setFitIntercept(self, value):
-        """
-        Sets the value of :py:attr:`fitIntercept`.
-        """
-        self._paramMap[self.fitIntercept] = value
-        return self
-
-    def getFitIntercept(self):
-        """
-        Gets the value of fitIntercept or its default value.
-        """
-        return self.getOrDefault(self.fitIntercept)
-
     def setThreshold(self, value):
         """
         Sets the value of :py:attr:`threshold`.
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 926375e44871d..5b39e5dd4e25b 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -124,7 +124,16 @@ def get$Name(self):
         ("stepSize", "Step size to be used for each iteration of optimization.", None),
         ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
          "out rows with bad values), or error (which will throw an errror). More options may be " +
-         "added later.", None)]
+         "added later.", None),
+        ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
+         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0"),
+        ("fitIntercept", "whether to fit an intercept term.", "True"),
+        ("standardization", "whether to standardize the training features before fitting the " +
+         "model.", "True"),
+        ("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
+         "predicting each class. Array must have length equal to the number of classes, with " +
+         "values >= 0. The class with largest value p/t is predicted, where p is the original " +
+         "probability of that class and t is the class' threshold.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
         param_code = _gen_param_header(name, doc, defaultValueStr)
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 682170aee85fb..af1218128602b 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -459,6 +459,117 @@ def getHandleInvalid(self):
         return self.getOrDefault(self.handleInvalid)
 
 
+class HasElasticNetParam(Params):
+    """
+    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+
+    def __init__(self):
+        super(HasElasticNetParam, self).__init__()
+        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
+        self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+        self._setDefault(elasticNetParam=0.0)
+
+    def setElasticNetParam(self, value):
+        """
+        Sets the value of :py:attr:`elasticNetParam`.
+        """
+        self._paramMap[self.elasticNetParam] = value
+        return self
+
+    def getElasticNetParam(self):
+        """
+        Gets the value of elasticNetParam or its default value.
+        """
+        return self.getOrDefault(self.elasticNetParam)
+
+
+class HasFitIntercept(Params):
+    """
+    Mixin for param fitIntercept: whether to fit an intercept term..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
+
+    def __init__(self):
+        super(HasFitIntercept, self).__init__()
+        #: param for whether to fit an intercept term.
+        self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
+        self._setDefault(fitIntercept=True)
+
+    def setFitIntercept(self, value):
+        """
+        Sets the value of :py:attr:`fitIntercept`.
+        """
+        self._paramMap[self.fitIntercept] = value
+        return self
+
+    def getFitIntercept(self):
+        """
+        Gets the value of fitIntercept or its default value.
+        """
+        return self.getOrDefault(self.fitIntercept)
+
+
+class HasStandardization(Params):
+    """
+    Mixin for param standardization: whether to standardize the training features before fitting the model..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.")
+
+    def __init__(self):
+        super(HasStandardization, self).__init__()
+        #: param for whether to standardize the training features before fitting the model.
+        self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.")
+        self._setDefault(standardization=True)
+
+    def setStandardization(self, value):
+        """
+        Sets the value of :py:attr:`standardization`.
+        """
+        self._paramMap[self.standardization] = value
+        return self
+
+    def getStandardization(self):
+        """
+        Gets the value of standardization or its default value.
+        """
+        return self.getOrDefault(self.standardization)
+
+
+class HasThresholds(Params):
+    """
+    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
+
+    def __init__(self):
+        super(HasThresholds, self).__init__()
+        #: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
+        self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
+
+    def setThresholds(self, value):
+        """
+        Sets the value of :py:attr:`thresholds`.
+        """
+        self._paramMap[self.thresholds] = value
+        return self
+
+    def getThresholds(self):
+        """
+        Gets the value of thresholds or its default value.
+        """
+        return self.getOrDefault(self.thresholds)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 44f60a769566d..a9503608b7f25 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -28,7 +28,8 @@
 
 @inherit_doc
 class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                       HasRegParam, HasTol):
+                       HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
+                       HasStandardization):
     """
     Linear regression.
 
@@ -63,38 +64,30 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     TypeError: Method setParams forces keyword arguments.
     """
 
-    # a placeholder to make it appear in the generated doc
-    elasticNetParam = \
-        Param(Params._dummy(), "elasticNetParam",
-              "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-              "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                 standardization=True):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                 standardization=True)
         """
         super(LinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.LinearRegression", self.uid)
-        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
-        #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
-        self.elasticNetParam = \
-            Param(self, "elasticNetParam",
-                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
-                  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-        self._setDefault(maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+        self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                  standardization=True):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                  standardization=True)
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
@@ -103,19 +96,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)
 
-    def setElasticNetParam(self, value):
-        """
-        Sets the value of :py:attr:`elasticNetParam`.
-        """
-        self._paramMap[self.elasticNetParam] = value
-        return self
-
-    def getElasticNetParam(self):
-        """
-        Gets the value of elasticNetParam or its default value.
-        """
-        return self.getOrDefault(self.elasticNetParam)
-
 
 class LinearRegressionModel(JavaModel):
     """

From b01b26260625f0ba14e5f3010207666d62d93864 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 11 Sep 2015 08:52:28 -0700
Subject: [PATCH 252/802] [SPARK-9773] [ML] [PySpark] Add Python API for
 MultilayerPerceptronClassifier

Add Python API for ```MultilayerPerceptronClassifier```.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8067 from yanboliang/SPARK-9773.
---
 .../MultilayerPerceptronClassifier.scala      |   9 ++
 python/pyspark/ml/classification.py           | 132 +++++++++++++++++-
 2 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 82fc80c58054f..5f60dea91fcfa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.classification
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed}
 import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
@@ -181,6 +183,13 @@ class MultilayerPerceptronClassificationModel private[ml] (
 
   private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
 
+  /**
+   * Returns layers in a Java List.
+   */
+  private[ml] def javaLayers: java.util.List[Int] = {
+    layers.toList.asJava
+  }
+
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 22bdd1b322aca..88815e561f572 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -26,7 +26,8 @@
 __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
            'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
-           'NaiveBayesModel']
+           'NaiveBayesModel', 'MultilayerPerceptronClassifier',
+           'MultilayerPerceptronClassificationModel']
 
 
 @inherit_doc
@@ -755,6 +756,135 @@ def theta(self):
         return self._call_java("theta")
 
 
+@inherit_doc
+class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                                     HasMaxIter, HasTol, HasSeed):
+    """
+    Classifier trainer based on the Multilayer Perceptron.
+    Each layer has sigmoid activation function, output layer has softmax.
+    Number of inputs has to be equal to the size of feature vectors.
+    Number of outputs has to be equal to the total number of labels.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (0.0, Vectors.dense([0.0, 0.0])),
+    ...     (1.0, Vectors.dense([0.0, 1.0])),
+    ...     (1.0, Vectors.dense([1.0, 0.0])),
+    ...     (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
+    >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11)
+    >>> model = mlp.fit(df)
+    >>> model.layers
+    [2, 5, 2]
+    >>> model.weights.size
+    27
+    >>> testDF = sqlContext.createDataFrame([
+    ...     (Vectors.dense([1.0, 0.0]),),
+    ...     (Vectors.dense([0.0, 0.0]),)], ["features"])
+    >>> model.transform(testDF).show()
+    +---------+----------+
+    | features|prediction|
+    +---------+----------+
+    |[1.0,0.0]|       1.0|
+    |[0.0,0.0]|       0.0|
+    +---------+----------+
+    ...
+    """
+
+    # a placeholder to make it appear in the generated doc
+    layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
+                   "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
+                   "neurons and output layer of 10 neurons, default is [1, 1].")
+    blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
+                      "matrices. Data is stacked within partitions. If block size is more than " +
+                      "remaining data in a partition then it is adjusted to the size of this " +
+                      "data. Recommended size is between 10 and 1000, default is 128.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        """
+        super(MultilayerPerceptronClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
+        self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
+                            "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
+                            "100 neurons and output layer of 10 neurons, default is [1, 1].")
+        self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
+                               "matrices. Data is stacked within partitions. If block size is " +
+                               "more than remaining data in a partition then it is adjusted to " +
+                               "the size of this data. Recommended size is between 10 and 1000, " +
+                               "default is 128.")
+        self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        Sets params for MultilayerPerceptronClassifier.
+        """
+        kwargs = self.setParams._input_kwargs
+        if layers is None:
+            return self._set(**kwargs).setLayers([1, 1])
+        else:
+            return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return MultilayerPerceptronClassificationModel(java_model)
+
+    def setLayers(self, value):
+        """
+        Sets the value of :py:attr:`layers`.
+        """
+        self._paramMap[self.layers] = value
+        return self
+
+    def getLayers(self):
+        """
+        Gets the value of layers or its default value.
+        """
+        return self.getOrDefault(self.layers)
+
+    def setBlockSize(self, value):
+        """
+        Sets the value of :py:attr:`blockSize`.
+        """
+        self._paramMap[self.blockSize] = value
+        return self
+
+    def getBlockSize(self):
+        """
+        Gets the value of blockSize or its default value.
+        """
+        return self.getOrDefault(self.blockSize)
+
+
+class MultilayerPerceptronClassificationModel(JavaModel):
+    """
+    Model fitted by MultilayerPerceptronClassifier.
+    """
+
+    @property
+    def layers(self):
+        """
+        array of layer sizes including input and output layers.
+        """
+        return self._call_java("javaLayers")
+
+    @property
+    def weights(self):
+        """
+        vector of initial weights for the model that consists of the weights of layers.
+        """
+        return self._call_java("weights")
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 960d2d0ac6b5a22242a922f87f745f7d1f736181 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 11 Sep 2015 08:53:40 -0700
Subject: [PATCH 253/802] [SPARK-10537] [ML] document LIBSVM source options in
 public API doc and some minor improvements

We should document options in public API doc. Otherwise, it is hard to find out the options without looking at the code. I tried to make `DefaultSource` private and put the documentation to package doc. However, since then there exists no public class under `source.libsvm`, the Java package doc doesn't show up in the generated html file (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4492654). So I put the doc to `DefaultSource` instead. There are several minor updates in this PR:

1. Do `vectorType == "sparse"` only once.
2. Update `hashCode` and `equals`.
3. Remove inherited doc.
4. Delete temp dir in `afterAll`.

Lewuathe

Author: Xiangrui Meng <meng@databricks.com>

Closes #8699 from mengxr/SPARK-10537.
---
 .../ml/source/libsvm/LibSVMRelation.scala     | 71 ++++++++++++-------
 .../{ => libsvm}/JavaLibSVMRelationSuite.java | 24 +++----
 .../{ => libsvm}/LibSVMRelationSuite.scala    | 14 ++--
 3 files changed, 66 insertions(+), 43 deletions(-)
 rename mllib/src/test/java/org/apache/spark/ml/source/{ => libsvm}/JavaLibSVMRelationSuite.java (79%)
 rename mllib/src/test/scala/org/apache/spark/ml/source/{ => libsvm}/LibSVMRelationSuite.scala (88%)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index b12cb62a4ef15..1f627777fc68d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -21,12 +21,12 @@ import com.google.common.base.Objects
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Since
-import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{StructType, StructField, DoubleType}
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /**
  * LibSVMRelation provides the DataFrame constructed from LibSVM format data.
@@ -35,7 +35,7 @@ import org.apache.spark.sql.sources._
  * @param vectorType The type of vector. It can be 'sparse' or 'dense'
  * @param sqlContext The Spark SQLContext
  */
-private[ml] class LibSVMRelation(val path: String, val numFeatures: Int, val vectorType: String)
+private[libsvm] class LibSVMRelation(val path: String, val numFeatures: Int, val vectorType: String)
     (@transient val sqlContext: SQLContext)
   extends BaseRelation with TableScan with Logging with Serializable {
 
@@ -47,27 +47,56 @@ private[ml] class LibSVMRelation(val path: String, val numFeatures: Int, val vec
   override def buildScan(): RDD[Row] = {
     val sc = sqlContext.sparkContext
     val baseRdd = MLUtils.loadLibSVMFile(sc, path, numFeatures)
-
+    val sparse = vectorType == "sparse"
     baseRdd.map { pt =>
-      val features = if (vectorType == "dense") pt.features.toDense else pt.features.toSparse
+      val features = if (sparse) pt.features.toSparse else pt.features.toDense
       Row(pt.label, features)
     }
   }
 
   override def hashCode(): Int = {
-    Objects.hashCode(path, schema)
+    Objects.hashCode(path, Double.box(numFeatures), vectorType)
   }
 
   override def equals(other: Any): Boolean = other match {
-    case that: LibSVMRelation => (this.path == that.path) && this.schema.equals(that.schema)
-    case _ => false
+    case that: LibSVMRelation =>
+      path == that.path &&
+        numFeatures == that.numFeatures &&
+        vectorType == that.vectorType
+    case _ =>
+      false
   }
-
 }
 
 /**
- * This is used for creating DataFrame from LibSVM format file.
- * The LibSVM file path must be specified to DefaultSource.
+ * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
+ * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
+ * `features` containing feature vectors stored as [[Vector]]s.
+ *
+ * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and
+ * optionally specify options, for example:
+ * {{{
+ *   // Scala
+ *   val df = sqlContext.read.format("libsvm")
+ *     .option("numFeatures", "780")
+ *     .load("data/mllib/sample_libsvm_data.txt")
+ *
+ *   // Java
+ *   DataFrame df = sqlContext.read.format("libsvm")
+ *     .option("numFeatures, "780")
+ *     .load("data/mllib/sample_libsvm_data.txt");
+ * }}}
+ *
+ * LIBSVM data source supports the following options:
+ *  - "numFeatures": number of features.
+ *    If unspecified or nonpositive, the number of features will be determined automatically at the
+ *    cost of one additional pass.
+ *    This is also useful when the dataset is already split into multiple files and you want to load
+ *    them separately, because some features may not present in certain files, which leads to
+ *    inconsistent feature dimensions.
+ *  - "vectorType": feature vector type, "sparse" (default) or "dense".
+ *
+ *  @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
  */
 @Since("1.6.0")
 class DefaultSource extends RelationProvider with DataSourceRegister {
@@ -75,24 +104,12 @@ class DefaultSource extends RelationProvider with DataSourceRegister {
   @Since("1.6.0")
   override def shortName(): String = "libsvm"
 
-  private def checkPath(parameters: Map[String, String]): String = {
-    require(parameters.contains("path"), "'path' must be specified")
-    parameters.get("path").get
-  }
-
-  /**
-   * Returns a new base relation with the given parameters.
-   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
-   * by the Map that is passed to the function.
-   */
+  @Since("1.6.0")
   override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
     : BaseRelation = {
-    val path = checkPath(parameters)
+    val path = parameters.getOrElse("path",
+      throw new IllegalArgumentException("'path' must be specified"))
     val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
-    /**
-     * featuresType can be selected "dense" or "sparse".
-     * This parameter decides the type of returned feature vector.
-     */
     val vectorType = parameters.getOrElse("vectorType", "sparse")
     new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
   }
diff --git a/mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java b/mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java
similarity index 79%
rename from mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java
rename to mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java
index 11fa4eec0ccf0..2976b38e45031 100644
--- a/mllib/src/test/java/org/apache/spark/ml/source/JavaLibSVMRelationSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.ml.source;
+package org.apache.spark.ml.source.libsvm;
 
 import java.io.File;
 import java.io.IOException;
@@ -42,34 +42,34 @@
  */
 public class JavaLibSVMRelationSuite {
   private transient JavaSparkContext jsc;
-  private transient SQLContext jsql;
-  private transient DataFrame dataset;
+  private transient SQLContext sqlContext;
 
-  private File tmpDir;
-  private File path;
+  private File tempDir;
+  private String path;
 
   @Before
   public void setUp() throws IOException {
     jsc = new JavaSparkContext("local", "JavaLibSVMRelationSuite");
-    jsql = new SQLContext(jsc);
-
-    tmpDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource");
-    path = new File(tmpDir.getPath(), "part-00000");
+    sqlContext = new SQLContext(jsc);
 
+    tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource");
+    File file = new File(tempDir, "part-00000");
     String s = "1 1:1.0 3:2.0 5:3.0\n0\n0 2:4.0 4:5.0 6:6.0";
-    Files.write(s, path, Charsets.US_ASCII);
+    Files.write(s, file, Charsets.US_ASCII);
+    path = tempDir.toURI().toString();
   }
 
   @After
   public void tearDown() {
     jsc.stop();
     jsc = null;
-    Utils.deleteRecursively(tmpDir);
+    Utils.deleteRecursively(tempDir);
   }
 
   @Test
   public void verifyLibSVMDF() {
-    dataset = jsql.read().format("libsvm").option("vectorType", "dense").load(path.getPath());
+    DataFrame dataset = sqlContext.read().format("libsvm").option("vectorType", "dense")
+      .load(path);
     Assert.assertEquals("label", dataset.columns()[0]);
     Assert.assertEquals("features", dataset.columns()[1]);
     Row r = dataset.first();
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
similarity index 88%
rename from mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
rename to mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index 8ed134128c8d2..997f574e51f6a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.ml.source
+package org.apache.spark.ml.source.libsvm
 
 import java.io.File
 
@@ -23,11 +23,12 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{SparseVector, Vectors, DenseVector}
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.util.Utils
 
 class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
+  var tempDir: File = _
   var path: String = _
 
   override def beforeAll(): Unit = {
@@ -38,12 +39,17 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
         |0
         |0 2:4.0 4:5.0 6:6.0
       """.stripMargin
-    val tempDir = Utils.createTempDir()
-    val file = new File(tempDir.getPath, "part-00000")
+    tempDir = Utils.createTempDir()
+    val file = new File(tempDir, "part-00000")
     Files.write(lines, file, Charsets.US_ASCII)
     path = tempDir.toURI.toString
   }
 
+  override def afterAll(): Unit = {
+    Utils.deleteRecursively(tempDir)
+    super.afterAll()
+  }
+
   test("select as sparse vector") {
     val df = sqlContext.read.format("libsvm").load(path)
     assert(df.columns(0) == "label")

From 2e3a280754a28dc36a71b9ff988e34cbf457f6c3 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 11 Sep 2015 08:55:35 -0700
Subject: [PATCH 254/802] [MINOR] [MLLIB] [ML] [DOC] Minor doc fixes for
 StringIndexer and MetadataUtils
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
* Make Scala doc for StringIndexerInverse clearer.  Also remove Scala doc from transformSchema, so that the doc is inherited.
* MetadataUtils.scala: “ Helper utilities for tree-based algorithms” —> not just trees anymore

CC: holdenk mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8679 from jkbradley/doc-fixes-1.5.
---
 .../spark/ml/feature/StringIndexer.scala      | 31 +++++++------------
 .../apache/spark/ml/util/MetadataUtils.scala  |  2 +-
 python/pyspark/ml/feature.py                  | 16 +++++-----
 3 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index b6482ffe0b2ee..3a4ab9a857648 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -181,10 +181,10 @@ class StringIndexerModel (
 
 /**
  * :: Experimental ::
- * A [[Transformer]] that maps a column of string indices back to a new column of corresponding
- * string values using either the ML attributes of the input column, or if provided using the labels
- * supplied by the user.
- * All original columns are kept during transformation.
+ * A [[Transformer]] that maps a column of indices back to a new column of corresponding
+ * string values.
+ * The index-string mapping is either from the ML attributes of the input column,
+ * or from user-supplied labels (which take precedence over ML attributes).
  *
  * @see [[StringIndexer]] for converting strings into indices
  */
@@ -202,32 +202,23 @@ class IndexToString private[ml] (
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /**
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels. The default value is an empty array,
-   * but the empty array is ignored and column metadata used instead.
-   * @group setParam
-   */
+  /** @group setParam */
   def setLabels(value: Array[String]): this.type = set(labels, value)
 
   /**
-   * Param for array of labels.
-   * Optional labels to be provided by the user.
-   * Default: Empty array, in which case column metadata is used for labels.
+   * Optional param for array of labels specifying index-string mapping.
+   *
+   * Default: Empty array, in which case [[inputCol]] metadata is used for labels.
    * @group param
    */
   final val labels: StringArrayParam = new StringArrayParam(this, "labels",
-    "array of labels, if not provided metadata from inputCol is used instead.")
+    "Optional array of labels specifying index-string mapping." +
+      " If not provided or if empty, then metadata from inputCol is used instead.")
   setDefault(labels, Array.empty[String])
 
-  /**
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels.
-   * @group getParam
-   */
+  /** @group getParam */
   final def getLabels: Array[String] = $(labels)
 
-  /** Transform the schema for the inverse transformation */
   override def transformSchema(schema: StructType): StructType = {
     val inputColName = $(inputCol)
     val inputDataType = schema(inputColName).dataType
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index fcb517b5f735e..96a38a3bde960 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructField
 
 
 /**
- * Helper utilities for tree-based algorithms
+ * Helper utilities for algorithms using ML metadata
  */
 private[spark] object MetadataUtils {
 
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 71dc636b83eac..97cbee73a05ed 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -985,17 +985,17 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
     """
     .. note:: Experimental
 
-    A :py:class:`Transformer` that maps a column of string indices back to a new column of
-    corresponding string values using either the ML attributes of the input column, or if
-    provided using the labels supplied by the user.
-    All original columns are kept during transformation.
+    A :py:class:`Transformer` that maps a column of indices back to a new column of
+    corresponding string values.
+    The index-string mapping is either from the ML attributes of the input column,
+    or from user-supplied labels (which take precedence over ML attributes).
     See L{StringIndexer} for converting strings into indices.
     """
 
     # a placeholder to make the labels show up in generated doc
     labels = Param(Params._dummy(), "labels",
-                   "Optional array of labels to be provided by the user, if not supplied or " +
-                   "empty, column metadata is read for labels")
+                   "Optional array of labels specifying index-string mapping." +
+                   " If not provided or if empty, then metadata from inputCol is used instead.")
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, labels=None):
@@ -1006,8 +1006,8 @@ def __init__(self, inputCol=None, outputCol=None, labels=None):
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
                                             self.uid)
         self.labels = Param(self, "labels",
-                            "Optional array of labels to be provided by the user, if not " +
-                            "supplied or empty, column metadata is read for labels")
+                            "Optional array of labels specifying index-string mapping. If not" +
+                            " provided or if empty, then metadata from inputCol is used instead.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 

From 6ce0886eb0916a985db142c0b6d2c2b14db5063d Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 11 Sep 2015 09:42:53 -0700
Subject: [PATCH 255/802] [SPARK-10540] [SQL] Ignore HadoopFsRelationTest's
 "test all data types" if it is too flaky

If hadoopFsRelationSuites's "test all data types" is too flaky we can disable it for now.

https://issues.apache.org/jira/browse/SPARK-10540

Author: Yin Huai <yhuai@databricks.com>

Closes #8705 from yhuai/SPARK-10540-ignore.
---
 .../org/apache/spark/sql/sources/hadoopFsRelationSuites.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 24f43cf7c15ca..13223c61584b2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -100,7 +100,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
-  test("test all data types") {
+  ignore("test all data types") {
     withTempPath { file =>
       // Create the schema.
       val struct =

From 5f46444765a377696af76af6e2c77ab14bfdab8e Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 11 Sep 2015 10:32:35 -0700
Subject: [PATCH 256/802] [SPARK-8530] [ML] add python API for MinMaxScaler

jira: https://issues.apache.org/jira/browse/SPARK-8530

add python API for MinMaxScaler
jira for MinMaxScaler: https://issues.apache.org/jira/browse/SPARK-7514

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7150 from hhbyyh/pythonMinMax.
---
 python/pyspark/ml/feature.py | 104 +++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 97cbee73a05ed..92db8df80280b 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -27,11 +27,11 @@
 from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
-           'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
-           'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
-           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
-           'Word2Vec', 'Word2VecModel']
+           'IndexToString', 'MinMaxScaler', 'MinMaxScalerModel', 'NGram', 'Normalizer',
+           'OneHotEncoder', 'PCA', 'PCAModel', 'PolynomialExpansion', 'RegexTokenizer',
+           'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
+           'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
+           'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel']
 
 
 @inherit_doc
@@ -406,6 +406,100 @@ class IDFModel(JavaModel):
     """
 
 
+@inherit_doc
+class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    Rescale each feature individually to a common range [min, max] linearly using column summary
+    statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
+    feature E is calculated as,
+
+    Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min
+
+    For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
+
+    Note that since zero values will probably be transformed to non-zero values, output of the
+    transformer will be DenseVector even for sparse input.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
+    >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
+    >>> model = mmScaler.fit(df)
+    >>> model.transform(df).show()
+    +-----+------+
+    |    a|scaled|
+    +-----+------+
+    |[0.0]| [0.0]|
+    |[2.0]| [1.0]|
+    +-----+------+
+    ...
+    """
+
+    # a placeholder to make it appear in the generated doc
+    min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
+    max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
+
+    @keyword_only
+    def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
+        """
+        __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
+        """
+        super(MinMaxScaler, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
+        self.min = Param(self, "min", "Lower bound of the output feature range")
+        self.max = Param(self, "max", "Upper bound of the output feature range")
+        self._setDefault(min=0.0, max=1.0)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
+        """
+        setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
+        Sets params for this MinMaxScaler.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setMin(self, value):
+        """
+        Sets the value of :py:attr:`min`.
+        """
+        self._paramMap[self.min] = value
+        return self
+
+    def getMin(self):
+        """
+        Gets the value of min or its default value.
+        """
+        return self.getOrDefault(self.min)
+
+    def setMax(self, value):
+        """
+        Sets the value of :py:attr:`max`.
+        """
+        self._paramMap[self.max] = value
+        return self
+
+    def getMax(self):
+        """
+        Gets the value of max or its default value.
+        """
+        return self.getOrDefault(self.max)
+
+    def _create_model(self, java_model):
+        return MinMaxScalerModel(java_model)
+
+
+class MinMaxScalerModel(JavaModel):
+    """
+    .. note:: Experimental
+
+    Model fitted by :py:class:`MinMaxScaler`.
+    """
+
+
 @inherit_doc
 @ignore_unicode_prefix
 class NGram(JavaTransformer, HasInputCol, HasOutputCol):

From b231ab8938ae3c4fc2089cfc69c0d8164807d533 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Fri, 11 Sep 2015 21:45:45 +0100
Subject: [PATCH 257/802] [SPARK-10546] Check partitionId's range in
 ExternalSorter#spill()

See this thread for background:
http://search-hadoop.com/m/q3RTt0rWvIkHAE81

We should check the range of partition Id and provide meaningful message through exception.

Alternatively, we can use abs() and modulo to force the partition Id into legitimate range. However, expectation is that user should correct the logic error in his / her code.

Author: tedyu <yuzhihong@gmail.com>

Closes #8703 from tedyu/master.
---
 .../scala/org/apache/spark/util/collection/ExternalSorter.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 138c05dff19e4..31230d5978b2a 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -297,6 +297,8 @@ private[spark] class ExternalSorter[K, V, C](
       val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
       while (it.hasNext) {
         val partitionId = it.nextPartition()
+        require(partitionId >= 0 && partitionId < numPartitions,
+          s"partition Id: ${partitionId} should be in the range [0, ${numPartitions})")
         it.writeNext(writer)
         elementsPerPartition(partitionId) += 1
         objectsWritten += 1

From c373866774c082885a50daaf7c83f3a14b0cd714 Mon Sep 17 00:00:00 2001
From: Icaro Medeiros <icaro.medeiros@gmail.com>
Date: Fri, 11 Sep 2015 21:46:52 +0100
Subject: [PATCH 258/802] [PYTHON] Fixed typo in exception message

Just fixing a typo in exception message, raised when attempting to pickle SparkContext.

Author: Icaro Medeiros <icaro.medeiros@gmail.com>

Closes #8724 from icaromedeiros/master.
---
 python/pyspark/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 1b2a52ad64114..a0a1ccbeefb09 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -255,7 +255,7 @@ def __getnewargs__(self):
         # This method is called when attempting to pickle SparkContext, which is always an error:
         raise Exception(
             "It appears that you are attempting to reference SparkContext from a broadcast "
-            "variable, action, or transforamtion. SparkContext can only be used on the driver, "
+            "variable, action, or transformation. SparkContext can only be used on the driver, "
             "not in code that it run on workers. For more information, see SPARK-5063."
         )
 

From d5d647380f93f4773f9cb85ea6544892d409b5a1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 11 Sep 2015 14:15:16 -0700
Subject: [PATCH 259/802] [SPARK-10442] [SQL] fix string to boolean cast

When we cast string to boolean in hive, it returns `true` if the length of string is > 0, and spark SQL follows this behavior.

However, this behavior is very different from other SQL systems:

1. [presto](https://github.com/facebook/presto/blob/master/presto-main/src/main/java/com/facebook/presto/type/VarcharOperators.java#L89-L118) will return `true` for 't' 'true' '1', `false` for 'f' 'false' '0', throw exception for others.
2. [redshift](http://docs.aws.amazon.com/redshift/latest/dg/r_Boolean_type.html) will return `true` for 't' 'true' 'y' 'yes' '1', `false` for 'f' 'false' 'n' 'no' '0', null for others.
3. [postgresql](http://www.postgresql.org/docs/devel/static/datatype-boolean.html) will return `true` for 't' 'true' 'y' 'yes' 'on' '1', `false` for 'f' 'false' 'n' 'no' 'off' '0', throw exception for others.
4. [vertica](https://my.vertica.com/docs/5.0/HTML/Master/2983.htm) will return `true` for 't' 'true' 'y' 'yes' '1', `false` for 'f' 'false' 'n' 'no' '0', null for others.
5. [impala](http://www.cloudera.com/content/cloudera/en/documentation/cloudera-impala/latest/topics/impala_boolean.html) throw exception when try to cast string to boolean.
6. mysql, oracle, sqlserver don't have boolean type

Whether we should change the cast behavior according to other SQL system or not is not decided yet, this PR is a test to see if we changed, how many compatibility tests will fail.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8698 from cloud-fan/string2boolean.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 24 +++++++-
 .../spark/sql/catalyst/util/StringUtils.scala |  8 +++
 .../sql/catalyst/expressions/CastSuite.scala  | 61 ++++++++++++-------
 .../sql/sources/hadoopFsRelationSuites.scala  | 13 ++++
 4 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 2db954257be35..f0bce388d959a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -22,7 +22,7 @@ import java.math.{BigDecimal => JavaBigDecimal}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{StringUtils, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
@@ -140,7 +140,15 @@ case class Cast(child: Expression, dataType: DataType)
   // UDFToBoolean
   private[this] def castToBoolean(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, _.numBytes() != 0)
+      buildCast[UTF8String](_, s => {
+        if (StringUtils.isTrueString(s)) {
+          true
+        } else if (StringUtils.isFalseString(s)) {
+          false
+        } else {
+          null
+        }
+      })
     case TimestampType =>
       buildCast[Long](_, t => t != 0)
     case DateType =>
@@ -646,7 +654,17 @@ case class Cast(child: Expression, dataType: DataType)
 
   private[this] def castToBooleanCode(from: DataType): CastFunction = from match {
     case StringType =>
-      (c, evPrim, evNull) => s"$evPrim = $c.numBytes() != 0;"
+      val stringUtils = StringUtils.getClass.getName.stripSuffix("$")
+      (c, evPrim, evNull) =>
+        s"""
+          if ($stringUtils.isTrueString($c)) {
+            $evPrim = true;
+          } else if ($stringUtils.isFalseString($c)) {
+            $evPrim = false;
+          } else {
+            $evNull = true;
+          }
+        """
     case TimestampType =>
       (c, evPrim, evNull) => s"$evPrim = $c != 0;"
     case DateType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index 9ddfb3a0d3759..c2eeb3c5650ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.util
 
 import java.util.regex.Pattern
 
+import org.apache.spark.unsafe.types.UTF8String
+
 object StringUtils {
 
   // replace the _ with .{1} exactly match 1 time of any character
@@ -44,4 +46,10 @@ object StringUtils {
       v
     }
   }
+
+  private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
+  private[this] val falseStrings = Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
+
+  def isTrueString(s: UTF8String): Boolean = trueStrings.contains(s.toLowerCase)
+  def isFalseString(s: UTF8String): Boolean = falseStrings.contains(s.toLowerCase)
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1ad70733eae03..f4db4da7646f8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -503,9 +503,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast from array") {
-    val array = Literal.create(Seq("123", "abc", "", null),
+    val array = Literal.create(Seq("123", "true", "f", null),
       ArrayType(StringType, containsNull = true))
-    val array_notNull = Literal.create(Seq("123", "abc", ""),
+    val array_notNull = Literal.create(Seq("123", "true", "f"),
       ArrayType(StringType, containsNull = false))
 
     checkNullCast(ArrayType(StringType), ArrayType(IntegerType))
@@ -522,7 +522,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(array, ArrayType(BooleanType, containsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false, null))
+      checkEvaluation(ret, Seq(null, true, false, null))
     }
     {
       val ret = cast(array, ArrayType(BooleanType, containsNull = false))
@@ -541,12 +541,12 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false))
+      checkEvaluation(ret, Seq(null, true, false))
     }
     {
       val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = false))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false))
+      checkEvaluation(ret, Seq(null, true, false))
     }
 
     {
@@ -557,10 +557,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("cast from map") {
     val map = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> "", "d" -> null),
+      Map("a" -> "123", "b" -> "true", "c" -> "f", "d" -> null),
       MapType(StringType, StringType, valueContainsNull = true))
     val map_notNull = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> ""),
+      Map("a" -> "123", "b" -> "true", "c" -> "f"),
       MapType(StringType, StringType, valueContainsNull = false))
 
     checkNullCast(MapType(StringType, IntegerType), MapType(StringType, StringType))
@@ -577,7 +577,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false, "d" -> null))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false, "d" -> null))
     }
     {
       val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
@@ -600,12 +600,12 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
     }
     {
       val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
     }
     {
       val ret = cast(map_notNull, MapType(IntegerType, StringType, valueContainsNull = true))
@@ -630,8 +630,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val struct = Literal.create(
       InternalRow(
         UTF8String.fromString("123"),
-        UTF8String.fromString("abc"),
-        UTF8String.fromString(""),
+        UTF8String.fromString("true"),
+        UTF8String.fromString("f"),
         null),
       StructType(Seq(
         StructField("a", StringType, nullable = true),
@@ -641,8 +641,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val struct_notNull = Literal.create(
       InternalRow(
         UTF8String.fromString("123"),
-        UTF8String.fromString("abc"),
-        UTF8String.fromString("")),
+        UTF8String.fromString("true"),
+        UTF8String.fromString("f")),
       StructType(Seq(
         StructField("a", StringType, nullable = false),
         StructField("b", StringType, nullable = false),
@@ -672,7 +672,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("c", BooleanType, nullable = true),
         StructField("d", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false, null))
+      checkEvaluation(ret, InternalRow(null, true, false, null))
     }
     {
       val ret = cast(struct, StructType(Seq(
@@ -704,7 +704,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false))
+      checkEvaluation(ret, InternalRow(null, true, false))
     }
     {
       val ret = cast(struct_notNull, StructType(Seq(
@@ -712,7 +712,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = false))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false))
+      checkEvaluation(ret, InternalRow(null, true, false))
     }
 
     {
@@ -731,8 +731,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("complex casting") {
     val complex = Literal.create(
       Row(
-        Seq("123", "abc", ""),
-        Map("a" ->"123", "b" -> "abc", "c" -> ""),
+        Seq("123", "true", "f"),
+        Map("a" ->"123", "b" -> "true", "c" -> "f"),
         Row(0)),
       StructType(Seq(
         StructField("a",
@@ -755,11 +755,11 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(ret.resolved === true)
     checkEvaluation(ret, Row(
       Seq(123, null, null),
-      Map("a" -> true, "b" -> true, "c" -> false),
+      Map("a" -> null, "b" -> true, "c" -> false),
       Row(0L)))
   }
 
-  test("case between string and interval") {
+  test("cast between string and interval") {
     import org.apache.spark.unsafe.types.CalendarInterval
 
     checkEvaluation(Cast(Literal("interval -3 month 7 hours"), CalendarIntervalType),
@@ -769,4 +769,23 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType),
       "interval 1 years 3 months -3 days")
   }
+
+  test("cast string to boolean") {
+    checkCast("t", true)
+    checkCast("true", true)
+    checkCast("tRUe", true)
+    checkCast("y", true)
+    checkCast("yes", true)
+    checkCast("1", true)
+
+    checkCast("f", false)
+    checkCast("false", false)
+    checkCast("FAlsE", false)
+    checkCast("n", false)
+    checkCast("no", false)
+    checkCast("0", false)
+
+    checkEvaluation(cast("abc", BooleanType), null)
+    checkEvaluation(cast("", BooleanType), null)
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 13223c61584b2..8ffcef85668d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -375,6 +375,19 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
+  test("saveAsTable()/load() - partitioned table - boolean type") {
+    sqlContext.range(2)
+      .select('id, ('id % 2 === 0).as("b"))
+      .write.partitionBy("b").saveAsTable("t")
+
+    withTable("t") {
+      checkAnswer(
+        sqlContext.table("t").sort('id),
+        Row(0, true) :: Row(1, false) :: Nil
+      )
+    }
+  }
+
   test("saveAsTable()/load() - partitioned table - Overwrite") {
     partitionedTestDF.write
       .format(dataSourceName)

From 1eede3b254ee3793841c92971707094ac8afee35 Mon Sep 17 00:00:00 2001
From: Yash Datta <Yash.Datta@guavus.com>
Date: Fri, 11 Sep 2015 14:55:15 -0700
Subject: [PATCH 260/802] [SPARK-7142] [SQL] Minor enhancement to
 BooleanSimplification Optimizer rule. Incorporate review comments

Adding changes suggested by cloud-fan  in #5700

cc marmbrus

Author: Yash Datta <Yash.Datta@guavus.com>

Closes #8716 from saucam/bool_simp.
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d9b50f3c97da0..0f4caec7451a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -435,10 +435,10 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         // a && a  =>  a
         case (l, r) if l fastEquals r => l
         // a && (not(a) || b) => a && b
-        case (l, Or(l1, r)) if (Not(l) fastEquals l1) => And(l, r)
-        case (l, Or(r, l1)) if (Not(l) fastEquals l1) => And(l, r)
-        case (Or(l, l1), r) if (l1 fastEquals Not(r)) => And(l, r)
-        case (Or(l1, l), r) if (l1 fastEquals Not(r)) => And(l, r)
+        case (l, Or(l1, r)) if (Not(l) == l1) => And(l, r)
+        case (l, Or(r, l1)) if (Not(l) == l1) => And(l, r)
+        case (Or(l, l1), r) if (l1 == Not(r)) => And(l, r)
+        case (Or(l1, l), r) if (l1 == Not(r)) => And(l, r)
         // (a || b) && (a || c)  =>  a || (b && c)
         case _ =>
           // 1. Split left and right to get the disjunctive predicates,

From e626ac5f5c27dcc74113070f2fec03682bcd12bd Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 11 Sep 2015 15:00:13 -0700
Subject: [PATCH 261/802] [SPARK-9992] [SPARK-9994] [SPARK-9998] [SQL]
 Implement the local TopK, sample and intersect operators

This PR is in conflict with #8535. I will update this one when #8535 gets merged.

Author: zsxwing <zsxwing@gmail.com>

Closes #8573 from zsxwing/more-local-operators.
---
 .../spark/sql/execution/basicOperators.scala  |  2 +-
 .../sql/execution/local/IntersectNode.scala   | 63 ++++++++++++++
 .../spark/sql/execution/local/LocalNode.scala |  5 ++
 .../sql/execution/local/SampleNode.scala      | 82 +++++++++++++++++++
 .../local/TakeOrderedAndProjectNode.scala     | 73 +++++++++++++++++
 .../execution/local/IntersectNodeSuite.scala  | 35 ++++++++
 .../sql/execution/local/SampleNodeSuite.scala | 40 +++++++++
 .../TakeOrderedAndProjectNodeSuite.scala      | 54 ++++++++++++
 8 files changed, 353 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 3f68b05a24f44..bf6d44c098ee3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -138,7 +138,7 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
  *                   will be ub - lb.
  * @param withReplacement Whether to sample with replacement.
  * @param seed the random seed
- * @param child the QueryPlan
+ * @param child the SparkPlan
  */
 @DeveloperApi
 case class Sample(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala
new file mode 100644
index 0000000000000..740d485f8d9e6
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala
@@ -0,0 +1,63 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class IntersectNode(conf: SQLConf, left: LocalNode, right: LocalNode)
+  extends BinaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = left.output
+
+  private[this] var leftRows: mutable.HashSet[InternalRow] = _
+
+  private[this] var currentRow: InternalRow = _
+
+  override def open(): Unit = {
+    left.open()
+    leftRows = mutable.HashSet[InternalRow]()
+    while (left.next()) {
+      leftRows += left.fetch().copy()
+    }
+    left.close()
+    right.open()
+  }
+
+  override def next(): Boolean = {
+    currentRow = null
+    while (currentRow == null && right.next()) {
+      currentRow = right.fetch()
+      if (!leftRows.contains(currentRow)) {
+        currentRow = null
+      }
+    }
+    currentRow != null
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index c4f8ae304db39..a2c275db9b35d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -69,6 +69,11 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
    */
   def close(): Unit
 
+  /**
+   * Returns the content through the [[Iterator]] interface.
+   */
+  final def asIterator: Iterator[InternalRow] = new LocalNodeIterator(this)
+
   /**
    * Returns the content of the iterator from the beginning to the end in the form of a Scala Seq.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
new file mode 100644
index 0000000000000..abf3df1c0c2af
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import java.util.Random
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
+
+/**
+ * Sample the dataset.
+ *
+ * @param conf the SQLConf
+ * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
+ * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
+ *                   will be ub - lb.
+ * @param withReplacement Whether to sample with replacement.
+ * @param seed the random seed
+ * @param child the LocalNode
+ */
+case class SampleNode(
+    conf: SQLConf,
+    lowerBound: Double,
+    upperBound: Double,
+    withReplacement: Boolean,
+    seed: Long,
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var iterator: Iterator[InternalRow] = _
+
+  private[this] var currentRow: InternalRow = _
+
+  override def open(): Unit = {
+    child.open()
+    val (sampler, _seed) = if (withReplacement) {
+        val random = new Random(seed)
+        // Disable gap sampling since the gap sampling method buffers two rows internally,
+        // requiring us to copy the row, which is more expensive than the random number generator.
+        (new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false),
+          // Use the seed for partition 0 like PartitionwiseSampledRDD to generate the same result
+          // of DataFrame
+          random.nextLong())
+      } else {
+        (new BernoulliCellSampler[InternalRow](lowerBound, upperBound), seed)
+      }
+    sampler.setSeed(_seed)
+    iterator = sampler.sample(child.asIterator)
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      currentRow = iterator.next()
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = child.close()
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
new file mode 100644
index 0000000000000..53f1dcc65d8cf
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.util.BoundedPriorityQueue
+
+case class TakeOrderedAndProjectNode(
+    conf: SQLConf,
+    limit: Int,
+    sortOrder: Seq[SortOrder],
+    projectList: Option[Seq[NamedExpression]],
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  private[this] var projection: Option[Projection] = _
+  private[this] var ord: InterpretedOrdering = _
+  private[this] var iterator: Iterator[InternalRow] = _
+  private[this] var currentRow: InternalRow = _
+
+  override def output: Seq[Attribute] = {
+    val projectOutput = projectList.map(_.map(_.toAttribute))
+    projectOutput.getOrElse(child.output)
+  }
+
+  override def open(): Unit = {
+    child.open()
+    projection = projectList.map(new InterpretedProjection(_, child.output))
+    ord = new InterpretedOrdering(sortOrder, child.output)
+    // Priority keeps the largest elements, so let's reverse the ordering.
+    val queue = new BoundedPriorityQueue[InternalRow](limit)(ord.reverse)
+    while (child.next()) {
+      queue += child.fetch()
+    }
+    // Close it eagerly since we don't need it.
+    child.close()
+    iterator = queue.iterator
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      val _currentRow = iterator.next()
+      currentRow = projection match {
+        case Some(p) => p(_currentRow)
+        case None => _currentRow
+      }
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = child.close()
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
new file mode 100644
index 0000000000000..7deaa375fcfc2
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
@@ -0,0 +1,35 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+class IntersectNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  test("basic") {
+    val input1 = (1 to 10).map(i => (i, i.toString)).toDF("key", "value")
+    val input2 = (1 to 10).filter(_ % 2 == 0).map(i => (i, i.toString)).toDF("key", "value")
+
+    checkAnswer2(
+      input1,
+      input2,
+      (node1, node2) => IntersectNode(conf, node1, node2),
+      input1.intersect(input2).collect()
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
new file mode 100644
index 0000000000000..87a7da453999c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+class SampleNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  private def testSample(withReplacement: Boolean): Unit = {
+    test(s"withReplacement: $withReplacement") {
+      val seed = 0L
+      val input = sqlContext.sparkContext.
+        parallelize((1 to 10).map(i => (i, i.toString)), 1). // Should be only 1 partition
+        toDF("key", "value")
+      checkAnswer(
+        input,
+        node => SampleNode(conf, 0.0, 0.3, withReplacement, seed, node),
+        input.sample(withReplacement, 0.3, seed).collect()
+      )
+    }
+  }
+
+  testSample(withReplacement = true)
+  testSample(withReplacement = false)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
new file mode 100644
index 0000000000000..ff28b24eeff14
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions.{Ascending, Expression, SortOrder}
+
+class TakeOrderedAndProjectNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  private def columnToSortOrder(sortExprs: Column*): Seq[SortOrder] = {
+    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
+      col.expr match {
+        case expr: SortOrder =>
+          expr
+        case expr: Expression =>
+          SortOrder(expr, Ascending)
+      }
+    }
+    sortOrder
+  }
+
+  private def testTakeOrderedAndProjectNode(desc: Boolean): Unit = {
+    val testCaseName = if (desc) "desc" else "asc"
+    test(testCaseName) {
+      val input = (1 to 10).map(i => (i, i.toString)).toDF("key", "value")
+      val sortColumn = if (desc) input.col("key").desc else input.col("key")
+      checkAnswer(
+        input,
+        node => TakeOrderedAndProjectNode(conf, 5, columnToSortOrder(sortColumn), None, node),
+        input.sort(sortColumn).limit(5).collect()
+      )
+    }
+  }
+
+  testTakeOrderedAndProjectNode(desc = false)
+  testTakeOrderedAndProjectNode(desc = true)
+}

From c2af42b5f32287ff595ad027a8191d4b75702d8d Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 11 Sep 2015 15:01:37 -0700
Subject: [PATCH 262/802] [SPARK-9990] [SQL] Local hash join follow-ups

1. Hide `LocalNodeIterator` behind the `LocalNode#asIterator` method
2. Add tests for this

Author: Andrew Or <andrew@databricks.com>

Closes #8708 from andrewor14/local-hash-join-follow-up.
---
 .../sql/execution/joins/HashedRelation.scala  |   7 +-
 .../sql/execution/local/HashJoinNode.scala    |   3 +-
 .../spark/sql/execution/local/LocalNode.scala |   4 +-
 .../sql/execution/local/LocalNodeSuite.scala  | 116 ++++++++++++++++++
 4 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 0cff21ca618b4..bc255b27502b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -25,7 +25,8 @@ import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkSqlSerializer
-import org.apache.spark.sql.execution.metric.LongSQLMetric
+import org.apache.spark.sql.execution.local.LocalNode
+import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.map.BytesToBytesMap
 import org.apache.spark.unsafe.memory.{MemoryLocation, ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
@@ -113,6 +114,10 @@ final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalR
 
 private[execution] object HashedRelation {
 
+  def apply(localNode: LocalNode, keyGenerator: Projection): HashedRelation = {
+    apply(localNode.asIterator, SQLMetrics.nullLongMetric, keyGenerator)
+  }
+
   def apply(
       input: Iterator[InternalRow],
       numInputRows: LongSQLMetric,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
index a3e68d6a7c341..e7b24e3fca2b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
@@ -75,8 +75,7 @@ case class HashJoinNode(
 
   override def open(): Unit = {
     buildNode.open()
-    hashed = HashedRelation.apply(
-      new LocalNodeIterator(buildNode), SQLMetrics.nullLongMetric, buildSideKeyGenerator)
+    hashed = HashedRelation(buildNode, buildSideKeyGenerator)
     streamedNode.open()
     joinRow = new JoinedRow
     resultProjection = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index a2c275db9b35d..e540ef8555eb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -77,7 +77,7 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
   /**
    * Returns the content of the iterator from the beginning to the end in the form of a Scala Seq.
    */
-  def collect(): Seq[Row] = {
+  final def collect(): Seq[Row] = {
     val converter = CatalystTypeConverters.createToScalaConverter(StructType.fromAttributes(output))
     val result = new scala.collection.mutable.ArrayBuffer[Row]
     open()
@@ -140,7 +140,7 @@ abstract class BinaryLocalNode(conf: SQLConf) extends LocalNode(conf) {
 /**
  * An thin wrapper around a [[LocalNode]] that provides an `Iterator` interface.
  */
-private[local] class LocalNodeIterator(localNode: LocalNode) extends Iterator[InternalRow] {
+private class LocalNodeIterator(localNode: LocalNode) extends Iterator[InternalRow] {
   private var nextRow: InternalRow = _
 
   override def hasNext: Boolean = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
new file mode 100644
index 0000000000000..b89fa46f8b3b4
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
@@ -0,0 +1,116 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.IntegerType
+
+class LocalNodeSuite extends SparkFunSuite {
+  private val data = (1 to 100).toArray
+
+  test("basic open, next, fetch, close") {
+    val node = new DummyLocalNode(data)
+    assert(!node.isOpen)
+    node.open()
+    assert(node.isOpen)
+    data.foreach { i =>
+      assert(node.next())
+      // fetch should be idempotent
+      val fetched = node.fetch()
+      assert(node.fetch() === fetched)
+      assert(node.fetch() === fetched)
+      assert(node.fetch().numFields === 1)
+      assert(node.fetch().getInt(0) === i)
+    }
+    assert(!node.next())
+    node.close()
+    assert(!node.isOpen)
+  }
+
+  test("asIterator") {
+    val node = new DummyLocalNode(data)
+    val iter = node.asIterator
+    node.open()
+    data.foreach { i =>
+      // hasNext should be idempotent
+      assert(iter.hasNext)
+      assert(iter.hasNext)
+      val item = iter.next()
+      assert(item.numFields === 1)
+      assert(item.getInt(0) === i)
+    }
+    intercept[NoSuchElementException] {
+      iter.next()
+    }
+    node.close()
+  }
+
+  test("collect") {
+    val node = new DummyLocalNode(data)
+    node.open()
+    val collected = node.collect()
+    assert(collected.size === data.size)
+    assert(collected.forall(_.size === 1))
+    assert(collected.map(_.getInt(0)) === data)
+    node.close()
+  }
+
+}
+
+/**
+ * A dummy [[LocalNode]] that just returns one row per integer in the input.
+ */
+private case class DummyLocalNode(conf: SQLConf, input: Array[Int]) extends LocalNode(conf) {
+  private var index = Int.MinValue
+
+  def this(input: Array[Int]) {
+    this(new SQLConf, input)
+  }
+
+  def isOpen: Boolean = {
+    index != Int.MinValue
+  }
+
+  override def output: Seq[Attribute] = {
+    Seq(AttributeReference("something", IntegerType)())
+  }
+
+  override def children: Seq[LocalNode] = Seq.empty
+
+  override def open(): Unit = {
+    index = -1
+  }
+
+  override def next(): Boolean = {
+    index += 1
+    index < input.size
+  }
+
+  override def fetch(): InternalRow = {
+    assert(index >= 0 && index < input.size)
+    val values = Array(input(index).asInstanceOf[Any])
+    new GenericInternalRow(values)
+  }
+
+  override def close(): Unit = {
+    index = Int.MinValue
+  }
+}

From d74c6a143cbd060c25bf14a8d306841b3ec55d03 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 11 Sep 2015 15:02:59 -0700
Subject: [PATCH 263/802] [SPARK-10564] ThreadingSuite: assertion failures in
 threads don't fail the test

This commit ensures if an assertion fails within a thread, it will ultimately fail the test. Otherwise we end up potentially masking real bugs by not propagating assertion failures properly.

Author: Andrew Or <andrew@databricks.com>

Closes #8723 from andrewor14/fix-threading-suite.
---
 .../org/apache/spark/ThreadingSuite.scala     | 68 ++++++++++++-------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index 48509f0759a3b..cda2b245526f7 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -119,23 +119,30 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     val nums = sc.parallelize(1 to 2, 2)
     val sem = new Semaphore(0)
     ThreadingSuiteState.clear()
+    var throwable: Option[Throwable] = None
     for (i <- 0 until 2) {
       new Thread {
         override def run() {
-          val ans = nums.map(number => {
-            val running = ThreadingSuiteState.runningThreads
-            running.getAndIncrement()
-            val time = System.currentTimeMillis()
-            while (running.get() != 4 && System.currentTimeMillis() < time + 1000) {
-              Thread.sleep(100)
-            }
-            if (running.get() != 4) {
-              ThreadingSuiteState.failed.set(true)
-            }
-            number
-          }).collect()
-          assert(ans.toList === List(1, 2))
-          sem.release()
+          try {
+            val ans = nums.map(number => {
+              val running = ThreadingSuiteState.runningThreads
+              running.getAndIncrement()
+              val time = System.currentTimeMillis()
+              while (running.get() != 4 && System.currentTimeMillis() < time + 1000) {
+                Thread.sleep(100)
+              }
+              if (running.get() != 4) {
+                ThreadingSuiteState.failed.set(true)
+              }
+              number
+            }).collect()
+            assert(ans.toList === List(1, 2))
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }.start()
     }
@@ -145,18 +152,25 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
                 ThreadingSuiteState.runningThreads.get() + "); failing test")
       fail("One or more threads didn't see runningThreads = 4")
     }
+    throwable.foreach { t => throw t }
   }
 
   test("set local properties in different thread") {
     sc = new SparkContext("local", "test")
     val sem = new Semaphore(0)
-
+    var throwable: Option[Throwable] = None
     val threads = (1 to 5).map { i =>
       new Thread() {
         override def run() {
-          sc.setLocalProperty("test", i.toString)
-          assert(sc.getLocalProperty("test") === i.toString)
-          sem.release()
+          try {
+            sc.setLocalProperty("test", i.toString)
+            assert(sc.getLocalProperty("test") === i.toString)
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }
     }
@@ -165,20 +179,27 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
 
     sem.acquire(5)
     assert(sc.getLocalProperty("test") === null)
+    throwable.foreach { t => throw t }
   }
 
   test("set and get local properties in parent-children thread") {
     sc = new SparkContext("local", "test")
     sc.setLocalProperty("test", "parent")
     val sem = new Semaphore(0)
-
+    var throwable: Option[Throwable] = None
     val threads = (1 to 5).map { i =>
       new Thread() {
         override def run() {
-          assert(sc.getLocalProperty("test") === "parent")
-          sc.setLocalProperty("test", i.toString)
-          assert(sc.getLocalProperty("test") === i.toString)
-          sem.release()
+          try {
+            assert(sc.getLocalProperty("test") === "parent")
+            sc.setLocalProperty("test", i.toString)
+            assert(sc.getLocalProperty("test") === i.toString)
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }
     }
@@ -188,6 +209,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     sem.acquire(5)
     assert(sc.getLocalProperty("test") === "parent")
     assert(sc.getLocalProperty("Foo") === null)
+    throwable.foreach { t => throw t }
   }
 
   test("mutations to local properties should not affect submitted jobs (SPARK-6629)") {

From c34fc19765bdf55365cdce78d9ba11b220b73bb6 Mon Sep 17 00:00:00 2001
From: 0x0FFF <programmerag@gmail.com>
Date: Fri, 11 Sep 2015 15:19:04 -0700
Subject: [PATCH 264/802] [SPARK-9014] [SQL] Allow Python spark API to use
 built-in exponential operator

This PR addresses (SPARK-9014)[https://issues.apache.org/jira/browse/SPARK-9014]
Added functionality: `Column` object in Python now supports exponential operator `**`
Example:
```
from pyspark.sql import *
df = sqlContext.createDataFrame([Row(a=2)])
df.select(3**df.a,df.a**3,df.a**df.a).collect()
```
Outputs:
```
[Row(POWER(3.0, a)=9.0, POWER(a, 3.0)=8.0, POWER(a, a)=4.0)]
```

Author: 0x0FFF <programmerag@gmail.com>

Closes #8658 from 0x0FFF/SPARK-9014.
---
 python/pyspark/sql/column.py | 13 +++++++++++++
 python/pyspark/sql/tests.py  |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 573f65f5bf096..9ca8e1f264cfa 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -91,6 +91,17 @@ def _(self):
     return _
 
 
+def _bin_func_op(name, reverse=False, doc="binary function"):
+    def _(self, other):
+        sc = SparkContext._active_spark_context
+        fn = getattr(sc._jvm.functions, name)
+        jc = other._jc if isinstance(other, Column) else _create_column_from_literal(other)
+        njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc)
+        return Column(njc)
+    _.__doc__ = doc
+    return _
+
+
 def _bin_op(name, doc="binary operator"):
     """ Create a method for given binary operator
     """
@@ -151,6 +162,8 @@ def __init__(self, jc):
     __rdiv__ = _reverse_op("divide")
     __rtruediv__ = _reverse_op("divide")
     __rmod__ = _reverse_op("mod")
+    __pow__ = _bin_func_op("pow")
+    __rpow__ = _bin_func_op("pow", reverse=True)
 
     # logistic operators
     __eq__ = _bin_op("equalTo")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index eb449e8679fa0..f2172b7a27d88 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -568,7 +568,7 @@ def test_column_operators(self):
         cs = self.df.value
         c = ci == cs
         self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column))
-        rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci)
+        rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci), (1 ** ci), (ci ** 1)
         self.assertTrue(all(isinstance(c, Column) for c in rcc))
         cb = [ci == 5, ci != 0, ci > 3, ci < 4, ci >= 0, ci <= 7]
         self.assertTrue(all(isinstance(c, Column) for c in cb))

From 6d8367807cb62c2cb139cee1d039dc8b12c63385 Mon Sep 17 00:00:00 2001
From: Daniel Imfeld <daniel@danielimfeld.com>
Date: Sat, 12 Sep 2015 09:19:59 +0100
Subject: [PATCH 265/802] [SPARK-10566] [CORE] SnappyCompressionCodec init
 exception handling masks important error information

When throwing an IllegalArgumentException in SnappyCompressionCodec.init, chain the existing exception. This allows potentially important debugging info to be passed to the user.

Manual testing shows the exception chained properly, and the test suite still looks fine as well.

This contribution is my original work and I license the work to the project under the project's open source license.

Author: Daniel Imfeld <daniel@danielimfeld.com>

Closes #8725 from dimfeld/dimfeld-patch-1.
---
 core/src/main/scala/org/apache/spark/io/CompressionCodec.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 607d5a321efca..9dc36704a676d 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -148,7 +148,7 @@ class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
   try {
     Snappy.getNativeLibraryVersion
   } catch {
-    case e: Error => throw new IllegalArgumentException
+    case e: Error => throw new IllegalArgumentException(e)
   }
 
   override def compressedOutputStream(s: OutputStream): OutputStream = {

From 8285e3b0d3dc0eff669eba993742dfe0401116f9 Mon Sep 17 00:00:00 2001
From: Nithin Asokan <Nithin.Asokan@Cerner.com>
Date: Sat, 12 Sep 2015 09:50:49 +0100
Subject: [PATCH 266/802] [SPARK-10554] [CORE] Fix NPE with ShutdownHook

https://issues.apache.org/jira/browse/SPARK-10554

Fixes NPE when ShutdownHook tries to cleanup temporary folders

Author: Nithin Asokan <Nithin.Asokan@Cerner.com>

Closes #8720 from nasokan/SPARK-10554.
---
 .../scala/org/apache/spark/storage/DiskBlockManager.scala     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 3f8d26e1d4cab..f7e84a2c2e14c 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -164,7 +164,9 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
 
   private def doStop(): Unit = {
     // Only perform cleanup if an external service is not serving our shuffle files.
-    if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
+    // Also blockManagerId could be null if block manager is not initialized properly.
+    if (!blockManager.externalShuffleServiceEnabled ||
+      (blockManager.blockManagerId != null && blockManager.blockManagerId.isDriver)) {
       localDirs.foreach { localDir =>
         if (localDir.isDirectory() && localDir.exists()) {
           try {

From 22730ad54d681ad30e63fe910e8d89360853177d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 12 Sep 2015 10:40:10 +0100
Subject: [PATCH 267/802] [SPARK-10547] [TEST] Streamline / improve style of
 Java API tests

Fix a few Java API test style issues: unused generic types, exceptions, wrong assert argument order

Author: Sean Owen <sowen@cloudera.com>

Closes #8706 from srowen/SPARK-10547.
---
 .../java/org/apache/spark/JavaAPISuite.java   | 451 ++++++-----
 .../kafka/JavaDirectKafkaStreamSuite.java     |  24 +-
 .../streaming/kafka/JavaKafkaRDDSuite.java    |  17 +-
 .../streaming/kafka/JavaKafkaStreamSuite.java |  14 +-
 .../twitter/JavaTwitterStreamSuite.java       |   4 +-
 .../java/org/apache/spark/Java8APISuite.java  |  46 +-
 .../spark/sql/JavaApplySchemaSuite.java       |  39 +-
 .../apache/spark/sql/JavaDataFrameSuite.java  |  29 +-
 .../org/apache/spark/sql/JavaRowSuite.java    |  15 +-
 .../org/apache/spark/sql/JavaUDFSuite.java    |   9 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java  |  10 +-
 .../spark/sql/hive/JavaDataFrameSuite.java    |   8 +-
 .../hive/JavaMetastoreDataSourcesSuite.java   |  12 +-
 .../apache/spark/streaming/JavaAPISuite.java  | 752 +++++++++---------
 .../spark/streaming/JavaReceiverAPISuite.java |  86 +-
 15 files changed, 755 insertions(+), 761 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index ebd3d61ae7324..fd8f7f39b7cc8 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -90,7 +90,7 @@ public void sparkContextUnion() {
     JavaRDD<String> sUnion = sc.union(s1, s2);
     Assert.assertEquals(4, sUnion.count());
     // List
-    List<JavaRDD<String>> list = new ArrayList<JavaRDD<String>>();
+    List<JavaRDD<String>> list = new ArrayList<>();
     list.add(s2);
     sUnion = sc.union(s1, list);
     Assert.assertEquals(4, sUnion.count());
@@ -103,9 +103,9 @@ public void sparkContextUnion() {
     Assert.assertEquals(4, dUnion.count());
 
     // Union of JavaPairRDDs
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(1, 2));
-    pairs.add(new Tuple2<Integer, Integer>(3, 4));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(1, 2));
+    pairs.add(new Tuple2<>(3, 4));
     JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2);
@@ -133,9 +133,9 @@ public void intersection() {
     JavaDoubleRDD dIntersection = d1.intersection(d2);
     Assert.assertEquals(2, dIntersection.count());
 
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(1, 2));
-    pairs.add(new Tuple2<Integer, Integer>(3, 4));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(1, 2));
+    pairs.add(new Tuple2<>(3, 4));
     JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> pIntersection = p1.intersection(p2);
@@ -165,47 +165,49 @@ public void randomSplit() {
 
   @Test
   public void sortByKey() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 4));
-    pairs.add(new Tuple2<Integer, Integer>(3, 2));
-    pairs.add(new Tuple2<Integer, Integer>(-1, 1));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 4));
+    pairs.add(new Tuple2<>(3, 2));
+    pairs.add(new Tuple2<>(-1, 1));
 
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
 
     // Default comparator
     JavaPairRDD<Integer, Integer> sortedRDD = rdd.sortByKey();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
 
     // Custom comparator
     sortedRDD = rdd.sortByKey(Collections.<Integer>reverseOrder(), false);
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void repartitionAndSortWithinPartitions() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 5));
-    pairs.add(new Tuple2<Integer, Integer>(3, 8));
-    pairs.add(new Tuple2<Integer, Integer>(2, 6));
-    pairs.add(new Tuple2<Integer, Integer>(0, 8));
-    pairs.add(new Tuple2<Integer, Integer>(3, 8));
-    pairs.add(new Tuple2<Integer, Integer>(1, 3));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 5));
+    pairs.add(new Tuple2<>(3, 8));
+    pairs.add(new Tuple2<>(2, 6));
+    pairs.add(new Tuple2<>(0, 8));
+    pairs.add(new Tuple2<>(3, 8));
+    pairs.add(new Tuple2<>(1, 3));
 
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
 
     Partitioner partitioner = new Partitioner() {
+      @Override
       public int numPartitions() {
         return 2;
       }
+      @Override
       public int getPartition(Object key) {
-        return ((Integer)key).intValue() % 2;
+        return (Integer) key % 2;
       }
     };
 
@@ -214,10 +216,10 @@ public int getPartition(Object key) {
     Assert.assertTrue(repartitioned.partitioner().isPresent());
     Assert.assertEquals(repartitioned.partitioner().get(), partitioner);
     List<List<Tuple2<Integer, Integer>>> partitions = repartitioned.glom().collect();
-    Assert.assertEquals(partitions.get(0), Arrays.asList(new Tuple2<Integer, Integer>(0, 5),
-        new Tuple2<Integer, Integer>(0, 8), new Tuple2<Integer, Integer>(2, 6)));
-    Assert.assertEquals(partitions.get(1), Arrays.asList(new Tuple2<Integer, Integer>(1, 3),
-        new Tuple2<Integer, Integer>(3, 8), new Tuple2<Integer, Integer>(3, 8)));
+    Assert.assertEquals(partitions.get(0),
+        Arrays.asList(new Tuple2<>(0, 5), new Tuple2<>(0, 8), new Tuple2<>(2, 6)));
+    Assert.assertEquals(partitions.get(1),
+        Arrays.asList(new Tuple2<>(1, 3), new Tuple2<>(3, 8), new Tuple2<>(3, 8)));
   }
 
   @Test
@@ -228,35 +230,37 @@ public void emptyRDD() {
 
   @Test
   public void sortBy() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 4));
-    pairs.add(new Tuple2<Integer, Integer>(3, 2));
-    pairs.add(new Tuple2<Integer, Integer>(-1, 1));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 4));
+    pairs.add(new Tuple2<>(3, 2));
+    pairs.add(new Tuple2<>(-1, 1));
 
     JavaRDD<Tuple2<Integer, Integer>> rdd = sc.parallelize(pairs);
 
     // compare on first value
     JavaRDD<Tuple2<Integer, Integer>> sortedRDD = rdd.sortBy(new Function<Tuple2<Integer, Integer>, Integer>() {
-      public Integer call(Tuple2<Integer, Integer> t) throws Exception {
+      @Override
+      public Integer call(Tuple2<Integer, Integer> t) {
         return t._1();
       }
     }, true, 2);
 
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
 
     // compare on second value
     sortedRDD = rdd.sortBy(new Function<Tuple2<Integer, Integer>, Integer>() {
-      public Integer call(Tuple2<Integer, Integer> t) throws Exception {
+      @Override
+      public Integer call(Tuple2<Integer, Integer> t) {
         return t._2();
       }
     }, true, 2);
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(2));
   }
 
   @Test
@@ -265,7 +269,7 @@ public void foreach() {
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
     rdd.foreach(new VoidFunction<String>() {
       @Override
-      public void call(String s) throws IOException {
+      public void call(String s) {
         accum.add(1);
       }
     });
@@ -278,7 +282,7 @@ public void foreachPartition() {
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
     rdd.foreachPartition(new VoidFunction<Iterator<String>>() {
       @Override
-      public void call(Iterator<String> iter) throws IOException {
+      public void call(Iterator<String> iter) {
         while (iter.hasNext()) {
           iter.next();
           accum.add(1);
@@ -301,7 +305,7 @@ public void zipWithUniqueId() {
     List<Integer> dataArray = Arrays.asList(1, 2, 3, 4);
     JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithUniqueId();
     JavaRDD<Long> indexes = zip.values();
-    Assert.assertEquals(4, new HashSet<Long>(indexes.collect()).size());
+    Assert.assertEquals(4, new HashSet<>(indexes.collect()).size());
   }
 
   @Test
@@ -317,10 +321,10 @@ public void zipWithIndex() {
   @Test
   public void lookup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
-      ));
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
+    ));
     Assert.assertEquals(2, categories.lookup("Oranges").size());
     Assert.assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
   }
@@ -390,18 +394,17 @@ public String call(Tuple2<Integer, Integer> x) {
   @Test
   public void cogroup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Tuple2<Iterable<String>, Iterable<Integer>>> cogrouped =
         categories.cogroup(prices);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
 
     cogrouped.collect();
@@ -411,23 +414,22 @@ public void cogroup() {
   @Test
   public void cogroup3() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Integer> quantities = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 21),
-      new Tuple2<String, Integer>("Apples", 42)
+      new Tuple2<>("Oranges", 21),
+      new Tuple2<>("Apples", 42)
     ));
 
     JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<Integer>>> cogrouped =
         categories.cogroup(prices, quantities);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
     Assert.assertEquals("[42]", Iterables.toString(cogrouped.lookup("Apples").get(0)._3()));
 
@@ -439,27 +441,26 @@ public void cogroup3() {
   @Test
   public void cogroup4() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Integer> quantities = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 21),
-      new Tuple2<String, Integer>("Apples", 42)
+      new Tuple2<>("Oranges", 21),
+      new Tuple2<>("Apples", 42)
     ));
     JavaPairRDD<String, String> countries = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Oranges", "BR"),
-      new Tuple2<String, String>("Apples", "US")
+      new Tuple2<>("Oranges", "BR"),
+      new Tuple2<>("Apples", "US")
     ));
 
     JavaPairRDD<String, Tuple4<Iterable<String>, Iterable<Integer>, Iterable<Integer>, Iterable<String>>> cogrouped =
         categories.cogroup(prices, quantities, countries);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
     Assert.assertEquals("[42]", Iterables.toString(cogrouped.lookup("Apples").get(0)._3()));
     Assert.assertEquals("[BR]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._4()));
@@ -471,16 +472,16 @@ public void cogroup4() {
   @Test
   public void leftOuterJoin() {
     JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(1, 2),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(1, 1),
+      new Tuple2<>(1, 2),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(3, 1)
       ));
     JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<Integer, Character>(1, 'x'),
-      new Tuple2<Integer, Character>(2, 'y'),
-      new Tuple2<Integer, Character>(2, 'z'),
-      new Tuple2<Integer, Character>(4, 'w')
+      new Tuple2<>(1, 'x'),
+      new Tuple2<>(2, 'y'),
+      new Tuple2<>(2, 'z'),
+      new Tuple2<>(4, 'w')
     ));
     List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
       rdd1.leftOuterJoin(rdd2).collect();
@@ -548,11 +549,11 @@ public Integer call(Integer a, Integer b) {
   public void aggregateByKey() {
     JavaPairRDD<Integer, Integer> pairs = sc.parallelizePairs(
       Arrays.asList(
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(3, 2),
-        new Tuple2<Integer, Integer>(5, 1),
-        new Tuple2<Integer, Integer>(5, 3)), 2);
+        new Tuple2<>(1, 1),
+        new Tuple2<>(1, 1),
+        new Tuple2<>(3, 2),
+        new Tuple2<>(5, 1),
+        new Tuple2<>(5, 3)), 2);
 
     Map<Integer, Set<Integer>> sets = pairs.aggregateByKey(new HashSet<Integer>(),
       new Function2<Set<Integer>, Integer, Set<Integer>>() {
@@ -570,20 +571,20 @@ public Set<Integer> call(Set<Integer> a, Set<Integer> b) {
         }
       }).collectAsMap();
     Assert.assertEquals(3, sets.size());
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(1)), sets.get(1));
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(2)), sets.get(3));
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(1, 3)), sets.get(5));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(1)), sets.get(1));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(2)), sets.get(3));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(1, 3)), sets.get(5));
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void foldByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(3, 2),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(2, 1),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(1, 1),
+      new Tuple2<>(3, 2),
+      new Tuple2<>(3, 1)
     );
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0,
@@ -602,11 +603,11 @@ public Integer call(Integer a, Integer b) {
   @Test
   public void reduceByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(3, 2),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(2, 1),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(1, 1),
+      new Tuple2<>(3, 2),
+      new Tuple2<>(3, 1)
     );
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> counts = rdd.reduceByKey(
@@ -690,7 +691,7 @@ public void cartesian() {
     JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
     JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World"));
     JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD);
-    Assert.assertEquals(new Tuple2<String, Double>("Hello", 1.0), cartesian.first());
+    Assert.assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first());
   }
 
   @Test
@@ -743,6 +744,7 @@ public void javaDoubleRDDHistoGram() {
   }
 
   private static class DoubleComparator implements Comparator<Double>, Serializable {
+    @Override
     public int compare(Double o1, Double o2) {
       return o1.compareTo(o2);
     }
@@ -766,14 +768,14 @@ public void min() {
   public void naturalMax() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double max = rdd.max();
-    Assert.assertTrue(4.0 == max);
+    Assert.assertEquals(4.0, max, 0.0);
   }
 
   @Test
   public void naturalMin() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double max = rdd.min();
-    Assert.assertTrue(1.0 == max);
+    Assert.assertEquals(1.0, max, 0.0);
   }
 
   @Test
@@ -809,7 +811,7 @@ public void reduceOnJavaDoubleRDD() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double sum = rdd.reduce(new Function2<Double, Double, Double>() {
       @Override
-      public Double call(Double v1, Double v2) throws Exception {
+      public Double call(Double v1, Double v2) {
         return v1 + v2;
       }
     });
@@ -844,7 +846,7 @@ public double call(Integer x) {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer x) {
-            return new Tuple2<Integer, Integer>(x, x);
+            return new Tuple2<>(x, x);
           }
         }).cache();
     pairs.collect();
@@ -870,26 +872,25 @@ public Iterable<String> call(String x) {
     Assert.assertEquals("Hello", words.first());
     Assert.assertEquals(11, words.count());
 
-    JavaPairRDD<String, String> pairs = rdd.flatMapToPair(
+    JavaPairRDD<String, String> pairsRDD = rdd.flatMapToPair(
       new PairFlatMapFunction<String, String, String>() {
-
         @Override
         public Iterable<Tuple2<String, String>> call(String s) {
-          List<Tuple2<String, String>> pairs = new LinkedList<Tuple2<String, String>>();
+          List<Tuple2<String, String>> pairs = new LinkedList<>();
           for (String word : s.split(" ")) {
-            pairs.add(new Tuple2<String, String>(word, word));
+            pairs.add(new Tuple2<>(word, word));
           }
           return pairs;
         }
       }
     );
-    Assert.assertEquals(new Tuple2<String, String>("Hello", "Hello"), pairs.first());
-    Assert.assertEquals(11, pairs.count());
+    Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairsRDD.first());
+    Assert.assertEquals(11, pairsRDD.count());
 
     JavaDoubleRDD doubles = rdd.flatMapToDouble(new DoubleFlatMapFunction<String>() {
       @Override
       public Iterable<Double> call(String s) {
-        List<Double> lengths = new LinkedList<Double>();
+        List<Double> lengths = new LinkedList<>();
         for (String word : s.split(" ")) {
           lengths.add((double) word.length());
         }
@@ -897,36 +898,36 @@ public Iterable<Double> call(String s) {
       }
     });
     Assert.assertEquals(5.0, doubles.first(), 0.01);
-    Assert.assertEquals(11, pairs.count());
+    Assert.assertEquals(11, pairsRDD.count());
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void mapsFromPairsToPairs() {
-      List<Tuple2<Integer, String>> pairs = Arrays.asList(
-              new Tuple2<Integer, String>(1, "a"),
-              new Tuple2<Integer, String>(2, "aa"),
-              new Tuple2<Integer, String>(3, "aaa")
-      );
-      JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
-
-      // Regression test for SPARK-668:
-      JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
-          new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
-          @Override
-          public Iterable<Tuple2<String, Integer>> call(Tuple2<Integer, String> item) {
-              return Collections.singletonList(item.swap());
-          }
+    List<Tuple2<Integer, String>> pairs = Arrays.asList(
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
+    );
+    JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
+
+    // Regression test for SPARK-668:
+    JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
+      new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
+        @Override
+        public Iterable<Tuple2<String, Integer>> call(Tuple2<Integer, String> item) {
+          return Collections.singletonList(item.swap());
+        }
       });
-      swapped.collect();
+    swapped.collect();
 
-      // There was never a bug here, but it's worth testing:
-      pairRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
-          @Override
-          public Tuple2<String, Integer> call(Tuple2<Integer, String> item) {
-              return item.swap();
-          }
-      }).collect();
+    // There was never a bug here, but it's worth testing:
+    pairRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
+      @Override
+      public Tuple2<String, Integer> call(Tuple2<Integer, String> item) {
+        return item.swap();
+      }
+    }).collect();
   }
 
   @Test
@@ -953,7 +954,7 @@ public void mapPartitionsWithIndex() {
     JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex(
       new Function2<Integer, Iterator<Integer>, Iterator<Integer>>() {
         @Override
-        public Iterator<Integer> call(Integer index, Iterator<Integer> iter) throws Exception {
+        public Iterator<Integer> call(Integer index, Iterator<Integer> iter) {
           int sum = 0;
           while (iter.hasNext()) {
             sum += iter.next();
@@ -972,8 +973,8 @@ public void repartition() {
     JavaRDD<Integer> repartitioned1 = in1.repartition(4);
     List<List<Integer>> result1 = repartitioned1.glom().collect();
     Assert.assertEquals(4, result1.size());
-    for (List<Integer> l: result1) {
-      Assert.assertTrue(l.size() > 0);
+    for (List<Integer> l : result1) {
+      Assert.assertFalse(l.isEmpty());
     }
 
     // Growing number of partitions
@@ -982,7 +983,7 @@ public void repartition() {
     List<List<Integer>> result2 = repartitioned2.glom().collect();
     Assert.assertEquals(2, result2.size());
     for (List<Integer> l: result2) {
-      Assert.assertTrue(l.size() > 0);
+      Assert.assertFalse(l.isEmpty());
     }
   }
 
@@ -994,9 +995,9 @@ public void persist() {
     Assert.assertEquals(20, doubleRDD.sum(), 0.1);
 
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
     pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY());
@@ -1046,7 +1047,7 @@ public void wholeTextFiles() throws Exception {
     Files.write(content1, new File(tempDirName + "/part-00000"));
     Files.write(content2, new File(tempDirName + "/part-00001"));
 
-    Map<String, String> container = new HashMap<String, String>();
+    Map<String, String> container = new HashMap<>();
     container.put(tempDirName+"/part-00000", new Text(content1).toString());
     container.put(tempDirName+"/part-00001", new Text(content2).toString());
 
@@ -1075,16 +1076,16 @@ public void textFilesCompressed() throws IOException {
   public void sequenceFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
@@ -1093,7 +1094,7 @@ public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
       Text.class).mapToPair(new PairFunction<Tuple2<IntWritable, Text>, Integer, String>() {
       @Override
       public Tuple2<Integer, String> call(Tuple2<IntWritable, Text> pair) {
-        return new Tuple2<Integer, String>(pair._1().get(), pair._2().toString());
+        return new Tuple2<>(pair._1().get(), pair._2().toString());
       }
     });
     Assert.assertEquals(pairs, readRDD.collect());
@@ -1110,7 +1111,7 @@ public void binaryFiles() throws Exception {
     FileOutputStream fos1 = new FileOutputStream(file1);
 
     FileChannel channel1 = fos1.getChannel();
-    ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+    ByteBuffer bbuf = ByteBuffer.wrap(content1);
     channel1.write(bbuf);
     channel1.close();
     JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName, 3);
@@ -1131,14 +1132,14 @@ public void binaryFilesCaching() throws Exception {
     FileOutputStream fos1 = new FileOutputStream(file1);
 
     FileChannel channel1 = fos1.getChannel();
-    ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+    ByteBuffer bbuf = ByteBuffer.wrap(content1);
     channel1.write(bbuf);
     channel1.close();
 
     JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName).cache();
     readRDD.foreach(new VoidFunction<Tuple2<String,PortableDataStream>>() {
       @Override
-      public void call(Tuple2<String, PortableDataStream> pair) throws Exception {
+      public void call(Tuple2<String, PortableDataStream> pair) {
         pair._2().toArray(); // force the file to read
       }
     });
@@ -1162,7 +1163,7 @@ public void binaryRecords() throws Exception {
     FileChannel channel1 = fos1.getChannel();
 
     for (int i = 0; i < numOfCopies; i++) {
-      ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+      ByteBuffer bbuf = ByteBuffer.wrap(content1);
       channel1.write(bbuf);
     }
     channel1.close();
@@ -1180,24 +1181,23 @@ public void binaryRecords() throws Exception {
   public void writeWithNewAPIHadoopFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
-    }).saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
-      org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
+    }).saveAsNewAPIHadoopFile(
+        outputDir, IntWritable.class, Text.class,
+        org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 
-    JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class,
-      Text.class);
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+    JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class);
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1210,24 +1210,23 @@ public String call(Tuple2<IntWritable, Text> x) {
   public void readWithNewAPIHadoopFile() throws IOException {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
     JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
-      org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class, IntWritable.class,
-      Text.class, new Job().getConfiguration());
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+        org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
+        IntWritable.class, Text.class, new Job().getConfiguration());
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1251,9 +1250,9 @@ public void objectFilesOfInts() {
   public void objectFilesOfComplexTypes() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
     rdd.saveAsObjectFile(outputDir);
@@ -1267,23 +1266,22 @@ public void objectFilesOfComplexTypes() {
   public void hadoopFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
     JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
-      SequenceFileInputFormat.class, IntWritable.class, Text.class);
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+        SequenceFileInputFormat.class, IntWritable.class, Text.class);
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1296,16 +1294,16 @@ public String call(Tuple2<IntWritable, Text> x) {
   public void hadoopFileCompressed() {
     String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-        new Tuple2<Integer, String>(1, "a"),
-        new Tuple2<Integer, String>(2, "aa"),
-        new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class,
         DefaultCodec.class);
@@ -1313,8 +1311,7 @@ public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
     JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
         SequenceFileInputFormat.class, IntWritable.class, Text.class);
 
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-        String>() {
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1414,8 +1411,8 @@ public String call(Integer t) {
         return t.toString();
       }
     }).collect();
-    Assert.assertEquals(new Tuple2<String, Integer>("1", 1), s.get(0));
-    Assert.assertEquals(new Tuple2<String, Integer>("2", 2), s.get(1));
+    Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
+    Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
   }
 
   @Test
@@ -1448,20 +1445,20 @@ public void combineByKey() {
     JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
     Function<Integer, Integer> keyFunction = new Function<Integer, Integer>() {
       @Override
-      public Integer call(Integer v1) throws Exception {
+      public Integer call(Integer v1) {
         return v1 % 3;
       }
     };
     Function<Integer, Integer> createCombinerFunction = new Function<Integer, Integer>() {
       @Override
-      public Integer call(Integer v1) throws Exception {
+      public Integer call(Integer v1) {
         return v1;
       }
     };
 
     Function2<Integer, Integer, Integer> mergeValueFunction = new Function2<Integer, Integer, Integer>() {
       @Override
-      public Integer call(Integer v1, Integer v2) throws Exception {
+      public Integer call(Integer v1, Integer v2) {
         return v1 + v2;
       }
     };
@@ -1496,21 +1493,21 @@ public void mapOnPairRDD() {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-            return new Tuple2<Integer, Integer>(i, i % 2);
+            return new Tuple2<>(i, i % 2);
           }
         });
     JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(
         new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
-      @Override
-      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) {
-        return new Tuple2<Integer, Integer>(in._2(), in._1());
-      }
-    });
+          @Override
+          public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) {
+            return new Tuple2<>(in._2(), in._1());
+          }
+        });
     Assert.assertEquals(Arrays.asList(
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(0, 2),
-        new Tuple2<Integer, Integer>(1, 3),
-        new Tuple2<Integer, Integer>(0, 4)), rdd3.collect());
+        new Tuple2<>(1, 1),
+        new Tuple2<>(0, 2),
+        new Tuple2<>(1, 3),
+        new Tuple2<>(0, 4)), rdd3.collect());
 
   }
 
@@ -1523,7 +1520,7 @@ public void collectPartitions() {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-            return new Tuple2<Integer, Integer>(i, i % 2);
+            return new Tuple2<>(i, i % 2);
           }
         });
 
@@ -1534,23 +1531,23 @@ public Tuple2<Integer, Integer> call(Integer i) {
     Assert.assertEquals(Arrays.asList(3, 4), parts[0]);
     Assert.assertEquals(Arrays.asList(5, 6, 7), parts[1]);
 
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(1, 1),
-                                      new Tuple2<Integer, Integer>(2, 0)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(1, 1),
+                                      new Tuple2<>(2, 0)),
                         rdd2.collectPartitions(new int[] {0})[0]);
 
     List<Tuple2<Integer,Integer>>[] parts2 = rdd2.collectPartitions(new int[] {1, 2});
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(3, 1),
-                                      new Tuple2<Integer, Integer>(4, 0)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1),
+                                      new Tuple2<>(4, 0)),
                         parts2[0]);
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(5, 1),
-                                      new Tuple2<Integer, Integer>(6, 0),
-                                      new Tuple2<Integer, Integer>(7, 1)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(5, 1),
+                                      new Tuple2<>(6, 0),
+                                      new Tuple2<>(7, 1)),
                         parts2[1]);
   }
 
   @Test
   public void countApproxDistinct() {
-    List<Integer> arrayData = new ArrayList<Integer>();
+    List<Integer> arrayData = new ArrayList<>();
     int size = 100;
     for (int i = 0; i < 100000; i++) {
       arrayData.add(i % size);
@@ -1561,15 +1558,15 @@ public void countApproxDistinct() {
 
   @Test
   public void countApproxDistinctByKey() {
-    List<Tuple2<Integer, Integer>> arrayData = new ArrayList<Tuple2<Integer, Integer>>();
+    List<Tuple2<Integer, Integer>> arrayData = new ArrayList<>();
     for (int i = 10; i < 100; i++) {
       for (int j = 0; j < i; j++) {
-        arrayData.add(new Tuple2<Integer, Integer>(i, j));
+        arrayData.add(new Tuple2<>(i, j));
       }
     }
     double relativeSD = 0.001;
     JavaPairRDD<Integer, Integer> pairRdd = sc.parallelizePairs(arrayData);
-    List<Tuple2<Integer, Object>> res =  pairRdd.countApproxDistinctByKey(8, 0).collect();
+    List<Tuple2<Integer, Object>> res =  pairRdd.countApproxDistinctByKey(relativeSD, 8).collect();
     for (Tuple2<Integer, Object> resItem : res) {
       double count = (double)resItem._1();
       Long resCount = (Long)resItem._2();
@@ -1587,7 +1584,7 @@ public void collectAsMapWithIntArrayValues() {
         new PairFunction<Integer, Integer, int[]>() {
           @Override
           public Tuple2<Integer, int[]> call(Integer x) {
-            return new Tuple2<Integer, int[]>(x, new int[] { x });
+            return new Tuple2<>(x, new int[]{x});
           }
         });
     pairRDD.collect();  // Works fine
@@ -1598,7 +1595,7 @@ public Tuple2<Integer, int[]> call(Integer x) {
   @Test
   public void collectAsMapAndSerialize() throws Exception {
     JavaPairRDD<String,Integer> rdd =
-        sc.parallelizePairs(Arrays.asList(new Tuple2<String,Integer>("foo", 1)));
+        sc.parallelizePairs(Arrays.asList(new Tuple2<>("foo", 1)));
     Map<String,Integer> map = rdd.collectAsMap();
     ByteArrayOutputStream bytes = new ByteArrayOutputStream();
     new ObjectOutputStream(bytes).writeObject(map);
@@ -1615,7 +1612,7 @@ public void sampleByKey() {
       new PairFunction<Integer, Integer, Integer>() {
         @Override
         public Tuple2<Integer, Integer> call(Integer i) {
-          return new Tuple2<Integer, Integer>(i % 2, 1);
+          return new Tuple2<>(i % 2, 1);
         }
       });
     Map<Integer, Object> fractions = Maps.newHashMap();
@@ -1623,12 +1620,12 @@ public Tuple2<Integer, Integer> call(Integer i) {
     fractions.put(1, 1.0);
     JavaPairRDD<Integer, Integer> wr = rdd2.sampleByKey(true, fractions, 1L);
     Map<Integer, Long> wrCounts = (Map<Integer, Long>) (Object) wr.countByKey();
-    Assert.assertTrue(wrCounts.size() == 2);
+    Assert.assertEquals(2, wrCounts.size());
     Assert.assertTrue(wrCounts.get(0) > 0);
     Assert.assertTrue(wrCounts.get(1) > 0);
     JavaPairRDD<Integer, Integer> wor = rdd2.sampleByKey(false, fractions, 1L);
     Map<Integer, Long> worCounts = (Map<Integer, Long>) (Object) wor.countByKey();
-    Assert.assertTrue(worCounts.size() == 2);
+    Assert.assertEquals(2, worCounts.size());
     Assert.assertTrue(worCounts.get(0) > 0);
     Assert.assertTrue(worCounts.get(1) > 0);
   }
@@ -1641,7 +1638,7 @@ public void sampleByKeyExact() {
       new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-              return new Tuple2<Integer, Integer>(i % 2, 1);
+              return new Tuple2<>(i % 2, 1);
           }
       });
     Map<Integer, Object> fractions = Maps.newHashMap();
@@ -1649,25 +1646,25 @@ public Tuple2<Integer, Integer> call(Integer i) {
     fractions.put(1, 1.0);
     JavaPairRDD<Integer, Integer> wrExact = rdd2.sampleByKeyExact(true, fractions, 1L);
     Map<Integer, Long> wrExactCounts = (Map<Integer, Long>) (Object) wrExact.countByKey();
-    Assert.assertTrue(wrExactCounts.size() == 2);
+    Assert.assertEquals(2, wrExactCounts.size());
     Assert.assertTrue(wrExactCounts.get(0) == 2);
     Assert.assertTrue(wrExactCounts.get(1) == 4);
     JavaPairRDD<Integer, Integer> worExact = rdd2.sampleByKeyExact(false, fractions, 1L);
     Map<Integer, Long> worExactCounts = (Map<Integer, Long>) (Object) worExact.countByKey();
-    Assert.assertTrue(worExactCounts.size() == 2);
+    Assert.assertEquals(2, worExactCounts.size());
     Assert.assertTrue(worExactCounts.get(0) == 2);
     Assert.assertTrue(worExactCounts.get(1) == 4);
   }
 
   private static class SomeCustomClass implements Serializable {
-    public SomeCustomClass() {
+    SomeCustomClass() {
       // Intentionally left blank
     }
   }
 
   @Test
   public void collectUnderlyingScalaRDD() {
-    List<SomeCustomClass> data = new ArrayList<SomeCustomClass>();
+    List<SomeCustomClass> data = new ArrayList<>();
     for (int i = 0; i < 100; i++) {
       data.add(new SomeCustomClass());
     }
@@ -1679,7 +1676,7 @@ public void collectUnderlyingScalaRDD() {
   private static final class BuggyMapFunction<T> implements Function<T, T> {
 
     @Override
-    public T call(T x) throws Exception {
+    public T call(T x) {
       throw new IllegalStateException("Custom exception!");
     }
   }
@@ -1716,7 +1713,7 @@ public void foreachAsync() throws Exception {
     JavaFutureAction<Void> future = rdd.foreachAsync(
         new VoidFunction<Integer>() {
           @Override
-          public void call(Integer integer) throws Exception {
+          public void call(Integer integer) {
             // intentionally left blank.
           }
         }
@@ -1745,7 +1742,7 @@ public void testAsyncActionCancellation() throws Exception {
     JavaRDD<Integer> rdd = sc.parallelize(data, 1);
     JavaFutureAction<Void> future = rdd.foreachAsync(new VoidFunction<Integer>() {
       @Override
-      public void call(Integer integer) throws Exception {
+      public void call(Integer integer) throws InterruptedException {
         Thread.sleep(10000);  // To ensure that the job won't finish before it's cancelled.
       }
     });
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
index 9db07d0507fea..fbdfbf7e509b3 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
@@ -75,11 +75,11 @@ public void testKafkaStream() throws InterruptedException {
     String[] topic1data = createTopicAndSendData(topic1);
     String[] topic2data = createTopicAndSendData(topic2);
 
-    HashSet<String> sent = new HashSet<String>();
+    Set<String> sent = new HashSet<>();
     sent.addAll(Arrays.asList(topic1data));
     sent.addAll(Arrays.asList(topic2data));
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
     kafkaParams.put("auto.offset.reset", "smallest");
 
@@ -95,17 +95,17 @@ public void testKafkaStream() throws InterruptedException {
         // Make sure you can get offset ranges from the rdd
         new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() {
           @Override
-          public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) throws Exception {
+          public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) {
             OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
             offsetRanges.set(offsets);
-            Assert.assertEquals(offsets[0].topic(), topic1);
+            Assert.assertEquals(topic1, offsets[0].topic());
             return rdd;
           }
         }
     ).map(
         new Function<Tuple2<String, String>, String>() {
           @Override
-          public String call(Tuple2<String, String> kv) throws Exception {
+          public String call(Tuple2<String, String> kv) {
             return kv._2();
           }
         }
@@ -119,10 +119,10 @@ public String call(Tuple2<String, String> kv) throws Exception {
         StringDecoder.class,
         String.class,
         kafkaParams,
-        topicOffsetToMap(topic2, (long) 0),
+        topicOffsetToMap(topic2, 0L),
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
@@ -133,7 +133,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception
     unifiedStream.foreachRDD(
         new Function<JavaRDD<String>, Void>() {
           @Override
-          public Void call(JavaRDD<String> rdd) throws Exception {
+          public Void call(JavaRDD<String> rdd) {
             result.addAll(rdd.collect());
             for (OffsetRange o : offsetRanges.get()) {
               System.out.println(
@@ -155,14 +155,14 @@ public Void call(JavaRDD<String> rdd) throws Exception {
     ssc.stop();
   }
 
-  private HashSet<String> topicToSet(String topic) {
-    HashSet<String> topicSet = new HashSet<String>();
+  private static Set<String> topicToSet(String topic) {
+    Set<String> topicSet = new HashSet<>();
     topicSet.add(topic);
     return topicSet;
   }
 
-  private HashMap<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
-    HashMap<TopicAndPartition, Long> topicMap = new HashMap<TopicAndPartition, Long>();
+  private static Map<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
+    Map<TopicAndPartition, Long> topicMap = new HashMap<>();
     topicMap.put(new TopicAndPartition(topic, 0), offsetToStart);
     return topicMap;
   }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
index a9dc6e50613ca..afcc6cfccd39a 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.HashMap;
+import java.util.Map;
 
 import scala.Tuple2;
 
@@ -66,10 +67,10 @@ public void testKafkaRDD() throws InterruptedException {
     String topic1 = "topic1";
     String topic2 = "topic2";
 
-    String[] topic1data = createTopicAndSendData(topic1);
-    String[] topic2data = createTopicAndSendData(topic2);
+    createTopicAndSendData(topic1);
+    createTopicAndSendData(topic2);
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
 
     OffsetRange[] offsetRanges = {
@@ -77,8 +78,8 @@ public void testKafkaRDD() throws InterruptedException {
       OffsetRange.create(topic2, 0, 0, 1)
     };
 
-    HashMap<TopicAndPartition, Broker> emptyLeaders = new HashMap<TopicAndPartition, Broker>();
-    HashMap<TopicAndPartition, Broker> leaders = new HashMap<TopicAndPartition, Broker>();
+    Map<TopicAndPartition, Broker> emptyLeaders = new HashMap<>();
+    Map<TopicAndPartition, Broker> leaders = new HashMap<>();
     String[] hostAndPort = kafkaTestUtils.brokerAddress().split(":");
     Broker broker = Broker.create(hostAndPort[0], Integer.parseInt(hostAndPort[1]));
     leaders.put(new TopicAndPartition(topic1, 0), broker);
@@ -95,7 +96,7 @@ public void testKafkaRDD() throws InterruptedException {
     ).map(
         new Function<Tuple2<String, String>, String>() {
           @Override
-          public String call(Tuple2<String, String> kv) throws Exception {
+          public String call(Tuple2<String, String> kv) {
             return kv._2();
           }
         }
@@ -113,7 +114,7 @@ public String call(Tuple2<String, String> kv) throws Exception {
         emptyLeaders,
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
@@ -131,7 +132,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception
         leaders,
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
index e4c659215b767..1e69de46cd35d 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
@@ -67,10 +67,10 @@ public void tearDown() {
   @Test
   public void testKafkaStream() throws InterruptedException {
     String topic = "topic1";
-    HashMap<String, Integer> topics = new HashMap<String, Integer>();
+    Map<String, Integer> topics = new HashMap<>();
     topics.put(topic, 1);
 
-    HashMap<String, Integer> sent = new HashMap<String, Integer>();
+    Map<String, Integer> sent = new HashMap<>();
     sent.put("a", 5);
     sent.put("b", 3);
     sent.put("c", 10);
@@ -78,7 +78,7 @@ public void testKafkaStream() throws InterruptedException {
     kafkaTestUtils.createTopic(topic);
     kafkaTestUtils.sendMessages(topic, sent);
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("zookeeper.connect", kafkaTestUtils.zkAddress());
     kafkaParams.put("group.id", "test-consumer-" + random.nextInt(10000));
     kafkaParams.put("auto.offset.reset", "smallest");
@@ -97,7 +97,7 @@ public void testKafkaStream() throws InterruptedException {
     JavaDStream<String> words = stream.map(
       new Function<Tuple2<String, String>, String>() {
         @Override
-        public String call(Tuple2<String, String> tuple2) throws Exception {
+        public String call(Tuple2<String, String> tuple2) {
           return tuple2._2();
         }
       }
@@ -106,7 +106,7 @@ public String call(Tuple2<String, String> tuple2) throws Exception {
     words.countByValue().foreachRDD(
       new Function<JavaPairRDD<String, Long>, Void>() {
         @Override
-        public Void call(JavaPairRDD<String, Long> rdd) throws Exception {
+        public Void call(JavaPairRDD<String, Long> rdd) {
           List<Tuple2<String, Long>> ret = rdd.collect();
           for (Tuple2<String, Long> r : ret) {
             if (result.containsKey(r._1())) {
@@ -130,8 +130,8 @@ public Void call(JavaPairRDD<String, Long> rdd) throws Exception {
       Thread.sleep(200);
     }
     Assert.assertEquals(sent.size(), result.size());
-    for (String k : sent.keySet()) {
-      Assert.assertEquals(sent.get(k).intValue(), result.get(k).intValue());
+    for (Map.Entry<String, Integer> e : sent.entrySet()) {
+      Assert.assertEquals(e.getValue().intValue(), result.get(e.getKey()).intValue());
     }
   }
 }
diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java b/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
index e46b4e5c7531d..26ec8af455bcf 100644
--- a/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
+++ b/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
@@ -17,8 +17,6 @@
 
 package org.apache.spark.streaming.twitter;
 
-import java.util.Arrays;
-
 import org.junit.Test;
 import twitter4j.Status;
 import twitter4j.auth.Authorization;
@@ -30,7 +28,7 @@
 public class JavaTwitterStreamSuite extends LocalJavaStreamingContext {
   @Test
   public void testTwitterStream() {
-    String[] filters = (String[])Arrays.<String>asList("filter1", "filter2").toArray();
+    String[] filters = { "filter1", "filter2" };
     Authorization auth = NullAuthorization.getInstance();
 
     // tests the API, does not actually test data receiving
diff --git a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
index 729bc0459ce52..14975265ab2ce 100644
--- a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
+++ b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
@@ -77,7 +77,7 @@ public void call(String s) {
   public void foreach() {
     foreachCalls = 0;
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
-    rdd.foreach((x) -> foreachCalls++);
+    rdd.foreach(x -> foreachCalls++);
     Assert.assertEquals(2, foreachCalls);
   }
 
@@ -180,7 +180,7 @@ public void map() {
     JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
       .cache();
     pairs.collect();
-    JavaRDD<String> strings = rdd.map(x -> x.toString()).cache();
+    JavaRDD<String> strings = rdd.map(Object::toString).cache();
     strings.collect();
   }
 
@@ -195,7 +195,9 @@ public void flatMap() {
 
     JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> {
       List<Tuple2<String, String>> pairs2 = new LinkedList<>();
-      for (String word : s.split(" ")) pairs2.add(new Tuple2<>(word, word));
+      for (String word : s.split(" ")) {
+        pairs2.add(new Tuple2<>(word, word));
+      }
       return pairs2;
     });
 
@@ -204,11 +206,12 @@ public void flatMap() {
 
     JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> {
       List<Double> lengths = new LinkedList<>();
-      for (String word : s.split(" ")) lengths.add(word.length() * 1.0);
+      for (String word : s.split(" ")) {
+        lengths.add((double) word.length());
+      }
       return lengths;
     });
 
-    Double x = doubles.first();
     Assert.assertEquals(5.0, doubles.first(), 0.01);
     Assert.assertEquals(11, pairs.count());
   }
@@ -228,7 +231,7 @@ public void mapsFromPairsToPairs() {
     swapped.collect();
 
     // There was never a bug here, but it's worth testing:
-    pairRDD.map(item -> item.swap()).collect();
+    pairRDD.map(Tuple2::swap).collect();
   }
 
   @Test
@@ -282,11 +285,11 @@ public void zipPartitions() {
     FlatMapFunction2<Iterator<Integer>, Iterator<String>, Integer> sizesFn =
       (Iterator<Integer> i, Iterator<String> s) -> {
         int sizeI = 0;
-        int sizeS = 0;
         while (i.hasNext()) {
           sizeI += 1;
           i.next();
         }
+        int sizeS = 0;
         while (s.hasNext()) {
           sizeS += 1;
           s.next();
@@ -301,30 +304,31 @@ public void zipPartitions() {
   public void accumulators() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 
-    final Accumulator<Integer> intAccum = sc.intAccumulator(10);
-    rdd.foreach(x -> intAccum.add(x));
+    Accumulator<Integer> intAccum = sc.intAccumulator(10);
+    rdd.foreach(intAccum::add);
     Assert.assertEquals((Integer) 25, intAccum.value());
 
-    final Accumulator<Double> doubleAccum = sc.doubleAccumulator(10.0);
+    Accumulator<Double> doubleAccum = sc.doubleAccumulator(10.0);
     rdd.foreach(x -> doubleAccum.add((double) x));
     Assert.assertEquals((Double) 25.0, doubleAccum.value());
 
     // Try a custom accumulator type
     AccumulatorParam<Float> floatAccumulatorParam = new AccumulatorParam<Float>() {
+      @Override
       public Float addInPlace(Float r, Float t) {
         return r + t;
       }
-
+      @Override
       public Float addAccumulator(Float r, Float t) {
         return r + t;
       }
-
+      @Override
       public Float zero(Float initialValue) {
         return 0.0f;
       }
     };
 
-    final Accumulator<Float> floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
+    Accumulator<Float> floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
     rdd.foreach(x -> floatAccum.add((float) x));
     Assert.assertEquals((Float) 25.0f, floatAccum.value());
 
@@ -336,7 +340,7 @@ public Float zero(Float initialValue) {
   @Test
   public void keyBy() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
-    List<Tuple2<String, Integer>> s = rdd.keyBy(x -> x.toString()).collect();
+    List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
     Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
     Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
   }
@@ -349,7 +353,7 @@ public void mapOnPairRDD() {
     JavaPairRDD<Integer, Integer> rdd3 =
       rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
     Assert.assertEquals(Arrays.asList(
-      new Tuple2<Integer, Integer>(1, 1),
+      new Tuple2<>(1, 1),
       new Tuple2<>(0, 2),
       new Tuple2<>(1, 3),
       new Tuple2<>(0, 4)), rdd3.collect());
@@ -361,7 +365,7 @@ public void collectPartitions() {
 
     JavaPairRDD<Integer, Integer> rdd2 =
       rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
-    List[] parts = rdd1.collectPartitions(new int[]{0});
+    List<Integer>[] parts = rdd1.collectPartitions(new int[]{0});
     Assert.assertEquals(Arrays.asList(1, 2), parts[0]);
 
     parts = rdd1.collectPartitions(new int[]{1, 2});
@@ -371,19 +375,19 @@ public void collectPartitions() {
     Assert.assertEquals(Arrays.asList(new Tuple2<>(1, 1), new Tuple2<>(2, 0)),
       rdd2.collectPartitions(new int[]{0})[0]);
 
-    parts = rdd2.collectPartitions(new int[]{1, 2});
-    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1), new Tuple2<>(4, 0)), parts[0]);
+    List<Tuple2<Integer, Integer>>[] parts2 = rdd2.collectPartitions(new int[]{1, 2});
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1), new Tuple2<>(4, 0)), parts2[0]);
     Assert.assertEquals(Arrays.asList(new Tuple2<>(5, 1), new Tuple2<>(6, 0), new Tuple2<>(7, 1)),
-      parts[1]);
+      parts2[1]);
   }
 
   @Test
   public void collectAsMapWithIntArrayValues() {
     // Regression test for SPARK-1040
-    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(new Integer[]{1}));
+    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1));
     JavaPairRDD<Integer, int[]> pairRDD =
       rdd.mapToPair(x -> new Tuple2<>(x, new int[]{x}));
     pairRDD.collect();  // Works fine
-    Map<Integer, int[]> map = pairRDD.collectAsMap();  // Used to crash with ClassCastException
+    pairRDD.collectAsMap();  // Used to crash with ClassCastException
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
index bf693c7c393f6..7b50aad4ad498 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
@@ -18,6 +18,7 @@
 package test.org.apache.spark.sql;
 
 import java.io.Serializable;
+import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -83,7 +84,7 @@ public void setAge(int age) {
 
   @Test
   public void applySchema() {
-    List<Person> personList = new ArrayList<Person>(2);
+    List<Person> personList = new ArrayList<>(2);
     Person person1 = new Person();
     person1.setName("Michael");
     person1.setAge(29);
@@ -95,12 +96,13 @@ public void applySchema() {
 
     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
       new Function<Person, Row>() {
+        @Override
         public Row call(Person person) throws Exception {
           return RowFactory.create(person.getName(), person.getAge());
         }
       });
 
-    List<StructField> fields = new ArrayList<StructField>(2);
+    List<StructField> fields = new ArrayList<>(2);
     fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
@@ -118,7 +120,7 @@ public Row call(Person person) throws Exception {
 
   @Test
   public void dataFrameRDDOperations() {
-    List<Person> personList = new ArrayList<Person>(2);
+    List<Person> personList = new ArrayList<>(2);
     Person person1 = new Person();
     person1.setName("Michael");
     person1.setAge(29);
@@ -129,27 +131,28 @@ public void dataFrameRDDOperations() {
     personList.add(person2);
 
     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
-            new Function<Person, Row>() {
-              public Row call(Person person) throws Exception {
-                return RowFactory.create(person.getName(), person.getAge());
-              }
-            });
-
-    List<StructField> fields = new ArrayList<StructField>(2);
-    fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
+        new Function<Person, Row>() {
+          @Override
+          public Row call(Person person) {
+            return RowFactory.create(person.getName(), person.getAge());
+          }
+        });
+
+    List<StructField> fields = new ArrayList<>(2);
+    fields.add(DataTypes.createStructField("", DataTypes.StringType, false));
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
 
     DataFrame df = sqlContext.applySchema(rowRDD, schema);
     df.registerTempTable("people");
     List<String> actual = sqlContext.sql("SELECT * FROM people").toJavaRDD().map(new Function<Row, String>() {
-
+      @Override
       public String call(Row row) {
-        return row.getString(0) + "_" + row.get(1).toString();
+        return row.getString(0) + "_" + row.get(1);
       }
     }).collect();
 
-    List<String> expected = new ArrayList<String>(2);
+    List<String> expected = new ArrayList<>(2);
     expected.add("Michael_29");
     expected.add("Yin_28");
 
@@ -165,7 +168,7 @@ public void applySchemaToJSON() {
       "{\"string\":\"this is another simple string.\", \"integer\":11, \"long\":21474836469, " +
         "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " +
         "\"boolean\":false, \"null\":null}"));
-    List<StructField> fields = new ArrayList<StructField>(7);
+    List<StructField> fields = new ArrayList<>(7);
     fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(20, 0),
       true));
     fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true));
@@ -175,10 +178,10 @@ public void applySchemaToJSON() {
     fields.add(DataTypes.createStructField("null", DataTypes.StringType, true));
     fields.add(DataTypes.createStructField("string", DataTypes.StringType, true));
     StructType expectedSchema = DataTypes.createStructType(fields);
-    List<Row> expectedResult = new ArrayList<Row>(2);
+    List<Row> expectedResult = new ArrayList<>(2);
     expectedResult.add(
       RowFactory.create(
-        new java.math.BigDecimal("92233720368547758070"),
+        new BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
         10,
@@ -187,7 +190,7 @@ public void applySchemaToJSON() {
         "this is a simple string."));
     expectedResult.add(
       RowFactory.create(
-        new java.math.BigDecimal("92233720368547758069"),
+        new BigDecimal("92233720368547758069"),
         false,
         1.7976931348623157E305,
         11,
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 4867cebf5328c..d981ce947f435 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -61,7 +61,7 @@ public void tearDown() {
   @Test
   public void testExecution() {
     DataFrame df = context.table("testData").filter("key = 1");
-    Assert.assertEquals(df.select("key").collect()[0].get(0), 1);
+    Assert.assertEquals(1, df.select("key").collect()[0].get(0));
   }
 
   /**
@@ -119,7 +119,7 @@ public void testShow() {
 
   public static class Bean implements Serializable {
     private double a = 0.0;
-    private Integer[] b = new Integer[]{0, 1};
+    private Integer[] b = { 0, 1 };
     private Map<String, int[]> c = ImmutableMap.of("hello", new int[] { 1, 2 });
     private List<String> d = Arrays.asList("floppy", "disk");
 
@@ -161,7 +161,7 @@ public void testCreateDataFrameFromJavaBeans() {
       schema.apply("d"));
     Row first = df.select("a", "b", "c", "d").first();
     Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
-    // Now Java lists and maps are converetd to Scala Seq's and Map's. Once we get a Seq below,
+    // Now Java lists and maps are converted to Scala Seq's and Map's. Once we get a Seq below,
     // verify that it has the expected length, and contains expected elements.
     Seq<Integer> result = first.getAs(1);
     Assert.assertEquals(bean.getB().length, result.length());
@@ -180,7 +180,8 @@ public void testCreateDataFrameFromJavaBeans() {
     }
   }
 
-  private static Comparator<Row> CrosstabRowComparator = new Comparator<Row>() {
+  private static final Comparator<Row> crosstabRowComparator = new Comparator<Row>() {
+    @Override
     public int compare(Row row1, Row row2) {
       String item1 = row1.getString(0);
       String item2 = row2.getString(0);
@@ -193,16 +194,16 @@ public void testCrosstab() {
     DataFrame df = context.table("testData2");
     DataFrame crosstab = df.stat().crosstab("a", "b");
     String[] columnNames = crosstab.schema().fieldNames();
-    Assert.assertEquals(columnNames[0], "a_b");
-    Assert.assertEquals(columnNames[1], "1");
-    Assert.assertEquals(columnNames[2], "2");
+    Assert.assertEquals("a_b", columnNames[0]);
+    Assert.assertEquals("1", columnNames[1]);
+    Assert.assertEquals("2", columnNames[2]);
     Row[] rows = crosstab.collect();
-    Arrays.sort(rows, CrosstabRowComparator);
+    Arrays.sort(rows, crosstabRowComparator);
     Integer count = 1;
     for (Row row : rows) {
       Assert.assertEquals(row.get(0).toString(), count.toString());
-      Assert.assertEquals(row.getLong(1), 1L);
-      Assert.assertEquals(row.getLong(2), 1L);
+      Assert.assertEquals(1L, row.getLong(1));
+      Assert.assertEquals(1L, row.getLong(2));
       count++;
     }
   }
@@ -210,7 +211,7 @@ public void testCrosstab() {
   @Test
   public void testFrequentItems() {
     DataFrame df = context.table("testData2");
-    String[] cols = new String[]{"a"};
+    String[] cols = {"a"};
     DataFrame results = df.stat().freqItems(cols, 0.2);
     Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
   }
@@ -219,14 +220,14 @@ public void testFrequentItems() {
   public void testCorrelation() {
     DataFrame df = context.table("testData2");
     Double pearsonCorr = df.stat().corr("a", "b", "pearson");
-    Assert.assertTrue(Math.abs(pearsonCorr) < 1e-6);
+    Assert.assertTrue(Math.abs(pearsonCorr) < 1.0e-6);
   }
 
   @Test
   public void testCovariance() {
     DataFrame df = context.table("testData2");
     Double result = df.stat().cov("a", "b");
-    Assert.assertTrue(Math.abs(result) < 1e-6);
+    Assert.assertTrue(Math.abs(result) < 1.0e-6);
   }
 
   @Test
@@ -234,7 +235,7 @@ public void testSampleBy() {
     DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
     DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
     Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
-    Row[] expected = new Row[] {RowFactory.create(0, 5), RowFactory.create(1, 8)};
+    Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)};
     Assert.assertArrayEquals(expected, actual);
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
index 4ce1d1dddb26a..3ab4db2a035d3 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
@@ -18,6 +18,7 @@
 package test.org.apache.spark.sql;
 
 import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
 import java.sql.Date;
 import java.sql.Timestamp;
 import java.util.Arrays;
@@ -52,12 +53,12 @@ public void setUp() {
     shortValue = (short)32767;
     intValue = 2147483647;
     longValue = 9223372036854775807L;
-    floatValue = (float)3.4028235E38;
+    floatValue = 3.4028235E38f;
     doubleValue = 1.7976931348623157E308;
     decimalValue = new BigDecimal("1.7976931348623157E328");
     booleanValue = true;
     stringValue = "this is a string";
-    binaryValue = stringValue.getBytes();
+    binaryValue = stringValue.getBytes(StandardCharsets.UTF_8);
     dateValue = Date.valueOf("2014-06-30");
     timestampValue = Timestamp.valueOf("2014-06-30 09:20:00.0");
   }
@@ -123,8 +124,8 @@ public void constructSimpleRow() {
     Assert.assertEquals(binaryValue, simpleRow.get(16));
     Assert.assertEquals(dateValue, simpleRow.get(17));
     Assert.assertEquals(timestampValue, simpleRow.get(18));
-    Assert.assertEquals(true, simpleRow.isNullAt(19));
-    Assert.assertEquals(null, simpleRow.get(19));
+    Assert.assertTrue(simpleRow.isNullAt(19));
+    Assert.assertNull(simpleRow.get(19));
   }
 
   @Test
@@ -134,7 +135,7 @@ public void constructComplexRow() {
       stringValue + " (1)", stringValue + " (2)", stringValue + "(3)");
 
     // Simple map
-    Map<String, Long> simpleMap = new HashMap<String, Long>();
+    Map<String, Long> simpleMap = new HashMap<>();
     simpleMap.put(stringValue + " (1)", longValue);
     simpleMap.put(stringValue + " (2)", longValue - 1);
     simpleMap.put(stringValue + " (3)", longValue - 2);
@@ -149,7 +150,7 @@ public void constructComplexRow() {
     List<Row> arrayOfRows = Arrays.asList(simpleStruct);
 
     // Complex map
-    Map<List<Row>, Row> complexMap = new HashMap<List<Row>, Row>();
+    Map<List<Row>, Row> complexMap = new HashMap<>();
     complexMap.put(arrayOfRows, simpleStruct);
 
     // Complex struct
@@ -167,7 +168,7 @@ public void constructComplexRow() {
     Assert.assertEquals(arrayOfMaps, complexStruct.get(3));
     Assert.assertEquals(arrayOfRows, complexStruct.get(4));
     Assert.assertEquals(complexMap, complexStruct.get(5));
-    Assert.assertEquals(null, complexStruct.get(6));
+    Assert.assertNull(complexStruct.get(6));
 
     // A very complex row
     Row complexRow = RowFactory.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct);
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index bb02b58cca9be..4a78dca7fea66 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -20,6 +20,7 @@
 import java.io.Serializable;
 
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -61,13 +62,13 @@ public void udf1Test() {
 
     sqlContext.udf().register("stringLengthTest", new UDF1<String, Integer>() {
       @Override
-      public Integer call(String str) throws Exception {
+      public Integer call(String str) {
         return str.length();
       }
     }, DataTypes.IntegerType);
 
     Row result = sqlContext.sql("SELECT stringLengthTest('test')").head();
-    assert(result.getInt(0) == 4);
+    Assert.assertEquals(4, result.getInt(0));
   }
 
   @SuppressWarnings("unchecked")
@@ -81,12 +82,12 @@ public void udf2Test() {
 
     sqlContext.udf().register("stringLengthTest", new UDF2<String, String, Integer>() {
       @Override
-      public Integer call(String str1, String str2) throws Exception {
+      public Integer call(String str1, String str2) {
         return str1.length() + str2.length();
       }
     }, DataTypes.IntegerType);
 
     Row result = sqlContext.sql("SELECT stringLengthTest('test', 'test2')").head();
-    assert(result.getInt(0) == 9);
+    Assert.assertEquals(9, result.getInt(0));
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index 6f9e7f68dc39c..9e241f20987c0 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -44,7 +44,7 @@ public class JavaSaveLoadSuite {
   File path;
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -64,7 +64,7 @@ public void setUp() throws IOException {
       path.delete();
     }
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
@@ -82,7 +82,7 @@ public void tearDown() {
 
   @Test
   public void saveAndLoad() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save();
     DataFrame loadedDF = sqlContext.read().format("json").options(options).load();
@@ -91,11 +91,11 @@ public void saveAndLoad() {
 
   @Test
   public void saveAndLoadWithSchema() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save();
 
-    List<StructField> fields = new ArrayList<StructField>();
+    List<StructField> fields = new ArrayList<>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF = sqlContext.read().format("json").schema(schema).options(options).load();
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index 019d8a30266e2..b4bf9eef8fca5 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -40,7 +40,7 @@ public class JavaDataFrameSuite {
 
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -52,7 +52,7 @@ public void setUp() throws IOException {
     hc = TestHive$.MODULE$;
     sc = new JavaSparkContext(hc.sparkContext());
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}");
     }
@@ -71,7 +71,7 @@ public void tearDown() throws IOException {
   @Test
   public void saveTableAndQueryIt() {
     checkAnswer(
-      df.select(functions.avg("key").over(
+      df.select(avg("key").over(
         Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))),
       hc.sql("SELECT avg(key) " +
         "OVER (PARTITION BY value " +
@@ -95,7 +95,7 @@ public void testUDAF() {
           registeredUDAF.apply(col("value")),
           callUDF("mydoublesum", col("value")));
 
-    List<Row> expectedResult = new ArrayList<Row>();
+    List<Row> expectedResult = new ArrayList<>();
     expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
     checkAnswer(
       aggregatedDF,
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 4192155975c47..c8d272794d10b 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -53,7 +53,7 @@ public class JavaMetastoreDataSourcesSuite {
   FileSystem fs;
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -77,7 +77,7 @@ public void setUp() throws IOException {
       fs.delete(hiveManagedPath, true);
     }
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
@@ -97,7 +97,7 @@ public void tearDown() throws IOException {
 
   @Test
   public void saveExternalTableAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write()
       .format("org.apache.spark.sql.json")
@@ -120,7 +120,7 @@ public void saveExternalTableAndQueryIt() {
 
   @Test
   public void saveExternalTableWithSchemaAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write()
       .format("org.apache.spark.sql.json")
@@ -132,7 +132,7 @@ public void saveExternalTableWithSchemaAndQueryIt() {
       sqlContext.sql("SELECT * FROM javaSavedTable"),
       df.collectAsList());
 
-    List<StructField> fields = new ArrayList<StructField>();
+    List<StructField> fields = new ArrayList<>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF =
@@ -148,7 +148,7 @@ public void saveExternalTableWithSchemaAndQueryIt() {
 
   @Test
   public void saveTableAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     df.write()
       .format("org.apache.spark.sql.json")
       .mode(SaveMode.Append)
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index e0718f73aa13f..c5217149224e4 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -18,24 +18,22 @@
 package org.apache.spark.streaming;
 
 import java.io.*;
-import java.lang.Iterable;
 import java.nio.charset.Charset;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 
+import scala.Tuple2;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 
-import scala.Tuple2;
-
 import org.junit.Assert;
-import static org.junit.Assert.*;
 import org.junit.Test;
 
 import com.google.common.base.Optional;
-import com.google.common.collect.Lists;
 import com.google.common.io.Files;
 import com.google.common.collect.Sets;
 
@@ -54,14 +52,14 @@
 // see http://stackoverflow.com/questions/758570/.
 public class JavaAPISuite extends LocalJavaStreamingContext implements Serializable {
 
-  public void equalIterator(Iterator<?> a, Iterator<?> b) {
+  public static void equalIterator(Iterator<?> a, Iterator<?> b) {
     while (a.hasNext() && b.hasNext()) {
       Assert.assertEquals(a.next(), b.next());
     }
     Assert.assertEquals(a.hasNext(), b.hasNext());
   }
 
-  public void equalIterable(Iterable<?> a, Iterable<?> b) {
+  public static void equalIterable(Iterable<?> a, Iterable<?> b) {
       equalIterator(a.iterator(), b.iterator());
   }
 
@@ -74,14 +72,14 @@ public void testInitialization() {
   @Test
   public void testContextState() {
     List<List<Integer>> inputData = Arrays.asList(Arrays.asList(1, 2, 3, 4));
-    Assert.assertTrue(ssc.getState() == StreamingContextState.INITIALIZED);
+    Assert.assertEquals(StreamingContextState.INITIALIZED, ssc.getState());
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaTestUtils.attachTestOutputStream(stream);
-    Assert.assertTrue(ssc.getState() == StreamingContextState.INITIALIZED);
+    Assert.assertEquals(StreamingContextState.INITIALIZED, ssc.getState());
     ssc.start();
-    Assert.assertTrue(ssc.getState() == StreamingContextState.ACTIVE);
+    Assert.assertEquals(StreamingContextState.ACTIVE, ssc.getState());
     ssc.stop();
-    Assert.assertTrue(ssc.getState() == StreamingContextState.STOPPED);
+    Assert.assertEquals(StreamingContextState.STOPPED, ssc.getState());
   }
 
   @SuppressWarnings("unchecked")
@@ -118,7 +116,7 @@ public void testMap() {
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<Integer> letterCount = stream.map(new Function<String, Integer>() {
         @Override
-        public Integer call(String s) throws Exception {
+        public Integer call(String s) {
           return s.length();
         }
     });
@@ -180,7 +178,7 @@ public void testWindowWithSlideDuration() {
   public void testFilter() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<String>> expected = Arrays.asList(
         Arrays.asList("giants"),
@@ -189,7 +187,7 @@ public void testFilter() {
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<String> filtered = stream.filter(new Function<String, Boolean>() {
       @Override
-      public Boolean call(String s) throws Exception {
+      public Boolean call(String s) {
         return s.contains("a");
       }
     });
@@ -243,11 +241,11 @@ public void testRepartitionFewerPartitions() {
   public void testGlom() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<List<String>>> expected = Arrays.asList(
         Arrays.asList(Arrays.asList("giants", "dodgers")),
-        Arrays.asList(Arrays.asList("yankees", "red socks")));
+        Arrays.asList(Arrays.asList("yankees", "red sox")));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<List<String>> glommed = stream.glom();
@@ -262,22 +260,22 @@ public void testGlom() {
   public void testMapPartitions() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<String>> expected = Arrays.asList(
         Arrays.asList("GIANTSDODGERS"),
-        Arrays.asList("YANKEESRED SOCKS"));
+        Arrays.asList("YANKEESRED SOX"));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<String> mapped = stream.mapPartitions(
         new FlatMapFunction<Iterator<String>, String>() {
           @Override
           public Iterable<String> call(Iterator<String> in) {
-            String out = "";
+            StringBuilder out = new StringBuilder();
             while (in.hasNext()) {
-              out = out + in.next().toUpperCase();
+              out.append(in.next().toUpperCase(Locale.ENGLISH));
             }
-            return Lists.newArrayList(out);
+            return Arrays.asList(out.toString());
           }
         });
     JavaTestUtils.attachTestOutputStream(mapped);
@@ -286,16 +284,16 @@ public Iterable<String> call(Iterator<String> in) {
     Assert.assertEquals(expected, result);
   }
 
-  private class IntegerSum implements Function2<Integer, Integer, Integer> {
+  private static class IntegerSum implements Function2<Integer, Integer, Integer> {
     @Override
-    public Integer call(Integer i1, Integer i2) throws Exception {
+    public Integer call(Integer i1, Integer i2) {
       return i1 + i2;
     }
   }
 
-  private class IntegerDifference implements Function2<Integer, Integer, Integer> {
+  private static class IntegerDifference implements Function2<Integer, Integer, Integer> {
     @Override
-    public Integer call(Integer i1, Integer i2) throws Exception {
+    public Integer call(Integer i1, Integer i2) {
       return i1 - i2;
     }
   }
@@ -347,13 +345,13 @@ private void testReduceByWindow(boolean withInverse) {
         Arrays.asList(24));
 
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
-    JavaDStream<Integer> reducedWindowed = null;
+    JavaDStream<Integer> reducedWindowed;
     if (withInverse) {
       reducedWindowed = stream.reduceByWindow(new IntegerSum(),
-        new IntegerDifference(), new Duration(2000), new Duration(1000));
+                                              new IntegerDifference(), new Duration(2000), new Duration(1000));
     } else {
       reducedWindowed = stream.reduceByWindow(new IntegerSum(),
-        new Duration(2000), new Duration(1000));
+                                              new Duration(2000), new Duration(1000));
     }
     JavaTestUtils.attachTestOutputStream(reducedWindowed);
     List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 4, 4);
@@ -378,11 +376,11 @@ public void testQueueStream() {
         Arrays.asList(7,8,9));
 
     JavaSparkContext jsc = new JavaSparkContext(ssc.ssc().sc());
-    JavaRDD<Integer> rdd1 = ssc.sparkContext().parallelize(Arrays.asList(1, 2, 3));
-    JavaRDD<Integer> rdd2 = ssc.sparkContext().parallelize(Arrays.asList(4, 5, 6));
-    JavaRDD<Integer> rdd3 = ssc.sparkContext().parallelize(Arrays.asList(7,8,9));
+    JavaRDD<Integer> rdd1 = jsc.parallelize(Arrays.asList(1, 2, 3));
+    JavaRDD<Integer> rdd2 = jsc.parallelize(Arrays.asList(4, 5, 6));
+    JavaRDD<Integer> rdd3 = jsc.parallelize(Arrays.asList(7,8,9));
 
-    LinkedList<JavaRDD<Integer>> rdds = Lists.newLinkedList();
+    Queue<JavaRDD<Integer>> rdds = new LinkedList<>();
     rdds.add(rdd1);
     rdds.add(rdd2);
     rdds.add(rdd3);
@@ -410,10 +408,10 @@ public void testTransform() {
     JavaDStream<Integer> transformed = stream.transform(
       new Function<JavaRDD<Integer>, JavaRDD<Integer>>() {
         @Override
-        public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
+        public JavaRDD<Integer> call(JavaRDD<Integer> in) {
           return in.map(new Function<Integer, Integer>() {
             @Override
-            public Integer call(Integer i) throws Exception {
+            public Integer call(Integer i) {
               return i + 2;
             }
           });
@@ -435,70 +433,70 @@ public void testVariousTransform() {
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
 
     List<List<Tuple2<String, Integer>>> pairInputData =
-        Arrays.asList(Arrays.asList(new Tuple2<String, Integer>("x", 1)));
+        Arrays.asList(Arrays.asList(new Tuple2<>("x", 1)));
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData, 1));
 
-    JavaDStream<Integer> transformed1 = stream.transform(
+    stream.transform(
         new Function<JavaRDD<Integer>, JavaRDD<Integer>>() {
           @Override
-          public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
+          public JavaRDD<Integer> call(JavaRDD<Integer> in) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> transformed2 = stream.transform(
+    stream.transform(
       new Function2<JavaRDD<Integer>, Time, JavaRDD<Integer>>() {
-        @Override public JavaRDD<Integer> call(JavaRDD<Integer> in, Time time) throws Exception {
+        @Override public JavaRDD<Integer> call(JavaRDD<Integer> in, Time time) {
           return null;
         }
       }
     );
 
-    JavaPairDStream<String, Integer> transformed3 = stream.transformToPair(
+    stream.transformToPair(
         new Function<JavaRDD<Integer>, JavaPairRDD<String, Integer>>() {
-          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in) throws Exception {
+          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, Integer> transformed4 = stream.transformToPair(
+    stream.transformToPair(
         new Function2<JavaRDD<Integer>, Time, JavaPairRDD<String, Integer>>() {
-          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in, Time time) throws Exception {
+          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> pairTransformed1 = pairStream.transform(
+    pairStream.transform(
         new Function<JavaPairRDD<String, Integer>, JavaRDD<Integer>>() {
-          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in) throws Exception {
+          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> pairTransformed2 = pairStream.transform(
+    pairStream.transform(
         new Function2<JavaPairRDD<String, Integer>, Time, JavaRDD<Integer>>() {
-          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in, Time time) throws Exception {
+          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, String> pairTransformed3 = pairStream.transformToPair(
+    pairStream.transformToPair(
         new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, String>>() {
-          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in) throws Exception {
+          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, String> pairTransformed4 = pairStream.transformToPair(
+    pairStream.transformToPair(
         new Function2<JavaPairRDD<String, Integer>, Time, JavaPairRDD<String, String>>() {
-          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in, Time time) throws Exception {
+          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in, Time time) {
             return null;
           }
         }
@@ -511,32 +509,32 @@ public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
   public void testTransformWith() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
+            new Tuple2<>("california", "dodgers"),
+            new Tuple2<>("new york", "yankees")),
         Arrays.asList(
-            new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+            new Tuple2<>("california", "sharks"),
+            new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
+            new Tuple2<>("california", "giants"),
+            new Tuple2<>("new york", "mets")),
         Arrays.asList(
-            new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+            new Tuple2<>("california", "ducks"),
+            new Tuple2<>("new york", "islanders")));
 
 
     List<HashSet<Tuple2<String, Tuple2<String, String>>>> expected = Arrays.asList(
         Sets.newHashSet(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("dodgers", "giants")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("yankees", "mets"))),
+            new Tuple2<>("california",
+                         new Tuple2<>("dodgers", "giants")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("yankees", "mets"))),
         Sets.newHashSet(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("sharks", "ducks")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("rangers", "islanders"))));
+            new Tuple2<>("california",
+                         new Tuple2<>("sharks", "ducks")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("rangers", "islanders"))));
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
         ssc, stringStringKVStream1, 1);
@@ -552,14 +550,12 @@ public void testTransformWith() {
             JavaPairRDD<String, String>,
             JavaPairRDD<String, String>,
             Time,
-            JavaPairRDD<String, Tuple2<String, String>>
-          >() {
+            JavaPairRDD<String, Tuple2<String, String>>>() {
           @Override
           public JavaPairRDD<String, Tuple2<String, String>> call(
               JavaPairRDD<String, String> rdd1,
               JavaPairRDD<String, String> rdd2,
-              Time time
-          ) throws Exception {
+              Time time) {
             return rdd1.join(rdd2);
           }
         }
@@ -567,9 +563,9 @@ public JavaPairRDD<String, Tuple2<String, String>> call(
 
     JavaTestUtils.attachTestOutputStream(joined);
     List<List<Tuple2<String, Tuple2<String, String>>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
-    List<HashSet<Tuple2<String, Tuple2<String, String>>>> unorderedResult = Lists.newArrayList();
+    List<HashSet<Tuple2<String, Tuple2<String, String>>>> unorderedResult = new ArrayList<>();
     for (List<Tuple2<String, Tuple2<String, String>>> res: result) {
-        unorderedResult.add(Sets.newHashSet(res));
+      unorderedResult.add(Sets.newHashSet(res));
     }
 
     Assert.assertEquals(expected, unorderedResult);
@@ -587,89 +583,89 @@ public void testVariousTransformWith() {
     JavaDStream<String> stream2 = JavaTestUtils.attachTestInputStream(ssc, inputData2, 1);
 
     List<List<Tuple2<String, Integer>>> pairInputData1 =
-        Arrays.asList(Arrays.asList(new Tuple2<String, Integer>("x", 1)));
+        Arrays.asList(Arrays.asList(new Tuple2<>("x", 1)));
     List<List<Tuple2<Double, Character>>> pairInputData2 =
-        Arrays.asList(Arrays.asList(new Tuple2<Double, Character>(1.0, 'x')));
+        Arrays.asList(Arrays.asList(new Tuple2<>(1.0, 'x')));
     JavaPairDStream<String, Integer> pairStream1 = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData1, 1));
     JavaPairDStream<Double, Character> pairStream2 = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData2, 1));
 
-    JavaDStream<Double> transformed1 = stream1.transformWith(
+    stream1.transformWith(
         stream2,
         new Function3<JavaRDD<Integer>, JavaRDD<String>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> transformed2 = stream1.transformWith(
+    stream1.transformWith(
         pairStream1,
         new Function3<JavaRDD<Integer>, JavaPairRDD<String, Integer>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> transformed3 = stream1.transformWithToPair(
+    stream1.transformWithToPair(
         stream2,
         new Function3<JavaRDD<Integer>, JavaRDD<String>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> transformed4 = stream1.transformWithToPair(
+    stream1.transformWithToPair(
         pairStream1,
         new Function3<JavaRDD<Integer>, JavaPairRDD<String, Integer>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> pairTransformed1 = pairStream1.transformWith(
+    pairStream1.transformWith(
         stream2,
         new Function3<JavaPairRDD<String, Integer>, JavaRDD<String>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> pairTransformed2_ = pairStream1.transformWith(
+    pairStream1.transformWith(
         pairStream1,
         new Function3<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> pairTransformed3 = pairStream1.transformWithToPair(
+    pairStream1.transformWithToPair(
         stream2,
         new Function3<JavaPairRDD<String, Integer>, JavaRDD<String>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> pairTransformed4 = pairStream1.transformWithToPair(
+    pairStream1.transformWithToPair(
         pairStream2,
         new Function3<JavaPairRDD<String, Integer>, JavaPairRDD<Double, Character>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<Double, Character> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<Double, Character> rdd2, Time time) {
             return null;
           }
         }
@@ -690,13 +686,13 @@ public void testStreamingContextTransform(){
     );
 
     List<List<Tuple2<Integer, String>>> pairStream1input = Arrays.asList(
-        Arrays.asList(new Tuple2<Integer, String>(1, "x")),
-        Arrays.asList(new Tuple2<Integer, String>(2, "y"))
+        Arrays.asList(new Tuple2<>(1, "x")),
+        Arrays.asList(new Tuple2<>(2, "y"))
     );
 
     List<List<Tuple2<Integer, Tuple2<Integer, String>>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<Integer, Tuple2<Integer, String>>(1, new Tuple2<Integer, String>(1, "x"))),
-        Arrays.asList(new Tuple2<Integer, Tuple2<Integer, String>>(2, new Tuple2<Integer, String>(2, "y")))
+        Arrays.asList(new Tuple2<>(1, new Tuple2<>(1, "x"))),
+        Arrays.asList(new Tuple2<>(2, new Tuple2<>(2, "y")))
     );
 
     JavaDStream<Integer> stream1 = JavaTestUtils.attachTestInputStream(ssc, stream1input, 1);
@@ -707,7 +703,7 @@ public void testStreamingContextTransform(){
     List<JavaDStream<?>> listOfDStreams1 = Arrays.<JavaDStream<?>>asList(stream1, stream2);
 
     // This is just to test whether this transform to JavaStream compiles
-    JavaDStream<Long> transformed1 = ssc.transform(
+    ssc.transform(
       listOfDStreams1,
       new Function2<List<JavaRDD<?>>, Time, JavaRDD<Long>>() {
         @Override
@@ -733,8 +729,8 @@ public JavaPairRDD<Integer, Tuple2<Integer, String>> call(List<JavaRDD<?>> listO
           JavaPairRDD<Integer, String> prdd3 = JavaPairRDD.fromJavaRDD(rdd3);
           PairFunction<Integer, Integer, Integer> mapToTuple = new PairFunction<Integer, Integer, Integer>() {
             @Override
-            public Tuple2<Integer, Integer> call(Integer i) throws Exception {
-              return new Tuple2<Integer, Integer>(i, i);
+            public Tuple2<Integer, Integer> call(Integer i) {
+              return new Tuple2<>(i, i);
             }
           };
           return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
@@ -763,7 +759,7 @@ public void testFlatMap() {
     JavaDStream<String> flatMapped = stream.flatMap(new FlatMapFunction<String, String>() {
       @Override
       public Iterable<String> call(String x) {
-        return Lists.newArrayList(x.split("(?!^)"));
+        return Arrays.asList(x.split("(?!^)"));
       }
     });
     JavaTestUtils.attachTestOutputStream(flatMapped);
@@ -782,39 +778,39 @@ public void testPairFlatMap() {
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(6, "g"),
-            new Tuple2<Integer, String>(6, "i"),
-            new Tuple2<Integer, String>(6, "a"),
-            new Tuple2<Integer, String>(6, "n"),
-            new Tuple2<Integer, String>(6, "t"),
-            new Tuple2<Integer, String>(6, "s")),
+            new Tuple2<>(6, "g"),
+            new Tuple2<>(6, "i"),
+            new Tuple2<>(6, "a"),
+            new Tuple2<>(6, "n"),
+            new Tuple2<>(6, "t"),
+            new Tuple2<>(6, "s")),
         Arrays.asList(
-            new Tuple2<Integer, String>(7, "d"),
-            new Tuple2<Integer, String>(7, "o"),
-            new Tuple2<Integer, String>(7, "d"),
-            new Tuple2<Integer, String>(7, "g"),
-            new Tuple2<Integer, String>(7, "e"),
-            new Tuple2<Integer, String>(7, "r"),
-            new Tuple2<Integer, String>(7, "s")),
+            new Tuple2<>(7, "d"),
+            new Tuple2<>(7, "o"),
+            new Tuple2<>(7, "d"),
+            new Tuple2<>(7, "g"),
+            new Tuple2<>(7, "e"),
+            new Tuple2<>(7, "r"),
+            new Tuple2<>(7, "s")),
         Arrays.asList(
-            new Tuple2<Integer, String>(9, "a"),
-            new Tuple2<Integer, String>(9, "t"),
-            new Tuple2<Integer, String>(9, "h"),
-            new Tuple2<Integer, String>(9, "l"),
-            new Tuple2<Integer, String>(9, "e"),
-            new Tuple2<Integer, String>(9, "t"),
-            new Tuple2<Integer, String>(9, "i"),
-            new Tuple2<Integer, String>(9, "c"),
-            new Tuple2<Integer, String>(9, "s")));
+            new Tuple2<>(9, "a"),
+            new Tuple2<>(9, "t"),
+            new Tuple2<>(9, "h"),
+            new Tuple2<>(9, "l"),
+            new Tuple2<>(9, "e"),
+            new Tuple2<>(9, "t"),
+            new Tuple2<>(9, "i"),
+            new Tuple2<>(9, "c"),
+            new Tuple2<>(9, "s")));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<Integer, String> flatMapped = stream.flatMapToPair(
       new PairFlatMapFunction<String, Integer, String>() {
         @Override
-        public Iterable<Tuple2<Integer, String>> call(String in) throws Exception {
-          List<Tuple2<Integer, String>> out = Lists.newArrayList();
+        public Iterable<Tuple2<Integer, String>> call(String in) {
+          List<Tuple2<Integer, String>> out = new ArrayList<>();
           for (String letter: in.split("(?!^)")) {
-            out.add(new Tuple2<Integer, String>(in.length(), letter));
+            out.add(new Tuple2<>(in.length(), letter));
           }
           return out;
         }
@@ -859,13 +855,13 @@ public void testUnion() {
    */
   public static <T> void assertOrderInvariantEquals(
       List<List<T>> expected, List<List<T>> actual) {
-    List<Set<T>> expectedSets = new ArrayList<Set<T>>();
+    List<Set<T>> expectedSets = new ArrayList<>();
     for (List<T> list: expected) {
-      expectedSets.add(Collections.unmodifiableSet(new HashSet<T>(list)));
+      expectedSets.add(Collections.unmodifiableSet(new HashSet<>(list)));
     }
-    List<Set<T>> actualSets = new ArrayList<Set<T>>();
+    List<Set<T>> actualSets = new ArrayList<>();
     for (List<T> list: actual) {
-      actualSets.add(Collections.unmodifiableSet(new HashSet<T>(list)));
+      actualSets.add(Collections.unmodifiableSet(new HashSet<>(list)));
     }
     Assert.assertEquals(expectedSets, actualSets);
   }
@@ -877,25 +873,25 @@ public static <T> void assertOrderInvariantEquals(
   public void testPairFilter() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("giants", 6)),
-        Arrays.asList(new Tuple2<String, Integer>("yankees", 7)));
+        Arrays.asList(new Tuple2<>("giants", 6)),
+        Arrays.asList(new Tuple2<>("yankees", 7)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = stream.mapToPair(
         new PairFunction<String, String, Integer>() {
           @Override
-          public Tuple2<String, Integer> call(String in) throws Exception {
-            return new Tuple2<String, Integer>(in, in.length());
+          public Tuple2<String, Integer> call(String in) {
+            return new Tuple2<>(in, in.length());
           }
         });
 
     JavaPairDStream<String, Integer> filtered = pairStream.filter(
         new Function<Tuple2<String, Integer>, Boolean>() {
       @Override
-      public Boolean call(Tuple2<String, Integer> in) throws Exception {
+      public Boolean call(Tuple2<String, Integer> in) {
         return in._1().contains("a");
       }
     });
@@ -906,28 +902,28 @@ public Boolean call(Tuple2<String, Integer> in) throws Exception {
   }
 
   @SuppressWarnings("unchecked")
-  private List<List<Tuple2<String, String>>> stringStringKVStream = Arrays.asList(
-      Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-          new Tuple2<String, String>("california", "giants"),
-          new Tuple2<String, String>("new york", "yankees"),
-          new Tuple2<String, String>("new york", "mets")),
-      Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-          new Tuple2<String, String>("california", "ducks"),
-          new Tuple2<String, String>("new york", "rangers"),
-          new Tuple2<String, String>("new york", "islanders")));
+  private final List<List<Tuple2<String, String>>> stringStringKVStream = Arrays.asList(
+      Arrays.asList(new Tuple2<>("california", "dodgers"),
+                    new Tuple2<>("california", "giants"),
+                    new Tuple2<>("new york", "yankees"),
+                    new Tuple2<>("new york", "mets")),
+      Arrays.asList(new Tuple2<>("california", "sharks"),
+                    new Tuple2<>("california", "ducks"),
+                    new Tuple2<>("new york", "rangers"),
+                    new Tuple2<>("new york", "islanders")));
 
   @SuppressWarnings("unchecked")
-  private List<List<Tuple2<String, Integer>>> stringIntKVStream = Arrays.asList(
+  private final List<List<Tuple2<String, Integer>>> stringIntKVStream = Arrays.asList(
       Arrays.asList(
-          new Tuple2<String, Integer>("california", 1),
-          new Tuple2<String, Integer>("california", 3),
-          new Tuple2<String, Integer>("new york", 4),
-          new Tuple2<String, Integer>("new york", 1)),
+          new Tuple2<>("california", 1),
+          new Tuple2<>("california", 3),
+          new Tuple2<>("new york", 4),
+          new Tuple2<>("new york", 1)),
       Arrays.asList(
-          new Tuple2<String, Integer>("california", 5),
-          new Tuple2<String, Integer>("california", 5),
-          new Tuple2<String, Integer>("new york", 3),
-          new Tuple2<String, Integer>("new york", 1)));
+          new Tuple2<>("california", 5),
+          new Tuple2<>("california", 5),
+          new Tuple2<>("new york", 3),
+          new Tuple2<>("new york", 1)));
 
   @SuppressWarnings("unchecked")
   @Test
@@ -936,22 +932,22 @@ public void testPairMap() { // Maps pair -> pair of different type
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-                new Tuple2<Integer, String>(1, "california"),
-                new Tuple2<Integer, String>(3, "california"),
-                new Tuple2<Integer, String>(4, "new york"),
-                new Tuple2<Integer, String>(1, "new york")),
+            new Tuple2<>(1, "california"),
+            new Tuple2<>(3, "california"),
+            new Tuple2<>(4, "new york"),
+            new Tuple2<>(1, "new york")),
         Arrays.asList(
-                new Tuple2<Integer, String>(5, "california"),
-                new Tuple2<Integer, String>(5, "california"),
-                new Tuple2<Integer, String>(3, "new york"),
-                new Tuple2<Integer, String>(1, "new york")));
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(3, "new york"),
+            new Tuple2<>(1, "new york")));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
     JavaPairDStream<Integer, String> reversed = pairStream.mapToPair(
         new PairFunction<Tuple2<String, Integer>, Integer, String>() {
           @Override
-          public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception {
+          public Tuple2<Integer, String> call(Tuple2<String, Integer> in) {
             return in.swap();
           }
         });
@@ -969,23 +965,23 @@ public void testPairMapPartitions() { // Maps pair -> pair of different type
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "california"),
-            new Tuple2<Integer, String>(3, "california"),
-            new Tuple2<Integer, String>(4, "new york"),
-            new Tuple2<Integer, String>(1, "new york")),
+            new Tuple2<>(1, "california"),
+            new Tuple2<>(3, "california"),
+            new Tuple2<>(4, "new york"),
+            new Tuple2<>(1, "new york")),
         Arrays.asList(
-            new Tuple2<Integer, String>(5, "california"),
-            new Tuple2<Integer, String>(5, "california"),
-            new Tuple2<Integer, String>(3, "new york"),
-            new Tuple2<Integer, String>(1, "new york")));
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(3, "new york"),
+            new Tuple2<>(1, "new york")));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
     JavaPairDStream<Integer, String> reversed = pairStream.mapPartitionsToPair(
         new PairFlatMapFunction<Iterator<Tuple2<String, Integer>>, Integer, String>() {
           @Override
-          public Iterable<Tuple2<Integer, String>> call(Iterator<Tuple2<String, Integer>> in) throws Exception {
-            LinkedList<Tuple2<Integer, String>> out = new LinkedList<Tuple2<Integer, String>>();
+          public Iterable<Tuple2<Integer, String>> call(Iterator<Tuple2<String, Integer>> in) {
+            List<Tuple2<Integer, String>> out = new LinkedList<>();
             while (in.hasNext()) {
               Tuple2<String, Integer> next = in.next();
               out.add(next.swap());
@@ -1014,7 +1010,7 @@ public void testPairMap2() { // Maps pair -> single
     JavaDStream<Integer> reversed = pairStream.map(
         new Function<Tuple2<String, Integer>, Integer>() {
           @Override
-          public Integer call(Tuple2<String, Integer> in) throws Exception {
+          public Integer call(Tuple2<String, Integer> in) {
             return in._2();
           }
         });
@@ -1030,23 +1026,23 @@ public Integer call(Tuple2<String, Integer> in) throws Exception {
   public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair
     List<List<Tuple2<String, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("hi", 1),
-            new Tuple2<String, Integer>("ho", 2)),
+            new Tuple2<>("hi", 1),
+            new Tuple2<>("ho", 2)),
         Arrays.asList(
-            new Tuple2<String, Integer>("hi", 1),
-            new Tuple2<String, Integer>("ho", 2)));
+            new Tuple2<>("hi", 1),
+            new Tuple2<>("ho", 2)));
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "h"),
-            new Tuple2<Integer, String>(1, "i"),
-            new Tuple2<Integer, String>(2, "h"),
-            new Tuple2<Integer, String>(2, "o")),
+            new Tuple2<>(1, "h"),
+            new Tuple2<>(1, "i"),
+            new Tuple2<>(2, "h"),
+            new Tuple2<>(2, "o")),
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "h"),
-            new Tuple2<Integer, String>(1, "i"),
-            new Tuple2<Integer, String>(2, "h"),
-            new Tuple2<Integer, String>(2, "o")));
+            new Tuple2<>(1, "h"),
+            new Tuple2<>(1, "i"),
+            new Tuple2<>(2, "h"),
+            new Tuple2<>(2, "o")));
 
     JavaDStream<Tuple2<String, Integer>> stream =
         JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
@@ -1054,10 +1050,10 @@ public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair
     JavaPairDStream<Integer, String> flatMapped = pairStream.flatMapToPair(
         new PairFlatMapFunction<Tuple2<String, Integer>, Integer, String>() {
           @Override
-          public Iterable<Tuple2<Integer, String>> call(Tuple2<String, Integer> in) throws Exception {
-            List<Tuple2<Integer, String>> out = new LinkedList<Tuple2<Integer, String>>();
+          public Iterable<Tuple2<Integer, String>> call(Tuple2<String, Integer> in) {
+            List<Tuple2<Integer, String>> out = new LinkedList<>();
             for (Character s : in._1().toCharArray()) {
-              out.add(new Tuple2<Integer, String>(in._2(), s.toString()));
+              out.add(new Tuple2<>(in._2(), s.toString()));
             }
             return out;
           }
@@ -1075,11 +1071,11 @@ public void testPairGroupByKey() {
 
     List<List<Tuple2<String, List<String>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, List<String>>("california", Arrays.asList("dodgers", "giants")),
-            new Tuple2<String, List<String>>("new york", Arrays.asList("yankees", "mets"))),
+            new Tuple2<>("california", Arrays.asList("dodgers", "giants")),
+            new Tuple2<>("new york", Arrays.asList("yankees", "mets"))),
         Arrays.asList(
-            new Tuple2<String, List<String>>("california", Arrays.asList("sharks", "ducks")),
-            new Tuple2<String, List<String>>("new york", Arrays.asList("rangers", "islanders"))));
+            new Tuple2<>("california", Arrays.asList("sharks", "ducks")),
+            new Tuple2<>("new york", Arrays.asList("rangers", "islanders"))));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, String> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1111,11 +1107,11 @@ public void testPairReduceByKey() {
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
+            new Tuple2<>("california", 4),
+            new Tuple2<>("new york", 5)),
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+            new Tuple2<>("california", 10),
+            new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1136,20 +1132,20 @@ public void testCombineByKey() {
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
+            new Tuple2<>("california", 4),
+            new Tuple2<>("new york", 5)),
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+            new Tuple2<>("california", 10),
+            new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
 
-    JavaPairDStream<String, Integer> combined = pairStream.<Integer>combineByKey(
+    JavaPairDStream<String, Integer> combined = pairStream.combineByKey(
         new Function<Integer, Integer>() {
           @Override
-          public Integer call(Integer i) throws Exception {
+          public Integer call(Integer i) {
             return i;
           }
         }, new IntegerSum(), new IntegerSum(), new HashPartitioner(2));
@@ -1170,13 +1166,13 @@ public void testCountByValue() {
 
     List<List<Tuple2<String, Long>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("world", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("world", 1L)),
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("moon", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("moon", 1L)),
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L)));
+            new Tuple2<>("hello", 1L)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Long> counted = stream.countByValue();
@@ -1193,16 +1189,16 @@ public void testGroupByKeyAndWindow() {
 
     List<List<Tuple2<String, List<Integer>>>> expected = Arrays.asList(
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(1, 3)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 4))
+        new Tuple2<>("california", Arrays.asList(1, 3)),
+        new Tuple2<>("new york", Arrays.asList(1, 4))
       ),
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(1, 3, 5, 5)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 1, 3, 4))
+        new Tuple2<>("california", Arrays.asList(1, 3, 5, 5)),
+        new Tuple2<>("new york", Arrays.asList(1, 1, 3, 4))
       ),
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(5, 5)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 3))
+        new Tuple2<>("california", Arrays.asList(5, 5)),
+        new Tuple2<>("new york", Arrays.asList(1, 3))
       )
     );
 
@@ -1220,16 +1216,16 @@ public void testGroupByKeyAndWindow() {
     }
   }
 
-  private HashSet<Tuple2<String, HashSet<Integer>>> convert(List<Tuple2<String, List<Integer>>> listOfTuples) {
-    List<Tuple2<String, HashSet<Integer>>> newListOfTuples = new ArrayList<Tuple2<String, HashSet<Integer>>>();
+  private static Set<Tuple2<String, HashSet<Integer>>> convert(List<Tuple2<String, List<Integer>>> listOfTuples) {
+    List<Tuple2<String, HashSet<Integer>>> newListOfTuples = new ArrayList<>();
     for (Tuple2<String, List<Integer>> tuple: listOfTuples) {
       newListOfTuples.add(convert(tuple));
     }
-    return new HashSet<Tuple2<String, HashSet<Integer>>>(newListOfTuples);
+    return new HashSet<>(newListOfTuples);
   }
 
-  private Tuple2<String, HashSet<Integer>> convert(Tuple2<String, List<Integer>> tuple) {
-    return new Tuple2<String, HashSet<Integer>>(tuple._1(), new HashSet<Integer>(tuple._2()));
+  private static Tuple2<String, HashSet<Integer>> convert(Tuple2<String, List<Integer>> tuple) {
+    return new Tuple2<>(tuple._1(), new HashSet<>(tuple._2()));
   }
 
   @SuppressWarnings("unchecked")
@@ -1238,12 +1234,12 @@ public void testReduceByKeyAndWindow() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 10),
+                      new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1262,12 +1258,12 @@ public void testUpdateStateByKey() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1278,10 +1274,10 @@ public void testUpdateStateByKey() {
           public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
             int out = 0;
             if (state.isPresent()) {
-              out = out + state.get();
+              out += state.get();
             }
             for (Integer v : values) {
-              out = out + v;
+              out += v;
             }
             return Optional.of(out);
           }
@@ -1298,19 +1294,19 @@ public void testUpdateStateByKeyWithInitial() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<Tuple2<String, Integer>> initial = Arrays.asList (
-            new Tuple2<String, Integer> ("california", 1),
-            new Tuple2<String, Integer> ("new york", 2));
+        new Tuple2<>("california", 1),
+            new Tuple2<>("new york", 2));
 
     JavaRDD<Tuple2<String, Integer>> tmpRDD = ssc.sparkContext().parallelize(initial);
     JavaPairRDD<String, Integer> initialRDD = JavaPairRDD.fromJavaRDD (tmpRDD);
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 5),
-            new Tuple2<String, Integer>("new york", 7)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 15),
-            new Tuple2<String, Integer>("new york", 11)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 15),
-            new Tuple2<String, Integer>("new york", 11)));
+        Arrays.asList(new Tuple2<>("california", 5),
+                      new Tuple2<>("new york", 7)),
+        Arrays.asList(new Tuple2<>("california", 15),
+                      new Tuple2<>("new york", 11)),
+        Arrays.asList(new Tuple2<>("california", 15),
+                      new Tuple2<>("new york", 11)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1321,10 +1317,10 @@ public void testUpdateStateByKeyWithInitial() {
           public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
             int out = 0;
             if (state.isPresent()) {
-              out = out + state.get();
+              out += state.get();
             }
             for (Integer v : values) {
-              out = out + v;
+              out += v;
             }
             return Optional.of(out);
           }
@@ -1341,19 +1337,19 @@ public void testReduceByKeyAndWindowWithInverse() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 10),
+                      new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
 
     JavaPairDStream<String, Integer> reduceWindowed =
         pairStream.reduceByKeyAndWindow(new IntegerSum(), new IntegerDifference(),
-            new Duration(2000), new Duration(1000));
+                                        new Duration(2000), new Duration(1000));
     JavaTestUtils.attachTestOutputStream(reduceWindowed);
     List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 3, 3);
 
@@ -1370,15 +1366,15 @@ public void testCountByValueAndWindow() {
 
     List<HashSet<Tuple2<String, Long>>> expected = Arrays.asList(
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("world", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("world", 1L)),
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 2L),
-            new Tuple2<String, Long>("world", 1L),
-            new Tuple2<String, Long>("moon", 1L)),
+            new Tuple2<>("hello", 2L),
+            new Tuple2<>("world", 1L),
+            new Tuple2<>("moon", 1L)),
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 2L),
-            new Tuple2<String, Long>("moon", 1L)));
+            new Tuple2<>("hello", 2L),
+            new Tuple2<>("moon", 1L)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1386,7 +1382,7 @@ public void testCountByValueAndWindow() {
       stream.countByValueAndWindow(new Duration(2000), new Duration(1000));
     JavaTestUtils.attachTestOutputStream(counted);
     List<List<Tuple2<String, Long>>> result = JavaTestUtils.runStreams(ssc, 3, 3);
-    List<HashSet<Tuple2<String, Long>>> unorderedResult = Lists.newArrayList();
+    List<Set<Tuple2<String, Long>>> unorderedResult = new ArrayList<>();
     for (List<Tuple2<String, Long>> res: result) {
       unorderedResult.add(Sets.newHashSet(res));
     }
@@ -1399,27 +1395,27 @@ public void testCountByValueAndWindow() {
   public void testPairTransform() {
     List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(2, 5)),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(2, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(1, 5)));
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(1, 5)));
 
     List<List<Tuple2<Integer, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5)),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5)));
+            new Tuple2<>(1, 5),
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5)));
 
     JavaDStream<Tuple2<Integer, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1428,7 +1424,7 @@ public void testPairTransform() {
     JavaPairDStream<Integer, Integer> sorted = pairStream.transformToPair(
         new Function<JavaPairRDD<Integer, Integer>, JavaPairRDD<Integer, Integer>>() {
           @Override
-          public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception {
+          public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) {
             return in.sortByKey();
           }
         });
@@ -1444,15 +1440,15 @@ public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) thro
   public void testPairToNormalRDDTransform() {
     List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(2, 5)),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(2, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(1, 5)));
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(1, 5)));
 
     List<List<Integer>> expected = Arrays.asList(
         Arrays.asList(3,1,4,2),
@@ -1465,11 +1461,11 @@ public void testPairToNormalRDDTransform() {
     JavaDStream<Integer> firstParts = pairStream.transform(
         new Function<JavaPairRDD<Integer, Integer>, JavaRDD<Integer>>() {
           @Override
-          public JavaRDD<Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception {
+          public JavaRDD<Integer> call(JavaPairRDD<Integer, Integer> in) {
             return in.map(new Function<Tuple2<Integer, Integer>, Integer>() {
               @Override
-              public Integer call(Tuple2<Integer, Integer> in) {
-                return in._1();
+              public Integer call(Tuple2<Integer, Integer> in2) {
+                return in2._1();
               }
             });
           }
@@ -1487,14 +1483,14 @@ public void testMapValues() {
     List<List<Tuple2<String, String>>> inputData = stringStringKVStream;
 
     List<List<Tuple2<String, String>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "DODGERS"),
-            new Tuple2<String, String>("california", "GIANTS"),
-            new Tuple2<String, String>("new york", "YANKEES"),
-            new Tuple2<String, String>("new york", "METS")),
-        Arrays.asList(new Tuple2<String, String>("california", "SHARKS"),
-            new Tuple2<String, String>("california", "DUCKS"),
-            new Tuple2<String, String>("new york", "RANGERS"),
-            new Tuple2<String, String>("new york", "ISLANDERS")));
+        Arrays.asList(new Tuple2<>("california", "DODGERS"),
+                      new Tuple2<>("california", "GIANTS"),
+                      new Tuple2<>("new york", "YANKEES"),
+                      new Tuple2<>("new york", "METS")),
+        Arrays.asList(new Tuple2<>("california", "SHARKS"),
+                      new Tuple2<>("california", "DUCKS"),
+                      new Tuple2<>("new york", "RANGERS"),
+                      new Tuple2<>("new york", "ISLANDERS")));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1502,8 +1498,8 @@ public void testMapValues() {
 
     JavaPairDStream<String, String> mapped = pairStream.mapValues(new Function<String, String>() {
       @Override
-      public String call(String s) throws Exception {
-        return s.toUpperCase();
+      public String call(String s) {
+        return s.toUpperCase(Locale.ENGLISH);
       }
     });
 
@@ -1519,22 +1515,22 @@ public void testFlatMapValues() {
     List<List<Tuple2<String, String>>> inputData = stringStringKVStream;
 
     List<List<Tuple2<String, String>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers1"),
-            new Tuple2<String, String>("california", "dodgers2"),
-            new Tuple2<String, String>("california", "giants1"),
-            new Tuple2<String, String>("california", "giants2"),
-            new Tuple2<String, String>("new york", "yankees1"),
-            new Tuple2<String, String>("new york", "yankees2"),
-            new Tuple2<String, String>("new york", "mets1"),
-            new Tuple2<String, String>("new york", "mets2")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks1"),
-            new Tuple2<String, String>("california", "sharks2"),
-            new Tuple2<String, String>("california", "ducks1"),
-            new Tuple2<String, String>("california", "ducks2"),
-            new Tuple2<String, String>("new york", "rangers1"),
-            new Tuple2<String, String>("new york", "rangers2"),
-            new Tuple2<String, String>("new york", "islanders1"),
-            new Tuple2<String, String>("new york", "islanders2")));
+        Arrays.asList(new Tuple2<>("california", "dodgers1"),
+                      new Tuple2<>("california", "dodgers2"),
+                      new Tuple2<>("california", "giants1"),
+                      new Tuple2<>("california", "giants2"),
+                      new Tuple2<>("new york", "yankees1"),
+                      new Tuple2<>("new york", "yankees2"),
+                      new Tuple2<>("new york", "mets1"),
+                      new Tuple2<>("new york", "mets2")),
+        Arrays.asList(new Tuple2<>("california", "sharks1"),
+                      new Tuple2<>("california", "sharks2"),
+                      new Tuple2<>("california", "ducks1"),
+                      new Tuple2<>("california", "ducks2"),
+                      new Tuple2<>("new york", "rangers1"),
+                      new Tuple2<>("new york", "rangers2"),
+                      new Tuple2<>("new york", "islanders1"),
+                      new Tuple2<>("new york", "islanders2")));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1545,7 +1541,7 @@ public void testFlatMapValues() {
         new Function<String, Iterable<String>>() {
           @Override
           public Iterable<String> call(String in) {
-            List<String> out = new ArrayList<String>();
+            List<String> out = new ArrayList<>();
             out.add(in + "1");
             out.add(in + "2");
             return out;
@@ -1562,29 +1558,29 @@ public Iterable<String> call(String in) {
   @Test
   public void testCoGroup() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks"),
+                      new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
-        Arrays.asList(new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+        Arrays.asList(new Tuple2<>("california", "giants"),
+                      new Tuple2<>("new york", "mets")),
+        Arrays.asList(new Tuple2<>("california", "ducks"),
+                      new Tuple2<>("new york", "islanders")));
 
 
     List<List<Tuple2<String, Tuple2<List<String>, List<String>>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("california",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("dodgers"), Arrays.asList("giants"))),
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("new york",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("yankees"), Arrays.asList("mets")))),
+            new Tuple2<>("california",
+                         new Tuple2<>(Arrays.asList("dodgers"), Arrays.asList("giants"))),
+            new Tuple2<>("new york",
+                         new Tuple2<>(Arrays.asList("yankees"), Arrays.asList("mets")))),
         Arrays.asList(
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("california",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("sharks"), Arrays.asList("ducks"))),
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("new york",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("rangers"), Arrays.asList("islanders")))));
+            new Tuple2<>("california",
+                         new Tuple2<>(Arrays.asList("sharks"), Arrays.asList("ducks"))),
+            new Tuple2<>("new york",
+                         new Tuple2<>(Arrays.asList("rangers"), Arrays.asList("islanders")))));
 
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
@@ -1620,29 +1616,29 @@ public void testCoGroup() {
   @Test
   public void testJoin() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks"),
+                      new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
-        Arrays.asList(new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+        Arrays.asList(new Tuple2<>("california", "giants"),
+                      new Tuple2<>("new york", "mets")),
+        Arrays.asList(new Tuple2<>("california", "ducks"),
+                      new Tuple2<>("new york", "islanders")));
 
 
     List<List<Tuple2<String, Tuple2<String, String>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("dodgers", "giants")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("yankees", "mets"))),
+            new Tuple2<>("california",
+                         new Tuple2<>("dodgers", "giants")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("yankees", "mets"))),
         Arrays.asList(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("sharks", "ducks")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("rangers", "islanders"))));
+            new Tuple2<>("california",
+                         new Tuple2<>("sharks", "ducks")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("rangers", "islanders"))));
 
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
@@ -1664,13 +1660,13 @@ public void testJoin() {
   @Test
   public void testLeftOuterJoin() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks") ));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks") ));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants") ),
-        Arrays.asList(new Tuple2<String, String>("new york", "islanders") )
+        Arrays.asList(new Tuple2<>("california", "giants") ),
+        Arrays.asList(new Tuple2<>("new york", "islanders") )
 
     );
 
@@ -1713,7 +1709,7 @@ public void testCheckpointMasterRecovery() throws InterruptedException {
     JavaDStream<String> stream = JavaCheckpointTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<Integer> letterCount = stream.map(new Function<String, Integer>() {
       @Override
-      public Integer call(String s) throws Exception {
+      public Integer call(String s) {
         return s.length();
       }
     });
@@ -1752,6 +1748,7 @@ public void testContextGetOrCreate() throws InterruptedException {
     // (used to detect the new context)
     final AtomicBoolean newContextCreated = new AtomicBoolean(false);
     Function0<JavaStreamingContext> creatingFunc = new Function0<JavaStreamingContext>() {
+      @Override
       public JavaStreamingContext call() {
         newContextCreated.set(true);
         return new JavaStreamingContext(conf, Seconds.apply(1));
@@ -1765,20 +1762,20 @@ public JavaStreamingContext call() {
 
     newContextCreated.set(false);
     ssc = JavaStreamingContext.getOrCreate(corruptedCheckpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration(), true);
+        new Configuration(), true);
     Assert.assertTrue("new context not created", newContextCreated.get());
     ssc.stop();
 
     newContextCreated.set(false);
     ssc = JavaStreamingContext.getOrCreate(checkpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration());
+        new Configuration());
     Assert.assertTrue("old context not recovered", !newContextCreated.get());
     ssc.stop();
 
     newContextCreated.set(false);
     JavaSparkContext sc = new JavaSparkContext(conf);
     ssc = JavaStreamingContext.getOrCreate(checkpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration());
+        new Configuration());
     Assert.assertTrue("old context not recovered", !newContextCreated.get());
     ssc.stop();
   }
@@ -1800,7 +1797,7 @@ public void testCheckpointofIndividualStream() throws InterruptedException {
     JavaDStream stream = JavaCheckpointTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream letterCount = stream.map(new Function<String, Integer>() {
       @Override
-      public Integer call(String s) throws Exception {
+      public Integer call(String s) {
         return s.length();
       }
     });
@@ -1818,29 +1815,26 @@ public Integer call(String s) throws Exception {
   // InputStream functionality is deferred to the existing Scala tests.
   @Test
   public void testSocketTextStream() {
-      JavaReceiverInputDStream<String> test = ssc.socketTextStream("localhost", 12345);
+    ssc.socketTextStream("localhost", 12345);
   }
 
   @Test
   public void testSocketString() {
-
-    class Converter implements Function<InputStream, Iterable<String>> {
-      public Iterable<String> call(InputStream in) throws IOException {
-        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
-        List<String> out = new ArrayList<String>();
-        while (true) {
-          String line = reader.readLine();
-          if (line == null) { break; }
-          out.add(line);
-        }
-        return out;
-      }
-    }
-
-    JavaDStream<String> test = ssc.socketStream(
+    ssc.socketStream(
       "localhost",
       12345,
-      new Converter(),
+      new Function<InputStream, Iterable<String>>() {
+        @Override
+        public Iterable<String> call(InputStream in) throws IOException {
+          List<String> out = new ArrayList<>();
+          try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) {
+            for (String line; (line = reader.readLine()) != null;) {
+              out.add(line);
+            }
+          }
+          return out;
+        }
+      },
       StorageLevel.MEMORY_ONLY());
   }
 
@@ -1870,7 +1864,7 @@ public void testFileStream() throws IOException {
       TextInputFormat.class,
       new Function<Path, Boolean>() {
         @Override
-        public Boolean call(Path v1) throws Exception {
+        public Boolean call(Path v1) {
           return Boolean.TRUE;
         }
       },
@@ -1879,7 +1873,7 @@ public Boolean call(Path v1) throws Exception {
     JavaDStream<String> test = inputStream.map(
       new Function<Tuple2<LongWritable, Text>, String>() {
         @Override
-        public String call(Tuple2<LongWritable, Text> v1) throws Exception {
+        public String call(Tuple2<LongWritable, Text> v1) {
           return v1._2().toString();
         }
     });
@@ -1892,19 +1886,15 @@ public String call(Tuple2<LongWritable, Text> v1) throws Exception {
 
   @Test
   public void testRawSocketStream() {
-    JavaReceiverInputDStream<String> test = ssc.rawSocketStream("localhost", 12345);
+    ssc.rawSocketStream("localhost", 12345);
   }
 
-  private List<List<String>> fileTestPrepare(File testDir) throws IOException {
+  private static List<List<String>> fileTestPrepare(File testDir) throws IOException {
     File existingFile = new File(testDir, "0");
     Files.write("0\n", existingFile, Charset.forName("UTF-8"));
-    assertTrue(existingFile.setLastModified(1000) && existingFile.lastModified() == 1000);
-
-    List<List<String>> expected = Arrays.asList(
-      Arrays.asList("0")
-    );
-
-    return expected;
+    Assert.assertTrue(existingFile.setLastModified(1000));
+    Assert.assertEquals(1000, existingFile.lastModified());
+    return Arrays.asList(Arrays.asList("0"));
   }
 
   @SuppressWarnings("unchecked")
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
index 1b0787fe69dec..ec2bffd6a5b97 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
@@ -36,7 +36,6 @@
 import java.io.Serializable;
 import java.net.ConnectException;
 import java.net.Socket;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
 public class JavaReceiverAPISuite implements Serializable {
@@ -64,16 +63,16 @@ public void testReceiver() throws InterruptedException {
         ssc.receiverStream(new JavaSocketReceiver("localhost", server.port()));
       JavaDStream<String> mapped = input.map(new Function<String, String>() {
         @Override
-        public String call(String v1) throws Exception {
+        public String call(String v1) {
           return v1 + ".";
         }
       });
       mapped.foreachRDD(new Function<JavaRDD<String>, Void>() {
         @Override
-        public Void call(JavaRDD<String> rdd) throws Exception {
-        long count = rdd.count();
-        dataCounter.addAndGet(count);
-        return null;
+        public Void call(JavaRDD<String> rdd) {
+          long count = rdd.count();
+          dataCounter.addAndGet(count);
+          return null;
         }
       });
 
@@ -83,7 +82,7 @@ public Void call(JavaRDD<String> rdd) throws Exception {
 
       Thread.sleep(200);
       for (int i = 0; i < 6; i++) {
-        server.send("" + i + "\n"); // \n to make sure these are separate lines
+        server.send(i + "\n"); // \n to make sure these are separate lines
         Thread.sleep(100);
       }
       while (dataCounter.get() == 0 && System.currentTimeMillis() - startTime < timeout) {
@@ -95,50 +94,49 @@ public Void call(JavaRDD<String> rdd) throws Exception {
       server.stop();
     }
   }
-}
 
-class JavaSocketReceiver extends Receiver<String> {
+  private static class JavaSocketReceiver extends Receiver<String> {
 
-  String host = null;
-  int port = -1;
+    String host = null;
+    int port = -1;
 
-  public JavaSocketReceiver(String host_ , int port_) {
-    super(StorageLevel.MEMORY_AND_DISK());
-    host = host_;
-    port = port_;
-  }
+    JavaSocketReceiver(String host_ , int port_) {
+      super(StorageLevel.MEMORY_AND_DISK());
+      host = host_;
+      port = port_;
+    }
 
-  @Override
-  public void onStart() {
-    new Thread()  {
-      @Override public void run() {
-        receive();
-      }
-    }.start();
-  }
+    @Override
+    public void onStart() {
+      new Thread()  {
+        @Override public void run() {
+          receive();
+        }
+      }.start();
+    }
 
-  @Override
-  public void onStop() {
-  }
+    @Override
+    public void onStop() {
+    }
 
-  private void receive() {
-    Socket socket = null;
-    try {
-      socket = new Socket(host, port);
-      BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));
-      String userInput;
-      while ((userInput = in.readLine()) != null) {
-        store(userInput);
+    private void receive() {
+      try {
+        Socket socket = new Socket(host, port);
+        BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));
+        String userInput;
+        while ((userInput = in.readLine()) != null) {
+          store(userInput);
+        }
+        in.close();
+        socket.close();
+      } catch(ConnectException ce) {
+        ce.printStackTrace();
+        restart("Could not connect", ce);
+      } catch(Throwable t) {
+        t.printStackTrace();
+        restart("Error receiving data", t);
       }
-      in.close();
-      socket.close();
-    } catch(ConnectException ce) {
-      ce.printStackTrace();
-      restart("Could not connect", ce);
-    } catch(Throwable t) {
-      t.printStackTrace();
-      restart("Error receiving data", t);
     }
   }
-}
 
+}

From f4a22808e03fa12bfe1bfc82cf713cfda7e063a9 Mon Sep 17 00:00:00 2001
From: JihongMa <linlin200605@gmail.com>
Date: Sat, 12 Sep 2015 10:17:15 -0700
Subject: [PATCH 268/802] [SPARK-6548] Adding stddev to DataFrame functions

Adding STDDEV support for DataFrame using 1-pass online /parallel algorithm to compute variance. Please review the code change.

Author: JihongMa <linlin200605@gmail.com>
Author: Jihong MA <linlin200605@gmail.com>
Author: Jihong MA <jihongma@jihongs-mbp.usca.ibm.com>
Author: Jihong MA <jihongma@Jihongs-MacBook-Pro.local>

Closes #6297 from JihongMA/SPARK-SQL.
---
 R/pkg/inst/tests/test_sparkSQL.R              |   2 +-
 python/pyspark/sql/dataframe.py               |  36 +--
 .../catalyst/analysis/FunctionRegistry.scala  |   3 +
 .../catalyst/analysis/HiveTypeCoercion.scala  |   3 +
 .../spark/sql/catalyst/dsl/package.scala      |   3 +
 .../expressions/aggregate/functions.scala     | 143 ++++++++++
 .../expressions/aggregate/utils.scala         |  18 ++
 .../sql/catalyst/expressions/aggregates.scala | 245 ++++++++++++++++++
 .../org/apache/spark/sql/DataFrame.scala      |   6 +-
 .../org/apache/spark/sql/GroupedData.scala    |  39 +++
 .../org/apache/spark/sql/functions.scala      |  27 ++
 .../apache/spark/sql/JavaDataFrameSuite.java  |   1 +
 .../spark/sql/DataFrameAggregateSuite.scala   |  33 +++
 .../org/apache/spark/sql/DataFrameSuite.scala |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  42 ++-
 .../execution/AggregationQuerySuite.scala     |  35 ---
 16 files changed, 574 insertions(+), 64 deletions(-)

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 1ccfde59176f5..98d4402d368e1 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1147,7 +1147,7 @@ test_that("describe() and summarize() on a DataFrame", {
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
   expect_equal(collect(stats)[2, "age"], "24.5")
-  expect_equal(collect(stats)[3, "age"], "5.5")
+  expect_equal(collect(stats)[3, "age"], "7.7781745930520225")
   stats <- describe(df)
   expect_equal(collect(stats)[4, "name"], "Andy")
   expect_equal(collect(stats)[5, "age"], "30")
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c5bf55791240b..fb995fa3a76b5 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -653,25 +653,25 @@ def describe(self, *cols):
         guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         >>> df.describe().show()
-        +-------+---+
-        |summary|age|
-        +-------+---+
-        |  count|  2|
-        |   mean|3.5|
-        | stddev|1.5|
-        |    min|  2|
-        |    max|  5|
-        +-------+---+
+        +-------+------------------+
+        |summary|               age|
+        +-------+------------------+
+        |  count|                 2|
+        |   mean|               3.5|
+        | stddev|2.1213203435596424|
+        |    min|                 2|
+        |    max|                 5|
+        +-------+------------------+
         >>> df.describe(['age', 'name']).show()
-        +-------+---+-----+
-        |summary|age| name|
-        +-------+---+-----+
-        |  count|  2|    2|
-        |   mean|3.5| null|
-        | stddev|1.5| null|
-        |    min|  2|Alice|
-        |    max|  5|  Bob|
-        +-------+---+-----+
+        +-------+------------------+-----+
+        |summary|               age| name|
+        +-------+------------------+-----+
+        |  count|                 2|    2|
+        |   mean|               3.5| null|
+        | stddev|2.1213203435596424| null|
+        |    min|                 2|Alice|
+        |    max|                 5|  Bob|
+        +-------+------------------+-----+
         """
         if len(cols) == 1 and isinstance(cols[0], list):
             cols = cols[0]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index cd5a90d788151..11b4866bf264b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -168,6 +168,9 @@ object FunctionRegistry {
     expression[Last]("last"),
     expression[Max]("max"),
     expression[Min]("min"),
+    expression[Stddev]("stddev"),
+    expression[StddevPop]("stddev_pop"),
+    expression[StddevSamp]("stddev_samp"),
     expression[Sum]("sum"),
 
     // string functions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87c11abbad490..87a3845b2d9e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -297,6 +297,9 @@ object HiveTypeCoercion {
       case Sum(e @ StringType()) => Sum(Cast(e, DoubleType))
       case SumDistinct(e @ StringType()) => Sum(Cast(e, DoubleType))
       case Average(e @ StringType()) => Average(Cast(e, DoubleType))
+      case Stddev(e @ StringType()) => Stddev(Cast(e, DoubleType))
+      case StddevPop(e @ StringType()) => StddevPop(Cast(e, DoubleType))
+      case StddevSamp(e @ StringType()) => StddevSamp(Cast(e, DoubleType))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index a7e3a49327655..699c4cc63d09a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -159,6 +159,9 @@ package object dsl {
     def lower(e: Expression): Expression = Lower(e)
     def sqrt(e: Expression): Expression = Sqrt(e)
     def abs(e: Expression): Expression = Abs(e)
+    def stddev(e: Expression): Expression = Stddev(e)
+    def stddev_pop(e: Expression): Expression = StddevPop(e)
+    def stddev_samp(e: Expression): Expression = StddevSamp(e)
 
     implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s: String = sym.name }
     // TODO more implicit class for literal?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index a73024d6adba1..02cd0ac0db118 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -249,6 +249,149 @@ case class Min(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = min
 }
 
+// Compute the sample standard deviation of a column
+case class Stddev(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = true
+  override def prettyName: String = "stddev"
+}
+
+// Compute the population standard deviation of a column
+case class StddevPop(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = false
+  override def prettyName: String = "stddev_pop"
+}
+
+// Compute the sample standard deviation of a column
+case class StddevSamp(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = true
+  override def prettyName: String = "stddev_samp"
+}
+
+// Compute standard deviation based on online algorithm specified here:
+// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+abstract class StddevAgg(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  def isSample: Boolean
+
+  // Return data type.
+  override def dataType: DataType = resultType
+
+  // Expected input data type.
+  // TODO: Right now, we replace old aggregate functions (based on AggregateExpression1) to the
+  // new version at planning time (after analysis phase). For now, NullType is added at here
+  // to make it resolved when we have cases like `select stddev(null)`.
+  // We can use our analyzer to cast NullType to the default data type of the NumericType once
+  // we remove the old aggregate functions. Then, we will not need NullType at here.
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
+
+  private val resultType = DoubleType
+
+  private val preCount = AttributeReference("preCount", resultType)()
+  private val currentCount = AttributeReference("currentCount", resultType)()
+  private val preAvg = AttributeReference("preAvg", resultType)()
+  private val currentAvg = AttributeReference("currentAvg", resultType)()
+  private val currentMk = AttributeReference("currentMk", resultType)()
+
+  override val bufferAttributes = preCount :: currentCount :: preAvg ::
+                                  currentAvg :: currentMk :: Nil
+
+  override val initialValues = Seq(
+    /* preCount = */ Cast(Literal(0), resultType),
+    /* currentCount = */ Cast(Literal(0), resultType),
+    /* preAvg = */ Cast(Literal(0), resultType),
+    /* currentAvg = */ Cast(Literal(0), resultType),
+    /* currentMk = */ Cast(Literal(0), resultType)
+  )
+
+  override val updateExpressions = {
+
+    // update average
+    // avg = avg + (value - avg)/count
+    def avgAdd: Expression = {
+      currentAvg + ((Cast(child, resultType) - currentAvg) / currentCount)
+    }
+
+    // update sum of square of difference from mean
+    // Mk = Mk + (value - preAvg) * (value - updatedAvg)
+    def mkAdd: Expression = {
+      val delta1 = Cast(child, resultType) - preAvg
+      val delta2 = Cast(child, resultType) - currentAvg
+      currentMk + (delta1 * delta2)
+    }
+
+    Seq(
+      /* preCount = */ If(IsNull(child), preCount, currentCount),
+      /* currentCount = */ If(IsNull(child), currentCount,
+                           Add(currentCount, Cast(Literal(1), resultType))),
+      /* preAvg = */ If(IsNull(child), preAvg, currentAvg),
+      /* currentAvg = */ If(IsNull(child), currentAvg, avgAdd),
+      /* currentMk = */ If(IsNull(child), currentMk, mkAdd)
+    )
+  }
+
+  override val mergeExpressions = {
+
+    // count merge
+    def countMerge: Expression = {
+      currentCount.left + currentCount.right
+    }
+
+    // average merge
+    def avgMerge: Expression = {
+      ((currentAvg.left * preCount) + (currentAvg.right * currentCount.right)) /
+      (preCount + currentCount.right)
+    }
+
+    // update sum of square differences
+    def mkMerge: Expression = {
+      val avgDelta = currentAvg.right - preAvg
+      val mkDelta = (avgDelta * avgDelta) * (preCount * currentCount.right) /
+        (preCount + currentCount.right)
+
+      currentMk.left + currentMk.right + mkDelta
+    }
+
+    Seq(
+      /* preCount = */ If(IsNull(currentCount.left),
+                         Cast(Literal(0), resultType), currentCount.left),
+      /* currentCount = */ If(IsNull(currentCount.left), currentCount.right,
+                             If(IsNull(currentCount.right), currentCount.left, countMerge)),
+      /* preAvg = */ If(IsNull(currentAvg.left), Cast(Literal(0), resultType), currentAvg.left),
+      /* currentAvg = */ If(IsNull(currentAvg.left), currentAvg.right,
+                           If(IsNull(currentAvg.right), currentAvg.left, avgMerge)),
+      /* currentMk = */ If(IsNull(currentMk.left), currentMk.right,
+                          If(IsNull(currentMk.right), currentMk.left, mkMerge))
+    )
+  }
+
+  override val evaluateExpression = {
+    // when currentCount == 0, return null
+    // when currentCount == 1, return 0
+    // when currentCount >1
+    // stddev_samp = sqrt (currentMk/(currentCount -1))
+    // stddev_pop = sqrt (currentMk/currentCount)
+    val varCol = {
+      if (isSample) {
+        currentMk / Cast((currentCount - Cast(Literal(1), resultType)), resultType)
+      }
+      else {
+        currentMk / currentCount
+      }
+    }
+
+    If(EqualTo(currentCount, Cast(Literal(0), resultType)), Cast(Literal(null), resultType),
+      If(EqualTo(currentCount, Cast(Literal(1), resultType)), Cast(Literal(0), resultType),
+        Cast(Sqrt(varCol), resultType)))
+  }
+}
+
 case class Sum(child: Expression) extends AlgebraicAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index 4a43318a95490..ce3dddad87f55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -85,6 +85,24 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = false)
 
+        case expressions.Stddev(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Stddev(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.StddevPop(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.StddevPop(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.StddevSamp(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.StddevSamp(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
         case expressions.Sum(child) =>
           aggregate.AggregateExpression2(
             aggregateFunction = aggregate.Sum(child),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 5e8298aaaa9cb..f1c47f39043c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -691,3 +691,248 @@ case class LastFunction(expr: Expression, base: AggregateExpression1) extends Ag
     result
   }
 }
+
+// Compute standard deviation based on online algorithm specified here:
+// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+abstract class StddevAgg1(child: Expression) extends UnaryExpression with PartialAggregate1 {
+  override def nullable: Boolean = true
+  override def dataType: DataType = DoubleType
+
+  def isSample: Boolean
+
+  override def asPartial: SplitEvaluation = {
+    val partialStd = Alias(ComputePartialStd(child), "PartialStddev")()
+    SplitEvaluation(MergePartialStd(partialStd.toAttribute, isSample), partialStd :: Nil)
+  }
+
+  override def newInstance(): StddevFunction = new StddevFunction(child, this, isSample)
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function stddev")
+
+}
+
+// Compute the sample standard deviation of a column
+case class Stddev(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV($child)"
+  override def isSample: Boolean = true
+}
+
+// Compute the population standard deviation of a column
+case class StddevPop(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV_POP($child)"
+  override def isSample: Boolean = false
+}
+
+// Compute the sample standard deviation of a column
+case class StddevSamp(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV_SAMP($child)"
+  override def isSample: Boolean = true
+}
+
+case class ComputePartialStd(child: Expression) extends UnaryExpression with AggregateExpression1 {
+    def this() = this(null)
+
+    override def children: Seq[Expression] = child :: Nil
+    override def nullable: Boolean = false
+    override def dataType: DataType = ArrayType(DoubleType)
+    override def toString: String = s"computePartialStddev($child)"
+    override def newInstance(): ComputePartialStdFunction =
+      new ComputePartialStdFunction(child, this)
+}
+
+case class ComputePartialStdFunction (
+    expr: Expression,
+    base: AggregateExpression1
+) extends AggregateFunction1 {
+  def this() = this(null, null)  // Required for serialization
+
+  private val computeType = DoubleType
+  private val zero = Cast(Literal(0), computeType)
+  private var partialCount: Long = 0L
+
+  // the mean of data processed so far
+  private val partialAvg: MutableLiteral = MutableLiteral(zero.eval(null), computeType)
+
+  // update average based on this formula:
+  // avg = avg + (value - avg)/count
+  private def avgAddFunction (value: Literal): Expression = {
+    val delta = Subtract(Cast(value, computeType), partialAvg)
+    Add(partialAvg, Divide(delta, Cast(Literal(partialCount), computeType)))
+  }
+
+  // the sum of squares of difference from mean
+  private val partialMk: MutableLiteral = MutableLiteral(zero.eval(null), computeType)
+
+  // update sum of square of difference from mean based on following formula:
+  // Mk = Mk + (value - preAvg) * (value - updatedAvg)
+  private def mkAddFunction(value: Literal, prePartialAvg: MutableLiteral): Expression = {
+    val delta1 = Subtract(Cast(value, computeType), prePartialAvg)
+    val delta2 = Subtract(Cast(value, computeType), partialAvg)
+    Add(partialMk, Multiply(delta1, delta2))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input)
+    if (evaluatedExpr != null) {
+      val exprValue = Literal.create(evaluatedExpr, expr.dataType)
+      val prePartialAvg = partialAvg.copy()
+      partialCount += 1
+      partialAvg.update(avgAddFunction(exprValue), input)
+      partialMk.update(mkAddFunction(exprValue, prePartialAvg), input)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    new GenericArrayData(Array(Cast(Literal(partialCount), computeType).eval(null),
+        partialAvg.eval(null),
+        partialMk.eval(null)))
+  }
+}
+
+case class MergePartialStd(
+    child: Expression,
+    isSample: Boolean
+) extends UnaryExpression with AggregateExpression1 {
+  def this() = this(null, false) // required for serialization
+
+  override def children: Seq[Expression] = child:: Nil
+  override def nullable: Boolean = false
+  override def dataType: DataType = DoubleType
+  override def toString: String = s"MergePartialStd($child)"
+  override def newInstance(): MergePartialStdFunction = {
+    new MergePartialStdFunction(child, this, isSample)
+  }
+}
+
+case class MergePartialStdFunction(
+    expr: Expression,
+    base: AggregateExpression1,
+    isSample: Boolean
+) extends AggregateFunction1 {
+  def this() = this (null, null, false) // Required for serialization
+
+  private val computeType = DoubleType
+  private val zero = Cast(Literal(0), computeType)
+  private val combineCount = MutableLiteral(zero.eval(null), computeType)
+  private val combineAvg = MutableLiteral(zero.eval(null), computeType)
+  private val combineMk = MutableLiteral(zero.eval(null), computeType)
+
+  private def avgUpdateFunction(preCount: Expression,
+                                partialCount: Expression,
+                                partialAvg: Expression): Expression = {
+    Divide(Add(Multiply(combineAvg, preCount),
+               Multiply(partialAvg, partialCount)),
+           Add(preCount, partialCount))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input).asInstanceOf[ArrayData]
+
+    if (evaluatedExpr != null) {
+      val exprValue = evaluatedExpr.toArray(computeType)
+      val (partialCount, partialAvg, partialMk) =
+        (Literal.create(exprValue(0), computeType),
+         Literal.create(exprValue(1), computeType),
+         Literal.create(exprValue(2), computeType))
+
+      if (Cast(partialCount, LongType).eval(null).asInstanceOf[Long] > 0) {
+        val preCount = combineCount.copy()
+        combineCount.update(Add(combineCount, partialCount), input)
+
+        val preAvg = combineAvg.copy()
+        val avgDelta = Subtract(partialAvg, preAvg)
+        val mkDelta = Multiply(Multiply(avgDelta, avgDelta),
+                               Divide(Multiply(preCount, partialCount),
+                                      combineCount))
+
+        // update average based on following formula
+        // (combineAvg * preCount + partialAvg * partialCount) / (preCount + partialCount)
+        combineAvg.update(avgUpdateFunction(preCount, partialCount, partialAvg), input)
+
+        // update sum of square differences from mean based on following formula
+        // (combineMk + partialMk + (avgDelta * avgDelta) * (preCount * partialCount/combineCount)
+        combineMk.update(Add(combineMk, Add(partialMk, mkDelta)), input)
+      }
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val count: Long = Cast(combineCount, LongType).eval(null).asInstanceOf[Long]
+
+    if (count == 0) null
+    else if (count < 2) zero.eval(null)
+    else {
+      // when total count > 2
+      // stddev_samp = sqrt (combineMk/(combineCount -1))
+      // stddev_pop = sqrt (combineMk/combineCount)
+      val varCol = {
+        if (isSample) {
+          Divide(combineMk, Cast(Literal(count - 1), computeType))
+        }
+        else {
+          Divide(combineMk, Cast(Literal(count), computeType))
+        }
+      }
+      Sqrt(varCol).eval(null)
+    }
+  }
+}
+
+case class StddevFunction(
+    expr: Expression,
+    base: AggregateExpression1,
+    isSample: Boolean
+) extends AggregateFunction1 {
+
+  def this() = this(null, null, false) // Required for serialization
+
+  private val computeType = DoubleType
+  private var curCount: Long = 0L
+  private val zero = Cast(Literal(0), computeType)
+  private val curAvg = MutableLiteral(zero.eval(null), computeType)
+  private val curMk = MutableLiteral(zero.eval(null), computeType)
+
+  private def curAvgAddFunction(value: Literal): Expression = {
+    val delta = Subtract(Cast(value, computeType), curAvg)
+    Add(curAvg, Divide(delta, Cast(Literal(curCount), computeType)))
+  }
+  private def curMkAddFunction(value: Literal, preAvg: MutableLiteral): Expression = {
+    val delta1 = Subtract(Cast(value, computeType), preAvg)
+    val delta2 = Subtract(Cast(value, computeType), curAvg)
+    Add(curMk, Multiply(delta1, delta2))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input)
+    if (evaluatedExpr != null) {
+      val preAvg: MutableLiteral = curAvg.copy()
+      val exprValue = Literal.create(evaluatedExpr, expr.dataType)
+      curCount += 1L
+      curAvg.update(curAvgAddFunction(exprValue), input)
+      curMk.update(curMkAddFunction(exprValue, preAvg), input)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    if (curCount == 0) null
+    else if (curCount < 2) zero.eval(null)
+    else {
+      // when total count > 2,
+      // stddev_samp = sqrt(curMk/(curCount - 1))
+      // stddev_pop = sqrt(curMk/curCount)
+      val varCol = {
+        if (isSample) {
+          Divide(curMk, Cast(Literal(curCount - 1), computeType))
+        }
+        else {
+          Divide(curMk, Cast(Literal(curCount), computeType))
+        }
+      }
+      Sqrt(varCol).eval(null)
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 791c10c3d7ce7..1a687b2374f14 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1288,15 +1288,11 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def describe(cols: String*): DataFrame = {
 
-    // TODO: Add stddev as an expression, and remove it from here.
-    def stddevExpr(expr: Expression): Expression =
-      Sqrt(Subtract(Average(Multiply(expr, expr)), Multiply(Average(expr), Average(expr))))
-
     // The list of summary statistics to compute, in the form of expressions.
     val statistics = List[(String, Expression => Expression)](
       "count" -> Count,
       "mean" -> Average,
-      "stddev" -> stddevExpr,
+      "stddev" -> Stddev,
       "min" -> Min,
       "max" -> Max)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index ee31d83cce42c..102b802ad0a0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -124,6 +124,9 @@ class GroupedData protected[sql](
       case "avg" | "average" | "mean" => Average
       case "max" => Max
       case "min" => Min
+      case "stddev" => Stddev
+      case "stddev_pop" => StddevPop
+      case "stddev_samp" => StddevSamp
       case "sum" => Sum
       case "count" | "size" =>
         // Turn count(*) into count(1)
@@ -283,6 +286,42 @@ class GroupedData protected[sql](
     aggregateNumericColumns(colNames : _*)(Min)
   }
 
+  /**
+   * Compute the sample standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Stddev)
+  }
+
+  /**
+   * Compute the population standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev_pop(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(StddevPop)
+  }
+
+  /**
+   * Compute the sample standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev_samp(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(StddevSamp)
+  }
+
   /**
    * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 435e6319a64c4..60d9c509104d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -294,6 +294,33 @@ object functions {
    */
   def min(columnName: String): Column = min(Column(columnName))
 
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation
+   * of the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev(e: Column): Column = Stddev(e.expr)
+
+  /**
+   * Aggregate function: returns the population standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_pop(e: Column): Column = StddevPop(e.expr)
+
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_samp(e: Column): Column = StddevSamp(e.expr)
+
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index d981ce947f435..5f9abd4999ce0 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -90,6 +90,7 @@ public void testVarargMethods() {
     df.groupBy().mean("key");
     df.groupBy().max("key");
     df.groupBy().min("key");
+    df.groupBy().stddev("key");
     df.groupBy().sum("key");
 
     // Varargs in column expressions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index c0950b09b14ad..f5ef9ffd7f4f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -175,6 +175,39 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Row(0, null))
   }
 
+  test("stddev") {
+    val testData2ADev = math.sqrt(4/5.0)
+
+    checkAnswer(
+      testData2.agg(stddev('a)),
+      Row(testData2ADev))
+
+    checkAnswer(
+      testData2.agg(stddev_pop('a)),
+      Row(math.sqrt(4/6.0)))
+
+    checkAnswer(
+      testData2.agg(stddev_samp('a)),
+      Row(testData2ADev))
+  }
+
+  test("zero stddev") {
+    val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
+    assert(emptyTableData.count() == 0)
+
+    checkAnswer(
+    emptyTableData.agg(stddev('a)),
+    Row(null))
+
+    checkAnswer(
+    emptyTableData.agg(stddev_pop('a)),
+    Row(null))
+
+    checkAnswer(
+    emptyTableData.agg(stddev_samp('a)),
+    Row(null))
+  }
+
   test("zero sum") {
     val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index dbed4fc247140..c167999af580e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -436,7 +436,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val describeResult = Seq(
       Row("count", "4", "4"),
       Row("mean", "33.0", "178.0"),
-      Row("stddev", "16.583123951777", "10.0"),
+      Row("stddev", "19.148542155126762", "11.547005383792516"),
       Row("min", "16", "164"),
       Row("max", "60", "192"))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 664b7a1512faf..962b100b532c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -328,6 +328,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       testCodeGen(
         "SELECT min(key) FROM testData3x",
         Row(1) :: Nil)
+      // STDDEV
+      testCodeGen(
+        "SELECT a, stddev(b), stddev_pop(b) FROM testData2 GROUP BY a",
+        (1 to 3).map(i => Row(i, math.sqrt(0.5), math.sqrt(0.25))))
+      testCodeGen(
+        "SELECT stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2",
+        Row(math.sqrt(1.5 / 5), math.sqrt(1.5 / 6), math.sqrt(1.5 / 5)) :: Nil)
       // Some combinations.
       testCodeGen(
         """
@@ -348,8 +355,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         Row(100, 1, 50.5, 300, 100) :: Nil)
       // Aggregate with Code generation handling all null values
       testCodeGen(
-        "SELECT  sum('a'), avg('a'), count(null) FROM testData",
-        Row(null, null, 0) :: Nil)
+        "SELECT  sum('a'), avg('a'), stddev('a'), count(null) FROM testData",
+        Row(null, null, null, 0) :: Nil)
     } finally {
       sqlContext.dropTempTable("testData3x")
       sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
@@ -515,8 +522,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("aggregates with nulls") {
     checkAnswer(
-      sql("SELECT MIN(a), MAX(a), AVG(a), SUM(a), COUNT(a) FROM nullInts"),
-      Row(1, 3, 2, 6, 3)
+      sql("SELECT MIN(a), MAX(a), AVG(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
+      Row(1, 3, 2, 1, 6, 3)
     )
   }
 
@@ -722,6 +729,33 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("stddev") {
+    checkAnswer(
+      sql("SELECT STDDEV(a) FROM testData2"),
+      Row(math.sqrt(4/5.0))
+    )
+  }
+
+  test("stddev_pop") {
+    checkAnswer(
+      sql("SELECT STDDEV_POP(a) FROM testData2"),
+      Row(math.sqrt(4/6.0))
+    )
+  }
+
+  test("stddev_samp") {
+    checkAnswer(
+      sql("SELECT STDDEV_SAMP(a) FROM testData2"),
+      Row(math.sqrt(4/5.0))
+    )
+  }
+
+  test("stddev agg") {
+    checkAnswer(
+      sql("SELECT a, stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2 GROUP BY a"),
+      (1 to 3).map(i => Row(i, math.sqrt(1/2.0), math.sqrt(1/4.0), math.sqrt(1/2.0))))
+  }
+
   test("inner join where, one match per row") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData WHERE n = N"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index b126ec455fc69..a73b1bd52c09f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -507,41 +507,6 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       }.getMessage
       assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
     }
-
-    // TODO: once we support Hive UDAF in the new interface,
-    // we can remove the following two tests.
-    withSQLConf("spark.sql.useAggregate2" -> "true") {
-      val errorMessage = intercept[AnalysisException] {
-        sqlContext.sql(
-          """
-            |SELECT
-            |  key,
-            |  mydoublesum(value + 1.5 * key),
-            |  stddev_samp(value)
-            |FROM agg1
-            |GROUP BY key
-          """.stripMargin).collect()
-      }.getMessage
-      assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
-
-      // This will fall back to the old aggregate
-      val newAggregateOperators = sqlContext.sql(
-        """
-          |SELECT
-          |  key,
-          |  sum(value + 1.5 * key),
-          |  stddev_samp(value)
-          |FROM agg1
-          |GROUP BY key
-        """.stripMargin).queryExecution.executedPlan.collect {
-        case agg: aggregate.SortBasedAggregate => agg
-        case agg: aggregate.TungstenAggregate => agg
-      }
-      val message =
-        "We should fallback to the old aggregation code path if " +
-          "there is any aggregate function that cannot be converted to the new interface."
-      assert(newAggregateOperators.isEmpty, message)
-    }
   }
 }
 

From b3a7480ab0821ab38f710de96e3ac4a13f62dbca Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 12 Sep 2015 16:23:55 -0700
Subject: [PATCH 269/802] [SPARK-10330] Add Scalastyle rule to require use of
 SparkHadoopUtil JobContext methods

This is a followup to #8499 which adds a Scalastyle rule to mandate the use of SparkHadoopUtil's JobContext accessor methods and fixes the existing violations.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8521 from JoshRosen/SPARK-10330-part2.
---
 .../src/main/scala/org/apache/spark/SparkContext.scala |  6 +++---
 .../org/apache/spark/deploy/SparkHadoopUtil.scala      |  4 ++++
 .../scala/org/apache/spark/rdd/PairRDDFunctions.scala  |  8 +++++---
 .../scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala   |  2 +-
 core/src/test/scala/org/apache/spark/FileSuite.scala   |  6 ++++--
 .../org/apache/spark/examples/CassandraCQLTest.scala   |  3 +++
 .../org/apache/spark/examples/CassandraTest.scala      |  2 ++
 scalastyle-config.xml                                  |  8 ++++++++
 .../sql/execution/datasources/WriterContainer.scala    |  8 ++++++--
 .../sql/execution/datasources/json/JSONRelation.scala  |  2 +-
 .../datasources/parquet/CatalystReadSupport.scala      |  6 +++++-
 .../parquet/DirectParquetOutputCommitter.scala         |  6 +++++-
 .../datasources/parquet/ParquetRelation.scala          | 10 +++++++---
 .../datasources/parquet/ParquetTypesConverter.scala    |  6 +++++-
 .../org/apache/spark/sql/hive/orc/OrcRelation.scala    |  4 ++--
 15 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index cbfe8bf31c3d6..e27b3c4962221 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -858,7 +858,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that wholeTextFiles aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updateConf = job.getConfiguration
+    val updateConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new WholeTextFileRDD(
       this,
       classOf[WholeTextFileInputFormat],
@@ -910,7 +910,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that binaryFiles aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updateConf = job.getConfiguration
+    val updateConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new BinaryFileRDD(
       this,
       classOf[StreamInputFormat],
@@ -1092,7 +1092,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that newAPIHadoopFile aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updatedConf = job.getConfiguration
+    val updatedConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf).setName(path)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index f7723ef5bde4c..a0b7365df900a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -192,7 +192,9 @@ class SparkHadoopUtil extends Logging {
    * while it's interface in Hadoop 2.+.
    */
   def getConfigurationFromJobContext(context: JobContext): Configuration = {
+    // scalastyle:off jobconfig
     val method = context.getClass.getMethod("getConfiguration")
+    // scalastyle:on jobconfig
     method.invoke(context).asInstanceOf[Configuration]
   }
 
@@ -204,7 +206,9 @@ class SparkHadoopUtil extends Logging {
    */
   def getTaskAttemptIDFromTaskAttemptContext(
       context: MapReduceTaskAttemptContext): MapReduceTaskAttemptID = {
+    // scalastyle:off jobconfig
     val method = context.getClass.getMethod("getTaskAttemptID")
+    // scalastyle:on jobconfig
     method.invoke(context).asInstanceOf[MapReduceTaskAttemptID]
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index c59f0d4aa75a0..199d79b811d65 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -996,8 +996,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     job.setOutputKeyClass(keyClass)
     job.setOutputValueClass(valueClass)
     job.setOutputFormatClass(outputFormatClass)
-    job.getConfiguration.set("mapred.output.dir", path)
-    saveAsNewAPIHadoopDataset(job.getConfiguration)
+    val jobConfiguration = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    jobConfiguration.set("mapred.output.dir", path)
+    saveAsNewAPIHadoopDataset(jobConfiguration)
   }
 
   /**
@@ -1064,7 +1065,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
     val jobtrackerID = formatter.format(new Date())
     val stageId = self.id
-    val wrappedConf = new SerializableConfiguration(job.getConfiguration)
+    val jobConfiguration = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    val wrappedConf = new SerializableConfiguration(jobConfiguration)
     val outfmt = job.getOutputFormatClass
     val jobFormat = outfmt.newInstance
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
index 9babe56267e08..0228c54e0511c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
@@ -86,7 +86,7 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
     if (isDriverSide) {
       initDriverSideJobFuncOpt.map(f => f(job))
     }
-    job.getConfiguration
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job)
   }
 
   private val jobTrackerId: String = {
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 418763f4e5ffa..fdb00aafc4a48 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.io.{File, FileWriter}
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.input.PortableDataStream
 import org.apache.spark.storage.StorageLevel
 
@@ -506,8 +507,9 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     job.setOutputKeyClass(classOf[String])
     job.setOutputValueClass(classOf[String])
     job.setOutputFormatClass(classOf[NewTextOutputFormat[String, String]])
-    job.getConfiguration.set("mapred.output.dir", tempDir.getPath + "/outputDataset_new")
-    randomRDD.saveAsNewAPIHadoopDataset(job.getConfiguration)
+    val jobConfig = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    jobConfig.set("mapred.output.dir", tempDir.getPath + "/outputDataset_new")
+    randomRDD.saveAsNewAPIHadoopDataset(jobConfig)
     assert(new File(tempDir.getPath + "/outputDataset_new/part-r-00000").exists() === true)
   }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
index fa07c1e5017cd..d1b9b8d398dd8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
@@ -16,6 +16,7 @@
  */
 
  // scalastyle:off println
+ // scalastyle:off jobcontext
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -81,6 +82,7 @@ object CassandraCQLTest {
 
     val job = new Job()
     job.setInputFormatClass(classOf[CqlPagingInputFormat])
+    val configuration = job.getConfiguration
     ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)
     ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)
     ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)
@@ -135,3 +137,4 @@ object CassandraCQLTest {
   }
 }
 // scalastyle:on println
+// scalastyle:on jobcontext
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
index 2e56d24c60c33..1e679bfb55343 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
@@ -16,6 +16,7 @@
  */
 
 // scalastyle:off println
+// scalastyle:off jobcontext
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -130,6 +131,7 @@ object CassandraTest {
   }
 }
 // scalastyle:on println
+// scalastyle:on jobcontext
 
 /*
 create keyspace casDemo;
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 68fdb4141cf27..64a0c71bbef2a 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -168,6 +168,14 @@ This file is divided into 3 sections:
     scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
   </check>
 
+  <!-- As of SPARK-10330 JobContext methods should not be called directly -->
+  <check customId="jobcontext" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+    <parameters><parameter name="regex">^getConfiguration$|^getTaskAttemptID$</parameter></parameters>
+    <customMessage>Instead of calling .getConfiguration() or .getTaskAttemptID() directly,
+    use SparkHadoopUtil's getConfigurationFromJobContext() and getTaskAttemptIDFromTaskAttemptContext() methods.
+    </customMessage>
+  </check>
+
   <!-- ================================================================================ -->
   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
   <!-- ================================================================================ -->
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 9a573db0c023a..f8ef674ed29c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -47,7 +47,8 @@ private[sql] abstract class BaseWriterContainer(
 
   protected val dataSchema = relation.dataSchema
 
-  protected val serializableConf = new SerializableConfiguration(job.getConfiguration)
+  protected val serializableConf =
+    new SerializableConfiguration(SparkHadoopUtil.get.getConfigurationFromJobContext(job))
 
   // This UUID is used to avoid output file name collision between different appending write jobs.
   // These jobs may belong to different SparkContext instances. Concrete data source implementations
@@ -89,7 +90,8 @@ private[sql] abstract class BaseWriterContainer(
     // This UUID is sent to executor side together with the serialized `Configuration` object within
     // the `Job` instance.  `OutputWriters` on the executor side should use this UUID to generate
     // unique task output files.
-    job.getConfiguration.set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString)
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job).
+      set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString)
 
     // Order of the following two lines is important.  For Hadoop 1, TaskAttemptContext constructor
     // clones the Configuration object passed in.  If we initialize the TaskAttemptContext first,
@@ -182,7 +184,9 @@ private[sql] abstract class BaseWriterContainer(
   private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = {
     this.jobId = SparkHadoopWriter.createJobID(new Date, jobId)
     this.taskId = new TaskID(this.jobId, true, splitId)
+    // scalastyle:off jobcontext
     this.taskAttemptId = new TaskAttemptID(taskId, attemptId)
+    // scalastyle:on jobcontext
   }
 
   private def setupConf(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 7a49157d9e72c..8ee0127c3bde8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -81,7 +81,7 @@ private[sql] class JSONRelation(
 
   private def createBaseRdd(inputPaths: Array[FileStatus]): RDD[String] = {
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
 
     val paths = inputPaths.map(_.getPath)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 5a8166fac5418..8c819f1a48cd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -72,7 +72,11 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
   // Called before `prepareForRead()` when initializing Parquet record reader.
   override def init(context: InitContext): ReadContext = {
-    val conf = context.getConfiguration
+    val conf = {
+      // scalastyle:off jobcontext
+      context.getConfiguration
+      // scalastyle:on jobcontext
+    }
 
     // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
     // schema of this file from its metadata.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
index 2c6b914328b60..de1fd0166ac5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
@@ -53,7 +53,11 @@ private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: T
   override def setupTask(taskContext: TaskAttemptContext): Unit = {}
 
   override def commitJob(jobContext: JobContext) {
-    val configuration = ContextUtil.getConfiguration(jobContext)
+    val configuration = {
+      // scalastyle:off jobcontext
+      ContextUtil.getConfiguration(jobContext)
+      // scalastyle:on jobcontext
+    }
     val fileSystem = outputPath.getFileSystem(configuration)
 
     if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index c6bbc392cad4c..953fcab126970 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -211,7 +211,11 @@ private[sql] class ParquetRelation(
   override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    val conf = ContextUtil.getConfiguration(job)
+    val conf = {
+      // scalastyle:off jobcontext
+      ContextUtil.getConfiguration(job)
+      // scalastyle:on jobcontext
+    }
 
     // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible
     val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
@@ -528,7 +532,7 @@ private[sql] object ParquetRelation extends Logging {
       assumeBinaryIsString: Boolean,
       assumeInt96IsTimestamp: Boolean,
       followParquetFormatSpec: Boolean)(job: Job): Unit = {
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
     // Try to push down filters when filter push-down is enabled.
@@ -572,7 +576,7 @@ private[sql] object ParquetRelation extends Logging {
       FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
     }
 
-    overrideMinSplitSize(parquetBlockSize, job.getConfiguration)
+    overrideMinSplitSize(parquetBlockSize, SparkHadoopUtil.get.getConfigurationFromJobContext(job))
   }
 
   private[parquet] def readSchema(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
index 142301fe87cb6..b647bb6116afa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
@@ -123,7 +123,11 @@ private[parquet] object ParquetTypesConverter extends Logging {
       throw new IllegalArgumentException("Unable to read Parquet metadata: path is null")
     }
     val job = new Job()
-    val conf = configuration.getOrElse(ContextUtil.getConfiguration(job))
+    val conf = {
+      // scalastyle:off jobcontext
+      configuration.getOrElse(ContextUtil.getConfiguration(job))
+      // scalastyle:on jobcontext
+    }
     val fs: FileSystem = origPath.getFileSystem(conf)
     if (fs == null) {
       throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 7e89109259955..d1f30e188eafb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -208,7 +208,7 @@ private[sql] class OrcRelation(
   }
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    job.getConfiguration match {
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job) match {
       case conf: JobConf =>
         conf.setOutputFormat(classOf[OrcOutputFormat])
       case conf =>
@@ -289,7 +289,7 @@ private[orc] case class OrcTableScan(
 
   def execute(): RDD[InternalRow] = {
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
 
     // Tries to push down filters if ORC filter push-down is enabled
     if (sqlContext.conf.orcFilterPushDown) {

From 1dc614b874badde0eee60def46fb47f608bc4759 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 13 Sep 2015 08:36:46 +0100
Subject: [PATCH 270/802] [SPARK-10222] [GRAPHX] [DOCS] More thoroughly
 deprecate Bagel in favor of GraphX

Finish deprecating Bagel; remove reference to nonexistent example

Author: Sean Owen <sowen@cloudera.com>

Closes #8731 from srowen/SPARK-10222.
---
 .../src/main/scala/org/apache/spark/bagel/Bagel.scala  |  6 ++++++
 docs/bagel-programming-guide.md                        | 10 +---------
 docs/index.md                                          |  1 -
 pom.xml                                                |  2 +-
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
index 4e6b7686f771d..8399033ac61ec 100644
--- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
+++ b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -22,6 +22,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 object Bagel extends Logging {
   val DEFAULT_STORAGE_LEVEL = StorageLevel.MEMORY_AND_DISK
 
@@ -270,18 +271,21 @@ object Bagel extends Logging {
   }
 }
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Combiner[M, C] {
   def createCombiner(msg: M): C
   def mergeMsg(combiner: C, msg: M): C
   def mergeCombiners(a: C, b: C): C
 }
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Aggregator[V, A] {
   def createAggregator(vert: V): A
   def mergeAggregators(a: A, b: A): A
 }
 
 /** Default combiner that simply appends messages together (i.e. performs no aggregation) */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializable {
   def createCombiner(msg: M): Array[M] =
     Array(msg)
@@ -297,6 +301,7 @@ class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializab
  * Subclasses may store state along with each vertex and must
  * inherit from java.io.Serializable or scala.Serializable.
  */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Vertex {
   def active: Boolean
 }
@@ -307,6 +312,7 @@ trait Vertex {
  * Subclasses may contain a payload to deliver to the target vertex
  * and must inherit from java.io.Serializable or scala.Serializable.
  */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Message[K] {
   def targetId: K
 }
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index c2fe6b0e286ce..347ca4a7af989 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -4,7 +4,7 @@ displayTitle: Bagel Programming Guide
 title: Bagel
 ---
 
-**Bagel will soon be superseded by [GraphX](graphx-programming-guide.html); we recommend that new users try GraphX instead.**
+**Bagel is deprecated, and superseded by [GraphX](graphx-programming-guide.html).**
 
 Bagel is a Spark implementation of Google's [Pregel](http://portal.acm.org/citation.cfm?id=1807184) graph processing framework. Bagel currently supports basic graph computation, combiners, and aggregators.
 
@@ -157,11 +157,3 @@ trait Message[K] {
   def targetId: K
 }
 {% endhighlight %}
-
-# Where to Go from Here
-
-Two example jobs, PageRank and shortest path, are included in `examples/src/main/scala/org/apache/spark/examples/bagel`. You can run them by passing the class name to the `bin/run-example` script included in Spark; e.g.:
-
-    ./bin/run-example org.apache.spark.examples.bagel.WikipediaPageRank
-
-Each example program prints usage help when run without any arguments.
diff --git a/docs/index.md b/docs/index.md
index d85cf12defefd..c0dc2b8d7412a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -90,7 +90,6 @@ options for deployment:
   * [Spark SQL and DataFrames](sql-programming-guide.html): support for structured data and relational queries
   * [MLlib](mllib-guide.html): built-in machine learning library
   * [GraphX](graphx-programming-guide.html): Spark's new API for graph processing
-  * [Bagel (Pregel on Spark)](bagel-programming-guide.html): older, simple graph processing model
 
 **API Docs:**
 
diff --git a/pom.xml b/pom.xml
index 88ebceca769e9..421357e141572 100644
--- a/pom.xml
+++ b/pom.xml
@@ -87,7 +87,7 @@
 
   <modules>
     <module>core</module>
-    <module>bagel</module>
+    <module>bagel</module> <!-- Deprecated -->
     <module>graphx</module>
     <module>mllib</module>
     <module>tools</module>

From d81565465cc6d4f38b4ed78036cded630c700388 Mon Sep 17 00:00:00 2001
From: Bertrand Dechoux <BertrandDechoux@users.noreply.github.com>
Date: Mon, 14 Sep 2015 09:18:46 +0100
Subject: [PATCH 271/802] [SPARK-9720] [ML] Identifiable types need UID in
 toString methods

A few Identifiable types did override their toString method but without using the parent implementation. As a consequence, the uid was not present anymore in the toString result. It is the default behaviour.

This patch is a quick fix. The question of enforcement is still up.

No tests have been written to verify the toString method behaviour. That would be long to do because all types should be tested and not only those which have a regression now.

It is possible to enforce the condition using the compiler by making the toString method final but that would introduce unwanted potential API breaking changes (see jira).

Author: Bertrand Dechoux <BertrandDechoux@users.noreply.github.com>

Closes #8062 from BertrandDechoux/SPARK-9720.
---
 .../spark/ml/classification/DecisionTreeClassifier.scala      | 2 +-
 .../org/apache/spark/ml/classification/GBTClassifier.scala    | 2 +-
 .../scala/org/apache/spark/ml/classification/NaiveBayes.scala | 2 +-
 .../spark/ml/classification/RandomForestClassifier.scala      | 2 +-
 .../src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 4 ++--
 .../apache/spark/ml/regression/DecisionTreeRegressor.scala    | 2 +-
 .../scala/org/apache/spark/ml/regression/GBTRegressor.scala   | 2 +-
 .../apache/spark/ml/regression/RandomForestRegressor.scala    | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 0a75d5d22280f..b8eb49f9bdb48 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -146,7 +146,7 @@ final class DecisionTreeClassificationModel private[ml] (
   }
 
   override def toString: String = {
-    s"DecisionTreeClassificationModel of depth $depth with $numNodes nodes"
+    s"DecisionTreeClassificationModel (uid=$uid) of depth $depth with $numNodes nodes"
   }
 
   /** (private[ml]) Convert to a model in the old API */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index 3073a2a61ce83..ad8683648b975 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -200,7 +200,7 @@ final class GBTClassificationModel(
   }
 
   override def toString: String = {
-    s"GBTClassificationModel with $numTrees trees"
+    s"GBTClassificationModel (uid=$uid) with $numTrees trees"
   }
 
   /** (private[ml]) Convert to a model in the old API */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 69cb88a7e6718..082ea1ffad58f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -198,7 +198,7 @@ class NaiveBayesModel private[ml] (
   }
 
   override def toString: String = {
-    s"NaiveBayesModel with ${pi.size} classes"
+    s"NaiveBayesModel (uid=$uid) with ${pi.size} classes"
   }
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 11a6d72468333..a6ebee1bb10af 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -193,7 +193,7 @@ final class RandomForestClassificationModel private[ml] (
   }
 
   override def toString: String = {
-    s"RandomForestClassificationModel with $numTrees trees"
+    s"RandomForestClassificationModel (uid=$uid) with $numTrees trees"
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index a7fa50444209b..dcd6fe3c406a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -129,7 +129,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
 
   override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
 
-  override def toString: String = s"RFormula(${get(formula)})"
+  override def toString: String = s"RFormula(${get(formula)}) (uid=$uid)"
 }
 
 /**
@@ -171,7 +171,7 @@ class RFormulaModel private[feature](
   override def copy(extra: ParamMap): RFormulaModel = copyValues(
     new RFormulaModel(uid, resolvedFormula, pipelineModel))
 
-  override def toString: String = s"RFormulaModel(${resolvedFormula})"
+  override def toString: String = s"RFormulaModel(${resolvedFormula}) (uid=$uid)"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
     val labelName = resolvedFormula.label
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index a2bcd67401d08..d9a244bea28d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -118,7 +118,7 @@ final class DecisionTreeRegressionModel private[ml] (
   }
 
   override def toString: String = {
-    s"DecisionTreeRegressionModel of depth $depth with $numNodes nodes"
+    s"DecisionTreeRegressionModel (uid=$uid) of depth $depth with $numNodes nodes"
   }
 
   /** Convert to a model in the old API */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index b66e61f37dd5e..d841ecb9e58d6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -189,7 +189,7 @@ final class GBTRegressionModel(
   }
 
   override def toString: String = {
-    s"GBTRegressionModel with $numTrees trees"
+    s"GBTRegressionModel (uid=$uid) with $numTrees trees"
   }
 
   /** (private[ml]) Convert to a model in the old API */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 2f36da371f577..ddb7214416a69 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -155,7 +155,7 @@ final class RandomForestRegressionModel private[ml] (
   }
 
   override def toString: String = {
-    s"RandomForestRegressionModel with $numTrees trees"
+    s"RandomForestRegressionModel (uid=$uid) with $numTrees trees"
   }
 
   /**

From 32407bfd2bdbf84d65cacfa7554dae6a2332bc37 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 14 Sep 2015 11:51:39 -0700
Subject: [PATCH 272/802] [SPARK-9899] [SQL] log warning for direct output
 committer with speculation enabled

This is a follow-up of https://github.com/apache/spark/pull/8317.

When speculation is enabled, there may be multiply tasks writing to the same path. Generally it's OK as we will write to a temporary directory first and only one task can commit the temporary directory to target path.

However, when we use direct output committer, tasks will write data to target path directly without temporary directory. This causes problems like corrupted data. Please see [PR comment](https://github.com/apache/spark/pull/8191#issuecomment-131598385) for more details.

Unfortunately, we don't have a simple flag to tell if a output committer will write to temporary directory or not, so for safety, we have to disable any customized output committer when `speculation` is true.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8687 from cloud-fan/direct-committer.
---
 .../apache/spark/rdd/PairRDDFunctions.scala   | 44 ++++++++++++++++---
 .../hive/execution/InsertIntoHiveTable.scala  | 17 ++++++-
 .../spark/sql/hive/hiveWriterContainers.scala |  1 -
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 199d79b811d65..a981b63942e6d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -1018,6 +1018,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD.
+   *
+   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * not use output committer that writes data directly.
+   * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
+   * result of using direct output committer with speculation enabled.
    */
   def saveAsHadoopFile(
       path: String,
@@ -1030,10 +1035,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val hadoopConf = conf
     hadoopConf.setOutputKeyClass(keyClass)
     hadoopConf.setOutputValueClass(valueClass)
-    // Doesn't work in Scala 2.9 due to what may be a generics bug
-    // TODO: Should we uncomment this for Scala 2.10?
-    // conf.setOutputFormat(outputFormatClass)
-    hadoopConf.set("mapred.output.format.class", outputFormatClass.getName)
+    conf.setOutputFormat(outputFormatClass)
     for (c <- codec) {
       hadoopConf.setCompressMapOutput(true)
       hadoopConf.set("mapred.output.compress", "true")
@@ -1047,6 +1049,19 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       hadoopConf.setOutputCommitter(classOf[FileOutputCommitter])
     }
 
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = self.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = hadoopConf.get("mapred.output.committer.class", "")
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     FileOutputFormat.setOutputPath(hadoopConf,
       SparkHadoopWriter.createPathFromString(path, hadoopConf))
     saveAsHadoopDataset(hadoopConf)
@@ -1057,6 +1072,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Configuration object for that storage system. The Conf should set an OutputFormat and any
    * output paths required (e.g. a table name to write to) in the same way as it would be
    * configured for a Hadoop MapReduce job.
+   *
+   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * not use output committer that writes data directly.
+   * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
+   * result of using direct output committer with speculation enabled.
    */
   def saveAsNewAPIHadoopDataset(conf: Configuration): Unit = self.withScope {
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
@@ -1115,6 +1135,20 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
     val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
     val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
+
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = self.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = jobCommitter.getClass.getSimpleName
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     jobCommitter.setupJob(jobTaskContext)
     self.context.runJob(self, writeShard)
     jobCommitter.commitJob(jobTaskContext)
@@ -1129,7 +1163,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def saveAsHadoopDataset(conf: JobConf): Unit = self.withScope {
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
     val hadoopConf = conf
-    val wrappedConf = new SerializableConfiguration(hadoopConf)
     val outputFormatInstance = hadoopConf.getOutputFormat
     val keyClass = hadoopConf.getOutputKeyClass
     val valueClass = hadoopConf.getOutputValueClass
@@ -1157,7 +1190,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     writer.preSetup()
 
     val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
-      val config = wrappedConf.value
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 58f7fa640e8a9..0c700bdb370ac 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
 import org.apache.hadoop.hive.serde2.Serializer
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
 import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
+import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
@@ -62,7 +62,7 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  def saveAsHiveFile(
+  private def saveAsHiveFile(
       rdd: RDD[InternalRow],
       valueClass: Class[_],
       fileSinkConf: FileSinkDesc,
@@ -178,6 +178,19 @@ case class InsertIntoHiveTable(
     val jobConf = new JobConf(sc.hiveconf)
     val jobConfSer = new SerializableJobConf(jobConf)
 
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = sqlContext.sparkContext.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = jobConf.get("mapred.output.committer.class", "")
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     val writerContainer = if (numDynamicPartitions > 0) {
       val dynamicPartColNames = partitionColumnNames.takeRight(numDynamicPartitions)
       new SparkHiveDynamicPartitionWriterContainer(jobConf, fileSinkConf, dynamicPartColNames)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 29a6f08f40728..4ca8042d22367 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -32,7 +32,6 @@ import org.apache.hadoop.mapred._
 import org.apache.hadoop.hive.common.FileUtils
 
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.sql.Row
 import org.apache.spark.{Logging, SerializableWritable, SparkHadoopWriter}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils

From cf2821ef5fd9965eb6256e8e8b3f1e00c0788098 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Mon, 14 Sep 2015 12:06:23 -0700
Subject: [PATCH 273/802] [SPARK-10584] [DOC] [SQL] Documentation about
 spark.sql.hive.metastore.version is wrong.

The default value of hive metastore version is 1.2.1 but the documentation says the value of `spark.sql.hive.metastore.version` is 0.13.1.
Also, we cannot get the default value by `sqlContext.getConf("spark.sql.hive.metastore.version")`.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #8739 from sarutak/SPARK-10584.
---
 docs/sql-programming-guide.md                         |  2 +-
 .../scala/org/apache/spark/sql/hive/HiveContext.scala | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 6a1b0fbfa1eb3..a0b911d207243 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1687,7 +1687,7 @@ The following options can be used to configure the version of Hive that is used
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
   <tr>
     <td><code>spark.sql.hive.metastore.version</code></td>
-    <td><code>0.13.1</code></td>
+    <td><code>1.2.1</code></td>
     <td>
       Version of the Hive metastore. Available
       options are <code>0.12.0</code> through <code>1.2.1</code>.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2e791cea96b41..d37ba5ddc2d80 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -111,8 +111,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * this does not necessarily need to be the same version of Hive that is used internally by
    * Spark SQL for execution.
    */
-  protected[hive] def hiveMetastoreVersion: String =
-    getConf(HIVE_METASTORE_VERSION, hiveExecutionVersion)
+  protected[hive] def hiveMetastoreVersion: String = getConf(HIVE_METASTORE_VERSION)
 
   /**
    * The location of the jars that should be used to instantiate the HiveMetastoreClient.  This
@@ -202,7 +201,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
           "Builtin jars can only be used when hive execution version == hive metastore version. " +
           s"Execution: ${hiveExecutionVersion} != Metastore: ${hiveMetastoreVersion}. " +
           "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
-          s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.")
+          s"or change ${HIVE_METASTORE_VERSION.key} to $hiveExecutionVersion.")
       }
 
       // We recursively find all jars in the class loader chain,
@@ -606,7 +605,11 @@ private[hive] object HiveContext {
   /** The version of hive used internally by Spark SQL. */
   val hiveExecutionVersion: String = "1.2.1"
 
-  val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version"
+  val HIVE_METASTORE_VERSION = stringConf("spark.sql.hive.metastore.version",
+    defaultValue = Some(hiveExecutionVersion),
+    doc = "Version of the Hive metastore. Available options are " +
+        s"<code>0.12.0</code> through <code>$hiveExecutionVersion</code>.")
+
   val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars",
     defaultValue = Some("builtin"),
     doc = s"""

From ce6f3f163bc667cb5da9ab4331c8bad10cc0d701 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Sep 2015 12:08:52 -0700
Subject: [PATCH 274/802] [SPARK-10194] [MLLIB] [PYSPARK] SGD algorithms need
 convergenceTol parameter in Python

[SPARK-3382](https://issues.apache.org/jira/browse/SPARK-3382) added a ```convergenceTol``` parameter for GradientDescent-based methods in Scala. We need that parameter in Python; otherwise, Python users will not be able to adjust that behavior (or even reproduce behavior from previous releases since the default changed).

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8457 from yanboliang/spark-10194.
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 20 +++++++++---
 python/pyspark/mllib/classification.py        | 17 +++++++---
 python/pyspark/mllib/regression.py            | 32 ++++++++++++-------
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f585aacd452e0..69ce7f50709a1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -132,7 +132,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       regParam: Double,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val lrAlg = new LinearRegressionWithSGD()
     lrAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -141,6 +142,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     lrAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       lrAlg,
@@ -159,7 +161,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       miniBatchFraction: Double,
       initialWeights: Vector,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val lassoAlg = new LassoWithSGD()
     lassoAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -168,6 +171,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     trainRegressionModel(
       lassoAlg,
       data,
@@ -185,7 +189,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       miniBatchFraction: Double,
       initialWeights: Vector,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val ridgeAlg = new RidgeRegressionWithSGD()
     ridgeAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -194,6 +199,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     trainRegressionModel(
       ridgeAlg,
       data,
@@ -212,7 +218,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       initialWeights: Vector,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val SVMAlg = new SVMWithSGD()
     SVMAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -221,6 +228,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     SVMAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       SVMAlg,
@@ -240,7 +248,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       regParam: Double,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val LogRegAlg = new LogisticRegressionWithSGD()
     LogRegAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -249,6 +258,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     LogRegAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       LogRegAlg,
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 8f27c446a66e8..cb4ee83678081 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -241,7 +241,7 @@ class LogisticRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.01, regType="l2", intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a logistic regression model on the given data.
 
@@ -274,11 +274,13 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
                                  float(step), float(miniBatchFraction), i, float(regParam), regType,
-                                 bool(intercept), bool(validateData))
+                                 bool(intercept), bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights)
 
@@ -439,7 +441,7 @@ class SVMWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, regType="l2",
-              intercept=False, validateData=True):
+              intercept=False, validateData=True, convergenceTol=0.001):
         """
         Train a support vector machine on the given data.
 
@@ -472,11 +474,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, regType,
-                                 bool(intercept), bool(validateData))
+                                 bool(intercept), bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, SVMModel, data, initialWeights)
 
@@ -600,12 +604,15 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
     :param miniBatchFraction: Fraction of data on which SGD is run for each
                               iteration.
     :param regParam: L2 Regularization parameter.
+    :param convergenceTol: A condition which decides iteration termination.
     """
-    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01):
+    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
+                 convergenceTol=0.001):
         self.stepSize = stepSize
         self.numIterations = numIterations
         self.regParam = regParam
         self.miniBatchFraction = miniBatchFraction
+        self.convergenceTol = convergenceTol
         self._model = None
         super(StreamingLogisticRegressionWithSGD, self).__init__(
             model=self._model)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 41946e3674fbe..256b7537fef6b 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -28,7 +28,8 @@
            'LinearRegressionModel', 'LinearRegressionWithSGD',
            'RidgeRegressionModel', 'RidgeRegressionWithSGD',
            'LassoModel', 'LassoWithSGD', 'IsotonicRegressionModel',
-           'IsotonicRegression']
+           'IsotonicRegression', 'StreamingLinearAlgorithm',
+           'StreamingLinearRegressionWithSGD']
 
 
 class LabeledPoint(object):
@@ -202,7 +203,7 @@ class LinearRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.0, regType=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a linear regression model using Stochastic Gradient
         Descent (SGD).
@@ -244,11 +245,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
                                  float(step), float(miniBatchFraction), i, float(regParam),
-                                 regType, bool(intercept), bool(validateData))
+                                 regType, bool(intercept), bool(validateData),
+                                 float(convergenceTol))
 
         return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
 
@@ -330,7 +334,7 @@ class LassoWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a regression model with L1-regularization using
         Stochastic Gradient Descent.
@@ -362,11 +366,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
-                                 bool(validateData))
+                                 bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, LassoModel, data, initialWeights)
 
@@ -449,7 +455,7 @@ class RidgeRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a regression model with L2-regularization using
         Stochastic Gradient Descent.
@@ -481,11 +487,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
-                                 bool(validateData))
+                                 bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)
 
@@ -636,15 +644,17 @@ class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     After training on a batch of data, the weights obtained at the end of
     training are used as initial weights for the next batch.
 
-    :param: stepSize Step size for each iteration of gradient descent.
-    :param: numIterations Total number of iterations run.
-    :param: miniBatchFraction Fraction of data on which SGD is run for each
+    :param stepSize: Step size for each iteration of gradient descent.
+    :param numIterations: Total number of iterations run.
+    :param miniBatchFraction: Fraction of data on which SGD is run for each
                               iteration.
+    :param convergenceTol: A condition which decides iteration termination.
     """
-    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0):
+    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001):
         self.stepSize = stepSize
         self.numIterations = numIterations
         self.miniBatchFraction = miniBatchFraction
+        self.convergenceTol = convergenceTol
         self._model = None
         super(StreamingLinearRegressionWithSGD, self).__init__(
             model=self._model)

From 8a634e9bcc671167613fb575c6c0c054fb4b3479 Mon Sep 17 00:00:00 2001
From: Nick Pritchard <nicholas.pritchard@falkonry.com>
Date: Mon, 14 Sep 2015 13:27:45 -0700
Subject: [PATCH 275/802] [SPARK-10573] [ML] IndexToString output schema should
 be StringType

Fixes bug where IndexToString output schema was DoubleType. Correct me if I'm wrong, but it doesn't seem like the output needs to have any "ML Attribute" metadata.

Author: Nick Pritchard <nicholas.pritchard@falkonry.com>

Closes #8751 from pnpritchard/SPARK-10573.
---
 .../scala/org/apache/spark/ml/feature/StringIndexer.scala | 5 ++---
 .../org/apache/spark/ml/feature/StringIndexerSuite.scala  | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 3a4ab9a857648..2b1592930e77b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -229,8 +229,7 @@ class IndexToString private[ml] (
     val outputColName = $(outputCol)
     require(inputFields.forall(_.name != outputColName),
       s"Output column $outputColName already exists.")
-    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
-    val outputFields = inputFields :+ attr.toStructField()
+    val outputFields = inputFields :+ StructField($(outputCol), StringType)
     StructType(outputFields)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 05e05bdc64bb1..ddcdb5f4212be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import org.apache.spark.sql.types.{StringType, StructType, StructField, DoubleType}
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param.ParamsSuite
@@ -165,4 +166,11 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(a === b)
     }
   }
+
+  test("IndexToString.transformSchema (SPARK-10573)") {
+    val idxToStr = new IndexToString().setInputCol("input").setOutputCol("output")
+    val inSchema = StructType(Seq(StructField("input", DoubleType)))
+    val outSchema = idxToStr.transformSchema(inSchema)
+    assert(outSchema("output").dataType === StringType)
+  }
 }

From 7e32387ae6303fd1cd32389d47df87170b841c67 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 14 Sep 2015 14:10:54 -0700
Subject: [PATCH 276/802] [SPARK-10522] [SQL] Nanoseconds of Timestamp in
 Parquet should be positive

Or Hive can't read it back correctly.

Thanks vanzin for report this.

Author: Davies Liu <davies@databricks.com>

Closes #8674 from davies/positive_nano.
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala | 12 +++++++-----
 .../sql/catalyst/util/DateTimeUtilsSuite.scala  | 17 ++++++++---------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index d652fce3fd9b6..687ca000d12bb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -42,6 +42,7 @@ object DateTimeUtils {
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
+  final val MICROS_PER_DAY = MICROS_PER_SECOND * SECONDS_PER_DAY
 
   final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L
 
@@ -190,13 +191,14 @@ object DateTimeUtils {
 
   /**
    * Returns Julian day and nanoseconds in a day from the number of microseconds
+   *
+   * Note: support timestamp since 4717 BC (without negative nanoseconds, compatible with Hive).
    */
   def toJulianDay(us: SQLTimestamp): (Int, Long) = {
-    val seconds = us / MICROS_PER_SECOND
-    val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
-    val secondsInDay = seconds % SECONDS_PER_DAY
-    val nanos = (us % MICROS_PER_SECOND) * 1000L
-    (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
+    val julian_us = us + JULIAN_DAY_OF_EPOCH * MICROS_PER_DAY
+    val day = julian_us / MICROS_PER_DAY
+    val micros = julian_us % MICROS_PER_DAY
+    (day.toInt, micros * 1000L)
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 1596bb79fa94b..6b9a11f0ff743 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -52,15 +52,14 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(ns === 0)
     assert(fromJulianDay(d, ns) == 0L)
 
-    val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
-    val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
-    val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
-    assert(t.equals(t1))
-
-    val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
-    val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
-    val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
-    assert(t2.equals(t22))
+    Seq(Timestamp.valueOf("2015-06-11 10:10:10.100"),
+      Timestamp.valueOf("2015-06-11 20:10:10.100"),
+      Timestamp.valueOf("1900-06-11 20:10:10.100")).foreach { t =>
+      val (d, ns) = toJulianDay(fromJavaTimestamp(t))
+      assert(ns > 0)
+      val t1 = toJavaTimestamp(fromJulianDay(d, ns))
+      assert(t.equals(t1))
+    }
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {

From 64f04154e3078ec7340da97e3c2b07cf24e89098 Mon Sep 17 00:00:00 2001
From: Edoardo Vacchi <uncommonnonsense@gmail.com>
Date: Mon, 14 Sep 2015 14:56:04 -0700
Subject: [PATCH 277/802] [SPARK-6981] [SQL] Factor out SparkPlanner and
 QueryExecution from SQLContext

Alternative to PR #6122; in this case the refactored out classes are replaced by inner classes with the same name for backwards binary compatibility

   * process in a lighter-weight, backwards-compatible way

Author: Edoardo Vacchi <uncommonnonsense@gmail.com>

Closes #6356 from evacchi/sqlctx-refactoring-lite.
---
 .../org/apache/spark/sql/DataFrame.scala      |   4 +-
 .../org/apache/spark/sql/SQLContext.scala     | 138 ++----------------
 .../spark/sql/execution/QueryExecution.scala  |  85 +++++++++++
 .../spark/sql/execution/SQLExecution.scala    |   2 +-
 .../spark/sql/execution/SparkPlanner.scala    |  92 ++++++++++++
 .../spark/sql/execution/SparkStrategies.scala |   2 +-
 6 files changed, 195 insertions(+), 128 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 1a687b2374f14..3e61123c145cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
-import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, SQLExecution}
+import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.sources.HadoopFsRelation
@@ -114,7 +114,7 @@ private[sql] object DataFrame {
 @Experimental
 class DataFrame private[sql](
     @transient val sqlContext: SQLContext,
-    @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) extends Serializable {
+    @DeveloperApi @transient val queryExecution: QueryExecution) extends Serializable {
 
   // Note for Spark contributors: if adding or updating any action in `DataFrame`, please make sure
   // you wrap it with `withNewExecutionId` if this actions doesn't call other action.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 4e8414af50b44..e3fdd782e6ff6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -38,6 +38,10 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
+import org.apache.spark.sql.execution.{Filter, _}
+import org.apache.spark.sql.{execution => sparkexecution}
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
@@ -188,9 +192,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   protected[sql] def parseSql(sql: String): LogicalPlan = ddlParser.parse(sql, false)
 
-  protected[sql] def executeSql(sql: String): this.QueryExecution = executePlan(parseSql(sql))
+  protected[sql] def executeSql(sql: String):
+    org.apache.spark.sql.execution.QueryExecution = executePlan(parseSql(sql))
 
-  protected[sql] def executePlan(plan: LogicalPlan) = new this.QueryExecution(plan)
+  protected[sql] def executePlan(plan: LogicalPlan) =
+    new sparkexecution.QueryExecution(this, plan)
 
   @transient
   protected[sql] val tlSession = new ThreadLocal[SQLSession]() {
@@ -781,77 +787,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }.toArray
   }
 
-  protected[sql] class SparkPlanner extends SparkStrategies {
-    val sparkContext: SparkContext = self.sparkContext
-
-    val sqlContext: SQLContext = self
-
-    def codegenEnabled: Boolean = self.conf.codegenEnabled
-
-    def unsafeEnabled: Boolean = self.conf.unsafeEnabled
-
-    def numPartitions: Int = self.conf.numShufflePartitions
-
-    def strategies: Seq[Strategy] =
-      experimental.extraStrategies ++ (
-      DataSourceStrategy ::
-      DDLStrategy ::
-      TakeOrderedAndProject ::
-      HashAggregation ::
-      Aggregation ::
-      LeftSemiJoin ::
-      EquiJoinSelection ::
-      InMemoryScans ::
-      BasicOperators ::
-      CartesianProduct ::
-      BroadcastNestedLoopJoin :: Nil)
-
-    /**
-     * Used to build table scan operators where complex projection and filtering are done using
-     * separate physical operators.  This function returns the given scan operator with Project and
-     * Filter nodes added only when needed.  For example, a Project operator is only used when the
-     * final desired output requires complex expressions to be evaluated or when columns can be
-     * further eliminated out after filtering has been done.
-     *
-     * The `prunePushedDownFilters` parameter is used to remove those filters that can be optimized
-     * away by the filter pushdown optimization.
-     *
-     * The required attributes for both filtering and expression evaluation are passed to the
-     * provided `scanBuilder` function so that it can avoid unnecessary column materialization.
-     */
-    def pruneFilterProject(
-        projectList: Seq[NamedExpression],
-        filterPredicates: Seq[Expression],
-        prunePushedDownFilters: Seq[Expression] => Seq[Expression],
-        scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {
-
-      val projectSet = AttributeSet(projectList.flatMap(_.references))
-      val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
-      val filterCondition =
-        prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And)
-
-      // Right now we still use a projection even if the only evaluation is applying an alias
-      // to a column.  Since this is a no-op, it could be avoided. However, using this
-      // optimization with the current implementation would change the output schema.
-      // TODO: Decouple final output schema from expression evaluation so this copy can be
-      // avoided safely.
-
-      if (AttributeSet(projectList.map(_.toAttribute)) == projectSet &&
-          filterSet.subsetOf(projectSet)) {
-        // When it is possible to just use column pruning to get the right projection and
-        // when the columns of this projection are enough to evaluate all filter conditions,
-        // just do a scan followed by a filter, with no extra project.
-        val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]])
-        filterCondition.map(Filter(_, scan)).getOrElse(scan)
-      } else {
-        val scan = scanBuilder((projectSet ++ filterSet).toSeq)
-        Project(projectList, filterCondition.map(Filter(_, scan)).getOrElse(scan))
-      }
-    }
-  }
+  @deprecated("use org.apache.spark.sql.SparkPlanner", "1.6.0")
+  protected[sql] class SparkPlanner extends sparkexecution.SparkPlanner(this)
 
   @transient
-  protected[sql] val planner = new SparkPlanner
+  protected[sql] val planner: sparkexecution.SparkPlanner = new sparkexecution.SparkPlanner(this)
 
   @transient
   protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[InternalRow], 1)
@@ -898,59 +838,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
     protected[sql] lazy val conf: SQLConf = new SQLConf
   }
 
-  /**
-   * :: DeveloperApi ::
-   * The primary workflow for executing relational queries using Spark.  Designed to allow easy
-   * access to the intermediate phases of query execution for developers.
-   */
-  @DeveloperApi
-  protected[sql] class QueryExecution(val logical: LogicalPlan) {
-    def assertAnalyzed(): Unit = analyzer.checkAnalysis(analyzed)
-
-    lazy val analyzed: LogicalPlan = analyzer.execute(logical)
-    lazy val withCachedData: LogicalPlan = {
-      assertAnalyzed()
-      cacheManager.useCachedData(analyzed)
-    }
-    lazy val optimizedPlan: LogicalPlan = optimizer.execute(withCachedData)
-
-    // TODO: Don't just pick the first one...
-    lazy val sparkPlan: SparkPlan = {
-      SparkPlan.currentContext.set(self)
-      planner.plan(optimizedPlan).next()
-    }
-    // executedPlan should not be used to initialize any SparkPlan. It should be
-    // only used for execution.
-    lazy val executedPlan: SparkPlan = prepareForExecution.execute(sparkPlan)
-
-    /** Internal version of the RDD. Avoids copies and has no schema */
-    lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
-
-    protected def stringOrError[A](f: => A): String =
-      try f.toString catch { case e: Throwable => e.toString }
-
-    def simpleString: String =
-      s"""== Physical Plan ==
-         |${stringOrError(executedPlan)}
-      """.stripMargin.trim
-
-    override def toString: String = {
-      def output =
-        analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")
-
-      s"""== Parsed Logical Plan ==
-         |${stringOrError(logical)}
-         |== Analyzed Logical Plan ==
-         |${stringOrError(output)}
-         |${stringOrError(analyzed)}
-         |== Optimized Logical Plan ==
-         |${stringOrError(optimizedPlan)}
-         |== Physical Plan ==
-         |${stringOrError(executedPlan)}
-         |Code Generation: ${stringOrError(executedPlan.codegenEnabled)}
-      """.stripMargin.trim
-    }
-  }
+  @deprecated("use org.apache.spark.sql.QueryExecution", "1.6.0")
+  protected[sql] class QueryExecution(logical: LogicalPlan)
+    extends sparkexecution.QueryExecution(this, logical)
 
   /**
    * Parses the data type in our internal string representation. The data type string should
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
new file mode 100644
index 0000000000000..7bb4133a29059
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.{Experimental, DeveloperApi}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.{InternalRow, optimizer}
+import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+/**
+ * :: DeveloperApi ::
+ * The primary workflow for executing relational queries using Spark.  Designed to allow easy
+ * access to the intermediate phases of query execution for developers.
+ */
+@DeveloperApi
+class QueryExecution(val sqlContext: SQLContext, val logical: LogicalPlan) {
+  val analyzer = sqlContext.analyzer
+  val optimizer = sqlContext.optimizer
+  val planner = sqlContext.planner
+  val cacheManager = sqlContext.cacheManager
+  val prepareForExecution = sqlContext.prepareForExecution
+
+  def assertAnalyzed(): Unit = analyzer.checkAnalysis(analyzed)
+
+  lazy val analyzed: LogicalPlan = analyzer.execute(logical)
+  lazy val withCachedData: LogicalPlan = {
+    assertAnalyzed()
+    cacheManager.useCachedData(analyzed)
+  }
+  lazy val optimizedPlan: LogicalPlan = optimizer.execute(withCachedData)
+
+  // TODO: Don't just pick the first one...
+  lazy val sparkPlan: SparkPlan = {
+    SparkPlan.currentContext.set(sqlContext)
+    planner.plan(optimizedPlan).next()
+  }
+  // executedPlan should not be used to initialize any SparkPlan. It should be
+  // only used for execution.
+  lazy val executedPlan: SparkPlan = prepareForExecution.execute(sparkPlan)
+
+  /** Internal version of the RDD. Avoids copies and has no schema */
+  lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
+
+  protected def stringOrError[A](f: => A): String =
+    try f.toString catch { case e: Throwable => e.toString }
+
+  def simpleString: String =
+    s"""== Physical Plan ==
+       |${stringOrError(executedPlan)}
+      """.stripMargin.trim
+
+
+  override def toString: String = {
+    def output =
+      analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")
+
+    s"""== Parsed Logical Plan ==
+       |${stringOrError(logical)}
+       |== Analyzed Logical Plan ==
+       |${stringOrError(output)}
+       |${stringOrError(analyzed)}
+       |== Optimized Logical Plan ==
+       |${stringOrError(optimizedPlan)}
+       |== Physical Plan ==
+       |${stringOrError(executedPlan)}
+       |Code Generation: ${stringOrError(executedPlan.codegenEnabled)}
+    """.stripMargin.trim
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index cee58218a885b..1422e15549c94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -37,7 +37,7 @@ private[sql] object SQLExecution {
    * we can connect them with an execution.
    */
   def withNewExecutionId[T](
-      sqlContext: SQLContext, queryExecution: SQLContext#QueryExecution)(body: => T): T = {
+      sqlContext: SQLContext, queryExecution: QueryExecution)(body: => T): T = {
     val sc = sqlContext.sparkContext
     val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY)
     if (oldExecutionId == null) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
new file mode 100644
index 0000000000000..b346f43faebe2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+
+@Experimental
+class SparkPlanner(val sqlContext: SQLContext) extends SparkStrategies {
+  val sparkContext: SparkContext = sqlContext.sparkContext
+
+  def codegenEnabled: Boolean = sqlContext.conf.codegenEnabled
+
+  def unsafeEnabled: Boolean = sqlContext.conf.unsafeEnabled
+
+  def numPartitions: Int = sqlContext.conf.numShufflePartitions
+
+  def strategies: Seq[Strategy] =
+    sqlContext.experimental.extraStrategies ++ (
+      DataSourceStrategy ::
+      DDLStrategy ::
+      TakeOrderedAndProject ::
+      HashAggregation ::
+      Aggregation ::
+      LeftSemiJoin ::
+      EquiJoinSelection ::
+      InMemoryScans ::
+      BasicOperators ::
+      CartesianProduct ::
+      BroadcastNestedLoopJoin :: Nil)
+
+  /**
+   * Used to build table scan operators where complex projection and filtering are done using
+   * separate physical operators.  This function returns the given scan operator with Project and
+   * Filter nodes added only when needed.  For example, a Project operator is only used when the
+   * final desired output requires complex expressions to be evaluated or when columns can be
+   * further eliminated out after filtering has been done.
+   *
+   * The `prunePushedDownFilters` parameter is used to remove those filters that can be optimized
+   * away by the filter pushdown optimization.
+   *
+   * The required attributes for both filtering and expression evaluation are passed to the
+   * provided `scanBuilder` function so that it can avoid unnecessary column materialization.
+   */
+  def pruneFilterProject(
+      projectList: Seq[NamedExpression],
+      filterPredicates: Seq[Expression],
+      prunePushedDownFilters: Seq[Expression] => Seq[Expression],
+      scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {
+
+    val projectSet = AttributeSet(projectList.flatMap(_.references))
+    val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
+    val filterCondition =
+      prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And)
+
+    // Right now we still use a projection even if the only evaluation is applying an alias
+    // to a column.  Since this is a no-op, it could be avoided. However, using this
+    // optimization with the current implementation would change the output schema.
+    // TODO: Decouple final output schema from expression evaluation so this copy can be
+    // avoided safely.
+
+    if (AttributeSet(projectList.map(_.toAttribute)) == projectSet &&
+        filterSet.subsetOf(projectSet)) {
+      // When it is possible to just use column pruning to get the right projection and
+      // when the columns of this projection are enough to evaluate all filter conditions,
+      // just do a scan followed by a filter, with no extra project.
+      val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]])
+      filterCondition.map(Filter(_, scan)).getOrElse(scan)
+    } else {
+      val scan = scanBuilder((projectSet ++ filterSet).toSeq)
+      Project(projectList, filterCondition.map(Filter(_, scan)).getOrElse(scan))
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4572d5efc92bb..5e40d77689045 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.{SQLContext, Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
-  self: SQLContext#SparkPlanner =>
+  self: SparkPlanner =>
 
   object LeftSemiJoin extends Strategy with PredicateHelper {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {

From 217e4964444f4e07b894b1bca768a0cbbe799ea0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 14 Sep 2015 15:00:27 -0700
Subject: [PATCH 278/802] [SPARK-9996] [SPARK-9997] [SQL] Add local expand and
 NestedLoopJoin operators

This PR is in conflict with #8535 and #8573. Will update this one when they are merged.

Author: zsxwing <zsxwing@gmail.com>

Closes #8642 from zsxwing/expand-nest-join.
---
 .../sql/execution/local/ExpandNode.scala      |  60 +++++
 .../spark/sql/execution/local/LocalNode.scala |  55 +++-
 .../execution/local/NestedLoopJoinNode.scala  | 156 ++++++++++++
 .../sql/execution/local/ExpandNodeSuite.scala |  51 ++++
 .../execution/local/HashJoinNodeSuite.scala   |  14 -
 .../sql/execution/local/LocalNodeTest.scala   |  14 +
 .../local/NestedLoopJoinNodeSuite.scala       | 239 ++++++++++++++++++
 7 files changed, 574 insertions(+), 15 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala
new file mode 100644
index 0000000000000..2aff156d18b54
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala
@@ -0,0 +1,60 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection}
+
+case class ExpandNode(
+    conf: SQLConf,
+    projections: Seq[Seq[Expression]],
+    output: Seq[Attribute],
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  assert(projections.size > 0)
+
+  private[this] var result: InternalRow = _
+  private[this] var idx: Int = _
+  private[this] var input: InternalRow = _
+  private[this] var groups: Array[Projection] = _
+
+  override def open(): Unit = {
+    child.open()
+    groups = projections.map(ee => newProjection(ee, child.output)).toArray
+    idx = groups.length
+  }
+
+  override def next(): Boolean = {
+    if (idx >= groups.length) {
+      if (child.next()) {
+        input = child.fetch()
+        idx = 0
+      } else {
+        return false
+      }
+    }
+    result = groups(idx)(input)
+    idx += 1
+    true
+  }
+
+  override def fetch(): InternalRow = result
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index e540ef8555eb6..9840080e16953 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -23,7 +23,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.{SQLConf, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.StructType
 
@@ -69,6 +69,18 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
    */
   def close(): Unit
 
+  /** Specifies whether this operator outputs UnsafeRows */
+  def outputsUnsafeRows: Boolean = false
+
+  /** Specifies whether this operator is capable of processing UnsafeRows */
+  def canProcessUnsafeRows: Boolean = false
+
+  /**
+   * Specifies whether this operator is capable of processing Java-object-based Rows (i.e. rows
+   * that are not UnsafeRows).
+   */
+  def canProcessSafeRows: Boolean = true
+
   /**
    * Returns the content through the [[Iterator]] interface.
    */
@@ -91,6 +103,28 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
     result
   }
 
+  protected def newProjection(
+      expressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): Projection = {
+    log.debug(
+      s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if (codegenEnabled) {
+      try {
+        GenerateProjection.generate(expressions, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate projection, fallback to interpret", e)
+            new InterpretedProjection(expressions, inputSchema)
+          }
+      }
+    } else {
+      new InterpretedProjection(expressions, inputSchema)
+    }
+  }
+
   protected def newMutableProjection(
       expressions: Seq[Expression],
       inputSchema: Seq[Attribute]): () => MutableProjection = {
@@ -113,6 +147,25 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
     }
   }
 
+  protected def newPredicate(
+      expression: Expression,
+      inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
+    if (codegenEnabled) {
+      try {
+        GeneratePredicate.generate(expression, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate predicate, fallback to interpreted", e)
+            InterpretedPredicate.create(expression, inputSchema)
+          }
+      }
+    } else {
+      InterpretedPredicate.create(expression, inputSchema)
+    }
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala
new file mode 100644
index 0000000000000..7321fc66b4dde
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.{FullOuter, RightOuter, LeftOuter, JoinType}
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+import org.apache.spark.util.collection.{BitSet, CompactBuffer}
+
+case class NestedLoopJoinNode(
+    conf: SQLConf,
+    left: LocalNode,
+    right: LocalNode,
+    buildSide: BuildSide,
+    joinType: JoinType,
+    condition: Option[Expression]) extends BinaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = {
+    joinType match {
+      case LeftOuter =>
+        left.output ++ right.output.map(_.withNullability(true))
+      case RightOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output
+      case FullOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true))
+      case x =>
+        throw new IllegalArgumentException(
+          s"NestedLoopJoin should not take $x as the JoinType")
+    }
+  }
+
+  private[this] def genResultProjection: InternalRow => InternalRow = {
+    if (outputsUnsafeRows) {
+      UnsafeProjection.create(schema)
+    } else {
+      identity[InternalRow]
+    }
+  }
+
+  private[this] var currentRow: InternalRow = _
+
+  private[this] var iterator: Iterator[InternalRow] = _
+
+  override def open(): Unit = {
+    val (streamed, build) = buildSide match {
+      case BuildRight => (left, right)
+      case BuildLeft => (right, left)
+    }
+    build.open()
+    val buildRelation = new CompactBuffer[InternalRow]
+    while (build.next()) {
+      buildRelation += build.fetch().copy()
+    }
+    build.close()
+
+    val boundCondition =
+      newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
+
+    val leftNulls = new GenericMutableRow(left.output.size)
+    val rightNulls = new GenericMutableRow(right.output.size)
+    val joinedRow = new JoinedRow
+    val matchedBuildTuples = new BitSet(buildRelation.size)
+    val resultProj = genResultProjection
+    streamed.open()
+
+    // streamedRowMatches also contains null rows if using outer join
+    val streamedRowMatches: Iterator[InternalRow] = streamed.asIterator.flatMap { streamedRow =>
+      val matchedRows = new CompactBuffer[InternalRow]
+
+      var i = 0
+      var streamRowMatched = false
+
+      // Scan the build relation to look for matches for each streamed row
+      while (i < buildRelation.size) {
+        val buildRow = buildRelation(i)
+        buildSide match {
+          case BuildRight => joinedRow(streamedRow, buildRow)
+          case BuildLeft => joinedRow(buildRow, streamedRow)
+        }
+        if (boundCondition(joinedRow)) {
+          matchedRows += resultProj(joinedRow).copy()
+          streamRowMatched = true
+          matchedBuildTuples.set(i)
+        }
+        i += 1
+      }
+
+      // If this row had no matches and we're using outer join, join it with the null rows
+      if (!streamRowMatched) {
+        (joinType, buildSide) match {
+          case (LeftOuter | FullOuter, BuildRight) =>
+            matchedRows += resultProj(joinedRow(streamedRow, rightNulls)).copy()
+          case (RightOuter | FullOuter, BuildLeft) =>
+            matchedRows += resultProj(joinedRow(leftNulls, streamedRow)).copy()
+          case _ =>
+        }
+      }
+
+      matchedRows.iterator
+    }
+
+    // If we're using outer join, find rows on the build side that didn't match anything
+    // and join them with the null row
+    lazy val unmatchedBuildRows: Iterator[InternalRow] = {
+      var i = 0
+      buildRelation.filter { row =>
+        val r = !matchedBuildTuples.get(i)
+        i += 1
+        r
+      }.iterator
+    }
+    iterator = (joinType, buildSide) match {
+      case (RightOuter | FullOuter, BuildRight) =>
+        streamedRowMatches ++
+          unmatchedBuildRows.map { buildRow => resultProj(joinedRow(leftNulls, buildRow)) }
+      case (LeftOuter | FullOuter, BuildLeft) =>
+        streamedRowMatches ++
+          unmatchedBuildRows.map { buildRow => resultProj(joinedRow(buildRow, rightNulls)) }
+      case _ => streamedRowMatches
+    }
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      currentRow = iterator.next()
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
new file mode 100644
index 0000000000000..cfa7f3f6dcb97
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
@@ -0,0 +1,51 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+class ExpandNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  test("expand") {
+    val input = Seq((1, 1), (2, 2), (3, 3), (4, 4), (5, 5)).toDF("key", "value")
+    checkAnswer(
+      input,
+      node =>
+        ExpandNode(conf, Seq(
+          Seq(
+            input.col("key") + input.col("value"), input.col("key") - input.col("value")
+          ).map(_.expr),
+          Seq(
+            input.col("key") * input.col("value"), input.col("key") / input.col("value")
+          ).map(_.expr)
+        ), node.output, node),
+      Seq(
+        (2, 0),
+        (1, 1),
+        (4, 0),
+        (4, 1),
+        (6, 0),
+        (9, 1),
+        (8, 0),
+        (16, 1),
+        (10, 0),
+        (25, 1)
+      ).toDF().collect()
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
index 43b6f06aead88..78d891351f4a9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
@@ -24,20 +24,6 @@ class HashJoinNodeSuite extends LocalNodeTest {
 
   import testImplicits._
 
-  private def wrapForUnsafe(
-      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
-    if (conf.unsafeEnabled) {
-      (left: LocalNode, right: LocalNode) => {
-        val _left = ConvertToUnsafeNode(conf, left)
-        val _right = ConvertToUnsafeNode(conf, right)
-        val r = f(_left, _right)
-        ConvertToSafeNode(conf, r)
-      }
-    } else {
-      f
-    }
-  }
-
   def joinSuite(suiteName: String, confPairs: (String, String)*): Unit = {
     test(s"$suiteName: inner join with one match per row") {
       withSQLConf(confPairs: _*) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
index b95d4ea7f8f2a..86dd28064cc6a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -27,6 +27,20 @@ class LocalNodeTest extends SparkFunSuite with SharedSQLContext {
 
   def conf: SQLConf = sqlContext.conf
 
+  protected def wrapForUnsafe(
+      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
+    if (conf.unsafeEnabled) {
+      (left: LocalNode, right: LocalNode) => {
+        val _left = ConvertToUnsafeNode(conf, left)
+        val _right = ConvertToUnsafeNode(conf, right)
+        val r = f(_left, _right)
+        ConvertToSafeNode(conf, r)
+      }
+    } else {
+      f
+    }
+  }
+
   /**
    * Runs the LocalNode and makes sure the answer matches the expected result.
    * @param input the input data to be used.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
new file mode 100644
index 0000000000000..b1ef26ba82f16
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
@@ -0,0 +1,239 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.plans.{FullOuter, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+
+class NestedLoopJoinNodeSuite extends LocalNodeTest {
+
+  import testImplicits._
+
+  private def joinSuite(
+      suiteName: String, buildSide: BuildSide, confPairs: (String, String)*): Unit = {
+    test(s"$suiteName: left outer join") {
+      withSQLConf(confPairs: _*) {
+        checkAnswer2(
+          upperCaseData,
+          lowerCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              LeftOuter,
+              Some((upperCaseData.col("N") === lowerCaseData.col("n")).expr))
+          ),
+          upperCaseData.join(lowerCaseData, $"n" === $"N", "left").collect())
+
+        checkAnswer2(
+          upperCaseData,
+          lowerCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              LeftOuter,
+              Some(
+                (upperCaseData.col("N") === lowerCaseData.col("n") &&
+                  lowerCaseData.col("n") > 1).expr))
+          ),
+          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"n" > 1, "left").collect())
+
+        checkAnswer2(
+          upperCaseData,
+          lowerCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              LeftOuter,
+              Some(
+                (upperCaseData.col("N") === lowerCaseData.col("n") &&
+                  upperCaseData.col("N") > 1).expr))
+          ),
+          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"N" > 1, "left").collect())
+
+        checkAnswer2(
+          upperCaseData,
+          lowerCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              LeftOuter,
+              Some(
+                (upperCaseData.col("N") === lowerCaseData.col("n") &&
+                  lowerCaseData.col("l") > upperCaseData.col("L")).expr))
+          ),
+          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"l" > $"L", "left").collect())
+      }
+    }
+
+    test(s"$suiteName: right outer join") {
+      withSQLConf(confPairs: _*) {
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              RightOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N")).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N", "right").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              RightOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                lowerCaseData.col("n") > 1).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > 1, "right").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              RightOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                upperCaseData.col("N") > 1).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > 1, "right").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              RightOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                lowerCaseData.col("l") > upperCaseData.col("L")).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"l" > $"L", "right").collect())
+      }
+    }
+
+    test(s"$suiteName: full outer join") {
+      withSQLConf(confPairs: _*) {
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              FullOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N")).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N", "full").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              FullOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                lowerCaseData.col("n") > 1).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > 1, "full").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              FullOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                upperCaseData.col("N") > 1).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > 1, "full").collect())
+
+        checkAnswer2(
+          lowerCaseData,
+          upperCaseData,
+          wrapForUnsafe(
+            (node1, node2) => NestedLoopJoinNode(
+              conf,
+              node1,
+              node2,
+              buildSide,
+              FullOuter,
+              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
+                lowerCaseData.col("l") > upperCaseData.col("L")).expr))
+          ),
+          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"l" > $"L", "full").collect())
+      }
+    }
+  }
+
+  joinSuite(
+    "general-build-left",
+    BuildLeft,
+    SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
+  joinSuite(
+    "general-build-right",
+    BuildRight,
+    SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
+  joinSuite(
+    "tungsten-build-left",
+    BuildLeft,
+    SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
+  joinSuite(
+    "tungsten-build-right",
+    BuildRight,
+    SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
+}

From 16b6d18613e150c7038c613992d80a7828413e66 Mon Sep 17 00:00:00 2001
From: Erick Tryzelaar <erick.tryzelaar@gmail.com>
Date: Mon, 14 Sep 2015 15:02:38 -0700
Subject: [PATCH 279/802] [SPARK-10594] [YARN] Remove reference to
 --num-executors, add --properties-file

`ApplicationMaster` no longer has the `--num-executors` flag, and had an undocumented `--properties-file` configuration option.

cc srowen

Author: Erick Tryzelaar <erick.tryzelaar@gmail.com>

Closes #8754 from erickt/master.
---
 .../apache/spark/deploy/yarn/ApplicationMasterArguments.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
index b08412414aa1c..17d9943c795e3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -105,9 +105,9 @@ class ApplicationMasterArguments(val args: Array[String]) {
       |                       place on the PYTHONPATH for Python apps.
       |  --args ARGS          Arguments to be passed to your application's main class.
       |                       Multiple invocations are possible, each will be passed in order.
-      |  --num-executors NUM    Number of executors to start (Default: 2)
       |  --executor-cores NUM   Number of cores for the executors (Default: 1)
       |  --executor-memory MEM  Memory per executor (e.g. 1000M, 2G) (Default: 1G)
+      |  --properties-file FILE Path to a custom Spark properties file.
       """.stripMargin)
     // scalastyle:on println
     System.exit(exitCode)

From 4e2242bb41dda922573046c00c5142745632f95f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 14 Sep 2015 15:03:51 -0700
Subject: [PATCH 280/802] [SPARK-10576] [BUILD] Move .java files out of
 src/main/scala

Move .java files in `src/main/scala` to `src/main/java` root, except for `package-info.java` (to stay next to package.scala)

Author: Sean Owen <sowen@cloudera.com>

Closes #8736 from srowen/SPARK-10576.
---
 .../org/apache/spark/annotation/AlphaComponent.java               | 0
 .../{scala => java}/org/apache/spark/annotation/DeveloperApi.java | 0
 .../{scala => java}/org/apache/spark/annotation/Experimental.java | 0
 .../main/{scala => java}/org/apache/spark/annotation/Private.java | 0
 .../{scala => java}/org/apache/spark/graphx/TripletFields.java    | 0
 .../org/apache/spark/graphx/impl/EdgeActiveness.java              | 0
 .../org/apache/spark/sql/types/SQLUserDefinedType.java            | 0
 .../org/apache/spark/streaming/StreamingContextState.java         | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename core/src/main/{scala => java}/org/apache/spark/annotation/AlphaComponent.java (100%)
 rename core/src/main/{scala => java}/org/apache/spark/annotation/DeveloperApi.java (100%)
 rename core/src/main/{scala => java}/org/apache/spark/annotation/Experimental.java (100%)
 rename core/src/main/{scala => java}/org/apache/spark/annotation/Private.java (100%)
 rename graphx/src/main/{scala => java}/org/apache/spark/graphx/TripletFields.java (100%)
 rename graphx/src/main/{scala => java}/org/apache/spark/graphx/impl/EdgeActiveness.java (100%)
 rename sql/catalyst/src/main/{scala => java}/org/apache/spark/sql/types/SQLUserDefinedType.java (100%)
 rename streaming/src/main/{scala => java}/org/apache/spark/streaming/StreamingContextState.java (100%)

diff --git a/core/src/main/scala/org/apache/spark/annotation/AlphaComponent.java b/core/src/main/java/org/apache/spark/annotation/AlphaComponent.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/AlphaComponent.java
rename to core/src/main/java/org/apache/spark/annotation/AlphaComponent.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/DeveloperApi.java b/core/src/main/java/org/apache/spark/annotation/DeveloperApi.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/DeveloperApi.java
rename to core/src/main/java/org/apache/spark/annotation/DeveloperApi.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/Experimental.java b/core/src/main/java/org/apache/spark/annotation/Experimental.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/Experimental.java
rename to core/src/main/java/org/apache/spark/annotation/Experimental.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/Private.java b/core/src/main/java/org/apache/spark/annotation/Private.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/Private.java
rename to core/src/main/java/org/apache/spark/annotation/Private.java
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/TripletFields.java b/graphx/src/main/java/org/apache/spark/graphx/TripletFields.java
similarity index 100%
rename from graphx/src/main/scala/org/apache/spark/graphx/TripletFields.java
rename to graphx/src/main/java/org/apache/spark/graphx/TripletFields.java
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeActiveness.java b/graphx/src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java
similarity index 100%
rename from graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeActiveness.java
rename to graphx/src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/SQLUserDefinedType.java
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/types/SQLUserDefinedType.java
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContextState.java b/streaming/src/main/java/org/apache/spark/streaming/StreamingContextState.java
similarity index 100%
rename from streaming/src/main/scala/org/apache/spark/streaming/StreamingContextState.java
rename to streaming/src/main/java/org/apache/spark/streaming/StreamingContextState.java

From ffbbc2c58b9bf1e2abc2ea797feada6821ab4de8 Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Mon, 14 Sep 2015 15:05:19 -0700
Subject: [PATCH 281/802] [SPARK-10549] scala 2.11 spark on yarn with security
 - Repl doesn't work

Make this lazy so that it can set the yarn mode before creating the securityManager.

Author: Tom Graves <tgraves@yahoo-inc.com>
Author: Thomas Graves <tgraves@staydecay.corp.gq1.yahoo.com>

Closes #8719 from tgravescs/SPARK-10549.
---
 .../scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index be31eb2eda546..627148df80c11 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -35,7 +35,8 @@ object Main extends Logging {
   s.processArguments(List("-Yrepl-class-based",
     "-Yrepl-outdir", s"${outputDir.getAbsolutePath}",
     "-classpath", getAddedJars.mkString(File.pathSeparator)), true)
-  val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
+  // the creation of SecurityManager has to be lazy so SPARK_YARN_MODE is set if needed
+  lazy val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
   var sparkContext: SparkContext = _
   var sqlContext: SQLContext = _
   var interp = new SparkILoop // this is a public var because tests reset it.

From fd1e8cddf2635c55fec2ac6e1f1c221c9685af0f Mon Sep 17 00:00:00 2001
From: Forest Fang <forest.fang@outlook.com>
Date: Mon, 14 Sep 2015 15:07:13 -0700
Subject: [PATCH 282/802] [SPARK-10543] [CORE] Peak Execution Memory Quantile
 should be Per-task Basis

Read `PEAK_EXECUTION_MEMORY` using `update` to get per task partial value instead of cumulative value.

I tested with this workload:

```scala
val size = 1000
val repetitions = 10
val data = sc.parallelize(1 to size, 5).map(x => (util.Random.nextInt(size / repetitions),util.Random.nextDouble)).toDF("key", "value")
val res = data.toDF.groupBy("key").agg(sum("value")).count
```

Before:
![image](https://cloud.githubusercontent.com/assets/4317392/9828197/07dd6874-58b8-11e5-9bd9-6ba927c38b26.png)

After:
![image](https://cloud.githubusercontent.com/assets/4317392/9828151/a5ddff30-58b7-11e5-8d31-eda5dc4eae79.png)

Tasks view:
![image](https://cloud.githubusercontent.com/assets/4317392/9828199/17dc2b84-58b8-11e5-92a8-be89ce4d29d1.png)

cc andrewor14 I appreciate if you can give feedback on this since I think you introduced display of this metric.

Author: Forest Fang <forest.fang@outlook.com>

Closes #8726 from saurfang/stagepage.
---
 .../org/apache/spark/ui/jobs/StagePage.scala  |  2 +-
 .../org/apache/spark/ui/StagePageSuite.scala  | 29 ++++++++++++++-----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 4adc6596ba21c..2b71f55b7bb4f 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -368,7 +368,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           val peakExecutionMemory = validTasks.map { case TaskUIData(info, _, _) =>
             info.accumulables
               .find { acc => acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY }
-              .map { acc => acc.value.toLong }
+              .map { acc => acc.update.getOrElse("0").toLong }
               .getOrElse(0L)
               .toDouble
           }
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 3388c6dca81f1..86699e7f56953 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -23,7 +23,7 @@ import scala.xml.Node
 
 import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS}
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite, Success}
+import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.ui.jobs.{JobProgressListener, StagePage, StagesTab}
@@ -47,6 +47,14 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
     assert(html3.contains(targetString))
   }
 
+  test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
+    val unsafeConf = "spark.sql.unsafe.enabled"
+    val conf = new SparkConf(false).set(unsafeConf, "true")
+    val html = renderStagePage(conf).toString().toLowerCase
+    // verify min/25/50/75/max show task value not cumulative values
+    assert(html.contains("<td>10.0 b</td>" * 5))
+  }
+
   /**
    * Render a stage page started with the given conf and return the HTML.
    * This also runs a dummy stage to populate the page with useful content.
@@ -67,12 +75,19 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
     // Simulate a stage in job progress listener
     val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details")
-    val taskInfo = new TaskInfo(0, 0, 0, 0, "0", "localhost", TaskLocality.ANY, false)
-    jobListener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo))
-    jobListener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo))
-    taskInfo.markSuccessful()
-    jobListener.onTaskEnd(
-      SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, TaskMetrics.empty))
+    // Simulate two tasks to test PEAK_EXECUTION_MEMORY correctness
+    (1 to 2).foreach {
+      taskId =>
+        val taskInfo = new TaskInfo(taskId, taskId, 0, 0, "0", "localhost", TaskLocality.ANY, false)
+        val peakExecutionMemory = 10
+        taskInfo.accumulables += new AccumulableInfo(0, InternalAccumulator.PEAK_EXECUTION_MEMORY,
+          Some(peakExecutionMemory.toString), (peakExecutionMemory * taskId).toString, true)
+        jobListener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo))
+        jobListener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo))
+        taskInfo.markSuccessful()
+        jobListener.onTaskEnd(
+          SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, TaskMetrics.empty))
+    }
     jobListener.onStageCompleted(SparkListenerStageCompleted(stageInfo))
     page.render(request)
   }

From 7b6c856367b9c36348e80e83959150da9656c4dd Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 14 Sep 2015 15:09:43 -0700
Subject: [PATCH 283/802] [SPARK-10564] ThreadingSuite: assertion failures in
 threads don't fail the test (round 2)

This is a follow-up patch to #8723. I missed one case there.

Author: Andrew Or <andrew@databricks.com>

Closes #8727 from andrewor14/fix-threading-suite.
---
 .../org/apache/spark/ThreadingSuite.scala     | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index cda2b245526f7..a96a4ce201c21 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -147,12 +147,12 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
       }.start()
     }
     sem.acquire(2)
+    throwable.foreach { t => throw t }
     if (ThreadingSuiteState.failed.get()) {
       logError("Waited 1 second without seeing runningThreads = 4 (it was " +
                 ThreadingSuiteState.runningThreads.get() + "); failing test")
       fail("One or more threads didn't see runningThreads = 4")
     }
-    throwable.foreach { t => throw t }
   }
 
   test("set local properties in different thread") {
@@ -178,8 +178,8 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
-    assert(sc.getLocalProperty("test") === null)
     throwable.foreach { t => throw t }
+    assert(sc.getLocalProperty("test") === null)
   }
 
   test("set and get local properties in parent-children thread") {
@@ -207,15 +207,16 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
+    throwable.foreach { t => throw t }
     assert(sc.getLocalProperty("test") === "parent")
     assert(sc.getLocalProperty("Foo") === null)
-    throwable.foreach { t => throw t }
   }
 
   test("mutations to local properties should not affect submitted jobs (SPARK-6629)") {
     val jobStarted = new Semaphore(0)
     val jobEnded = new Semaphore(0)
     @volatile var jobResult: JobResult = null
+    var throwable: Option[Throwable] = None
 
     sc = new SparkContext("local", "test")
     sc.setJobGroup("originalJobGroupId", "description")
@@ -232,14 +233,19 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     // Create a new thread which will inherit the current thread's properties
     val thread = new Thread() {
       override def run(): Unit = {
-        assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId")
-        // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task
         try {
-          sc.parallelize(1 to 100).foreach { x =>
-            Thread.sleep(100)
+          assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId")
+          // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task
+          try {
+            sc.parallelize(1 to 100).foreach { x =>
+              Thread.sleep(100)
+            }
+          } catch {
+            case s: SparkException => // ignored so that we don't print noise in test logs
           }
         } catch {
-          case s: SparkException => // ignored so that we don't print noise in test logs
+          case t: Throwable =>
+            throwable = Some(t)
         }
       }
     }
@@ -252,6 +258,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     // modification of the properties object should not affect the properties of running jobs
     sc.cancelJobGroup("originalJobGroupId")
     jobEnded.tryAcquire(10, TimeUnit.SECONDS)
+    throwable.foreach { t => throw t }
     assert(jobResult.isInstanceOf[JobFailed])
   }
 }

From 1a0955250bb65cd6f5818ad60efb62ea4b45d18e Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Mon, 14 Sep 2015 21:47:40 -0400
Subject: [PATCH 284/802] [SPARK-9851] Support submitting map stages
 individually in DAGScheduler

This patch adds support for submitting map stages in a DAG individually so that we can make downstream decisions after seeing statistics about their output, as part of SPARK-9850. I also added more comments to many of the key classes in DAGScheduler. By itself, the patch is not super useful except maybe to switch between a shuffle and broadcast join, but with the other subtasks of SPARK-9850 we'll be able to do more interesting decisions.

The main entry point is SparkContext.submitMapStage, which lets you run a map stage and see stats about the map output sizes. Other stats could also be collected through accumulators. See AdaptiveSchedulingSuite for a short example.

Author: Matei Zaharia <matei@databricks.com>

Closes #8180 from mateiz/spark-9851.
---
 .../apache/spark/MapOutputStatistics.scala    |  27 ++
 .../org/apache/spark/MapOutputTracker.scala   |  49 +++-
 .../scala/org/apache/spark/SparkContext.scala |  17 ++
 .../apache/spark/scheduler/ActiveJob.scala    |  34 ++-
 .../apache/spark/scheduler/DAGScheduler.scala | 251 +++++++++++++++---
 .../spark/scheduler/DAGSchedulerEvent.scala   |  10 +
 .../apache/spark/scheduler/ResultStage.scala  |  17 +-
 .../spark/scheduler/ShuffleMapStage.scala     |  13 +-
 .../org/apache/spark/scheduler/Stage.scala    |  26 +-
 .../scala/org/apache/spark/FailureSuite.scala |  21 ++
 .../scheduler/AdaptiveSchedulingSuite.scala   |  65 +++++
 .../spark/scheduler/DAGSchedulerSuite.scala   | 243 ++++++++++++++++-
 12 files changed, 710 insertions(+), 63 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
new file mode 100644
index 0000000000000..f8a6f1d0d8cbb
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+/**
+ * Holds statistics about the output sizes in a map stage. May become a DeveloperApi in the future.
+ *
+ * @param shuffleId ID of the shuffle
+ * @param bytesByPartitionId approximate number of output bytes for each map output partition
+ *   (may be inexact due to use of compressed map statuses)
+ */
+private[spark] class MapOutputStatistics(val shuffleId: Int, val bytesByPartitionId: Array[Long])
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index a387592783850..94eb8daa85c53 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.io._
+import java.util.Arrays
 import java.util.concurrent.ConcurrentHashMap
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
@@ -132,13 +133,43 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    *         describing the shuffle blocks that are stored at that block manager.
    */
   def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int)
-  : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
+      : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
     logDebug(s"Fetching outputs for shuffle $shuffleId, reduce $reduceId")
-    val startTime = System.currentTimeMillis
+    val statuses = getStatuses(shuffleId)
+    // Synchronize on the returned array because, on the driver, it gets mutated in place
+    statuses.synchronized {
+      return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses)
+    }
+  }
 
+  /**
+   * Return statistics about all of the outputs for a given shuffle.
+   */
+  def getStatistics(dep: ShuffleDependency[_, _, _]): MapOutputStatistics = {
+    val statuses = getStatuses(dep.shuffleId)
+    // Synchronize on the returned array because, on the driver, it gets mutated in place
+    statuses.synchronized {
+      val totalSizes = new Array[Long](dep.partitioner.numPartitions)
+      for (s <- statuses) {
+        for (i <- 0 until totalSizes.length) {
+          totalSizes(i) += s.getSizeForBlock(i)
+        }
+      }
+      new MapOutputStatistics(dep.shuffleId, totalSizes)
+    }
+  }
+
+  /**
+   * Get or fetch the array of MapStatuses for a given shuffle ID. NOTE: clients MUST synchronize
+   * on this array when reading it, because on the driver, we may be changing it in place.
+   *
+   * (It would be nice to remove this restriction in the future.)
+   */
+  private def getStatuses(shuffleId: Int): Array[MapStatus] = {
     val statuses = mapStatuses.get(shuffleId).orNull
     if (statuses == null) {
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
+      val startTime = System.currentTimeMillis
       var fetchedStatuses: Array[MapStatus] = null
       fetching.synchronized {
         // Someone else is fetching it; wait for them to be done
@@ -160,7 +191,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
       }
 
       if (fetchedStatuses == null) {
-        // We won the race to fetch the output locs; do so
+        // We won the race to fetch the statuses; do so
         logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
         // This try-finally prevents hangs due to timeouts:
         try {
@@ -175,22 +206,18 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
           }
         }
       }
-      logDebug(s"Fetching map output location for shuffle $shuffleId, reduce $reduceId took " +
+      logDebug(s"Fetching map output statuses for shuffle $shuffleId took " +
         s"${System.currentTimeMillis - startTime} ms")
 
       if (fetchedStatuses != null) {
-        fetchedStatuses.synchronized {
-          return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, fetchedStatuses)
-        }
+        return fetchedStatuses
       } else {
         logError("Missing all output locations for shuffle " + shuffleId)
         throw new MetadataFetchFailedException(
-          shuffleId, reduceId, "Missing all output locations for shuffle " + shuffleId)
+          shuffleId, -1, "Missing all output locations for shuffle " + shuffleId)
       }
     } else {
-      statuses.synchronized {
-        return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses)
-      }
+      return statuses
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e27b3c4962221..dee6091ce3caf 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1984,6 +1984,23 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     new SimpleFutureAction(waiter, resultFunc)
   }
 
+  /**
+   * Submit a map stage for execution. This is currently an internal API only, but might be
+   * promoted to DeveloperApi in the future.
+   */
+  private[spark] def submitMapStage[K, V, C](dependency: ShuffleDependency[K, V, C])
+      : SimpleFutureAction[MapOutputStatistics] = {
+    assertNotStopped()
+    val callSite = getCallSite()
+    var result: MapOutputStatistics = null
+    val waiter = dagScheduler.submitMapStage(
+      dependency,
+      (r: MapOutputStatistics) => { result = r },
+      callSite,
+      localProperties.get)
+    new SimpleFutureAction[MapOutputStatistics](waiter, result)
+  }
+
   /**
    * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]]
    * for more information.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala b/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
index 50a69379412d2..a3d2db31301b3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
@@ -23,18 +23,42 @@ import org.apache.spark.TaskContext
 import org.apache.spark.util.CallSite
 
 /**
- * Tracks information about an active job in the DAGScheduler.
+ * A running job in the DAGScheduler. Jobs can be of two types: a result job, which computes a
+ * ResultStage to execute an action, or a map-stage job, which computes the map outputs for a
+ * ShuffleMapStage before any downstream stages are submitted. The latter is used for adaptive
+ * query planning, to look at map output statistics before submitting later stages. We distinguish
+ * between these two types of jobs using the finalStage field of this class.
+ *
+ * Jobs are only tracked for "leaf" stages that clients directly submitted, through DAGScheduler's
+ * submitJob or submitMapStage methods. However, either type of job may cause the execution of
+ * other earlier stages (for RDDs in the DAG it depends on), and multiple jobs may share some of
+ * these previous stages. These dependencies are managed inside DAGScheduler.
+ *
+ * @param jobId A unique ID for this job.
+ * @param finalStage The stage that this job computes (either a ResultStage for an action or a
+ *   ShuffleMapStage for submitMapStage).
+ * @param callSite Where this job was initiated in the user's program (shown on UI).
+ * @param listener A listener to notify if tasks in this job finish or the job fails.
+ * @param properties Scheduling properties attached to the job, such as fair scheduler pool name.
  */
 private[spark] class ActiveJob(
     val jobId: Int,
-    val finalStage: ResultStage,
-    val func: (TaskContext, Iterator[_]) => _,
-    val partitions: Array[Int],
+    val finalStage: Stage,
     val callSite: CallSite,
     val listener: JobListener,
     val properties: Properties) {
 
-  val numPartitions = partitions.length
+  /**
+   * Number of partitions we need to compute for this job. Note that result stages may not need
+   * to compute all partitions in their target RDD, for actions like first() and lookup().
+   */
+  val numPartitions = finalStage match {
+    case r: ResultStage => r.partitions.length
+    case m: ShuffleMapStage => m.rdd.partitions.length
+  }
+
+  /** Which partitions of the stage have finished */
   val finished = Array.fill[Boolean](numPartitions)(false)
+
   var numFinished = 0
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 09e963f5cdf60..b4f90e8347894 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -45,17 +45,65 @@ import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
  * The high-level scheduling layer that implements stage-oriented scheduling. It computes a DAG of
  * stages for each job, keeps track of which RDDs and stage outputs are materialized, and finds a
  * minimal schedule to run the job. It then submits stages as TaskSets to an underlying
- * TaskScheduler implementation that runs them on the cluster.
+ * TaskScheduler implementation that runs them on the cluster. A TaskSet contains fully independent
+ * tasks that can run right away based on the data that's already on the cluster (e.g. map output
+ * files from previous stages), though it may fail if this data becomes unavailable.
  *
- * In addition to coming up with a DAG of stages, this class also determines the preferred
+ * Spark stages are created by breaking the RDD graph at shuffle boundaries. RDD operations with
+ * "narrow" dependencies, like map() and filter(), are pipelined together into one set of tasks
+ * in each stage, but operations with shuffle dependencies require multiple stages (one to write a
+ * set of map output files, and another to read those files after a barrier). In the end, every
+ * stage will have only shuffle dependencies on other stages, and may compute multiple operations
+ * inside it. The actual pipelining of these operations happens in the RDD.compute() functions of
+ * various RDDs (MappedRDD, FilteredRDD, etc).
+ *
+ * In addition to coming up with a DAG of stages, the DAGScheduler also determines the preferred
  * locations to run each task on, based on the current cache status, and passes these to the
  * low-level TaskScheduler. Furthermore, it handles failures due to shuffle output files being
  * lost, in which case old stages may need to be resubmitted. Failures *within* a stage that are
  * not caused by shuffle file loss are handled by the TaskScheduler, which will retry each task
  * a small number of times before cancelling the whole stage.
  *
+ * When looking through this code, there are several key concepts:
+ *
+ *  - Jobs (represented by [[ActiveJob]]) are the top-level work items submitted to the scheduler.
+ *    For example, when the user calls an action, like count(), a job will be submitted through
+ *    submitJob. Each Job may require the execution of multiple stages to build intermediate data.
+ *
+ *  - Stages ([[Stage]]) are sets of tasks that compute intermediate results in jobs, where each
+ *    task computes the same function on partitions of the same RDD. Stages are separated at shuffle
+ *    boundaries, which introduce a barrier (where we must wait for the previous stage to finish to
+ *    fetch outputs). There are two types of stages: [[ResultStage]], for the final stage that
+ *    executes an action, and [[ShuffleMapStage]], which writes map output files for a shuffle.
+ *    Stages are often shared across multiple jobs, if these jobs reuse the same RDDs.
+ *
+ *  - Tasks are individual units of work, each sent to one machine.
+ *
+ *  - Cache tracking: the DAGScheduler figures out which RDDs are cached to avoid recomputing them
+ *    and likewise remembers which shuffle map stages have already produced output files to avoid
+ *    redoing the map side of a shuffle.
+ *
+ *  - Preferred locations: the DAGScheduler also computes where to run each task in a stage based
+ *    on the preferred locations of its underlying RDDs, or the location of cached or shuffle data.
+ *
+ *  - Cleanup: all data structures are cleared when the running jobs that depend on them finish,
+ *    to prevent memory leaks in a long-running application.
+ *
+ * To recover from failures, the same stage might need to run multiple times, which are called
+ * "attempts". If the TaskScheduler reports that a task failed because a map output file from a
+ * previous stage was lost, the DAGScheduler resubmits that lost stage. This is detected through a
+ * CompletionEvent with FetchFailed, or an ExecutorLost event. The DAGScheduler will wait a small
+ * amount of time to see whether other nodes or tasks fail, then resubmit TaskSets for any lost
+ * stage(s) that compute the missing tasks. As part of this process, we might also have to create
+ * Stage objects for old (finished) stages where we previously cleaned up the Stage object. Since
+ * tasks from the old attempt of a stage could still be running, care must be taken to map any
+ * events received in the correct Stage object.
+ *
  * Here's a checklist to use when making or reviewing changes to this class:
  *
+ *  - All data structures should be cleared when the jobs involving them end to avoid indefinite
+ *    accumulation of state in long-running programs.
+ *
  *  - When adding a new data structure, update `DAGSchedulerSuite.assertDataStructuresEmpty` to
  *    include the new structure. This will help to catch memory leaks.
  */
@@ -295,12 +343,12 @@ class DAGScheduler(
    */
   private def newResultStage(
       rdd: RDD[_],
-      numTasks: Int,
+      func: (TaskContext, Iterator[_]) => _,
+      partitions: Array[Int],
       jobId: Int,
       callSite: CallSite): ResultStage = {
     val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId)
-    val stage: ResultStage = new ResultStage(id, rdd, numTasks, parentStages, jobId, callSite)
-
+    val stage = new ResultStage(id, rdd, func, partitions, parentStages, jobId, callSite)
     stageIdToStage(id) = stage
     updateJobIdStageIdMaps(jobId, stage)
     stage
@@ -500,12 +548,25 @@ class DAGScheduler(
     jobIdToStageIds -= job.jobId
     jobIdToActiveJob -= job.jobId
     activeJobs -= job
-    job.finalStage.resultOfJob = None
+    job.finalStage match {
+      case r: ResultStage =>
+        r.resultOfJob = None
+      case m: ShuffleMapStage =>
+        m.mapStageJobs = m.mapStageJobs.filter(_ != job)
+    }
   }
 
   /**
-   * Submit a job to the job scheduler and get a JobWaiter object back. The JobWaiter object
+   * Submit an action job to the scheduler and get a JobWaiter object back. The JobWaiter object
    * can be used to block until the the job finishes executing or can be used to cancel the job.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param partitions set of partitions to run on; some jobs may not want to compute on all
+   *   partitions of the target RDD, e.g. for operations like first()
+   * @param callSite where in the user program this job was called
+   * @param resultHandler callback to pass each result to
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
    */
   def submitJob[T, U](
       rdd: RDD[T],
@@ -524,6 +585,7 @@ class DAGScheduler(
 
     val jobId = nextJobId.getAndIncrement()
     if (partitions.size == 0) {
+      // Return immediately if the job is running 0 tasks
       return new JobWaiter[U](this, jobId, 0, resultHandler)
     }
 
@@ -536,6 +598,18 @@ class DAGScheduler(
     waiter
   }
 
+  /**
+   * Run an action job on the given RDD and pass all the results to the resultHandler function as
+   * they arrive. Throws an exception if the job fials, or returns normally if successful.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param partitions set of partitions to run on; some jobs may not want to compute on all
+   *   partitions of the target RDD, e.g. for operations like first()
+   * @param callSite where in the user program this job was called
+   * @param resultHandler callback to pass each result to
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
   def runJob[T, U](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
@@ -559,6 +633,17 @@ class DAGScheduler(
     }
   }
 
+  /**
+   * Run an approximate job on the given RDD and pass all the results to an ApproximateEvaluator
+   * as they arrive. Returns a partial result object from the evaluator.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param evaluator [[ApproximateEvaluator]] to receive the partial results
+   * @param callSite where in the user program this job was called
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
   def runApproximateJob[T, U, R](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
@@ -575,6 +660,41 @@ class DAGScheduler(
     listener.awaitResult()    // Will throw an exception if the job fails
   }
 
+  /**
+   * Submit a shuffle map stage to run independently and get a JobWaiter object back. The waiter
+   * can be used to block until the the job finishes executing or can be used to cancel the job.
+   * This method is used for adaptive query planning, to run map stages and look at statistics
+   * about their outputs before submitting downstream stages.
+   *
+   * @param dependency the ShuffleDependency to run a map stage for
+   * @param callback function called with the result of the job, which in this case will be a
+   *   single MapOutputStatistics object showing how much data was produced for each partition
+   * @param callSite where in the user program this job was submitted
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
+  def submitMapStage[K, V, C](
+      dependency: ShuffleDependency[K, V, C],
+      callback: MapOutputStatistics => Unit,
+      callSite: CallSite,
+      properties: Properties): JobWaiter[MapOutputStatistics] = {
+
+    val rdd = dependency.rdd
+    val jobId = nextJobId.getAndIncrement()
+    if (rdd.partitions.length == 0) {
+      throw new SparkException("Can't run submitMapStage on RDD with 0 partitions")
+    }
+
+    // We create a JobWaiter with only one "task", which will be marked as complete when the whole
+    // map stage has completed, and will be passed the MapOutputStatistics for that stage.
+    // This makes it easier to avoid race conditions between the user code and the map output
+    // tracker that might result if we told the user the stage had finished, but then they queries
+    // the map output tracker and some node failures had caused the output statistics to be lost.
+    val waiter = new JobWaiter(this, jobId, 1, (i: Int, r: MapOutputStatistics) => callback(r))
+    eventProcessLoop.post(MapStageSubmitted(
+      jobId, dependency, callSite, waiter, SerializationUtils.clone(properties)))
+    waiter
+  }
+
   /**
    * Cancel a job that is running or waiting in the queue.
    */
@@ -583,6 +703,9 @@ class DAGScheduler(
     eventProcessLoop.post(JobCancelled(jobId))
   }
 
+  /**
+   * Cancel all jobs in the given job group ID.
+   */
   def cancelJobGroup(groupId: String): Unit = {
     logInfo("Asked to cancel job group " + groupId)
     eventProcessLoop.post(JobGroupCancelled(groupId))
@@ -720,31 +843,77 @@ class DAGScheduler(
     try {
       // New stage creation may throw an exception if, for example, jobs are run on a
       // HadoopRDD whose underlying HDFS files have been deleted.
-      finalStage = newResultStage(finalRDD, partitions.length, jobId, callSite)
+      finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
     } catch {
       case e: Exception =>
         logWarning("Creating new stage failed due to exception - job: " + jobId, e)
         listener.jobFailed(e)
         return
     }
-    if (finalStage != null) {
-      val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
-      clearCacheLocs()
-      logInfo("Got job %s (%s) with %d output partitions".format(
-        job.jobId, callSite.shortForm, partitions.length))
-      logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
-      logInfo("Parents of final stage: " + finalStage.parents)
-      logInfo("Missing parents: " + getMissingParentStages(finalStage))
-      val jobSubmissionTime = clock.getTimeMillis()
-      jobIdToActiveJob(jobId) = job
-      activeJobs += job
-      finalStage.resultOfJob = Some(job)
-      val stageIds = jobIdToStageIds(jobId).toArray
-      val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
-      listenerBus.post(
-        SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
-      submitStage(finalStage)
+
+    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
+    clearCacheLocs()
+    logInfo("Got job %s (%s) with %d output partitions".format(
+      job.jobId, callSite.shortForm, partitions.length))
+    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
+    logInfo("Parents of final stage: " + finalStage.parents)
+    logInfo("Missing parents: " + getMissingParentStages(finalStage))
+
+    val jobSubmissionTime = clock.getTimeMillis()
+    jobIdToActiveJob(jobId) = job
+    activeJobs += job
+    finalStage.resultOfJob = Some(job)
+    val stageIds = jobIdToStageIds(jobId).toArray
+    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
+    listenerBus.post(
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+    submitStage(finalStage)
+
+    submitWaitingStages()
+  }
+
+  private[scheduler] def handleMapStageSubmitted(jobId: Int,
+      dependency: ShuffleDependency[_, _, _],
+      callSite: CallSite,
+      listener: JobListener,
+      properties: Properties) {
+    // Submitting this map stage might still require the creation of some parent stages, so make
+    // sure that happens.
+    var finalStage: ShuffleMapStage = null
+    try {
+      // New stage creation may throw an exception if, for example, jobs are run on a
+      // HadoopRDD whose underlying HDFS files have been deleted.
+      finalStage = getShuffleMapStage(dependency, jobId)
+    } catch {
+      case e: Exception =>
+        logWarning("Creating new stage failed due to exception - job: " + jobId, e)
+        listener.jobFailed(e)
+        return
+    }
+
+    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
+    clearCacheLocs()
+    logInfo("Got map stage job %s (%s) with %d output partitions".format(
+      jobId, callSite.shortForm, dependency.rdd.partitions.size))
+    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
+    logInfo("Parents of final stage: " + finalStage.parents)
+    logInfo("Missing parents: " + getMissingParentStages(finalStage))
+
+    val jobSubmissionTime = clock.getTimeMillis()
+    jobIdToActiveJob(jobId) = job
+    activeJobs += job
+    finalStage.mapStageJobs = job :: finalStage.mapStageJobs
+    val stageIds = jobIdToStageIds(jobId).toArray
+    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
+    listenerBus.post(
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+    submitStage(finalStage)
+
+    // If the whole stage has already finished, tell the listener and remove it
+    if (!finalStage.outputLocs.contains(Nil)) {
+      markMapStageJobAsFinished(job, mapOutputTracker.getStatistics(dependency))
     }
+
     submitWaitingStages()
   }
 
@@ -814,7 +983,7 @@ class DAGScheduler(
         case s: ResultStage =>
           val job = s.resultOfJob.get
           partitionsToCompute.map { id =>
-            val p = job.partitions(id)
+            val p = s.partitions(id)
             (id, getPreferredLocs(stage.rdd, p))
           }.toMap
       }
@@ -844,7 +1013,7 @@ class DAGScheduler(
         case stage: ShuffleMapStage =>
           closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
         case stage: ResultStage =>
-          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func): AnyRef).array()
+          closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
       }
 
       taskBinary = sc.broadcast(taskBinaryBytes)
@@ -875,7 +1044,7 @@ class DAGScheduler(
         case stage: ResultStage =>
           val job = stage.resultOfJob.get
           partitionsToCompute.map { id =>
-            val p: Int = job.partitions(id)
+            val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptId,
@@ -1052,13 +1221,21 @@ class DAGScheduler(
                 logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name +
                   ") because some of its tasks had failed: " +
                   shuffleStage.outputLocs.zipWithIndex.filter(_._1.isEmpty)
-                      .map(_._2).mkString(", "))
+                    .map(_._2).mkString(", "))
                 submitStage(shuffleStage)
+              } else {
+                // Mark any map-stage jobs waiting on this stage as finished
+                if (shuffleStage.mapStageJobs.nonEmpty) {
+                  val stats = mapOutputTracker.getStatistics(shuffleStage.shuffleDep)
+                  for (job <- shuffleStage.mapStageJobs) {
+                    markMapStageJobAsFinished(job, stats)
+                  }
+                }
               }
 
               // Note: newly runnable stages will be submitted below when we submit waiting stages
             }
-          }
+        }
 
       case Resubmitted =>
         logInfo("Resubmitted " + task + ", so marking it as still running")
@@ -1412,6 +1589,17 @@ class DAGScheduler(
     Nil
   }
 
+  /** Mark a map stage job as finished with the given output stats, and report to its listener. */
+  def markMapStageJobAsFinished(job: ActiveJob, stats: MapOutputStatistics): Unit = {
+    // In map stage jobs, we only create a single "task", which is to finish all of the stage
+    // (including reusing any previous map outputs, etc); so we just mark task 0 as done
+    job.finished(0) = true
+    job.numFinished += 1
+    job.listener.taskSucceeded(0, stats)
+    cleanupStateForJobAndIndependentStages(job)
+    listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded))
+  }
+
   def stop() {
     logInfo("Stopping DAGScheduler")
     messageScheduler.shutdownNow()
@@ -1445,6 +1633,9 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
       dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
 
+    case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
+      dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
+
     case StageCancelled(stageId) =>
       dagScheduler.handleStageCancellation(stageId)
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
index f72a52e85dc15..dda3b6cc7f960 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -35,6 +35,7 @@ import org.apache.spark.util.CallSite
  */
 private[scheduler] sealed trait DAGSchedulerEvent
 
+/** A result-yielding job was submitted on a target RDD */
 private[scheduler] case class JobSubmitted(
     jobId: Int,
     finalRDD: RDD[_],
@@ -45,6 +46,15 @@ private[scheduler] case class JobSubmitted(
     properties: Properties = null)
   extends DAGSchedulerEvent
 
+/** A map stage as submitted to run as a separate job */
+private[scheduler] case class MapStageSubmitted(
+  jobId: Int,
+  dependency: ShuffleDependency[_, _, _],
+  callSite: CallSite,
+  listener: JobListener,
+  properties: Properties = null)
+  extends DAGSchedulerEvent
+
 private[scheduler] case class StageCancelled(stageId: Int) extends DAGSchedulerEvent
 
 private[scheduler] case class JobCancelled(jobId: Int) extends DAGSchedulerEvent
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index bf81b9aca4810..c0451da1f0247 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -17,23 +17,30 @@
 
 package org.apache.spark.scheduler
 
+import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.CallSite
 
 /**
- * The ResultStage represents the final stage in a job.
+ * ResultStages apply a function on some partitions of an RDD to compute the result of an action.
+ * The ResultStage object captures the function to execute, `func`, which will be applied to each
+ * partition, and the set of partition IDs, `partitions`. Some stages may not run on all partitions
+ * of the RDD, for actions like first() and lookup().
  */
 private[spark] class ResultStage(
     id: Int,
     rdd: RDD[_],
-    numTasks: Int,
+    val func: (TaskContext, Iterator[_]) => _,
+    val partitions: Array[Int],
     parents: List[Stage],
     firstJobId: Int,
     callSite: CallSite)
-  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
+  extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite) {
 
-  // The active job for this result stage. Will be empty if the job has already finished
-  // (e.g., because the job was cancelled).
+  /**
+   * The active job for this result stage. Will be empty if the job has already finished
+   * (e.g., because the job was cancelled).
+   */
   var resultOfJob: Option[ActiveJob] = None
 
   override def toString: String = "ResultStage " + id
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 48d8d8e9c4b78..7d92960876403 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -23,7 +23,15 @@ import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.CallSite
 
 /**
- * The ShuffleMapStage represents the intermediate stages in a job.
+ * ShuffleMapStages are intermediate stages in the execution DAG that produce data for a shuffle.
+ * They occur right before each shuffle operation, and might contain multiple pipelined operations
+ * before that (e.g. map and filter). When executed, they save map output files that can later be
+ * fetched by reduce tasks. The `shuffleDep` field describes the shuffle each stage is part of,
+ * and variables like `outputLocs` and `numAvailableOutputs` track how many map outputs are ready.
+ *
+ * ShuffleMapStages can also be submitted independently as jobs with DAGScheduler.submitMapStage.
+ * For such stages, the ActiveJobs that submitted them are tracked in `mapStageJobs`. Note that
+ * there can be multiple ActiveJobs trying to compute the same shuffle map stage.
  */
 private[spark] class ShuffleMapStage(
     id: Int,
@@ -37,6 +45,9 @@ private[spark] class ShuffleMapStage(
 
   override def toString: String = "ShuffleMapStage " + id
 
+  /** Running map-stage jobs that were submitted to execute this stage independently (if any) */
+  var mapStageJobs: List[ActiveJob] = Nil
+
   var numAvailableOutputs: Int = 0
 
   def isAvailable: Boolean = numAvailableOutputs == numPartitions
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index c086535782c23..b37eccbd0f7b8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -24,27 +24,33 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.CallSite
 
 /**
- * A stage is a set of independent tasks all computing the same function that need to run as part
+ * A stage is a set of parallel tasks all computing the same function that need to run as part
  * of a Spark job, where all the tasks have the same shuffle dependencies. Each DAG of tasks run
  * by the scheduler is split up into stages at the boundaries where shuffle occurs, and then the
  * DAGScheduler runs these stages in topological order.
  *
  * Each Stage can either be a shuffle map stage, in which case its tasks' results are input for
- * another stage, or a result stage, in which case its tasks directly compute the action that
- * initiated a job (e.g. count(), save(), etc). For shuffle map stages, we also track the nodes
- * that each output partition is on.
+ * other stage(s), or a result stage, in which case its tasks directly compute a Spark action
+ * (e.g. count(), save(), etc) by running a function on an RDD. For shuffle map stages, we also
+ * track the nodes that each output partition is on.
  *
  * Each Stage also has a firstJobId, identifying the job that first submitted the stage.  When FIFO
  * scheduling is used, this allows Stages from earlier jobs to be computed first or recovered
  * faster on failure.
  *
- * The callSite provides a location in user code which relates to the stage. For a shuffle map
- * stage, the callSite gives the user code that created the RDD being shuffled. For a result
- * stage, the callSite gives the user code that executes the associated action (e.g. count()).
- *
- * A single stage can consist of multiple attempts. In that case, the latestInfo field will
- * be updated for each attempt.
+ * Finally, a single stage can be re-executed in multiple attempts due to fault recovery. In that
+ * case, the Stage object will track multiple StageInfo objects to pass to listeners or the web UI.
+ * The latest one will be accessible through latestInfo.
  *
+ * @param id Unique stage ID
+ * @param rdd RDD that this stage runs on: for a shuffle map stage, it's the RDD we run map tasks
+ *   on, while for a result stage, it's the target RDD that we ran an action on
+ * @param numTasks Total number of tasks in stage; result stages in particular may not need to
+ *   compute all partitions, e.g. for first(), lookup(), and take().
+ * @param parents List of stages that this stage depends on (through shuffle dependencies).
+ * @param firstJobId ID of the first job this stage was part of, for FIFO scheduling.
+ * @param callSite Location in the user program associated with this stage: either where the target
+ *   RDD was created, for a shuffle map stage, or where the action for a result stage was called.
  */
 private[scheduler] abstract class Stage(
     val id: Int,
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index aa50a49c50232..f58756e6f6179 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -217,6 +217,27 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     FailureSuiteState.clear()
   }
 
+  // Run a 3-task map stage where one task fails once.
+  test("failure in tasks in a submitMapStage") {
+    sc = new SparkContext("local[1,2]", "test")
+    val rdd = sc.makeRDD(1 to 3, 3).map { x =>
+      FailureSuiteState.synchronized {
+        FailureSuiteState.tasksRun += 1
+        if (x == 1 && FailureSuiteState.tasksFailed == 0) {
+          FailureSuiteState.tasksFailed += 1
+          throw new Exception("Intentional task failure")
+        }
+      }
+      (x, x)
+    }
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(2))
+    sc.submitMapStage(dep).get()
+    FailureSuiteState.synchronized {
+      assert(FailureSuiteState.tasksRun === 4)
+    }
+    FailureSuiteState.clear()
+  }
+
   // TODO: Need to add tests with shuffle fetch failures.
 }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
new file mode 100644
index 0000000000000..3fe28027c3c21
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.spark.rdd.{ShuffledRDDPartition, RDD, ShuffledRDD}
+import org.apache.spark._
+
+object AdaptiveSchedulingSuiteState {
+  var tasksRun = 0
+
+  def clear(): Unit = {
+    tasksRun = 0
+  }
+}
+
+/** A special ShuffledRDD where we can pass a ShuffleDependency object to use */
+class CustomShuffledRDD[K, V, C](@transient dep: ShuffleDependency[K, V, C])
+  extends RDD[(K, C)](dep.rdd.context, Seq(dep)) {
+
+  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
+    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
+      .read()
+      .asInstanceOf[Iterator[(K, C)]]
+  }
+
+  override def getPartitions: Array[Partition] = {
+    Array.tabulate[Partition](dep.partitioner.numPartitions)(i => new ShuffledRDDPartition(i))
+  }
+}
+
+class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext {
+  test("simple use of submitMapStage") {
+    try {
+      sc = new SparkContext("local[1,2]", "test")
+      val rdd = sc.parallelize(1 to 3, 3).map { x =>
+        AdaptiveSchedulingSuiteState.tasksRun += 1
+        (x, x)
+      }
+      val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(2))
+      val shuffled = new CustomShuffledRDD[Int, Int, Int](dep)
+      sc.submitMapStage(dep).get()
+      assert(AdaptiveSchedulingSuiteState.tasksRun == 3)
+      assert(shuffled.collect().toSet == Set((1, 1), (2, 2), (3, 3)))
+      assert(AdaptiveSchedulingSuiteState.tasksRun == 3)
+    } finally {
+      AdaptiveSchedulingSuiteState.clear()
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 1b9ff740ff530..1c55f90ad9b44 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -152,6 +152,14 @@ class DAGSchedulerSuite
     override def jobFailed(exception: Exception) = { failure = exception }
   }
 
+  /** A simple helper class for creating custom JobListeners */
+  class SimpleListener extends JobListener {
+    val results = new HashMap[Int, Any]
+    var failure: Exception = null
+    override def taskSucceeded(index: Int, result: Any): Unit = results.put(index, result)
+    override def jobFailed(exception: Exception): Unit = { failure = exception }
+  }
+
   before {
     sc = new SparkContext("local", "DAGSchedulerSuite")
     sparkListener.submittedStageInfos.clear()
@@ -229,7 +237,7 @@ class DAGSchedulerSuite
     }
   }
 
-  /** Sends the rdd to the scheduler for scheduling and returns the job id. */
+  /** Submits a job to the scheduler and returns the job id. */
   private def submit(
       rdd: RDD[_],
       partitions: Array[Int],
@@ -240,6 +248,15 @@ class DAGSchedulerSuite
     jobId
   }
 
+  /** Submits a map stage to the scheduler and returns the job id. */
+  private def submitMapStage(
+      shuffleDep: ShuffleDependency[_, _, _],
+      listener: JobListener = jobListener): Int = {
+    val jobId = scheduler.nextJobId.getAndIncrement()
+    runEvent(MapStageSubmitted(jobId, shuffleDep, CallSite("", ""), listener))
+    jobId
+  }
+
   /** Sends TaskSetFailed to the scheduler. */
   private def failed(taskSet: TaskSet, message: String) {
     runEvent(TaskSetFailed(taskSet, message, None))
@@ -1313,6 +1330,230 @@ class DAGSchedulerSuite
     assert(stackTraceString.contains("org.scalatest.FunSuite"))
   }
 
+  test("simple map stage submission") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+
+    // Submit a map stage by itself
+    submitMapStage(shuffleDep)
+    assert(results.size === 0)  // No results yet
+    completeShuffleMapStageSuccessfully(0, 0, 1)
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Submit a reduce job that depends on this map stage; it should directly do the reduce
+    submit(reduceRdd, Array(0))
+    completeNextResultStageWithSuccess(2, 0)
+    assert(results === Map(0 -> 42))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Check that if we submit the map stage again, no tasks run
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    assertDataStructuresEmpty()
+  }
+
+  test("map stage submission with reduce stage also depending on the data") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+
+    // Submit the map stage by itself
+    submitMapStage(shuffleDep)
+
+    // Submit a reduce job that depends on this map stage
+    submit(reduceRdd, Array(0))
+
+    // Complete tasks for the map stage
+    completeShuffleMapStageSuccessfully(0, 0, 1)
+    assert(results.size === 1)
+    results.clear()
+
+    // Complete tasks for the reduce stage
+    completeNextResultStageWithSuccess(1, 0)
+    assert(results === Map(0 -> 42))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Check that if we submit the map stage again, no tasks run
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    assertDataStructuresEmpty()
+  }
+
+  test("map stage submission with fetch failure") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+
+    // Submit a map stage by itself
+    submitMapStage(shuffleDep)
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Submit a reduce job that depends on this map stage, but where one reduce will fail a fetch
+    submit(reduceRdd, Array(0, 1))
+    complete(taskSets(1), Seq(
+      (Success, 42),
+      (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), null)))
+    // Ask the scheduler to try it again; TaskSet 2 will rerun the map task that we couldn't fetch
+    // from, then TaskSet 3 will run the reduce stage
+    scheduler.resubmitFailedStages()
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
+    complete(taskSets(3), Seq((Success, 43)))
+    assert(results === Map(0 -> 42, 1 -> 43))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Run another reduce job without a failure; this should just work
+    submit(reduceRdd, Array(0, 1))
+    complete(taskSets(4), Seq(
+      (Success, 44),
+      (Success, 45)))
+    assert(results === Map(0 -> 44, 1 -> 45))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Resubmit the map stage; this should also just work
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test, we have three RDDs with shuffle dependencies, and we submit map stage jobs
+   * that are waiting on each one, as well as a reduce job on the last one. We test that all of
+   * these jobs complete even if there are some fetch failures in both shuffles.
+   */
+  test("map stage submission with multiple shared stages and failures") {
+    val rdd1 = new MyRDD(sc, 2, Nil)
+    val dep1 = new ShuffleDependency(rdd1, new HashPartitioner(2))
+    val rdd2 = new MyRDD(sc, 2, List(dep1))
+    val dep2 = new ShuffleDependency(rdd2, new HashPartitioner(2))
+    val rdd3 = new MyRDD(sc, 2, List(dep2))
+
+    val listener1 = new SimpleListener
+    val listener2 = new SimpleListener
+    val listener3 = new SimpleListener
+
+    submitMapStage(dep1, listener1)
+    submitMapStage(dep2, listener2)
+    submit(rdd3, Array(0, 1), listener = listener3)
+
+    // Complete the first stage
+    assert(taskSets(0).stageId === 0)
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", rdd1.partitions.size)),
+      (Success, makeMapStatus("hostB", rdd1.partitions.size))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
+    assert(listener1.results.size === 1)
+
+    // When attempting the second stage, show a fetch failure
+    assert(taskSets(1).stageId === 1)
+    complete(taskSets(1), Seq(
+      (Success, makeMapStatus("hostA", rdd2.partitions.size)),
+      (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null)))
+    scheduler.resubmitFailedStages()
+    assert(listener2.results.size === 0)    // Second stage listener should not have a result yet
+
+    // Stage 0 should now be running as task set 2; make its task succeed
+    assert(taskSets(2).stageId === 0)
+    complete(taskSets(2), Seq(
+      (Success, makeMapStatus("hostC", rdd2.partitions.size))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
+    assert(listener2.results.size === 0)    // Second stage listener should still not have a result
+
+    // Stage 1 should now be running as task set 3; make its first task succeed
+    assert(taskSets(3).stageId === 1)
+    complete(taskSets(3), Seq(
+      (Success, makeMapStatus("hostB", rdd2.partitions.size)),
+      (Success, makeMapStatus("hostD", rdd2.partitions.size))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep2.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostD")))
+    assert(listener2.results.size === 1)
+
+    // Finally, the reduce job should be running as task set 4; make it see a fetch failure,
+    // then make it run again and succeed
+    assert(taskSets(4).stageId === 2)
+    complete(taskSets(4), Seq(
+      (Success, 52),
+      (FetchFailed(makeBlockManagerId("hostD"), dep2.shuffleId, 0, 0, "ignored"), null)))
+    scheduler.resubmitFailedStages()
+
+    // TaskSet 5 will rerun stage 1's lost task, then TaskSet 6 will rerun stage 2
+    assert(taskSets(5).stageId === 1)
+    complete(taskSets(5), Seq(
+      (Success, makeMapStatus("hostE", rdd2.partitions.size))))
+    complete(taskSets(6), Seq(
+      (Success, 53)))
+    assert(listener3.results === Map(0 -> 52, 1 -> 53))
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test, we run a map stage where one of the executors fails but we still receive a
+   * "zombie" complete message from that executor. We want to make sure the stage is not reported
+   * as done until all tasks have completed.
+   */
+  test("map stage submission with executor failure late map task completions") {
+    val shuffleMapRdd = new MyRDD(sc, 3, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+
+    submitMapStage(shuffleDep)
+
+    val oldTaskSet = taskSets(0)
+    runEvent(CompletionEvent(oldTaskSet.tasks(0), Success, makeMapStatus("hostA", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // Pretend host A was lost
+    val oldEpoch = mapOutputTracker.getEpoch
+    runEvent(ExecutorLost("exec-hostA"))
+    val newEpoch = mapOutputTracker.getEpoch
+    assert(newEpoch > oldEpoch)
+
+    // Suppose we also get a completed event from task 1 on the same host; this should be ignored
+    runEvent(CompletionEvent(oldTaskSet.tasks(1), Success, makeMapStatus("hostA", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // A completion from another task should work because it's a non-failed host
+    runEvent(CompletionEvent(oldTaskSet.tasks(2), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // Now complete tasks in the second task set
+    val newTaskSet = taskSets(1)
+    assert(newTaskSet.tasks.size === 2)     // Both tasks 0 and 1 were on on hostA
+    runEvent(CompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+    runEvent(CompletionEvent(newTaskSet.tasks(1), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 1)    // Map stage job should now finally be complete
+    assertDataStructuresEmpty()
+
+    // Also test that a reduce stage using this shuffled data can immediately run
+    val reduceRDD = new MyRDD(sc, 2, List(shuffleDep))
+    results.clear()
+    submit(reduceRDD, Array(0, 1))
+    complete(taskSets(2), Seq((Success, 42), (Success, 43)))
+    assert(results === Map(0 -> 42, 1 -> 43))
+    results.clear()
+    assertDataStructuresEmpty()
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.

From 55204181004c105c7a3e8c31a099b37e48bfd953 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 14 Sep 2015 19:46:34 -0700
Subject: [PATCH 285/802] [SPARK-10542] [PYSPARK] fix serialize namedtuple

Author: Davies Liu <davies@databricks.com>

Closes #8707 from davies/fix_namedtuple.
---
 python/pyspark/cloudpickle.py | 15 ++++++++++++++-
 python/pyspark/serializers.py |  1 +
 python/pyspark/tests.py       |  5 +++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 3b647985801b7..95b3abc74244b 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -350,6 +350,11 @@ def save_global(self, obj, name=None, pack=struct.pack):
             if new_override:
                 d['__new__'] = obj.__new__
 
+            # workaround for namedtuple (hijacked by PySpark)
+            if getattr(obj, '_is_namedtuple_', False):
+                self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields))
+                return
+
             self.save(_load_class)
             self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj)
             d.pop('__doc__', None)
@@ -382,7 +387,7 @@ def save_instancemethod(self, obj):
             self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
         else:
             self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__),
-                         obj=obj)
+                             obj=obj)
     dispatch[types.MethodType] = save_instancemethod
 
     def save_inst(self, obj):
@@ -744,6 +749,14 @@ def _load_class(cls, d):
     return cls
 
 
+def _load_namedtuple(name, fields):
+    """
+    Loads a class generated by namedtuple
+    """
+    from collections import namedtuple
+    return namedtuple(name, fields)
+
+
 """Constructors for 3rd party libraries
 Note: These can never be renamed due to client compatibility issues"""
 
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 411b4dbf481f1..2a1326947f4f5 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -359,6 +359,7 @@ def _hack_namedtuple(cls):
     def __reduce__(self):
         return (_restore, (name, fields, tuple(self)))
     cls.__reduce__ = __reduce__
+    cls._is_namedtuple_ = True
     return cls
 
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 8bfed074c9052..647504c32f156 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -218,6 +218,11 @@ def test_namedtuple(self):
         p2 = loads(dumps(p1, 2))
         self.assertEqual(p1, p2)
 
+        from pyspark.cloudpickle import dumps
+        P2 = loads(dumps(P))
+        p3 = P2(1, 3)
+        self.assertEqual(p1, p3)
+
     def test_itemgetter(self):
         from operator import itemgetter
         ser = CloudPickleSerializer()

From 4ae4d54794778042b2cc983e52757edac02412ab Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Sep 2015 21:37:43 -0700
Subject: [PATCH 286/802] [SPARK-9793] [MLLIB] [PYSPARK] PySpark DenseVector,
 SparseVector implement __eq__ and __hash__ correctly

PySpark DenseVector, SparseVector ```__eq__``` method should use semantics equality, and DenseVector can compared with SparseVector.
Implement PySpark DenseVector, SparseVector ```__hash__``` method based on the first 16 entries. That will make PySpark Vector objects can be used in collections.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8166 from yanboliang/spark-9793.
---
 python/pyspark/mllib/linalg/__init__.py | 90 ++++++++++++++++++++-----
 python/pyspark/mllib/tests.py           | 32 +++++++++
 2 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 334dc8e38bb8f..380f86e9b44f8 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -25,6 +25,7 @@
 
 import sys
 import array
+import struct
 
 if sys.version >= '3':
     basestring = str
@@ -122,6 +123,13 @@ def _format_float_list(l):
     return [_format_float(x) for x in l]
 
 
+def _double_to_long_bits(value):
+    if np.isnan(value):
+        value = float('nan')
+    # pack double into 64 bits, then unpack as long int
+    return struct.unpack('Q', struct.pack('d', value))[0]
+
+
 class VectorUDT(UserDefinedType):
     """
     SQL user-defined type (UDT) for Vector.
@@ -404,11 +412,31 @@ def __repr__(self):
         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
 
     def __eq__(self, other):
-        return isinstance(other, DenseVector) and np.array_equal(self.array, other.array)
+        if isinstance(other, DenseVector):
+            return np.array_equal(self.array, other.array)
+        elif isinstance(other, SparseVector):
+            if len(self) != other.size:
+                return False
+            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+        return False
 
     def __ne__(self, other):
         return not self == other
 
+    def __hash__(self):
+        size = len(self)
+        result = 31 + size
+        nnz = 0
+        i = 0
+        while i < size and nnz < 128:
+            if self.array[i] != 0:
+                result = 31 * result + i
+                bits = _double_to_long_bits(self.array[i])
+                result = 31 * result + (bits ^ (bits >> 32))
+                nnz += 1
+            i += 1
+        return result
+
     def __getattr__(self, item):
         return getattr(self.array, item)
 
@@ -704,20 +732,14 @@ def __repr__(self):
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
-        """
-        Test SparseVectors for equality.
-
-        >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v1 == v2
-        True
-        >>> v1 != v2
-        False
-        """
-        return (isinstance(other, self.__class__)
-                and other.size == self.size
-                and np.array_equal(other.indices, self.indices)
-                and np.array_equal(other.values, self.values))
+        if isinstance(other, SparseVector):
+            return other.size == self.size and np.array_equal(other.indices, self.indices) \
+                and np.array_equal(other.values, self.values)
+        elif isinstance(other, DenseVector):
+            if self.size != len(other):
+                return False
+            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+        return False
 
     def __getitem__(self, index):
         inds = self.indices
@@ -739,6 +761,19 @@ def __getitem__(self, index):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __hash__(self):
+        result = 31 + self.size
+        nnz = 0
+        i = 0
+        while i < len(self.values) and nnz < 128:
+            if self.values[i] != 0:
+                result = 31 * result + int(self.indices[i])
+                bits = _double_to_long_bits(self.values[i])
+                result = 31 * result + (bits ^ (bits >> 32))
+                nnz += 1
+            i += 1
+        return result
+
 
 class Vectors(object):
 
@@ -841,6 +876,31 @@ def parse(s):
     def zeros(size):
         return DenseVector(np.zeros(size))
 
+    @staticmethod
+    def _equals(v1_indices, v1_values, v2_indices, v2_values):
+        """
+        Check equality between sparse/dense vectors,
+        v1_indices and v2_indices assume to be strictly increasing.
+        """
+        v1_size = len(v1_values)
+        v2_size = len(v2_values)
+        k1 = 0
+        k2 = 0
+        all_equal = True
+        while all_equal:
+            while k1 < v1_size and v1_values[k1] == 0:
+                k1 += 1
+            while k2 < v2_size and v2_values[k2] == 0:
+                k2 += 1
+
+            if k1 >= v1_size or k2 >= v2_size:
+                return k1 >= v1_size and k2 >= v2_size
+
+            all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
+            k1 += 1
+            k2 += 1
+        return all_equal
+
 
 class Matrix(object):
 
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 5097c5e8ba4cd..636f9a06cab7b 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -194,6 +194,38 @@ def test_squared_distance(self):
         self.assertEquals(3.0, _squared_distance(sv, arr))
         self.assertEquals(3.0, _squared_distance(sv, narr))
 
+    def test_hash(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertEquals(hash(v1), hash(v2))
+        self.assertEquals(hash(v1), hash(v3))
+        self.assertEquals(hash(v2), hash(v3))
+        self.assertFalse(hash(v1) == hash(v4))
+        self.assertFalse(hash(v2) == hash(v4))
+
+    def test_eq(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
+        v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
+        v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertEquals(v1, v2)
+        self.assertEquals(v1, v3)
+        self.assertFalse(v2 == v4)
+        self.assertFalse(v1 == v5)
+        self.assertFalse(v1 == v6)
+
+    def test_equals(self):
+        indices = [1, 2, 4]
+        values = [1., 3., 2.]
+        self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
+
     def test_conversion(self):
         # numpy arrays should be automatically upcast to float64
         # tests for fix of [SPARK-5089]

From 610971ecfe858b1a48ce69b25614afe52bcbe77f Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Mon, 14 Sep 2015 21:58:52 -0700
Subject: [PATCH 287/802] [SPARK-10273] Add @since annotation to
 pyspark.mllib.feature

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to methods + "versionadded::" to classes (derived from the git file history in pyspark).

Author: noelsmith <mail@noelsmith.com>

Closes #8633 from noel-smith/SPARK-10273-since-mllib-feature.
---
 python/pyspark/mllib/feature.py | 58 ++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index f921e3ad1a314..7b077b058c3fd 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -30,7 +30,7 @@
 
 from py4j.protocol import Py4JJavaError
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import (
@@ -84,11 +84,14 @@ class Normalizer(VectorTransformer):
     >>> nor2 = Normalizer(float("inf"))
     >>> nor2.transform(v)
     DenseVector([0.0, 0.5, 1.0])
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, p=2.0):
         assert p >= 1.0, "p should be greater than 1.0"
         self.p = float(p)
 
+    @since('1.2.0')
     def transform(self, vector):
         """
         Applies unit length normalization on a vector.
@@ -133,7 +136,11 @@ class StandardScalerModel(JavaVectorTransformer):
     .. note:: Experimental
 
     Represents a StandardScaler model that can transform vectors.
+
+    .. versionadded:: 1.2.0
     """
+
+    @since('1.2.0')
     def transform(self, vector):
         """
         Applies standardization transformation on a vector.
@@ -149,6 +156,7 @@ def transform(self, vector):
         """
         return JavaVectorTransformer.transform(self, vector)
 
+    @since('1.4.0')
     def setWithMean(self, withMean):
         """
         Setter of the boolean which decides
@@ -157,6 +165,7 @@ def setWithMean(self, withMean):
         self.call("setWithMean", withMean)
         return self
 
+    @since('1.4.0')
     def setWithStd(self, withStd):
         """
         Setter of the boolean which decides
@@ -189,6 +198,8 @@ class StandardScaler(object):
     >>> for r in result.collect(): r
     DenseVector([-0.7071, 0.7071, -0.7071])
     DenseVector([0.7071, -0.7071, 0.7071])
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, withMean=False, withStd=True):
         if not (withMean or withStd):
@@ -196,6 +207,7 @@ def __init__(self, withMean=False, withStd=True):
         self.withMean = withMean
         self.withStd = withStd
 
+    @since('1.2.0')
     def fit(self, dataset):
         """
         Computes the mean and variance and stores as a model to be used
@@ -215,7 +227,11 @@ class ChiSqSelectorModel(JavaVectorTransformer):
     .. note:: Experimental
 
     Represents a Chi Squared selector model.
+
+    .. versionadded:: 1.4.0
     """
+
+    @since('1.4.0')
     def transform(self, vector):
         """
         Applies transformation on a vector.
@@ -245,10 +261,13 @@ class ChiSqSelector(object):
     SparseVector(1, {0: 6.0})
     >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
     DenseVector([5.0])
+
+    .. versionadded:: 1.4.0
     """
     def __init__(self, numTopFeatures):
         self.numTopFeatures = int(numTopFeatures)
 
+    @since('1.4.0')
     def fit(self, data):
         """
         Returns a ChiSquared feature selector.
@@ -265,6 +284,8 @@ def fit(self, data):
 class PCAModel(JavaVectorTransformer):
     """
     Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+
+    .. versionadded:: 1.5.0
     """
 
 
@@ -281,6 +302,8 @@ class PCA(object):
     1.648...
     >>> pcArray[1]
     -4.013...
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, k):
         """
@@ -288,6 +311,7 @@ def __init__(self, k):
         """
         self.k = int(k)
 
+    @since('1.5.0')
     def fit(self, data):
         """
         Computes a [[PCAModel]] that contains the principal components of the input vectors.
@@ -312,14 +336,18 @@ class HashingTF(object):
     >>> doc = "a a b b c d".split(" ")
     >>> htf.transform(doc)
     SparseVector(100, {...})
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, numFeatures=1 << 20):
         self.numFeatures = numFeatures
 
+    @since('1.2.0')
     def indexOf(self, term):
         """ Returns the index of the input term. """
         return hash(term) % self.numFeatures
 
+    @since('1.2.0')
     def transform(self, document):
         """
         Transforms the input document (list of terms) to term frequency
@@ -339,7 +367,10 @@ def transform(self, document):
 class IDFModel(JavaVectorTransformer):
     """
     Represents an IDF model that can transform term frequency vectors.
+
+    .. versionadded:: 1.2.0
     """
+    @since('1.2.0')
     def transform(self, x):
         """
         Transforms term frequency (TF) vectors to TF-IDF vectors.
@@ -358,6 +389,7 @@ def transform(self, x):
         """
         return JavaVectorTransformer.transform(self, x)
 
+    @since('1.4.0')
     def idf(self):
         """
         Returns the current IDF vector.
@@ -401,10 +433,13 @@ class IDF(object):
     DenseVector([0.0, 0.0, 1.3863, 0.863])
     >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
     SparseVector(4, {1: 0.0, 3: 0.5754})
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, minDocFreq=0):
         self.minDocFreq = minDocFreq
 
+    @since('1.2.0')
     def fit(self, dataset):
         """
         Computes the inverse document frequency.
@@ -420,7 +455,10 @@ def fit(self, dataset):
 class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
     """
     class for Word2Vec model
+
+    .. versionadded:: 1.2.0
     """
+    @since('1.2.0')
     def transform(self, word):
         """
         Transforms a word to its vector representation
@@ -435,6 +473,7 @@ def transform(self, word):
         except Py4JJavaError:
             raise ValueError("%s not found" % word)
 
+    @since('1.2.0')
     def findSynonyms(self, word, num):
         """
         Find synonyms of a word
@@ -450,6 +489,7 @@ def findSynonyms(self, word, num):
         words, similarity = self.call("findSynonyms", word, num)
         return zip(words, similarity)
 
+    @since('1.4.0')
     def getVectors(self):
         """
         Returns a map of words to their vector representations.
@@ -457,7 +497,11 @@ def getVectors(self):
         return self.call("getVectors")
 
     @classmethod
+    @since('1.5.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         jmodel = sc._jvm.org.apache.spark.mllib.feature \
             .Word2VecModel.load(sc._jsc.sc(), path)
         return Word2VecModel(jmodel)
@@ -507,6 +551,8 @@ class Word2Vec(object):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self):
         """
@@ -519,6 +565,7 @@ def __init__(self):
         self.seed = random.randint(0, sys.maxsize)
         self.minCount = 5
 
+    @since('1.2.0')
     def setVectorSize(self, vectorSize):
         """
         Sets vector size (default: 100).
@@ -526,6 +573,7 @@ def setVectorSize(self, vectorSize):
         self.vectorSize = vectorSize
         return self
 
+    @since('1.2.0')
     def setLearningRate(self, learningRate):
         """
         Sets initial learning rate (default: 0.025).
@@ -533,6 +581,7 @@ def setLearningRate(self, learningRate):
         self.learningRate = learningRate
         return self
 
+    @since('1.2.0')
     def setNumPartitions(self, numPartitions):
         """
         Sets number of partitions (default: 1). Use a small number for
@@ -541,6 +590,7 @@ def setNumPartitions(self, numPartitions):
         self.numPartitions = numPartitions
         return self
 
+    @since('1.2.0')
     def setNumIterations(self, numIterations):
         """
         Sets number of iterations (default: 1), which should be smaller
@@ -549,6 +599,7 @@ def setNumIterations(self, numIterations):
         self.numIterations = numIterations
         return self
 
+    @since('1.2.0')
     def setSeed(self, seed):
         """
         Sets random seed.
@@ -556,6 +607,7 @@ def setSeed(self, seed):
         self.seed = seed
         return self
 
+    @since('1.4.0')
     def setMinCount(self, minCount):
         """
         Sets minCount, the minimum number of times a token must appear
@@ -564,6 +616,7 @@ def setMinCount(self, minCount):
         self.minCount = minCount
         return self
 
+    @since('1.2.0')
     def fit(self, data):
         """
         Computes the vector representation of each word in vocabulary.
@@ -596,10 +649,13 @@ class ElementwiseProduct(VectorTransformer):
     >>> rdd = sc.parallelize([a, b])
     >>> eprod.transform(rdd).collect()
     [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, scalingVector):
         self.scalingVector = _convert_to_vector(scalingVector)
 
+    @since('1.5.0')
     def transform(self, vector):
         """
         Computes the Hadamard product of the vector.

From a2249359d5b0368318a714b292bb1d0dc70c0e27 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Mon, 14 Sep 2015 21:59:40 -0700
Subject: [PATCH 288/802] [SPARK-10275] [MLLIB] Add @since annotation to
 pyspark.mllib.random

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8666 from yu-iskw/SPARK-10275.
---
 python/pyspark/mllib/random.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 06fbc0eb6aef0..9c733b1332bc0 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -21,6 +21,7 @@
 
 from functools import wraps
 
+from pyspark import since
 from pyspark.mllib.common import callMLlibFunc
 
 
@@ -39,9 +40,12 @@ class RandomRDDs(object):
     """
     Generator methods for creating RDDs comprised of i.i.d samples from
     some distribution.
+
+    .. addedversion:: 1.1.0
     """
 
     @staticmethod
+    @since("1.1.0")
     def uniformRDD(sc, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the
@@ -72,6 +76,7 @@ def uniformRDD(sc, size, numPartitions=None, seed=None):
         return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
 
     @staticmethod
+    @since("1.1.0")
     def normalRDD(sc, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the standard normal
@@ -100,6 +105,7 @@ def normalRDD(sc, size, numPartitions=None, seed=None):
         return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the log normal
@@ -132,6 +138,7 @@ def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
                              size, numPartitions, seed)
 
     @staticmethod
+    @since("1.1.0")
     def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Poisson
@@ -158,6 +165,7 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Exponential
@@ -184,6 +192,7 @@ def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):
         return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Gamma
@@ -216,6 +225,7 @@ def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -241,6 +251,7 @@ def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -266,6 +277,7 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -300,6 +312,7 @@ def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -330,6 +343,7 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -360,6 +374,7 @@ def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=No
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn

From 833be73314b85b390a9007ed6ed63dc47bbd9e4f Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Mon, 14 Sep 2015 23:40:29 -0700
Subject: [PATCH 289/802] Small fixes to docs

Links work now properly + consistent use of *Spark standalone cluster* (Spark uppercase + lowercase the rest -- seems agreed in the other places in the docs).

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #8759 from jaceklaskowski/docs-submitting-apps.
---
 docs/submitting-applications.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index e58645274e525..7ea4d6f1a3f8f 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -65,8 +65,8 @@ For Python applications, simply pass a `.py` file in the place of `<application-
 and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`.
 
 There are a few options available that are specific to the
-[cluster manager](#cluster-overview.html#cluster-manager-types) that is being used.
-For example, with a [Spark Standalone](#spark-standalone) cluster with `cluster` deploy mode,
+[cluster manager](cluster-overview.html#cluster-manager-types) that is being used.
+For example, with a [Spark standalone cluster](spark-standalone.html) with `cluster` deploy mode,
 you can also specify `--supervise` to make sure that the driver is automatically restarted if it
 fails with non-zero exit code. To enumerate all such options available to `spark-submit`,
 run it with `--help`. Here are a few examples of common options:
@@ -79,7 +79,7 @@ run it with `--help`. Here are a few examples of common options:
   /path/to/examples.jar \
   100
 
-# Run on a Spark Standalone cluster in client deploy mode
+# Run on a Spark standalone cluster in client deploy mode
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
   --master spark://207.184.161.138:7077 \
@@ -88,7 +88,7 @@ run it with `--help`. Here are a few examples of common options:
   /path/to/examples.jar \
   1000
 
-# Run on a Spark Standalone cluster in cluster deploy mode with supervise
+# Run on a Spark standalone cluster in cluster deploy mode with supervise
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
   --master spark://207.184.161.138:7077 \
@@ -109,7 +109,7 @@ export HADOOP_CONF_DIR=XXX
   /path/to/examples.jar \
   1000
 
-# Run a Python application on a Spark Standalone cluster
+# Run a Python application on a Spark standalone cluster
 ./bin/spark-submit \
   --master spark://207.184.161.138:7077 \
   examples/src/main/python/pi.py \

From 6503c4b5f3cf3ba9aefc850e7874a62a218b9b0a Mon Sep 17 00:00:00 2001
From: Robin East <robin.east@xense.co.uk>
Date: Mon, 14 Sep 2015 23:41:06 -0700
Subject: [PATCH 290/802] [SPARK-10598] [DOCS]

Comments preceding toMessage method state: "The edge partition is encoded in the lower
   * 30 bytes of the Int, and the position is encoded in the upper 2 bytes of the Int.". References to bytes should be changed to bits.

This contribution is my original work and I license the work to the Spark project under it's open source license.

Author: Robin East <robin.east@xense.co.uk>

Closes #8756 from insidedctm/master.
---
 .../org/apache/spark/graphx/impl/RoutingTablePartition.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
index eb3c997e0f3c0..4f1260a5a67b2 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
@@ -34,7 +34,7 @@ object RoutingTablePartition {
   /**
    * A message from an edge partition to a vertex specifying the position in which the edge
    * partition references the vertex (src, dst, or both). The edge partition is encoded in the lower
-   * 30 bytes of the Int, and the position is encoded in the upper 2 bytes of the Int.
+   * 30 bits of the Int, and the position is encoded in the upper 2 bits of the Int.
    */
   type RoutingTableMessage = (VertexId, Int)
 

From 09b7e7c19897549a8622aec095f27b8b38a1a4d3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 15 Sep 2015 00:54:20 -0700
Subject: [PATCH 291/802] Update version to 1.6.0-SNAPSHOT.

Author: Reynold Xin <rxin@databricks.com>

Closes #8350 from rxin/1.6.
---
 R/pkg/DESCRIPTION                                  |  2 +-
 assembly/pom.xml                                   |  2 +-
 bagel/pom.xml                                      |  2 +-
 core/pom.xml                                       |  2 +-
 core/src/main/scala/org/apache/spark/package.scala |  2 +-
 docs/_config.yml                                   |  4 ++--
 examples/pom.xml                                   |  2 +-
 external/flume-assembly/pom.xml                    |  2 +-
 external/flume-sink/pom.xml                        |  2 +-
 external/flume/pom.xml                             |  2 +-
 external/kafka-assembly/pom.xml                    |  2 +-
 external/kafka/pom.xml                             |  2 +-
 external/mqtt-assembly/pom.xml                     |  2 +-
 external/mqtt/pom.xml                              |  2 +-
 external/twitter/pom.xml                           |  2 +-
 external/zeromq/pom.xml                            |  2 +-
 extras/java8-tests/pom.xml                         |  2 +-
 extras/kinesis-asl-assembly/pom.xml                |  2 +-
 extras/kinesis-asl/pom.xml                         |  2 +-
 extras/spark-ganglia-lgpl/pom.xml                  |  2 +-
 graphx/pom.xml                                     |  2 +-
 launcher/pom.xml                                   |  2 +-
 mllib/pom.xml                                      |  2 +-
 network/common/pom.xml                             |  2 +-
 network/shuffle/pom.xml                            |  2 +-
 network/yarn/pom.xml                               |  2 +-
 pom.xml                                            |  2 +-
 project/MimaBuild.scala                            |  2 +-
 project/MimaExcludes.scala                         | 13 +++++++++++--
 repl/pom.xml                                       |  2 +-
 sql/catalyst/pom.xml                               |  2 +-
 sql/core/pom.xml                                   |  2 +-
 sql/hive-thriftserver/pom.xml                      |  2 +-
 sql/hive/pom.xml                                   |  2 +-
 streaming/pom.xml                                  |  2 +-
 tools/pom.xml                                      |  2 +-
 unsafe/pom.xml                                     |  2 +-
 yarn/pom.xml                                       |  2 +-
 38 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d0d7201f004a2..a3a16c42a6214 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R frontend for Spark
-Version: 1.5.0
+Version: 1.6.0
 Date: 2013-09-09
 Author: The Apache Software Foundation
 Maintainer: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
diff --git a/assembly/pom.xml b/assembly/pom.xml
index e9c6d26ccddc7..4b60ee00ffbe5 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bagel/pom.xml b/bagel/pom.xml
index ed5c37e595a96..3baf8d47b4dc7 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index a46292c13bcc0..e31d90f608892 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala
index 8ae76c5f72f2e..7515aad09db73 100644
--- a/core/src/main/scala/org/apache/spark/package.scala
+++ b/core/src/main/scala/org/apache/spark/package.scala
@@ -43,5 +43,5 @@ package org.apache
 
 package object spark {
   // For package docs only
-  val SPARK_VERSION = "1.5.0-SNAPSHOT"
+  val SPARK_VERSION = "1.6.0-SNAPSHOT"
 }
diff --git a/docs/_config.yml b/docs/_config.yml
index c0e031a83ba9c..c59cc465ef89d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 1.5.0-SNAPSHOT
-SPARK_VERSION_SHORT: 1.5.0
+SPARK_VERSION: 1.6.0-SNAPSHOT
+SPARK_VERSION_SHORT: 1.6.0
 SCALA_BINARY_VERSION: "2.10"
 SCALA_VERSION: "2.10.4"
 MESOS_VERSION: 0.21.0
diff --git a/examples/pom.xml b/examples/pom.xml
index e6884b09dca94..f5ab2a7fdc098 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 561ed4babe5d0..dceedcf23ed5b 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 0664cfb2021e1..d7c2ac474a18d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 14f7daaf417e0..132062f94fb45 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 6f4e2a89e9af7..a9ed39ef8c9a0 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index ded863bd985e8..05abd9e2e6810 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml
index 8412600633734..89713a28ca6a8 100644
--- a/external/mqtt-assembly/pom.xml
+++ b/external/mqtt-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 69b309876a0db..05e6338a08b0a 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 178ae8de13b57..244ad58ae9593 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 37bfd10d43663..171df8682c848 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 3636a9037d43f..81794a8536318 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml
index 51af3e6f2225f..61ba4787fbf90 100644
--- a/extras/kinesis-asl-assembly/pom.xml
+++ b/extras/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 521b53e230c4a..6dd8ff69c2943 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml
index 478d0019a25f0..87a4f05a05961 100644
--- a/extras/spark-ganglia-lgpl/pom.xml
+++ b/extras/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 853dea9a7795e..202fc19002d12 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fd768d8119c4..ed38e66aa2467 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a5db14407b4fc..22c0c6008ba37 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 4141fcb8267a5..1cc054a8936c5 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 3d2edf9d94515..7a66c968041ce 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index a99f7c4392d3d..e745180eace78 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 421357e141572..6535994641145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.10</artifactId>
-  <version>1.5.0-SNAPSHOT</version>
+  <version>1.6.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index f16bf989f200b..519052620246f 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -91,7 +91,7 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "1.4.0"
+    val previousSparkVersion = "1.5.0"
     val fullId = "spark-" + projectRef.project + "_2.10"
     mimaDefaultSettings ++
     Seq(previousArtifact := Some(organization % fullId % previousSparkVersion),
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 3b8b6c8ffa375..87b141cd3b058 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -35,8 +35,17 @@ object MimaExcludes {
   def excludes(version: String) = version match {
     case v if v.startsWith("1.6") =>
       Seq(
-        MimaBuild.excludeSparkPackage("network")
-        )
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("network"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        // SQL execution is considered private.
+        excludePackage("org.apache.spark.sql.execution")
+      ) ++
+      MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
+      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils")
     case v if v.startsWith("1.5") =>
       Seq(
         MimaBuild.excludeSparkPackage("network"),
diff --git a/repl/pom.xml b/repl/pom.xml
index a5a0f1fc2c857..5cf416a4a5448 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 75ab575dfde83..6cfd53e868f83 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 349007789f634..465aa3a3888c2 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3566c87dd248c..f7fe085f34d84 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index be1607476e254..ac67fe5f47be9 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 697895e72fe5b..5cc9001b0e9ab 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 298ee2348b58e..1e64f280e5bed 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 89475ee3cf5a1..066abe92e51c0 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index f6737695307a2..d8e4a4bbead81 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From c35fdcb7e9c01271ce560dba4e0bd37569c8f5d1 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 15 Sep 2015 09:58:49 -0700
Subject: [PATCH 292/802] [SPARK-10491] [MLLIB] move RowMatrix.dspr to BLAS

jira: https://issues.apache.org/jira/browse/SPARK-10491

We implemented dspr with sparse vector support in `RowMatrix`. This method is also used in WeightedLeastSquares and other places. It would be useful to move it to `linalg.BLAS`.

Let me know if new UT needed.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #8663 from hhbyyh/movedspr.
---
 .../spark/ml/optim/WeightedLeastSquares.scala |  4 +-
 .../org/apache/spark/mllib/linalg/BLAS.scala  | 44 +++++++++++++++++++
 .../mllib/linalg/distributed/RowMatrix.scala  | 40 +----------------
 .../apache/spark/mllib/linalg/BLASSuite.scala | 25 +++++++++++
 4 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index a99e2ac4c6913..0ff8931b0bab4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -88,7 +88,7 @@ private[ml] class WeightedLeastSquares(
     if (fitIntercept) {
       // shift centers
       // A^T A - aBar aBar^T
-      RowMatrix.dspr(-1.0, aBar, aaValues)
+      BLAS.spr(-1.0, aBar, aaValues)
       // A^T b - bBar aBar
       BLAS.axpy(-bBar, aBar, abBar)
     }
@@ -203,7 +203,7 @@ private[ml] object WeightedLeastSquares {
       bbSum += w * b * b
       BLAS.axpy(w, a, aSum)
       BLAS.axpy(w * b, a, abSum)
-      RowMatrix.dspr(w, a, aaSum.values)
+      BLAS.spr(w, a, aaSum)
       this
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 9ee81eda8a8c0..df9f4ae145b88 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -236,6 +236,50 @@ private[spark] object BLAS extends Serializable with Logging {
     _nativeBLAS
   }
 
+  /**
+   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+   *
+   * @param U the upper triangular part of the matrix in a [[DenseVector]](column major)
+   */
+  def spr(alpha: Double, v: Vector, U: DenseVector): Unit = {
+    spr(alpha, v, U.values)
+  }
+
+  /**
+   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+   *
+   * @param U the upper triangular part of the matrix packed in an array (column major)
+   */
+  def spr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
+    val n = v.size
+    v match {
+      case DenseVector(values) =>
+        NativeBLAS.dspr("U", n, alpha, values, 1, U)
+      case SparseVector(size, indices, values) =>
+        val nnz = indices.length
+        var colStartIdx = 0
+        var prevCol = 0
+        var col = 0
+        var j = 0
+        var i = 0
+        var av = 0.0
+        while (j < nnz) {
+          col = indices(j)
+          // Skip empty columns.
+          colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
+          col = indices(j)
+          av = alpha * values(j)
+          i = 0
+          while (i <= j) {
+            U(colStartIdx + indices(i)) += av * values(i)
+            i += 1
+          }
+          j += 1
+          prevCol = col
+        }
+    }
+  }
+
   /**
    * A := alpha * x * x^T^ + A
    * @param alpha a real scalar that will be multiplied to x * x^T^.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 83779ac88989b..e55ef26858adb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable.ListBuffer
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy,
   svd => brzSvd, MatrixSingularException, inv}
 import breeze.numerics.{sqrt => brzSqrt}
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext._
@@ -123,7 +122,7 @@ class RowMatrix @Since("1.0.0") (
     // Compute the upper triangular part of the gram matrix.
     val GU = rows.treeAggregate(new BDV[Double](new Array[Double](nt)))(
       seqOp = (U, v) => {
-        RowMatrix.dspr(1.0, v, U.data)
+        BLAS.spr(1.0, v, U.data)
         U
       }, combOp = (U1, U2) => U1 += U2)
 
@@ -673,43 +672,6 @@ class RowMatrix @Since("1.0.0") (
 @Experimental
 object RowMatrix {
 
-  /**
-   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's DSPR.
-   *
-   * @param U the upper triangular part of the matrix packed in an array (column major)
-   */
-  // TODO: SPARK-10491 - move this method to linalg.BLAS
-  private[spark] def dspr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
-    // TODO: Find a better home (breeze?) for this method.
-    val n = v.size
-    v match {
-      case DenseVector(values) =>
-        blas.dspr("U", n, alpha, values, 1, U)
-      case SparseVector(size, indices, values) =>
-        val nnz = indices.length
-        var colStartIdx = 0
-        var prevCol = 0
-        var col = 0
-        var j = 0
-        var i = 0
-        var av = 0.0
-        while (j < nnz) {
-          col = indices(j)
-          // Skip empty columns.
-          colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
-          col = indices(j)
-          av = alpha * values(j)
-          i = 0
-          while (i <= j) {
-            U(colStartIdx + indices(i)) += av * values(i)
-            i += 1
-          }
-          j += 1
-          prevCol = col
-        }
-    }
-  }
-
   /**
    * Fills a full square matrix from its upper triangular part.
    */
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index 8db5c8424abe9..96e5ffef7a131 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -126,6 +126,31 @@ class BLASSuite extends SparkFunSuite {
     }
   }
 
+  test("spr") {
+    // test dense vector
+    val alpha = 0.1
+    val x = new DenseVector(Array(1.0, 2, 2.1, 4))
+    val U = new DenseVector(Array(1.0, 2, 2, 3, 3, 3, 4, 4, 4, 4))
+    val expected = new DenseVector(Array(1.1, 2.2, 2.4, 3.21, 3.42, 3.441, 4.4, 4.8, 4.84, 5.6))
+
+    spr(alpha, x, U)
+    assert(U ~== expected absTol 1e-9)
+
+    val matrix33 = new DenseVector(Array(1.0, 2, 3, 4, 5))
+    withClue("Size of vector must match the rank of matrix") {
+      intercept[Exception] {
+        spr(alpha, x, matrix33)
+      }
+    }
+
+    // test sparse vector
+    val sv = new SparseVector(4, Array(0, 3), Array(1.0, 2))
+    val U2 = new DenseVector(Array(1.0, 2, 2, 3, 3, 3, 4, 4, 4, 4))
+    spr(0.1, sv, U2)
+    val expectedSparse = new DenseVector(Array(1.1, 2.0, 2.0, 3.0, 3.0, 3.0, 4.2, 4.0, 4.0, 4.4))
+    assert(U2 ~== expectedSparse absTol 1e-15)
+  }
+
   test("syr") {
     val dA = new DenseMatrix(4, 4,
       Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0, 3.1, 4.6, 3.0, 0.8))

From 8abef21dac1a6538c4e4e0140323b83d804d602b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 15 Sep 2015 10:45:02 -0700
Subject: [PATCH 293/802] [SPARK-10300] [BUILD] [TESTS] Add support for test
 tags in run-tests.py.

This change does two things:

- tag a few tests and adds the mechanism in the build to be able to disable those tags,
  both in maven and sbt, for both junit and scalatest suites.
- add some logic to run-tests.py to disable some tags depending on what files have
  changed; that's used to disable expensive tests when a module hasn't explicitly
  been changed, to speed up testing for changes that don't directly affect those
  modules.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8437 from vanzin/test-tags.
---
 core/pom.xml                                  | 10 -------
 dev/run-tests.py                              | 19 ++++++++++++--
 dev/sparktestsupport/modules.py               | 24 ++++++++++++++++-
 external/flume/pom.xml                        | 10 -------
 external/kafka/pom.xml                        | 10 -------
 external/mqtt/pom.xml                         | 10 -------
 external/twitter/pom.xml                      | 10 -------
 external/zeromq/pom.xml                       | 10 -------
 extras/java8-tests/pom.xml                    | 10 -------
 extras/kinesis-asl/pom.xml                    |  5 ----
 launcher/pom.xml                              |  5 ----
 mllib/pom.xml                                 | 10 -------
 network/common/pom.xml                        | 10 -------
 network/shuffle/pom.xml                       | 10 -------
 pom.xml                                       | 17 ++++++++++--
 project/SparkBuild.scala                      | 13 ++++++++--
 sql/core/pom.xml                              |  5 ----
 .../execution/HiveCompatibilitySuite.scala    |  2 ++
 sql/hive/pom.xml                              |  5 ----
 .../spark/sql/hive/ExtendedHiveTest.java      | 26 +++++++++++++++++++
 .../spark/sql/hive/client/VersionsSuite.scala |  2 ++
 streaming/pom.xml                             | 10 -------
 unsafe/pom.xml                                | 10 -------
 .../spark/deploy/yarn/ExtendedYarnTest.java   | 26 +++++++++++++++++++
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  1 +
 .../yarn/YarnShuffleIntegrationSuite.scala    |  1 +
 26 files changed, 124 insertions(+), 147 deletions(-)
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
 create mode 100644 yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java

diff --git a/core/pom.xml b/core/pom.xml
index e31d90f608892..8a20181096223 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -331,16 +331,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.curator</groupId>
       <artifactId>curator-test</artifactId>
diff --git a/dev/run-tests.py b/dev/run-tests.py
index d8b22e1665e7b..1a816585187d9 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -118,6 +118,14 @@ def determine_modules_to_test(changed_modules):
     return modules_to_test.union(set(changed_modules))
 
 
+def determine_tags_to_exclude(changed_modules):
+    tags = []
+    for m in modules.all_modules:
+        if m not in changed_modules:
+            tags += m.test_tags
+    return tags
+
+
 # -------------------------------------------------------------------------------------------------
 # Functions for working with subprocesses and shell tools
 # -------------------------------------------------------------------------------------------------
@@ -369,6 +377,7 @@ def detect_binary_inop_with_mima():
 
 def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
+
     profiles_and_goals = test_profiles + mvn_test_goals
 
     print("[info] Running Spark tests using Maven with these arguments: ",
@@ -392,7 +401,7 @@ def run_scala_tests_sbt(test_modules, test_profiles):
     exec_sbt(profiles_and_goals)
 
 
-def run_scala_tests(build_tool, hadoop_version, test_modules):
+def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
     """Function to properly execute all tests passed in as a set from the
     `determine_test_suites` function"""
     set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
@@ -401,6 +410,10 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_profiles = get_hadoop_profiles(hadoop_version) + \
         list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
+
+    if excluded_tags:
+        test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
+
     if build_tool == "maven":
         run_scala_tests_maven(test_profiles)
     else:
@@ -500,8 +513,10 @@ def main():
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
+        excluded_tags = determine_tags_to_exclude(changed_modules)
     if not changed_modules:
         changed_modules = [modules.root]
+        excluded_tags = []
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -541,7 +556,7 @@ def main():
         detect_binary_inop_with_mima()
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules)
+    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 346452f3174e4..65397f1f3e0bc 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -31,7 +31,7 @@ class Module(object):
 
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
                  sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
-                 should_run_r_tests=False):
+                 test_tags=(), should_run_r_tests=False):
         """
         Define a new module.
 
@@ -50,6 +50,8 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         :param blacklisted_python_implementations: A set of Python implementations that are not
             supported by this module's Python components. The values in this set should match
             strings returned by Python's `platform.python_implementation()`.
+        :param test_tags A set of tags that will be excluded when running unit tests if the module
+            is not explicitly changed.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         """
         self.name = name
@@ -60,6 +62,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.environ = environ
         self.python_test_goals = python_test_goals
         self.blacklisted_python_implementations = blacklisted_python_implementations
+        self.test_tags = test_tags
         self.should_run_r_tests = should_run_r_tests
 
         self.dependent_modules = set()
@@ -85,6 +88,9 @@ def contains_file(self, filename):
         "catalyst/test",
         "sql/test",
         "hive/test",
+    ],
+    test_tags=[
+        "org.apache.spark.sql.hive.ExtendedHiveTest"
     ]
 )
 
@@ -398,6 +404,22 @@ def contains_file(self, filename):
 )
 
 
+yarn = Module(
+    name="yarn",
+    dependencies=[],
+    source_file_regexes=[
+        "yarn/",
+        "network/yarn/",
+    ],
+    sbt_test_goals=[
+        "yarn/test",
+        "network-yarn/test",
+    ],
+    test_tags=[
+        "org.apache.spark.deploy.yarn.ExtendedYarnTest"
+    ]
+)
+
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 132062f94fb45..3154e36c21ef5 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -66,16 +66,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 05abd9e2e6810..7d0d46dadc727 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -86,16 +86,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 05e6338a08b0a..913c47d33f488 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -58,16 +58,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.activemq</groupId>
       <artifactId>activemq-core</artifactId>
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 244ad58ae9593..9137bf25ee8ae 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -58,16 +58,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 171df8682c848..6fec4f0e8a0f9 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -57,16 +57,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 81794a8536318..dba3dda8a9562 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -58,16 +58,6 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
 
   <profiles>
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 6dd8ff69c2943..760f183a2ef37 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -74,11 +74,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/launcher/pom.xml b/launcher/pom.xml
index ed38e66aa2467..80696280a1d18 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -42,11 +42,6 @@
       <artifactId>log4j</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 22c0c6008ba37..5dedacb38874e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -94,16 +94,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 1cc054a8936c5..9c12cca0df609 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -64,16 +64,6 @@
     </dependency>
 
     <!-- Test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 7a66c968041ce..e4f4c57b683c8 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -78,16 +78,6 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
diff --git a/pom.xml b/pom.xml
index 6535994641145..2927d3e107563 100644
--- a/pom.xml
+++ b/pom.xml
@@ -181,6 +181,7 @@
     <libthrift.version>0.9.2</libthrift.version>
 
     <test.java.home>${java.home}</test.java.home>
+    <test.exclude.tags></test.exclude.tags>
 
     <!--
       Dependency scopes that can be overridden by enabling certain profiles. These profiles are
@@ -339,6 +340,16 @@
       <artifactId>scalatest_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <dependencyManagement>
     <dependencies>
@@ -742,7 +753,7 @@
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
-        <version>4.10</version>
+        <version>4.11</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -760,7 +771,7 @@
       <dependency>
         <groupId>com.novocode</groupId>
         <artifactId>junit-interface</artifactId>
-        <version>0.10</version>
+        <version>0.11</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -1915,6 +1926,7 @@
               <test.src.tables>src</test.src.tables>
             </systemProperties>
             <failIfNoTests>false</failIfNoTests>
+            <excludedGroups>${test.exclude.tags}</excludedGroups>
           </configuration>
         </plugin>
         <!-- Scalatest runs all Scala tests -->
@@ -1952,6 +1964,7 @@
               <!-- Needed by sql/hive tests. -->
               <test.src.tables>__not_used__</test.src.tables>
             </systemProperties>
+            <tagsToExclude>${test.exclude.tags}</tagsToExclude>
           </configuration>
           <executions>
             <execution>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 901cfa538d23e..d80d300f1c3b2 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -567,11 +567,20 @@ object TestSettings {
     javaOptions in Test ++= "-Xmx3g -Xss4096k -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
     javaOptions += "-Xmx3g",
+    // Exclude tags defined in a system property
+    testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest,
+      sys.props.get("test.exclude.tags").map { tags =>
+        tags.split(",").flatMap { tag => Seq("-l", tag) }.toSeq
+      }.getOrElse(Nil): _*),
+    testOptions in Test += Tests.Argument(TestFrameworks.JUnit,
+      sys.props.get("test.exclude.tags").map { tags =>
+        Seq("--exclude-categories=" + tags)
+      }.getOrElse(Nil): _*),
     // Show full stack trace and duration in test cases.
     testOptions in Test += Tests.Argument("-oDF"),
-    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
+    testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
     // Enable Junit testing.
-    libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test",
+    libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test",
     // Only allow one test at a time, even across projects, since they run in the same JVM
     parallelExecution in Test := false,
     // Make sure the test temp directory exists.
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 465aa3a3888c2..fa6732db183d8 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -73,11 +73,6 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index ab309e0a1d36b..ffc4c32794ca4 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -24,11 +24,13 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.hive.ExtendedHiveTest
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
  * Runs the test cases that are included in the hive distribution.
  */
+@ExtendedHiveTest
 class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   // TODO: bundle in jar files... get from classpath
   private lazy val hiveQueryDir = TestHive.getHiveFile(
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index ac67fe5f47be9..82cfeb2bb95d3 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -160,11 +160,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
new file mode 100644
index 0000000000000..e2183183fb559
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive;
+
+import java.lang.annotation.*;
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ExtendedHiveTest { }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index f0bb77092c0cf..888d1b7b45532 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
+import org.apache.spark.sql.hive.ExtendedHiveTest
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 
@@ -32,6 +33,7 @@ import org.apache.spark.util.Utils
  * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionality
  * is not fully tested.
  */
+@ExtendedHiveTest
 class VersionsSuite extends SparkFunSuite with Logging {
 
   // Do not use a temp path here to speed up subsequent executions of the unit test during
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 5cc9001b0e9ab..1e6ee009ca6d5 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -84,21 +84,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 066abe92e51c0..4e8b9a84bb67f 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -55,16 +55,6 @@
     </dependency>
 
     <!-- Test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java b/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java
new file mode 100644
index 0000000000000..7a8f2fe979c1f
--- /dev/null
+++ b/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn;
+
+import java.lang.annotation.*;
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ExtendedYarnTest { }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index b5a42fd6afd98..105c3090d489d 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -39,6 +39,7 @@ import org.apache.spark.util.Utils
  * applications, and require the Spark assembly to be built before they can be successfully
  * run.
  */
+@ExtendedYarnTest
 class YarnClusterSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 8d9c9b3004eda..4700e2428df08 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
 /**
  * Integration test for the external shuffle service with a yarn mini-cluster
  */
+@ExtendedYarnTest
 class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = {

From 7ca30b505c3561dc2832b463be4c6301a90380e4 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 15 Sep 2015 12:23:20 -0700
Subject: [PATCH 294/802] [PYSPARK] [MLLIB] [DOCS] Replaced addversion with
 versionadded in mllib.random

Missed this when reviewing `pyspark.mllib.random` for SPARK-10275.

Author: noelsmith <mail@noelsmith.com>

Closes #8773 from noel-smith/mllib-random-versionadded-fix.
---
 python/pyspark/mllib/random.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 9c733b1332bc0..6a3c643b66417 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -41,7 +41,7 @@ class RandomRDDs(object):
     Generator methods for creating RDDs comprised of i.i.d samples from
     some distribution.
 
-    .. addedversion:: 1.1.0
+    .. versionadded:: 1.1.0
     """
 
     @staticmethod

From 0d9ab016755d5b56ce4043f229602169fd752e88 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 15 Sep 2015 12:25:31 -0700
Subject: [PATCH 295/802] Closes #8738 Closes #8767 Closes #2491 Closes #6795
 Closes #2096 Closes #7722


From 416003b26401894ec712e1a5291a92adfbc5af01 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Tue, 15 Sep 2015 20:42:33 +0100
Subject: [PATCH 296/802] [DOCS] Small fixes to Spark on Yarn doc

* a follow-up to 16b6d18613e150c7038c613992d80a7828413e66 as `--num-executors` flag is not suppported.
* links + formatting

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #8762 from jaceklaskowski/docs-spark-on-yarn.
---
 docs/running-on-yarn.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 5159ef9e3394e..d1244323edfff 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -18,16 +18,16 @@ Spark application's configuration (driver, executors, and the AM when running in
 
 There are two deploy modes that can be used to launch Spark applications on YARN. In `yarn-cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `yarn-client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN.
 
-Unlike in Spark standalone and Mesos mode, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`. 
+Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`.
+
 To launch a Spark application in `yarn-cluster` mode:
 
-   `$ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]`
+    $ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]
     
 For example:
 
     $ ./bin/spark-submit --class org.apache.spark.examples.SparkPi \
         --master yarn-cluster \
-        --num-executors 3 \
         --driver-memory 4g \
         --executor-memory 2g \
         --executor-cores 1 \
@@ -37,7 +37,7 @@ For example:
 
 The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running.  Refer to the "Debugging your Application" section below for how to see driver and executor logs.
 
-To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`.  To run spark-shell:
+To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`. The following shows how you can run `spark-shell` in `yarn-client` mode:
 
     $ ./bin/spark-shell --master yarn-client
 
@@ -54,8 +54,8 @@ In `yarn-cluster` mode, the driver runs on a different machine than the client,
 
 # Preparations
 
-Running Spark-on-YARN requires a binary distribution of Spark which is built with YARN support.
-Binary distributions can be downloaded from the Spark project website. 
+Running Spark on YARN requires a binary distribution of Spark which is built with YARN support.
+Binary distributions can be downloaded from the [downloads page](http://spark.apache.org/downloads.html) of the project website.
 To build Spark yourself, refer to [Building Spark](building-spark.html).
 
 # Configuration

From b42059d2efdf3322334694205a6d951bcc291644 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 15 Sep 2015 13:03:38 -0700
Subject: [PATCH 297/802] Revert "[SPARK-10300] [BUILD] [TESTS] Add support for
 test tags in run-tests.py."

This reverts commit 8abef21dac1a6538c4e4e0140323b83d804d602b.
---
 core/pom.xml                                  | 10 +++++++
 dev/run-tests.py                              | 19 ++------------
 dev/sparktestsupport/modules.py               | 24 +----------------
 external/flume/pom.xml                        | 10 +++++++
 external/kafka/pom.xml                        | 10 +++++++
 external/mqtt/pom.xml                         | 10 +++++++
 external/twitter/pom.xml                      | 10 +++++++
 external/zeromq/pom.xml                       | 10 +++++++
 extras/java8-tests/pom.xml                    | 10 +++++++
 extras/kinesis-asl/pom.xml                    |  5 ++++
 launcher/pom.xml                              |  5 ++++
 mllib/pom.xml                                 | 10 +++++++
 network/common/pom.xml                        | 10 +++++++
 network/shuffle/pom.xml                       | 10 +++++++
 pom.xml                                       | 17 ++----------
 project/SparkBuild.scala                      | 13 ++--------
 sql/core/pom.xml                              |  5 ++++
 .../execution/HiveCompatibilitySuite.scala    |  2 --
 sql/hive/pom.xml                              |  5 ++++
 .../spark/sql/hive/ExtendedHiveTest.java      | 26 -------------------
 .../spark/sql/hive/client/VersionsSuite.scala |  2 --
 streaming/pom.xml                             | 10 +++++++
 unsafe/pom.xml                                | 10 +++++++
 .../spark/deploy/yarn/ExtendedYarnTest.java   | 26 -------------------
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  1 -
 .../yarn/YarnShuffleIntegrationSuite.scala    |  1 -
 26 files changed, 147 insertions(+), 124 deletions(-)
 delete mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
 delete mode 100644 yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java

diff --git a/core/pom.xml b/core/pom.xml
index 8a20181096223..e31d90f608892 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -331,6 +331,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.curator</groupId>
       <artifactId>curator-test</artifactId>
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1a816585187d9..d8b22e1665e7b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -118,14 +118,6 @@ def determine_modules_to_test(changed_modules):
     return modules_to_test.union(set(changed_modules))
 
 
-def determine_tags_to_exclude(changed_modules):
-    tags = []
-    for m in modules.all_modules:
-        if m not in changed_modules:
-            tags += m.test_tags
-    return tags
-
-
 # -------------------------------------------------------------------------------------------------
 # Functions for working with subprocesses and shell tools
 # -------------------------------------------------------------------------------------------------
@@ -377,7 +369,6 @@ def detect_binary_inop_with_mima():
 
 def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
-
     profiles_and_goals = test_profiles + mvn_test_goals
 
     print("[info] Running Spark tests using Maven with these arguments: ",
@@ -401,7 +392,7 @@ def run_scala_tests_sbt(test_modules, test_profiles):
     exec_sbt(profiles_and_goals)
 
 
-def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
+def run_scala_tests(build_tool, hadoop_version, test_modules):
     """Function to properly execute all tests passed in as a set from the
     `determine_test_suites` function"""
     set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
@@ -410,10 +401,6 @@ def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
 
     test_profiles = get_hadoop_profiles(hadoop_version) + \
         list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
-
-    if excluded_tags:
-        test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
-
     if build_tool == "maven":
         run_scala_tests_maven(test_profiles)
     else:
@@ -513,10 +500,8 @@ def main():
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
-        excluded_tags = determine_tags_to_exclude(changed_modules)
     if not changed_modules:
         changed_modules = [modules.root]
-        excluded_tags = []
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -556,7 +541,7 @@ def main():
         detect_binary_inop_with_mima()
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
+    run_scala_tests(build_tool, hadoop_version, test_modules)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 65397f1f3e0bc..346452f3174e4 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -31,7 +31,7 @@ class Module(object):
 
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
                  sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
-                 test_tags=(), should_run_r_tests=False):
+                 should_run_r_tests=False):
         """
         Define a new module.
 
@@ -50,8 +50,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         :param blacklisted_python_implementations: A set of Python implementations that are not
             supported by this module's Python components. The values in this set should match
             strings returned by Python's `platform.python_implementation()`.
-        :param test_tags A set of tags that will be excluded when running unit tests if the module
-            is not explicitly changed.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         """
         self.name = name
@@ -62,7 +60,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.environ = environ
         self.python_test_goals = python_test_goals
         self.blacklisted_python_implementations = blacklisted_python_implementations
-        self.test_tags = test_tags
         self.should_run_r_tests = should_run_r_tests
 
         self.dependent_modules = set()
@@ -88,9 +85,6 @@ def contains_file(self, filename):
         "catalyst/test",
         "sql/test",
         "hive/test",
-    ],
-    test_tags=[
-        "org.apache.spark.sql.hive.ExtendedHiveTest"
     ]
 )
 
@@ -404,22 +398,6 @@ def contains_file(self, filename):
 )
 
 
-yarn = Module(
-    name="yarn",
-    dependencies=[],
-    source_file_regexes=[
-        "yarn/",
-        "network/yarn/",
-    ],
-    sbt_test_goals=[
-        "yarn/test",
-        "network-yarn/test",
-    ],
-    test_tags=[
-        "org.apache.spark.deploy.yarn.ExtendedYarnTest"
-    ]
-)
-
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 3154e36c21ef5..132062f94fb45 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -66,6 +66,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 7d0d46dadc727..05abd9e2e6810 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -86,6 +86,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 913c47d33f488..05e6338a08b0a 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -58,6 +58,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.activemq</groupId>
       <artifactId>activemq-core</artifactId>
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 9137bf25ee8ae..244ad58ae9593 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -58,6 +58,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 6fec4f0e8a0f9..171df8682c848 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -57,6 +57,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index dba3dda8a9562..81794a8536318 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -58,6 +58,16 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <profiles>
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 760f183a2ef37..6dd8ff69c2943 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -74,6 +74,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 80696280a1d18..ed38e66aa2467 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -42,6 +42,11 @@
       <artifactId>log4j</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5dedacb38874e..22c0c6008ba37 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -94,6 +94,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 9c12cca0df609..1cc054a8936c5 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -64,6 +64,16 @@
     </dependency>
 
     <!-- Test dependencies -->
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index e4f4c57b683c8..7a66c968041ce 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -78,6 +78,16 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
diff --git a/pom.xml b/pom.xml
index 2927d3e107563..6535994641145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -181,7 +181,6 @@
     <libthrift.version>0.9.2</libthrift.version>
 
     <test.java.home>${java.home}</test.java.home>
-    <test.exclude.tags></test.exclude.tags>
 
     <!--
       Dependency scopes that can be overridden by enabling certain profiles. These profiles are
@@ -340,16 +339,6 @@
       <artifactId>scalatest_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <dependencyManagement>
     <dependencies>
@@ -753,7 +742,7 @@
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
-        <version>4.11</version>
+        <version>4.10</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -771,7 +760,7 @@
       <dependency>
         <groupId>com.novocode</groupId>
         <artifactId>junit-interface</artifactId>
-        <version>0.11</version>
+        <version>0.10</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -1926,7 +1915,6 @@
               <test.src.tables>src</test.src.tables>
             </systemProperties>
             <failIfNoTests>false</failIfNoTests>
-            <excludedGroups>${test.exclude.tags}</excludedGroups>
           </configuration>
         </plugin>
         <!-- Scalatest runs all Scala tests -->
@@ -1964,7 +1952,6 @@
               <!-- Needed by sql/hive tests. -->
               <test.src.tables>__not_used__</test.src.tables>
             </systemProperties>
-            <tagsToExclude>${test.exclude.tags}</tagsToExclude>
           </configuration>
           <executions>
             <execution>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index d80d300f1c3b2..901cfa538d23e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -567,20 +567,11 @@ object TestSettings {
     javaOptions in Test ++= "-Xmx3g -Xss4096k -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
     javaOptions += "-Xmx3g",
-    // Exclude tags defined in a system property
-    testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest,
-      sys.props.get("test.exclude.tags").map { tags =>
-        tags.split(",").flatMap { tag => Seq("-l", tag) }.toSeq
-      }.getOrElse(Nil): _*),
-    testOptions in Test += Tests.Argument(TestFrameworks.JUnit,
-      sys.props.get("test.exclude.tags").map { tags =>
-        Seq("--exclude-categories=" + tags)
-      }.getOrElse(Nil): _*),
     // Show full stack trace and duration in test cases.
     testOptions in Test += Tests.Argument("-oDF"),
-    testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
+    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
     // Enable Junit testing.
-    libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test",
+    libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test",
     // Only allow one test at a time, even across projects, since they run in the same JVM
     parallelExecution in Test := false,
     // Make sure the test temp directory exists.
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index fa6732db183d8..465aa3a3888c2 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -73,6 +73,11 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index ffc4c32794ca4..ab309e0a1d36b 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -24,13 +24,11 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.hive.ExtendedHiveTest
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
  * Runs the test cases that are included in the hive distribution.
  */
-@ExtendedHiveTest
 class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   // TODO: bundle in jar files... get from classpath
   private lazy val hiveQueryDir = TestHive.getHiveFile(
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 82cfeb2bb95d3..ac67fe5f47be9 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -160,6 +160,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
deleted file mode 100644
index e2183183fb559..0000000000000
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/ExtendedHiveTest.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive;
-
-import java.lang.annotation.*;
-import org.scalatest.TagAnnotation;
-
-@TagAnnotation
-@Retention(RetentionPolicy.RUNTIME)
-@Target({ElementType.METHOD, ElementType.TYPE})
-public @interface ExtendedHiveTest { }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 888d1b7b45532..f0bb77092c0cf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
-import org.apache.spark.sql.hive.ExtendedHiveTest
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 
@@ -33,7 +32,6 @@ import org.apache.spark.util.Utils
  * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionality
  * is not fully tested.
  */
-@ExtendedHiveTest
 class VersionsSuite extends SparkFunSuite with Logging {
 
   // Do not use a temp path here to speed up subsequent executions of the unit test during
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 1e6ee009ca6d5..5cc9001b0e9ab 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -84,11 +84,21 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 4e8b9a84bb67f..066abe92e51c0 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -55,6 +55,16 @@
     </dependency>
 
     <!-- Test dependencies -->
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java b/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java
deleted file mode 100644
index 7a8f2fe979c1f..0000000000000
--- a/yarn/src/test/java/org/apache/spark/deploy/yarn/ExtendedYarnTest.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn;
-
-import java.lang.annotation.*;
-import org.scalatest.TagAnnotation;
-
-@TagAnnotation
-@Retention(RetentionPolicy.RUNTIME)
-@Target({ElementType.METHOD, ElementType.TYPE})
-public @interface ExtendedYarnTest { }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 105c3090d489d..b5a42fd6afd98 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -39,7 +39,6 @@ import org.apache.spark.util.Utils
  * applications, and require the Spark assembly to be built before they can be successfully
  * run.
  */
-@ExtendedYarnTest
 class YarnClusterSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 4700e2428df08..8d9c9b3004eda 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -32,7 +32,6 @@ import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
 /**
  * Integration test for the external shuffle service with a yarn mini-cluster
  */
-@ExtendedYarnTest
 class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = {

From 841972e22c653ba58e9a65433fed203ff288f13a Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Tue, 15 Sep 2015 13:33:32 -0700
Subject: [PATCH 298/802] [SPARK-10437] [SQL] Support aggregation expressions
 in Order By

JIRA: https://issues.apache.org/jira/browse/SPARK-10437

If an expression in `SortOrder` is a resolved one, such as `count(1)`, the corresponding rule in `Analyzer` to make it work in order by will not be applied.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8599 from viirya/orderby-agg.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 14 +++++++++----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 20 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 591747b45c376..02f34cbf58ad0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -561,7 +561,7 @@ class Analyzer(
         }
 
       case sort @ Sort(sortOrder, global, aggregate: Aggregate)
-        if aggregate.resolved && !sort.resolved =>
+        if aggregate.resolved =>
 
         // Try resolving the ordering as though it is in the aggregate clause.
         try {
@@ -598,9 +598,15 @@ class Analyzer(
               }
           }
 
-          Project(aggregate.output,
-            Sort(evaluatedOrderings, global,
-              aggregate.copy(aggregateExpressions = originalAggExprs ++ needsPushDown)))
+          // Since we don't rely on sort.resolved as the stop condition for this rule,
+          // we need to check this and prevent applying this rule multiple times
+          if (sortOrder == evaluatedOrderings) {
+            sort
+          } else {
+            Project(aggregate.output,
+              Sort(evaluatedOrderings, global,
+                aggregate.copy(aggregateExpressions = originalAggExprs ++ needsPushDown)))
+          }
         } catch {
           // Attempting to resolve in the aggregate can result in ambiguity.  When this happens,
           // just return the original plan.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 962b100b532c9..f9981356f364f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1562,6 +1562,26 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
             |ORDER BY sum(b) + 1
           """.stripMargin),
       Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+            |SELECT count(*)
+            |FROM orderByData
+            |GROUP BY a
+            |ORDER BY count(*)
+          """.stripMargin),
+      Row(2) :: Row(2) :: Row(2) :: Row(2) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+            |SELECT a
+            |FROM orderByData
+            |GROUP BY a
+            |ORDER BY a, count(*), sum(b)
+          """.stripMargin),
+      Row("1") :: Row("2") :: Row("3") :: Row("4") :: Nil)
   }
 
   test("SPARK-7952: fix the equality check between boolean and numeric types") {

From 31a229aa739b6d05ec6d91b820fcca79b6b7d6fe Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 15 Sep 2015 13:36:52 -0700
Subject: [PATCH 299/802] [SPARK-10475] [SQL] improve column prunning for
 Project on Sort

Sometimes we can't push down the whole `Project` though `Sort`, but we still have a chance to push down part of it.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8644 from cloud-fan/column-prune.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 19 +++++++++++++++----
 .../optimizer/ColumnPruningSuite.scala        | 11 +++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0f4caec7451a2..648a65e7c0eb3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -228,10 +228,21 @@ object ColumnPruning extends Rule[LogicalPlan] {
     case Project(projectList, Limit(exp, child)) =>
       Limit(exp, Project(projectList, child))
 
-    // Push down project if possible when the child is sort
-    case p @ Project(projectList, s @ Sort(_, _, grandChild))
-      if s.references.subsetOf(p.outputSet) =>
-      s.copy(child = Project(projectList, grandChild))
+    // Push down project if possible when the child is sort.
+    case p @ Project(projectList, s @ Sort(_, _, grandChild)) =>
+      if (s.references.subsetOf(p.outputSet)) {
+        s.copy(child = Project(projectList, grandChild))
+      } else {
+        val neededReferences = s.references ++ p.references
+        if (neededReferences == grandChild.outputSet) {
+          // No column we can prune, return the original plan.
+          p
+        } else {
+          // Do not use neededReferences.toSeq directly, should respect grandChild's output order.
+          val newProjectList = grandChild.output.filter(neededReferences.contains)
+          p.copy(child = s.copy(child = Project(newProjectList, grandChild)))
+        }
+      }
 
     // Eliminate no-op Projects
     case Project(projectList, child) if child.output == projectList => child
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index dbebcb86809de..4a1e7ceaf394b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -80,5 +80,16 @@ class ColumnPruningSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("Column pruning for Project on Sort") {
+    val input = LocalRelation('a.int, 'b.string, 'c.double)
+
+    val query = input.orderBy('b.asc).select('a).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = input.select('a, 'b).orderBy('b.asc).select('a).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   // todo: add more tests for column pruning
 }

From be52faa7c72fb4b95829f09a7dc5eb5dccd03524 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Tue, 15 Sep 2015 15:46:47 -0700
Subject: [PATCH 300/802] [SPARK-7685] [ML] Apply weights to different samples
 in Logistic Regression

In fraud detection dataset, almost all the samples are negative while only couple of them are positive. This type of high imbalanced data will bias the models toward negative resulting poor performance. In python-scikit, they provide a correction allowing users to Over-/undersample the samples of each class according to the given weights. In auto mode, selects weights inversely proportional to class frequencies in the training set. This can be done in a more efficient way by multiplying the weights into loss and gradient instead of doing actual over/undersampling in the training dataset which is very expensive.
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
On the other hand, some of the training data maybe more important like the training samples from tenure users while the training samples from new users maybe less important. We should be able to provide another "weight: Double" information in the LabeledPoint to weight them differently in the learning algorithm.

Author: DB Tsai <dbt@netflix.com>
Author: DB Tsai <dbt@dbs-mac-pro.corp.netflix.com>

Closes #7884 from dbtsai/SPARK-7685.
---
 .../classification/LogisticRegression.scala   | 199 +++++++++++-------
 .../ml/param/shared/SharedParamsCodeGen.scala |   6 +-
 .../spark/ml/param/shared/sharedParams.scala  |  12 +-
 .../stat/MultivariateOnlineSummarizer.scala   |  75 ++++---
 .../LogisticRegressionSuite.scala             | 102 ++++++++-
 .../MultivariateOnlineSummarizerSuite.scala   |  27 +++
 project/MimaExcludes.scala                    |  10 +-
 7 files changed, 303 insertions(+), 128 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a460262b87e43..bd96e8d000ff2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -29,12 +29,12 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, lit}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -42,7 +42,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
-  with HasStandardization with HasThreshold {
+  with HasStandardization with HasWeightCol with HasThreshold {
 
   /**
    * Set threshold in binary classification, in range [0, 1].
@@ -146,6 +146,17 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   }
 }
 
+/**
+ * Class that represents an instance of weighted data point with label and features.
+ *
+ * TODO: Refactor this class to proper place.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param features The vector of features for this data point.
+ */
+private[classification] case class Instance(label: Double, weight: Double, features: Vector)
+
 /**
  * :: Experimental ::
  * Logistic regression.
@@ -218,31 +229,42 @@ class LogisticRegression(override val uid: String)
 
   override def getThreshold: Double = super.getThreshold
 
+  /**
+   * Whether to over-/under-sample training instances according to the given weights in weightCol.
+   * If empty, all instances are treated equally (weight 1.0).
+   * Default is empty, so all instances have weight one.
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+  setDefault(weightCol -> "")
+
   override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
 
   override def getThresholds: Array[Double] = super.getThresholds
 
   override protected def train(dataset: DataFrame): LogisticRegressionModel = {
     // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
-    val instances = extractLabeledPoints(dataset).map {
-      case LabeledPoint(label: Double, features: Vector) => (label, features)
+    val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
+    val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).map {
+      case Row(label: Double, weight: Double, features: Vector) =>
+        Instance(label, weight, features)
     }
+
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val (summarizer, labelSummarizer) = instances.treeAggregate(
-      (new MultivariateOnlineSummarizer, new MultiClassSummarizer))(
-        seqOp = (c, v) => (c, v) match {
-          case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer),
-          (label: Double, features: Vector)) =>
-            (summarizer.add(features), labelSummarizer.add(label))
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case ((summarizer1: MultivariateOnlineSummarizer,
-          classSummarizer1: MultiClassSummarizer), (summarizer2: MultivariateOnlineSummarizer,
-          classSummarizer2: MultiClassSummarizer)) =>
-            (summarizer1.merge(summarizer2), classSummarizer1.merge(classSummarizer2))
-      })
+    val (summarizer, labelSummarizer) = {
+      val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+        instance: Instance) =>
+          (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight))
+
+      val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+        c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) =>
+          (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+      instances.treeAggregate(
+        new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp)
+    }
 
     val histogram = labelSummarizer.histogram
     val numInvalid = labelSummarizer.countInvalid
@@ -295,7 +317,7 @@ class LogisticRegression(override val uid: String)
       new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
     }
 
-    val initialWeightsWithIntercept =
+    val initialCoefficientsWithIntercept =
       Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures)
 
     if ($(fitIntercept)) {
@@ -312,14 +334,14 @@ class LogisticRegression(override val uid: String)
          b = \log{P(1) / P(0)} = \log{count_1 / count_0}
          }}}
        */
-      initialWeightsWithIntercept.toArray(numFeatures)
-        = math.log(histogram(1).toDouble / histogram(0).toDouble)
+      initialCoefficientsWithIntercept.toArray(numFeatures)
+        = math.log(histogram(1) / histogram(0))
     }
 
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialWeightsWithIntercept.toBreeze.toDenseVector)
+      initialCoefficientsWithIntercept.toBreeze.toDenseVector)
 
-    val (weights, intercept, objectiveHistory) = {
+    val (coefficients, intercept, objectiveHistory) = {
       /*
          Note that in Logistic Regression, the objective history (loss + regularization)
          is log-likelihood which is invariance under feature standardization. As a result,
@@ -339,28 +361,29 @@ class LogisticRegression(override val uid: String)
       }
 
       /*
-         The weights are trained in the scaled space; we're converting them back to
+         The coefficients are trained in the scaled space; we're converting them back to
          the original space.
          Note that the intercept in scaled space and original space is the same;
          as a result, no scaling is needed.
        */
-      val rawWeights = state.x.toArray.clone()
+      val rawCoefficients = state.x.toArray.clone()
       var i = 0
       while (i < numFeatures) {
-        rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
+        rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
         i += 1
       }
 
       if ($(fitIntercept)) {
-        (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result())
+        (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last,
+          arrayBuilder.result())
       } else {
-        (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result())
+        (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result())
       }
     }
 
     if (handlePersistence) instances.unpersist()
 
-    val model = copyValues(new LogisticRegressionModel(uid, weights, intercept))
+    val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept))
     val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
       model.transform(dataset),
       $(probabilityCol),
@@ -501,22 +524,29 @@ class LogisticRegressionModel private[ml] (
  * corresponding joint dataset.
  */
 private[classification] class MultiClassSummarizer extends Serializable {
-  private val distinctMap = new mutable.HashMap[Int, Long]
+  // The first element of value in distinctMap is the actually number of instances,
+  // and the second element of value is sum of the weights.
+  private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
   private var totalInvalidCnt: Long = 0L
 
   /**
    * Add a new label into this MultilabelSummarizer, and update the distinct map.
    * @param label The label for this data point.
+   * @param weight The weight of this instances.
    * @return This MultilabelSummarizer
    */
-  def add(label: Double): this.type = {
+  def add(label: Double, weight: Double = 1.0): this.type = {
+    require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
+
+    if (weight == 0.0) return this
+
     if (label - label.toInt != 0.0 || label < 0) {
       totalInvalidCnt += 1
       this
     }
     else {
-      val counts: Long = distinctMap.getOrElse(label.toInt, 0L)
-      distinctMap.put(label.toInt, counts + 1)
+      val (counts: Long, weightSum: Double) = distinctMap.getOrElse(label.toInt, (0L, 0.0))
+      distinctMap.put(label.toInt, (counts + 1L, weightSum + weight))
       this
     }
   }
@@ -537,8 +567,8 @@ private[classification] class MultiClassSummarizer extends Serializable {
     }
     smallMap.distinctMap.foreach {
       case (key, value) =>
-        val counts = largeMap.distinctMap.getOrElse(key, 0L)
-        largeMap.distinctMap.put(key, counts + value)
+        val (counts: Long, weightSum: Double) = largeMap.distinctMap.getOrElse(key, (0L, 0.0))
+        largeMap.distinctMap.put(key, (counts + value._1, weightSum + value._2))
     }
     largeMap.totalInvalidCnt += smallMap.totalInvalidCnt
     largeMap
@@ -550,13 +580,13 @@ private[classification] class MultiClassSummarizer extends Serializable {
   /** @return The number of distinct labels in the input dataset. */
   def numClasses: Int = distinctMap.keySet.max + 1
 
-  /** @return The counts of each label in the input dataset. */
-  def histogram: Array[Long] = {
-    val result = Array.ofDim[Long](numClasses)
+  /** @return The weightSum of each label in the input dataset. */
+  def histogram: Array[Double] = {
+    val result = Array.ofDim[Double](numClasses)
     var i = 0
     val len = result.length
     while (i < len) {
-      result(i) = distinctMap.getOrElse(i, 0L)
+      result(i) = distinctMap.getOrElse(i, (0L, 0.0))._2
       i += 1
     }
     result
@@ -565,6 +595,8 @@ private[classification] class MultiClassSummarizer extends Serializable {
 
 /**
  * Abstraction for multinomial Logistic Regression Training results.
+ * Currently, the training summary ignores the training weights except
+ * for the objective trace.
  */
 sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary {
 
@@ -584,10 +616,10 @@ sealed trait LogisticRegressionSummary extends Serializable {
   /** Dataframe outputted by the model's `transform` method. */
   def predictions: DataFrame
 
-  /** Field in "predictions" which gives the calibrated probability of each sample as a vector. */
+  /** Field in "predictions" which gives the calibrated probability of each instance as a vector. */
   def probabilityCol: String
 
-  /** Field in "predictions" which gives the the true label of each sample. */
+  /** Field in "predictions" which gives the the true label of each instance. */
   def labelCol: String
 
 }
@@ -597,8 +629,8 @@ sealed trait LogisticRegressionSummary extends Serializable {
  * Logistic regression training results.
  * @param predictions dataframe outputted by the model's `transform` method.
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
- *                       each sample as a vector.
- * @param labelCol field in "predictions" which gives the true label of each sample.
+ *                       each instance as a vector.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
 @Experimental
@@ -617,8 +649,8 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
  * Binary Logistic regression results for a given model.
  * @param predictions dataframe outputted by the model's `transform` method.
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
- *                       each sample.
- * @param labelCol field in "predictions" which gives the true label of each sample.
+ *                       each instance.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
  */
 @Experimental
 class BinaryLogisticRegressionSummary private[classification] (
@@ -687,14 +719,14 @@ class BinaryLogisticRegressionSummary private[classification] (
 
 /**
  * LogisticAggregator computes the gradient and loss for binary logistic loss function, as used
- * in binary classification for samples in sparse or dense vector in a online fashion.
+ * in binary classification for instances in sparse or dense vector in a online fashion.
  *
  * Note that multinomial logistic loss is not supported yet!
  *
  * Two LogisticAggregator can be merged together to have a summary of loss and gradient of
  * the corresponding joint dataset.
  *
- * @param weights The weights/coefficients corresponding to the features.
+ * @param coefficients The coefficients corresponding to the features.
  * @param numClasses the number of possible outcomes for k classes classification problem in
  *                   Multinomial Logistic Regression.
  * @param fitIntercept Whether to fit an intercept term.
@@ -702,25 +734,25 @@ class BinaryLogisticRegressionSummary private[classification] (
  * @param featuresMean The mean values of the features.
  */
 private class LogisticAggregator(
-    weights: Vector,
+    coefficients: Vector,
     numClasses: Int,
     fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double]) extends Serializable {
 
-  private var totalCnt: Long = 0L
+  private var weightSum = 0.0
   private var lossSum = 0.0
 
-  private val weightsArray = weights match {
+  private val coefficientsArray = coefficients match {
     case dv: DenseVector => dv.values
     case _ =>
       throw new IllegalArgumentException(
-        s"weights only supports dense vector but got type ${weights.getClass}.")
+        s"coefficients only supports dense vector but got type ${coefficients.getClass}.")
   }
 
-  private val dim = if (fitIntercept) weightsArray.length - 1 else weightsArray.length
+  private val dim = if (fitIntercept) coefficientsArray.length - 1 else coefficientsArray.length
 
-  private val gradientSumArray = Array.ofDim[Double](weightsArray.length)
+  private val gradientSumArray = Array.ofDim[Double](coefficientsArray.length)
 
   /**
    * Add a new training data to this LogisticAggregator, and update the loss and gradient
@@ -729,13 +761,17 @@ private class LogisticAggregator(
    * @param label The label for this data point.
    * @param data The features for one data point in dense/sparse vector format to be added
    *             into this aggregator.
+   * @param weight The weight for over-/undersamples each of training instance. Default is one.
    * @return This LogisticAggregator object.
    */
-  def add(label: Double, data: Vector): this.type = {
-    require(dim == data.size, s"Dimensions mismatch when adding new sample." +
+  def add(label: Double, data: Vector, weight: Double = 1.0): this.type = {
+    require(dim == data.size, s"Dimensions mismatch when adding new instance." +
       s" Expecting $dim but got ${data.size}.")
+    require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
 
-    val localWeightsArray = weightsArray
+    if (weight == 0.0) return this
+
+    val localCoefficientsArray = coefficientsArray
     val localGradientSumArray = gradientSumArray
 
     numClasses match {
@@ -745,13 +781,13 @@ private class LogisticAggregator(
           var sum = 0.0
           data.foreachActive { (index, value) =>
             if (featuresStd(index) != 0.0 && value != 0.0) {
-              sum += localWeightsArray(index) * (value / featuresStd(index))
+              sum += localCoefficientsArray(index) * (value / featuresStd(index))
             }
           }
-          sum + { if (fitIntercept) localWeightsArray(dim) else 0.0 }
+          sum + { if (fitIntercept) localCoefficientsArray(dim) else 0.0 }
         }
 
-        val multiplier = (1.0 / (1.0 + math.exp(margin))) - label
+        val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
 
         data.foreachActive { (index, value) =>
           if (featuresStd(index) != 0.0 && value != 0.0) {
@@ -765,15 +801,15 @@ private class LogisticAggregator(
 
         if (label > 0) {
           // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
-          lossSum += MLUtils.log1pExp(margin)
+          lossSum += weight * MLUtils.log1pExp(margin)
         } else {
-          lossSum += MLUtils.log1pExp(margin) - margin
+          lossSum += weight * (MLUtils.log1pExp(margin) - margin)
         }
       case _ =>
         new NotImplementedError("LogisticRegression with ElasticNet in ML package only supports " +
           "binary classification for now.")
     }
-    totalCnt += 1
+    weightSum += weight
     this
   }
 
@@ -789,8 +825,8 @@ private class LogisticAggregator(
     require(dim == other.dim, s"Dimensions mismatch when merging with another " +
       s"LeastSquaresAggregator. Expecting $dim but got ${other.dim}.")
 
-    if (other.totalCnt != 0) {
-      totalCnt += other.totalCnt
+    if (other.weightSum != 0.0) {
+      weightSum += other.weightSum
       lossSum += other.lossSum
 
       var i = 0
@@ -805,13 +841,17 @@ private class LogisticAggregator(
     this
   }
 
-  def count: Long = totalCnt
-
-  def loss: Double = lossSum / totalCnt
+  def loss: Double = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
+    lossSum / weightSum
+  }
 
   def gradient: Vector = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
-    scal(1.0 / totalCnt, result)
+    scal(1.0 / weightSum, result)
     result
   }
 }
@@ -823,7 +863,7 @@ private class LogisticAggregator(
  * It's used in Breeze's convex optimization routines.
  */
 private class LogisticCostFun(
-    data: RDD[(Double, Vector)],
+    data: RDD[Instance],
     numClasses: Int,
     fitIntercept: Boolean,
     standardization: Boolean,
@@ -831,22 +871,23 @@ private class LogisticCostFun(
     featuresMean: Array[Double],
     regParamL2: Double) extends DiffFunction[BDV[Double]] {
 
-  override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
     val numFeatures = featuresStd.length
-    val w = Vectors.fromBreeze(weights)
+    val w = Vectors.fromBreeze(coefficients)
 
-    val logisticAggregator = data.treeAggregate(new LogisticAggregator(w, numClasses, fitIntercept,
-      featuresStd, featuresMean))(
-        seqOp = (c, v) => (c, v) match {
-          case (aggregator, (label, features)) => aggregator.add(label, features)
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
-        })
+    val logisticAggregator = {
+      val seqOp = (c: LogisticAggregator, instance: Instance) =>
+        c.add(instance.label, instance.features, instance.weight)
+      val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2)
+
+      data.treeAggregate(
+        new LogisticAggregator(w, numClasses, fitIntercept, featuresStd, featuresMean)
+      )(seqOp, combOp)
+    }
 
     val totalGradientArray = logisticAggregator.gradient.toArray
 
-    // regVal is the sum of weight squares excluding intercept for L2 regularization.
+    // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
     val regVal = if (regParamL2 == 0.0) {
       0.0
     } else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index e9e99ed1db40e..8049d51fee5ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -42,7 +42,7 @@ private[shared] object SharedParamsCodeGen {
         Some("\"rawPrediction\"")),
       ParamDesc[String]("probabilityCol", "Column name for predicted class conditional" +
         " probabilities. Note: Not all models output well-calibrated probability estimates!" +
-        " These probabilities should be treated as confidences, not precise probabilities.",
+        " These probabilities should be treated as confidences, not precise probabilities",
         Some("\"probability\"")),
       ParamDesc[Double]("threshold",
         "threshold in binary classification prediction, in range [0, 1]", Some("0.5"),
@@ -65,10 +65,10 @@ private[shared] object SharedParamsCodeGen {
         "options may be added later.",
         isValid = "ParamValidators.inArray(Array(\"skip\", \"error\"))"),
       ParamDesc[Boolean]("standardization", "whether to standardize the training features" +
-        " before fitting the model.", Some("true")),
+        " before fitting the model", Some("true")),
       ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
       ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
-        " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
+        " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty",
         isValid = "ParamValidators.inRange(0, 1)"),
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 30092170863ad..aff47fc326c4a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -127,10 +127,10 @@ private[ml] trait HasRawPredictionCol extends Params {
 private[ml] trait HasProbabilityCol extends Params {
 
   /**
-   * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities..
+   * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
    * @group param
    */
-  final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
+  final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities")
 
   setDefault(probabilityCol, "probability")
 
@@ -270,10 +270,10 @@ private[ml] trait HasHandleInvalid extends Params {
 private[ml] trait HasStandardization extends Params {
 
   /**
-   * Param for whether to standardize the training features before fitting the model..
+   * Param for whether to standardize the training features before fitting the model.
    * @group param
    */
-  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model.")
+  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model")
 
   setDefault(standardization, true)
 
@@ -304,10 +304,10 @@ private[ml] trait HasSeed extends Params {
 private[ml] trait HasElasticNetParam extends Params {
 
   /**
-   * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+   * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
    * @group param
    */
-  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1))
+  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty", ParamValidators.inRange(0, 1))
 
   /** @group getParam */
   final def getElasticNetParam: Double = $(elasticNetParam)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 51b713e263e0c..201333c3690df 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -23,16 +23,19 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
 /**
  * :: DeveloperApi ::
  * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean,
- * variance, minimum, maximum, counts, and nonzero counts for samples in sparse or dense vector
+ * variance, minimum, maximum, counts, and nonzero counts for instances in sparse or dense vector
  * format in a online fashion.
  *
  * Two MultivariateOnlineSummarizer can be merged together to have a statistical summary of
  * the corresponding joint dataset.
  *
- * A numerically stable algorithm is implemented to compute sample mean and variance:
+ * A numerically stable algorithm is implemented to compute the mean and variance of instances:
  * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
+ *
+ * For weighted instances, the unbiased estimation of variance is defined by the reliability
+ * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]].
  */
 @Since("1.1.0")
 @DeveloperApi
@@ -44,6 +47,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   private var currM2: Array[Double] = _
   private var currL1: Array[Double] = _
   private var totalCnt: Long = 0
+  private var weightSum: Double = 0.0
+  private var weightSquareSum: Double = 0.0
   private var nnz: Array[Double] = _
   private var currMax: Array[Double] = _
   private var currMin: Array[Double] = _
@@ -55,10 +60,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    * @return This MultivariateOnlineSummarizer object.
    */
   @Since("1.1.0")
-  def add(sample: Vector): this.type = {
+  def add(sample: Vector): this.type = add(sample, 1.0)
+
+  private[spark] def add(instance: Vector, weight: Double): this.type = {
+    require(weight >= 0.0, s"sample weight, ${weight} has to be >= 0.0")
+    if (weight == 0.0) return this
+
     if (n == 0) {
-      require(sample.size > 0, s"Vector should have dimension larger than zero.")
-      n = sample.size
+      require(instance.size > 0, s"Vector should have dimension larger than zero.")
+      n = instance.size
 
       currMean = Array.ofDim[Double](n)
       currM2n = Array.ofDim[Double](n)
@@ -69,8 +79,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       currMin = Array.fill[Double](n)(Double.MaxValue)
     }
 
-    require(n == sample.size, s"Dimensions mismatch when adding new sample." +
-      s" Expecting $n but got ${sample.size}.")
+    require(n == instance.size, s"Dimensions mismatch when adding new sample." +
+      s" Expecting $n but got ${instance.size}.")
 
     val localCurrMean = currMean
     val localCurrM2n = currM2n
@@ -79,7 +89,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     val localNnz = nnz
     val localCurrMax = currMax
     val localCurrMin = currMin
-    sample.foreachActive { (index, value) =>
+    instance.foreachActive { (index, value) =>
       if (value != 0.0) {
         if (localCurrMax(index) < value) {
           localCurrMax(index) = value
@@ -90,15 +100,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
 
         val prevMean = localCurrMean(index)
         val diff = value - prevMean
-        localCurrMean(index) = prevMean + diff / (localNnz(index) + 1.0)
-        localCurrM2n(index) += (value - localCurrMean(index)) * diff
-        localCurrM2(index) += value * value
-        localCurrL1(index) += math.abs(value)
+        localCurrMean(index) = prevMean + weight * diff / (localNnz(index) + weight)
+        localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
+        localCurrM2(index) += weight * value * value
+        localCurrL1(index) += weight * math.abs(value)
 
-        localNnz(index) += 1.0
+        localNnz(index) += weight
       }
     }
 
+    weightSum += weight
+    weightSquareSum += weight * weight
     totalCnt += 1
     this
   }
@@ -112,10 +124,12 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   def merge(other: MultivariateOnlineSummarizer): this.type = {
-    if (this.totalCnt != 0 && other.totalCnt != 0) {
+    if (this.weightSum != 0.0 && other.weightSum != 0.0) {
       require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
         s"Expecting $n but got ${other.n}.")
       totalCnt += other.totalCnt
+      weightSum += other.weightSum
+      weightSquareSum += other.weightSquareSum
       var i = 0
       while (i < n) {
         val thisNnz = nnz(i)
@@ -138,13 +152,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
         nnz(i) = totalNnz
         i += 1
       }
-    } else if (totalCnt == 0 && other.totalCnt != 0) {
+    } else if (weightSum == 0.0 && other.weightSum != 0.0) {
       this.n = other.n
       this.currMean = other.currMean.clone()
       this.currM2n = other.currM2n.clone()
       this.currM2 = other.currM2.clone()
       this.currL1 = other.currL1.clone()
       this.totalCnt = other.totalCnt
+      this.weightSum = other.weightSum
+      this.weightSquareSum = other.weightSquareSum
       this.nnz = other.nnz.clone()
       this.currMax = other.currMax.clone()
       this.currMin = other.currMin.clone()
@@ -158,28 +174,28 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def mean: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realMean = Array.ofDim[Double](n)
     var i = 0
     while (i < n) {
-      realMean(i) = currMean(i) * (nnz(i) / totalCnt)
+      realMean(i) = currMean(i) * (nnz(i) / weightSum)
       i += 1
     }
     Vectors.dense(realMean)
   }
 
   /**
-   * Sample variance of each dimension.
+   * Unbiased estimate of sample variance of each dimension.
    *
    */
   @Since("1.1.0")
   override def variance: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realVariance = Array.ofDim[Double](n)
 
-    val denominator = totalCnt - 1.0
+    val denominator = weightSum - (weightSquareSum / weightSum)
 
     // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
     if (denominator > 0.0) {
@@ -187,9 +203,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       var i = 0
       val len = currM2n.length
       while (i < len) {
-        realVariance(i) =
-          currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt
-        realVariance(i) /= denominator
+        realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) *
+          (weightSum - nnz(i)) / weightSum) / denominator
         i += 1
       }
     }
@@ -209,7 +224,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def numNonzeros: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     Vectors.dense(nnz)
   }
@@ -220,11 +235,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def max: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     var i = 0
     while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0
+      if ((nnz(i) < weightSum) && (currMax(i) < 0.0)) currMax(i) = 0.0
       i += 1
     }
     Vectors.dense(currMax)
@@ -236,11 +251,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def min: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     var i = 0
     while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0
+      if ((nnz(i) < weightSum) && (currMin(i) > 0.0)) currMin(i) = 0.0
       i += 1
     }
     Vectors.dense(currMin)
@@ -252,7 +267,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.2.0")
   override def normL2: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realMagnitude = Array.ofDim[Double](n)
 
@@ -271,7 +286,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.2.0")
   override def normL1: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     Vectors.dense(currL1)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index cce39f382f738..f5219f9f574be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.ml.classification
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -59,8 +62,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
       val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
 
-      sqlContext.createDataFrame(
-        generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42))
+      sqlContext.createDataFrame(sc.parallelize(testData, 4))
     }
   }
 
@@ -77,6 +79,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lr.getPredictionCol === "prediction")
     assert(lr.getRawPredictionCol === "rawPrediction")
     assert(lr.getProbabilityCol === "probability")
+    assert(lr.getWeightCol === "")
     assert(lr.getFitIntercept)
     assert(lr.getStandardization)
     val model = lr.fit(dataset)
@@ -216,43 +219,65 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("MultiClassSummarizer") {
     val summarizer1 = (new MultiClassSummarizer)
       .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
-    assert(summarizer1.histogram.zip(Array[Long](1, 0, 0, 2, 1, 0, 1)).forall(x => x._1 === x._2))
+    assert(summarizer1.histogram === Array[Double](1, 0, 0, 2, 1, 0, 1))
     assert(summarizer1.countInvalid === 0)
     assert(summarizer1.numClasses === 7)
 
     val summarizer2 = (new MultiClassSummarizer)
       .add(1.0).add(5.0).add(3.0).add(0.0).add(4.0).add(1.0)
-    assert(summarizer2.histogram.zip(Array[Long](1, 2, 0, 1, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizer2.histogram === Array[Double](1, 2, 0, 1, 1, 1))
     assert(summarizer2.countInvalid === 0)
     assert(summarizer2.numClasses === 6)
 
     val summarizer3 = (new MultiClassSummarizer)
       .add(0.0).add(1.3).add(5.2).add(2.5).add(2.0).add(4.0).add(4.0).add(4.0).add(1.0)
-    assert(summarizer3.histogram.zip(Array[Long](1, 1, 1, 0, 3)).forall(x => x._1 === x._2))
+    assert(summarizer3.histogram === Array[Double](1, 1, 1, 0, 3))
     assert(summarizer3.countInvalid === 3)
     assert(summarizer3.numClasses === 5)
 
     val summarizer4 = (new MultiClassSummarizer)
       .add(3.1).add(4.3).add(2.0).add(1.0).add(3.0)
-    assert(summarizer4.histogram.zip(Array[Long](0, 1, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizer4.histogram === Array[Double](0, 1, 1, 1))
     assert(summarizer4.countInvalid === 2)
     assert(summarizer4.numClasses === 4)
 
     // small map merges large one
     val summarizerA = summarizer1.merge(summarizer2)
     assert(summarizerA.hashCode() === summarizer2.hashCode())
-    assert(summarizerA.histogram.zip(Array[Long](2, 2, 0, 3, 2, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizerA.histogram === Array[Double](2, 2, 0, 3, 2, 1, 1))
     assert(summarizerA.countInvalid === 0)
     assert(summarizerA.numClasses === 7)
 
     // large map merges small one
     val summarizerB = summarizer3.merge(summarizer4)
     assert(summarizerB.hashCode() === summarizer3.hashCode())
-    assert(summarizerB.histogram.zip(Array[Long](1, 2, 2, 1, 3)).forall(x => x._1 === x._2))
+    assert(summarizerB.histogram === Array[Double](1, 2, 2, 1, 3))
     assert(summarizerB.countInvalid === 5)
     assert(summarizerB.numClasses === 5)
   }
 
+  test("MultiClassSummarizer with weighted samples") {
+    val summarizer1 = (new MultiClassSummarizer)
+      .add(label = 0.0, weight = 0.2).add(3.0, 0.8).add(4.0, 3.2).add(3.0, 1.3).add(6.0, 3.1)
+    assert(Vectors.dense(summarizer1.histogram) ~==
+      Vectors.dense(Array(0.2, 0, 0, 2.1, 3.2, 0, 3.1)) absTol 1E-10)
+    assert(summarizer1.countInvalid === 0)
+    assert(summarizer1.numClasses === 7)
+
+    val summarizer2 = (new MultiClassSummarizer)
+      .add(1.0, 1.1).add(5.0, 2.3).add(3.0).add(0.0).add(4.0).add(1.0).add(2, 0.0)
+    assert(Vectors.dense(summarizer2.histogram) ~==
+      Vectors.dense(Array[Double](1.0, 2.1, 0.0, 1, 1, 2.3)) absTol 1E-10)
+    assert(summarizer2.countInvalid === 0)
+    assert(summarizer2.numClasses === 6)
+
+    val summarizer = summarizer1.merge(summarizer2)
+    assert(Vectors.dense(summarizer.histogram) ~==
+      Vectors.dense(Array(1.2, 2.1, 0.0, 3.1, 4.2, 2.3, 3.1)) absTol 1E-10)
+    assert(summarizer.countInvalid === 0)
+    assert(summarizer.numClasses === 7)
+  }
+
   test("binary logistic regression with intercept without regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
     val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false)
@@ -713,7 +738,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        b = \log{P(1) / P(0)} = \log{count_1 / count_0}
        }}}
      */
-    val interceptTheory = math.log(histogram(1).toDouble / histogram(0).toDouble)
+    val interceptTheory = math.log(histogram(1) / histogram(0))
     val weightsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptTheory relTol 1E-5)
@@ -781,4 +806,63 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .forall(x => x(0) >= x(1)))
 
   }
+
+  test("binary logistic regression with weighted samples") {
+    val (dataset, weightedDataset) = {
+      val nPoints = 1000
+      val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+      val xMean = Array(5.843, 3.057, 3.758, 1.199)
+      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+      val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+
+      // Let's over-sample the positive samples twice.
+      val data1 = testData.flatMap { case labeledPoint: LabeledPoint =>
+        if (labeledPoint.label == 1.0) {
+          Iterator(labeledPoint, labeledPoint)
+        } else {
+          Iterator(labeledPoint)
+        }
+      }
+
+      val rnd = new Random(8392)
+      val data2 = testData.flatMap { case LabeledPoint(label: Double, features: Vector) =>
+        if (rnd.nextGaussian() > 0.0) {
+          if (label == 1.0) {
+            Iterator(
+              Instance(label, 1.2, features),
+              Instance(label, 0.8, features),
+              Instance(0.0, 0.0, features))
+          } else {
+            Iterator(
+              Instance(label, 0.3, features),
+              Instance(1.0, 0.0, features),
+              Instance(label, 0.1, features),
+              Instance(label, 0.6, features))
+          }
+        } else {
+          if (label == 1.0) {
+            Iterator(Instance(label, 2.0, features))
+          } else {
+            Iterator(Instance(label, 1.0, features))
+          }
+        }
+      }
+
+      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+    }
+
+    val trainer1a = (new LogisticRegression).setFitIntercept(true)
+      .setRegParam(0.0).setStandardization(true)
+    val trainer1b = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
+      .setRegParam(0.0).setStandardization(true)
+    val model1a0 = trainer1a.fit(dataset)
+    val model1a1 = trainer1a.fit(weightedDataset)
+    val model1b = trainer1b.fit(weightedDataset)
+    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
index 07efde4f5e6dc..b6d41db69be0a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
@@ -218,4 +218,31 @@ class MultivariateOnlineSummarizerSuite extends SparkFunSuite {
     s0.merge(s1)
     assert(s0.mean(0) ~== 1.0 absTol 1e-14)
   }
+
+  test("merging summarizer with weighted samples") {
+    val summarizer = (new MultivariateOnlineSummarizer)
+      .add(instance = Vectors.sparse(3, Seq((0, -0.8), (1, 1.7))), weight = 0.1)
+      .add(Vectors.dense(0.0, -1.2, -1.7), 0.2).merge(
+        (new MultivariateOnlineSummarizer)
+          .add(Vectors.sparse(3, Seq((0, -0.7), (1, 0.01), (2, 1.3))), 0.15)
+          .add(Vectors.dense(-0.5, 0.3, -1.5), 0.05))
+
+    assert(summarizer.count === 4)
+
+    // The following values are hand calculated using the formula:
+    // [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]]
+    // which defines the reliability weight used for computing the unbiased estimation of variance
+    // for weighted instances.
+    assert(summarizer.mean ~== Vectors.dense(Array(-0.42, -0.107, -0.44))
+      absTol 1E-10, "mean mismatch")
+    assert(summarizer.variance ~== Vectors.dense(Array(0.17657142857, 1.645115714, 2.42057142857))
+      absTol 1E-8, "variance mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(Array(0.3, 0.5, 0.4))
+      absTol 1E-10, "numNonzeros mismatch")
+    assert(summarizer.max ~== Vectors.dense(Array(0.0, 1.7, 1.3)) absTol 1E-10, "max mismatch")
+    assert(summarizer.min ~== Vectors.dense(Array(-0.8, -1.2, -1.7)) absTol 1E-10, "min mismatch")
+    assert(summarizer.normL2 ~== Vectors.dense(0.387298335, 0.762571308141, 0.9715966241192)
+      absTol 1E-8, "normL2 mismatch")
+    assert(summarizer.normL1 ~== Vectors.dense(0.21, 0.4265, 0.61) absTol 1E-10, "normL1 mismatch")
+  }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 87b141cd3b058..46026c1e90ea0 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -45,7 +45,15 @@ object MimaExcludes {
         excludePackage("org.apache.spark.sql.execution")
       ) ++
       MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
-      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils")
+      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++ 
+      Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticAggregator.add"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticAggregator.count")
+      )
     case v if v.startsWith("1.5") =>
       Seq(
         MimaBuild.excludeSparkPackage("network"),

From b6e998634e05db0bb6267173e7b28f885c808c16 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 15 Sep 2015 16:45:47 -0700
Subject: [PATCH 301/802] [SPARK-10548] [SPARK-10563] [SQL] Fix concurrent SQL
 executions

*Note: this is for master branch only.* The fix for branch-1.5 is at #8721.

The query execution ID is currently passed from a thread to its children, which is not the intended behavior. This led to `IllegalArgumentException: spark.sql.execution.id is already set` when running queries in parallel, e.g.:
```
(1 to 100).par.foreach { _ =>
  sc.parallelize(1 to 5).map { i => (i, i) }.toDF("a", "b").count()
}
```
The cause is `SparkContext`'s local properties are inherited by default. This patch adds a way to exclude keys we don't want to be inherited, and makes SQL go through that code path.

Author: Andrew Or <andrew@databricks.com>

Closes #8710 from andrewor14/concurrent-sql-executions.
---
 .../scala/org/apache/spark/SparkContext.scala |   9 +-
 .../org/apache/spark/ThreadingSuite.scala     |  65 +++++------
 .../sql/execution/SQLExecutionSuite.scala     | 101 ++++++++++++++++++
 3 files changed, 132 insertions(+), 43 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index dee6091ce3caf..a2f34eafa2c38 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -33,6 +33,7 @@ import scala.collection.mutable.HashMap
 import scala.reflect.{ClassTag, classTag}
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang.SerializationUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable,
@@ -347,8 +348,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] var checkpointDir: Option[String] = None
 
   // Thread Local variable that can be used by users to pass information down the stack
-  private val localProperties = new InheritableThreadLocal[Properties] {
-    override protected def childValue(parent: Properties): Properties = new Properties(parent)
+  protected[spark] val localProperties = new InheritableThreadLocal[Properties] {
+    override protected def childValue(parent: Properties): Properties = {
+      // Note: make a clone such that changes in the parent properties aren't reflected in
+      // the those of the children threads, which has confusing semantics (SPARK-10563).
+      SerializationUtils.clone(parent).asInstanceOf[Properties]
+    }
     override protected def initialValue(): Properties = new Properties()
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index a96a4ce201c21..54c131cdae367 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -147,7 +147,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
       }.start()
     }
     sem.acquire(2)
-    throwable.foreach { t => throw t }
+    throwable.foreach { t => throw improveStackTrace(t) }
     if (ThreadingSuiteState.failed.get()) {
       logError("Waited 1 second without seeing runningThreads = 4 (it was " +
                 ThreadingSuiteState.runningThreads.get() + "); failing test")
@@ -178,7 +178,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
-    throwable.foreach { t => throw t }
+    throwable.foreach { t => throw improveStackTrace(t) }
     assert(sc.getLocalProperty("test") === null)
   }
 
@@ -207,58 +207,41 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
-    throwable.foreach { t => throw t }
+    throwable.foreach { t => throw improveStackTrace(t) }
     assert(sc.getLocalProperty("test") === "parent")
     assert(sc.getLocalProperty("Foo") === null)
   }
 
-  test("mutations to local properties should not affect submitted jobs (SPARK-6629)") {
-    val jobStarted = new Semaphore(0)
-    val jobEnded = new Semaphore(0)
-    @volatile var jobResult: JobResult = null
-    var throwable: Option[Throwable] = None
-
+  test("mutation in parent local property does not affect child (SPARK-10563)") {
     sc = new SparkContext("local", "test")
-    sc.setJobGroup("originalJobGroupId", "description")
-    sc.addSparkListener(new SparkListener {
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        jobStarted.release()
-      }
-      override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
-        jobResult = jobEnd.jobResult
-        jobEnded.release()
-      }
-    })
-
-    // Create a new thread which will inherit the current thread's properties
-    val thread = new Thread() {
+    val originalTestValue: String = "original-value"
+    var threadTestValue: String = null
+    sc.setLocalProperty("test", originalTestValue)
+    var throwable: Option[Throwable] = None
+    val thread = new Thread {
       override def run(): Unit = {
         try {
-          assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId")
-          // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task
-          try {
-            sc.parallelize(1 to 100).foreach { x =>
-              Thread.sleep(100)
-            }
-          } catch {
-            case s: SparkException => // ignored so that we don't print noise in test logs
-          }
+          threadTestValue = sc.getLocalProperty("test")
         } catch {
           case t: Throwable =>
             throwable = Some(t)
         }
       }
     }
+    sc.setLocalProperty("test", "this-should-not-be-inherited")
     thread.start()
-    // Wait for the job to start, then mutate the original properties, which should have been
-    // inherited by the running job but hopefully defensively copied or snapshotted:
-    jobStarted.tryAcquire(10, TimeUnit.SECONDS)
-    sc.setJobGroup("modifiedJobGroupId", "description")
-    // Canceling the original job group should cancel the running job. In other words, the
-    // modification of the properties object should not affect the properties of running jobs
-    sc.cancelJobGroup("originalJobGroupId")
-    jobEnded.tryAcquire(10, TimeUnit.SECONDS)
-    throwable.foreach { t => throw t }
-    assert(jobResult.isInstanceOf[JobFailed])
+    thread.join()
+    throwable.foreach { t => throw improveStackTrace(t) }
+    assert(threadTestValue === originalTestValue)
   }
+
+  /**
+   * Improve the stack trace of an error thrown from within a thread.
+   * Otherwise it's difficult to tell which line in the test the error came from.
+   */
+  private def improveStackTrace(t: Throwable): Throwable = {
+    t.setStackTrace(t.getStackTrace ++ Thread.currentThread.getStackTrace)
+    t
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
new file mode 100644
index 0000000000000..63639681ef80a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.util.Properties
+
+import scala.collection.parallel.CompositeThrowable
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.sql.SQLContext
+
+class SQLExecutionSuite extends SparkFunSuite {
+
+  test("concurrent query execution (SPARK-10548)") {
+    // Try to reproduce the issue with the old SparkContext
+    val conf = new SparkConf()
+      .setMaster("local[*]")
+      .setAppName("test")
+    val badSparkContext = new BadSparkContext(conf)
+    try {
+      testConcurrentQueryExecution(badSparkContext)
+      fail("unable to reproduce SPARK-10548")
+    } catch {
+      case e: IllegalArgumentException =>
+        assert(e.getMessage.contains(SQLExecution.EXECUTION_ID_KEY))
+    } finally {
+      badSparkContext.stop()
+    }
+
+    // Verify that the issue is fixed with the latest SparkContext
+    val goodSparkContext = new SparkContext(conf)
+    try {
+      testConcurrentQueryExecution(goodSparkContext)
+    } finally {
+      goodSparkContext.stop()
+    }
+  }
+
+  /**
+   * Trigger SPARK-10548 by mocking a parent and its child thread executing queries concurrently.
+   */
+  private def testConcurrentQueryExecution(sc: SparkContext): Unit = {
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+
+    // Initialize local properties. This is necessary for the test to pass.
+    sc.getLocalProperties
+
+    // Set up a thread that runs executes a simple SQL query.
+    // Before starting the thread, mutate the execution ID in the parent.
+    // The child thread should not see the effect of this change.
+    var throwable: Option[Throwable] = None
+    val child = new Thread {
+      override def run(): Unit = {
+        try {
+          sc.parallelize(1 to 100).map { i => (i, i) }.toDF("a", "b").collect()
+        } catch {
+          case t: Throwable =>
+            throwable = Some(t)
+        }
+
+      }
+    }
+    sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, "anything")
+    child.start()
+    child.join()
+
+    // The throwable is thrown from the child thread so it doesn't have a helpful stack trace
+    throwable.foreach { t =>
+      t.setStackTrace(t.getStackTrace ++ Thread.currentThread.getStackTrace)
+      throw t
+    }
+  }
+
+}
+
+/**
+ * A bad [[SparkContext]] that does not clone the inheritable thread local properties
+ * when passing them to children threads.
+ */
+private class BadSparkContext(conf: SparkConf) extends SparkContext(conf) {
+  protected[spark] override val localProperties = new InheritableThreadLocal[Properties] {
+    override protected def childValue(parent: Properties): Properties = new Properties(parent)
+    override protected def initialValue(): Properties = new Properties()
+  }
+}

From a63cdc769f511e98b38c3318bcc732c9a6c76c22 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 15 Sep 2015 16:53:27 -0700
Subject: [PATCH 302/802] [SPARK-10612] [SQL] Add prepare to LocalNode.

The idea is that we should separate the function call that does memory reservation (i.e. prepare) from the function call that consumes the input (e.g. open()), so all operators can be a chance to reserve memory before they are all consumed.

Author: Reynold Xin <rxin@databricks.com>

Closes #8761 from rxin/SPARK-10612.
---
 .../org/apache/spark/sql/execution/local/LocalNode.scala  | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index 9840080e16953..569cff565c092 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -45,6 +45,14 @@ abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging
 
   def output: Seq[Attribute]
 
+  /**
+   * Called before open(). Prepare can be used to reserve memory needed. It must NOT consume
+   * any input data.
+   *
+   * Implementations of this must also call the `prepare()` function of its children.
+   */
+  def prepare(): Unit = children.foreach(_.prepare())
+
   /**
    * Initializes the iterator state. Must be called before calling `next()`.
    *

From 99ecfa5945aedaa71765ecf5cce59964ae52eebe Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Tue, 15 Sep 2015 17:01:10 -0700
Subject: [PATCH 303/802] [SPARK-10575] [SPARK CORE] Wrapped RDD.takeSample
 with Scope

Remove return statements in RDD.takeSample and wrap it withScope

Author: vinodkc <vinod.kc.in@gmail.com>
Author: vinodkc <vinodkc@users.noreply.github.com>
Author: Vinod K C <vinod.kc@huawei.com>

Closes #8730 from vinodkc/fix_takesample_return.
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 68 +++++++++----------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 7dd2bc5d7cd72..a56e542242d5f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -469,50 +469,44 @@ abstract class RDD[T: ClassTag](
    * @param seed seed for the random number generator
    * @return sample of specified size in an array
    */
-  // TODO: rewrite this without return statements so we can wrap it in a scope
   def takeSample(
       withReplacement: Boolean,
       num: Int,
-      seed: Long = Utils.random.nextLong): Array[T] = {
+      seed: Long = Utils.random.nextLong): Array[T] = withScope {
     val numStDev = 10.0
 
-    if (num < 0) {
-      throw new IllegalArgumentException("Negative number of elements requested")
-    } else if (num == 0) {
-      return new Array[T](0)
-    }
-
-    val initialCount = this.count()
-    if (initialCount == 0) {
-      return new Array[T](0)
-    }
-
-    val maxSampleSize = Int.MaxValue - (numStDev * math.sqrt(Int.MaxValue)).toInt
-    if (num > maxSampleSize) {
-      throw new IllegalArgumentException("Cannot support a sample size > Int.MaxValue - " +
-        s"$numStDev * math.sqrt(Int.MaxValue)")
-    }
-
-    val rand = new Random(seed)
-    if (!withReplacement && num >= initialCount) {
-      return Utils.randomizeInPlace(this.collect(), rand)
-    }
-
-    val fraction = SamplingUtils.computeFractionForSampleSize(num, initialCount,
-      withReplacement)
-
-    var samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+    require(num >= 0, "Negative number of elements requested")
+    require(num <= (Int.MaxValue - (numStDev * math.sqrt(Int.MaxValue)).toInt),
+      "Cannot support a sample size > Int.MaxValue - " +
+      s"$numStDev * math.sqrt(Int.MaxValue)")
 
-    // If the first sample didn't turn out large enough, keep trying to take samples;
-    // this shouldn't happen often because we use a big multiplier for the initial size
-    var numIters = 0
-    while (samples.length < num) {
-      logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters")
-      samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
-      numIters += 1
+    if (num == 0) {
+      new Array[T](0)
+    } else {
+      val initialCount = this.count()
+      if (initialCount == 0) {
+        new Array[T](0)
+      } else {
+        val rand = new Random(seed)
+        if (!withReplacement && num >= initialCount) {
+          Utils.randomizeInPlace(this.collect(), rand)
+        } else {
+          val fraction = SamplingUtils.computeFractionForSampleSize(num, initialCount,
+            withReplacement)
+          var samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+
+          // If the first sample didn't turn out large enough, keep trying to take samples;
+          // this shouldn't happen often because we use a big multiplier for the initial size
+          var numIters = 0
+          while (samples.length < num) {
+            logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters")
+            samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+            numIters += 1
+          }
+          Utils.randomizeInPlace(samples, rand).take(num)
+        }
+      }
     }
-
-    Utils.randomizeInPlace(samples, rand).take(num)
   }
 
   /**

From 38700ea40cb1dd0805cc926a9e629f93c99527ad Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 15 Sep 2015 17:11:21 -0700
Subject: [PATCH 304/802] [SPARK-10381] Fix mixup of taskAttemptNumber &
 attemptId in OutputCommitCoordinator

When speculative execution is enabled, consider a scenario where the authorized committer of a particular output partition fails during the OutputCommitter.commitTask() call. In this case, the OutputCommitCoordinator is supposed to release that committer's exclusive lock on committing once that task fails. However, due to a unit mismatch (we used task attempt number in one place and task attempt id in another) the lock will not be released, causing Spark to go into an infinite retry loop.

This bug was masked by the fact that the OutputCommitCoordinator does not have enough end-to-end tests (the current tests use many mocks). Other factors contributing to this bug are the fact that we have many similarly-named identifiers that have different semantics but the same data types (e.g. attemptNumber and taskAttemptId, with inconsistent variable naming which makes them difficult to distinguish).

This patch adds a regression test and fixes this bug by always using task attempt numbers throughout this code.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8544 from JoshRosen/SPARK-10381.
---
 .../org/apache/spark/SparkHadoopWriter.scala  |  3 +-
 .../org/apache/spark/TaskEndReason.scala      |  7 +-
 .../executor/CommitDeniedException.scala      |  4 +-
 .../spark/mapred/SparkHadoopMapRedUtil.scala  | 20 ++----
 .../apache/spark/scheduler/DAGScheduler.scala |  7 +-
 .../scheduler/OutputCommitCoordinator.scala   | 48 +++++++------
 .../org/apache/spark/scheduler/TaskInfo.scala |  7 +-
 .../status/api/v1/AllStagesResource.scala     |  2 +-
 .../org/apache/spark/ui/jobs/StagePage.scala  |  4 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  2 +-
 ...putCommitCoordinatorIntegrationSuite.scala | 68 +++++++++++++++++++
 .../OutputCommitCoordinatorSuite.scala        | 24 ++++---
 .../apache/spark/util/JsonProtocolSuite.scala |  2 +-
 project/MimaExcludes.scala                    | 36 +++++++++-
 .../datasources/WriterContainer.scala         |  3 +-
 .../sql/execution/ui/SQLListenerSuite.scala   |  4 +-
 .../spark/sql/hive/hiveWriterContainers.scala |  2 +-
 17 files changed, 174 insertions(+), 69 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index ae5926dd534a6..ac6eaab20d8d2 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -104,8 +104,7 @@ class SparkHadoopWriter(jobConf: JobConf)
   }
 
   def commit() {
-    SparkHadoopMapRedUtil.commitTask(
-      getOutputCommitter(), getTaskContext(), jobID, splitID, attemptID)
+    SparkHadoopMapRedUtil.commitTask(getOutputCommitter(), getTaskContext(), jobID, splitID)
   }
 
   def commitJob() {
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 2ae878b3e6087..7137246bc34f2 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -193,9 +193,12 @@ case object TaskKilled extends TaskFailedReason {
  * Task requested the driver to commit, but was denied.
  */
 @DeveloperApi
-case class TaskCommitDenied(jobID: Int, partitionID: Int, attemptID: Int) extends TaskFailedReason {
+case class TaskCommitDenied(
+    jobID: Int,
+    partitionID: Int,
+    attemptNumber: Int) extends TaskFailedReason {
   override def toErrorString: String = s"TaskCommitDenied (Driver denied task commit)" +
-    s" for job: $jobID, partition: $partitionID, attempt: $attemptID"
+    s" for job: $jobID, partition: $partitionID, attemptNumber: $attemptNumber"
   /**
    * If a task failed because its attempt to commit was denied, do not count this failure
    * towards failing the stage. This is intended to prevent spurious stage failures in cases
diff --git a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
index f47d7ef511da1..7d84889a2def0 100644
--- a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
@@ -26,8 +26,8 @@ private[spark] class CommitDeniedException(
     msg: String,
     jobID: Int,
     splitID: Int,
-    attemptID: Int)
+    attemptNumber: Int)
   extends Exception(msg) {
 
-  def toTaskEndReason: TaskEndReason = TaskCommitDenied(jobID, splitID, attemptID)
+  def toTaskEndReason: TaskEndReason = TaskCommitDenied(jobID, splitID, attemptNumber)
 }
diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index f405b732e4725..f7298e8d5c62c 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -91,8 +91,7 @@ object SparkHadoopMapRedUtil extends Logging {
       committer: MapReduceOutputCommitter,
       mrTaskContext: MapReduceTaskAttemptContext,
       jobId: Int,
-      splitId: Int,
-      attemptId: Int): Unit = {
+      splitId: Int): Unit = {
 
     val mrTaskAttemptID = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(mrTaskContext)
 
@@ -122,7 +121,8 @@ object SparkHadoopMapRedUtil extends Logging {
 
       if (shouldCoordinateWithDriver) {
         val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator
-        val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, attemptId)
+        val taskAttemptNumber = TaskContext.get().attemptNumber()
+        val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber)
 
         if (canCommit) {
           performCommit()
@@ -132,7 +132,7 @@ object SparkHadoopMapRedUtil extends Logging {
           logInfo(message)
           // We need to abort the task so that the driver can reschedule new attempts, if necessary
           committer.abortTask(mrTaskContext)
-          throw new CommitDeniedException(message, jobId, splitId, attemptId)
+          throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber)
         }
       } else {
         // Speculation is disabled or a user has chosen to manually bypass the commit coordination
@@ -143,16 +143,4 @@ object SparkHadoopMapRedUtil extends Logging {
       logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID")
     }
   }
-
-  def commitTask(
-      committer: MapReduceOutputCommitter,
-      mrTaskContext: MapReduceTaskAttemptContext,
-      sparkTaskContext: TaskContext): Unit = {
-    commitTask(
-      committer,
-      mrTaskContext,
-      sparkTaskContext.stageId(),
-      sparkTaskContext.partitionId(),
-      sparkTaskContext.attemptNumber())
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index b4f90e8347894..3c9a66e504403 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1128,8 +1128,11 @@ class DAGScheduler(
     val stageId = task.stageId
     val taskType = Utils.getFormattedClassName(task)
 
-    outputCommitCoordinator.taskCompleted(stageId, task.partitionId,
-      event.taskInfo.attempt, event.reason)
+    outputCommitCoordinator.taskCompleted(
+      stageId,
+      task.partitionId,
+      event.taskInfo.attemptNumber, // this is a task attempt number
+      event.reason)
 
     // The success case is dealt with separately below, since we need to compute accumulator
     // updates before posting.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
index 5d926377ce86b..add0dedc03f44 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
@@ -25,7 +25,7 @@ import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, RpcEndpoint
 private sealed trait OutputCommitCoordinationMessage extends Serializable
 
 private case object StopCoordinator extends OutputCommitCoordinationMessage
-private case class AskPermissionToCommitOutput(stage: Int, task: Long, taskAttempt: Long)
+private case class AskPermissionToCommitOutput(stage: Int, partition: Int, attemptNumber: Int)
 
 /**
  * Authority that decides whether tasks can commit output to HDFS. Uses a "first committer wins"
@@ -44,8 +44,8 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   var coordinatorRef: Option[RpcEndpointRef] = None
 
   private type StageId = Int
-  private type PartitionId = Long
-  private type TaskAttemptId = Long
+  private type PartitionId = Int
+  private type TaskAttemptNumber = Int
 
   /**
    * Map from active stages's id => partition id => task attempt with exclusive lock on committing
@@ -57,7 +57,8 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    * Access to this map should be guarded by synchronizing on the OutputCommitCoordinator instance.
    */
   private val authorizedCommittersByStage: CommittersByStageMap = mutable.Map()
-  private type CommittersByStageMap = mutable.Map[StageId, mutable.Map[PartitionId, TaskAttemptId]]
+  private type CommittersByStageMap =
+    mutable.Map[StageId, mutable.Map[PartitionId, TaskAttemptNumber]]
 
   /**
    * Returns whether the OutputCommitCoordinator's internal data structures are all empty.
@@ -75,14 +76,15 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    *
    * @param stage the stage number
    * @param partition the partition number
-   * @param attempt a unique identifier for this task attempt
+   * @param attemptNumber how many times this task has been attempted
+   *                      (see [[TaskContext.attemptNumber()]])
    * @return true if this task is authorized to commit, false otherwise
    */
   def canCommit(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId): Boolean = {
-    val msg = AskPermissionToCommitOutput(stage, partition, attempt)
+      attemptNumber: TaskAttemptNumber): Boolean = {
+    val msg = AskPermissionToCommitOutput(stage, partition, attemptNumber)
     coordinatorRef match {
       case Some(endpointRef) =>
         endpointRef.askWithRetry[Boolean](msg)
@@ -95,7 +97,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
 
   // Called by DAGScheduler
   private[scheduler] def stageStart(stage: StageId): Unit = synchronized {
-    authorizedCommittersByStage(stage) = mutable.HashMap[PartitionId, TaskAttemptId]()
+    authorizedCommittersByStage(stage) = mutable.HashMap[PartitionId, TaskAttemptNumber]()
   }
 
   // Called by DAGScheduler
@@ -107,7 +109,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   private[scheduler] def taskCompleted(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId,
+      attemptNumber: TaskAttemptNumber,
       reason: TaskEndReason): Unit = synchronized {
     val authorizedCommitters = authorizedCommittersByStage.getOrElse(stage, {
       logDebug(s"Ignoring task completion for completed stage")
@@ -117,12 +119,12 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
       case Success =>
       // The task output has been committed successfully
       case denied: TaskCommitDenied =>
-        logInfo(
-          s"Task was denied committing, stage: $stage, partition: $partition, attempt: $attempt")
+        logInfo(s"Task was denied committing, stage: $stage, partition: $partition, " +
+          s"attempt: $attemptNumber")
       case otherReason =>
-        if (authorizedCommitters.get(partition).exists(_ == attempt)) {
-          logDebug(s"Authorized committer $attempt (stage=$stage, partition=$partition) failed;" +
-            s" clearing lock")
+        if (authorizedCommitters.get(partition).exists(_ == attemptNumber)) {
+          logDebug(s"Authorized committer (attemptNumber=$attemptNumber, stage=$stage, " +
+            s"partition=$partition) failed; clearing lock")
           authorizedCommitters.remove(partition)
         }
     }
@@ -140,21 +142,23 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   private[scheduler] def handleAskPermissionToCommit(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId): Boolean = synchronized {
+      attemptNumber: TaskAttemptNumber): Boolean = synchronized {
     authorizedCommittersByStage.get(stage) match {
       case Some(authorizedCommitters) =>
         authorizedCommitters.get(partition) match {
           case Some(existingCommitter) =>
-            logDebug(s"Denying $attempt to commit for stage=$stage, partition=$partition; " +
-              s"existingCommitter = $existingCommitter")
+            logDebug(s"Denying attemptNumber=$attemptNumber to commit for stage=$stage, " +
+              s"partition=$partition; existingCommitter = $existingCommitter")
             false
           case None =>
-            logDebug(s"Authorizing $attempt to commit for stage=$stage, partition=$partition")
-            authorizedCommitters(partition) = attempt
+            logDebug(s"Authorizing attemptNumber=$attemptNumber to commit for stage=$stage, " +
+              s"partition=$partition")
+            authorizedCommitters(partition) = attemptNumber
             true
         }
       case None =>
-        logDebug(s"Stage $stage has completed, so not allowing task attempt $attempt to commit")
+        logDebug(s"Stage $stage has completed, so not allowing attempt number $attemptNumber of" +
+          s"partition $partition to commit")
         false
     }
   }
@@ -174,9 +178,9 @@ private[spark] object OutputCommitCoordinator {
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-      case AskPermissionToCommitOutput(stage, partition, taskAttempt) =>
+      case AskPermissionToCommitOutput(stage, partition, attemptNumber) =>
         context.reply(
-          outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, taskAttempt))
+          outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, attemptNumber))
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
index 132a9ced77700..f113c2b1b8433 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
@@ -29,7 +29,7 @@ import org.apache.spark.annotation.DeveloperApi
 class TaskInfo(
     val taskId: Long,
     val index: Int,
-    val attempt: Int,
+    val attemptNumber: Int,
     val launchTime: Long,
     val executorId: String,
     val host: String,
@@ -95,7 +95,10 @@ class TaskInfo(
     }
   }
 
-  def id: String = s"$index.$attempt"
+  @deprecated("Use attemptNumber", "1.6.0")
+  def attempt: Int = attemptNumber
+
+  def id: String = s"$index.$attemptNumber"
 
   def duration: Long = {
     if (!finished) {
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
index 390c136df79b3..24a0b5220695c 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
@@ -127,7 +127,7 @@ private[v1] object AllStagesResource {
     new TaskData(
       taskId = uiData.taskInfo.taskId,
       index = uiData.taskInfo.index,
-      attempt = uiData.taskInfo.attempt,
+      attempt = uiData.taskInfo.attemptNumber,
       launchTime = new Date(uiData.taskInfo.launchTime),
       executorId = uiData.taskInfo.executorId,
       host = uiData.taskInfo.host,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 2b71f55b7bb4f..712782d27b3cf 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -621,7 +621,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           serializationTimeProportionPos + serializationTimeProportion
 
         val index = taskInfo.index
-        val attempt = taskInfo.attempt
+        val attempt = taskInfo.attemptNumber
 
         val svgTag =
           if (totalExecutionTime == 0) {
@@ -967,7 +967,7 @@ private[ui] class TaskDataSource(
     new TaskTableRowData(
       info.index,
       info.taskId,
-      info.attempt,
+      info.attemptNumber,
       info.speculative,
       info.status,
       info.taskLocality.toString,
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 24f78744ad74c..99614a786bd93 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -266,7 +266,7 @@ private[spark] object JsonProtocol {
   def taskInfoToJson(taskInfo: TaskInfo): JValue = {
     ("Task ID" -> taskInfo.taskId) ~
     ("Index" -> taskInfo.index) ~
-    ("Attempt" -> taskInfo.attempt) ~
+    ("Attempt" -> taskInfo.attemptNumber) ~
     ("Launch Time" -> taskInfo.launchTime) ~
     ("Executor ID" -> taskInfo.executorId) ~
     ("Host" -> taskInfo.host) ~
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
new file mode 100644
index 0000000000000..1ae5b030f0832
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.hadoop.mapred.{FileOutputCommitter, TaskAttemptContext}
+import org.scalatest.concurrent.Timeouts
+import org.scalatest.time.{Span, Seconds}
+
+import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext, SparkFunSuite, TaskContext}
+import org.apache.spark.util.Utils
+
+/**
+ * Integration tests for the OutputCommitCoordinator.
+ *
+ * See also: [[OutputCommitCoordinatorSuite]] for unit tests that use mocks.
+ */
+class OutputCommitCoordinatorIntegrationSuite
+  extends SparkFunSuite
+  with LocalSparkContext
+  with Timeouts {
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val conf = new SparkConf()
+      .set("master", "local[2,4]")
+      .set("spark.speculation", "true")
+      .set("spark.hadoop.mapred.output.committer.class",
+        classOf[ThrowExceptionOnFirstAttemptOutputCommitter].getCanonicalName)
+    sc = new SparkContext("local[2, 4]", "test", conf)
+  }
+
+  test("exception thrown in OutputCommitter.commitTask()") {
+    // Regression test for SPARK-10381
+    failAfter(Span(60, Seconds)) {
+      val tempDir = Utils.createTempDir()
+      try {
+        sc.parallelize(1 to 4, 2).map(_.toString).saveAsTextFile(tempDir.getAbsolutePath + "/out")
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
+}
+
+private class ThrowExceptionOnFirstAttemptOutputCommitter extends FileOutputCommitter {
+  override def commitTask(context: TaskAttemptContext): Unit = {
+    val ctx = TaskContext.get()
+    if (ctx.attemptNumber < 1) {
+      throw new java.io.FileNotFoundException("Intentional exception")
+    }
+    super.commitTask(context)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index e5ecd4b7c2610..6d08d7c5b7d2a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -63,6 +63,9 @@ import scala.language.postfixOps
  * was not in SparkHadoopWriter, the tests would still pass because only one of the
  * increments would be captured even though the commit in both tasks was executed
  * erroneously.
+ *
+ * See also: [[OutputCommitCoordinatorIntegrationSuite]] for integration tests that do
+ * not use mocks.
  */
 class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
@@ -164,27 +167,28 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Only authorized committer failures can clear the authorized committer lock (SPARK-6614)") {
     val stage: Int = 1
-    val partition: Long = 2
-    val authorizedCommitter: Long = 3
-    val nonAuthorizedCommitter: Long = 100
+    val partition: Int = 2
+    val authorizedCommitter: Int = 3
+    val nonAuthorizedCommitter: Int = 100
     outputCommitCoordinator.stageStart(stage)
-    assert(outputCommitCoordinator.canCommit(stage, partition, attempt = authorizedCommitter))
-    assert(!outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter))
+
+    assert(outputCommitCoordinator.canCommit(stage, partition, authorizedCommitter))
+    assert(!outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter))
     // The non-authorized committer fails
     outputCommitCoordinator.taskCompleted(
-      stage, partition, attempt = nonAuthorizedCommitter, reason = TaskKilled)
+      stage, partition, attemptNumber = nonAuthorizedCommitter, reason = TaskKilled)
     // New tasks should still not be able to commit because the authorized committer has not failed
     assert(
-      !outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 1))
+      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 1))
     // The authorized committer now fails, clearing the lock
     outputCommitCoordinator.taskCompleted(
-      stage, partition, attempt = authorizedCommitter, reason = TaskKilled)
+      stage, partition, attemptNumber = authorizedCommitter, reason = TaskKilled)
     // A new task should now be allowed to become the authorized committer
     assert(
-      outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 2))
+      outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 2))
     // There can only be one authorized committer
     assert(
-      !outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 3))
+      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 3))
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 47e548ef0d442..143c1b901df11 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -499,7 +499,7 @@ class JsonProtocolSuite extends SparkFunSuite {
   private def assertEquals(info1: TaskInfo, info2: TaskInfo) {
     assert(info1.taskId === info2.taskId)
     assert(info1.index === info2.index)
-    assert(info1.attempt === info2.attempt)
+    assert(info1.attemptNumber === info2.attemptNumber)
     assert(info1.launchTime === info2.launchTime)
     assert(info1.executorId === info2.executorId)
     assert(info1.host === info2.host)
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 46026c1e90ea0..1c96b0958586f 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -45,7 +45,7 @@ object MimaExcludes {
         excludePackage("org.apache.spark.sql.execution")
       ) ++
       MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
-      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++ 
+      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++
       Seq(
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.ml.classification.LogisticCostFun.this"),
@@ -53,6 +53,23 @@ object MimaExcludes {
           "org.apache.spark.ml.classification.LogisticAggregator.add"),
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.ml.classification.LogisticAggregator.count")
+      ) ++ Seq(
+        // SPARK-10381 Fix types / units in private AskPermissionToCommitOutput RPC message.
+        // This class is marked as `private` but MiMa still seems to be confused by the change.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.task"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$2"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.taskAttempt"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$3"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
       )
     case v if v.startsWith("1.5") =>
       Seq(
@@ -213,6 +230,23 @@ object MimaExcludes {
         // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs
         ProblemFilters.exclude[IncompatibleResultTypeProblem](
           "org.apache.spark.mllib.linalg.VectorUDT.serialize")
+      ) ++ Seq(
+        // SPARK-10381 Fix types / units in private AskPermissionToCommitOutput RPC message.
+        // This class is marked as `private` but MiMa still seems to be confused by the change.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.task"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$2"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.taskAttempt"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$3"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
       )
 
     case v if v.startsWith("1.4") =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index f8ef674ed29c1..cfd64c1d9eb34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -198,8 +198,7 @@ private[sql] abstract class BaseWriterContainer(
   }
 
   def commitTask(): Unit = {
-    SparkHadoopMapRedUtil.commitTask(
-      outputCommitter, taskAttemptContext, jobId.getId, taskId.getId, taskAttemptId.getId)
+    SparkHadoopMapRedUtil.commitTask(outputCommitter, taskAttemptContext, jobId.getId, taskId.getId)
   }
 
   def abortTask(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 2bbb41ca777b7..7a46c69a056b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -54,9 +54,9 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     details = ""
   )
 
-  private def createTaskInfo(taskId: Int, attempt: Int): TaskInfo = new TaskInfo(
+  private def createTaskInfo(taskId: Int, attemptNumber: Int): TaskInfo = new TaskInfo(
     taskId = taskId,
-    attempt = attempt,
+    attemptNumber = attemptNumber,
     // The following fields are not used in tests
     index = 0,
     launchTime = 0,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 4ca8042d22367..c8d6b718045a5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -121,7 +121,7 @@ private[hive] class SparkHiveWriterContainer(
   }
 
   protected def commit() {
-    SparkHadoopMapRedUtil.commitTask(committer, taskContext, jobID, splitID, attemptID)
+    SparkHadoopMapRedUtil.commitTask(committer, taskContext, jobID, splitID)
   }
 
   private def setIDs(jobId: Int, splitId: Int, attemptId: Int) {

From 35a19f3357d2ec017cfefb90f1018403e9617de4 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 15 Sep 2015 17:24:32 -0700
Subject: [PATCH 305/802] [SPARK-10613] [SPARK-10624] [SQL] Reduce LocalNode
 tests dependency on SQLContext

Instead of relying on `DataFrames` to verify our answers, we can just use simple arrays. This significantly simplifies the test logic for `LocalNode`s and reduces a lot of code duplicated from `SparkPlanTest`.

This also fixes an additional issue [SPARK-10624](https://issues.apache.org/jira/browse/SPARK-10624) where the output of `TakeOrderedAndProjectNode` is not actually ordered.

Author: Andrew Or <andrew@databricks.com>

Closes #8764 from andrewor14/sql-local-tests-cleanup.
---
 .../spark/sql/execution/local/LocalNode.scala |   8 +-
 .../sql/execution/local/SampleNode.scala      |  16 +-
 .../local/TakeOrderedAndProjectNode.scala     |   2 +-
 .../spark/sql/execution/SparkPlanTest.scala   |   2 +-
 .../spark/sql/execution/local/DummyNode.scala |  68 ++++
 .../sql/execution/local/ExpandNodeSuite.scala |  54 ++-
 .../sql/execution/local/FilterNodeSuite.scala |  34 +-
 .../execution/local/HashJoinNodeSuite.scala   | 141 ++++----
 .../execution/local/IntersectNodeSuite.scala  |  24 +-
 .../sql/execution/local/LimitNodeSuite.scala  |  28 +-
 .../sql/execution/local/LocalNodeSuite.scala  |  73 +---
 .../sql/execution/local/LocalNodeTest.scala   | 165 ++-------
 .../local/NestedLoopJoinNodeSuite.scala       | 316 ++++++------------
 .../execution/local/ProjectNodeSuite.scala    |  39 ++-
 .../sql/execution/local/SampleNodeSuite.scala |  35 +-
 .../TakeOrderedAndProjectNodeSuite.scala      |  50 ++-
 .../sql/execution/local/UnionNodeSuite.scala  |  49 +--
 17 files changed, 468 insertions(+), 636 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index 569cff565c092..f96b62a67a254 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{SQLConf, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -33,18 +33,14 @@ import org.apache.spark.sql.types.StructType
  * Before consuming the iterator, open function must be called.
  * After consuming the iterator, close function must be called.
  */
-abstract class LocalNode(conf: SQLConf) extends TreeNode[LocalNode] with Logging {
+abstract class LocalNode(conf: SQLConf) extends QueryPlan[LocalNode] with Logging {
 
   protected val codegenEnabled: Boolean = conf.codegenEnabled
 
   protected val unsafeEnabled: Boolean = conf.unsafeEnabled
 
-  lazy val schema: StructType = StructType.fromAttributes(output)
-
   private[this] lazy val isTesting: Boolean = sys.props.contains("spark.testing")
 
-  def output: Seq[Attribute]
-
   /**
    * Called before open(). Prepare can be used to reserve memory needed. It must NOT consume
    * any input data.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
index abf3df1c0c2af..793700803f216 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.sql.execution.local
 
-import java.util.Random
-
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
 
+
 /**
  * Sample the dataset.
  *
@@ -51,18 +50,15 @@ case class SampleNode(
 
   override def open(): Unit = {
     child.open()
-    val (sampler, _seed) = if (withReplacement) {
-        val random = new Random(seed)
+    val sampler =
+      if (withReplacement) {
         // Disable gap sampling since the gap sampling method buffers two rows internally,
         // requiring us to copy the row, which is more expensive than the random number generator.
-        (new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false),
-          // Use the seed for partition 0 like PartitionwiseSampledRDD to generate the same result
-          // of DataFrame
-          random.nextLong())
+        new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false)
       } else {
-        (new BernoulliCellSampler[InternalRow](lowerBound, upperBound), seed)
+        new BernoulliCellSampler[InternalRow](lowerBound, upperBound)
       }
-    sampler.setSeed(_seed)
+    sampler.setSeed(seed)
     iterator = sampler.sample(child.asIterator)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
index 53f1dcc65d8cf..ae672fbca8d83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
@@ -50,7 +50,7 @@ case class TakeOrderedAndProjectNode(
     }
     // Close it eagerly since we don't need it.
     child.close()
-    iterator = queue.iterator
+    iterator = queue.toArray.sorted(ord).iterator
   }
 
   override def next(): Boolean = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index de45ae4635fb7..3d218f01c9ead 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -238,7 +238,7 @@ object SparkPlanTest {
       outputPlan transform {
         case plan: SparkPlan =>
           val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
-          plan.transformExpressions {
+          plan transformExpressions {
             case UnresolvedAttribute(Seq(u)) =>
               inputMap.getOrElse(u,
                 sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala
new file mode 100644
index 0000000000000..efc3227dd60d8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala
@@ -0,0 +1,68 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+
+/**
+ * A dummy [[LocalNode]] that just returns rows from a [[LocalRelation]].
+ */
+private[local] case class DummyNode(
+    output: Seq[Attribute],
+    relation: LocalRelation,
+    conf: SQLConf)
+  extends LocalNode(conf) {
+
+  import DummyNode._
+
+  private var index: Int = CLOSED
+  private val input: Seq[InternalRow] = relation.data
+
+  def this(output: Seq[Attribute], data: Seq[Product], conf: SQLConf = new SQLConf) {
+    this(output, LocalRelation.fromProduct(output, data), conf)
+  }
+
+  def isOpen: Boolean = index != CLOSED
+
+  override def children: Seq[LocalNode] = Seq.empty
+
+  override def open(): Unit = {
+    index = -1
+  }
+
+  override def next(): Boolean = {
+    index += 1
+    index < input.size
+  }
+
+  override def fetch(): InternalRow = {
+    assert(index >= 0 && index < input.size)
+    input(index)
+  }
+
+  override def close(): Unit = {
+    index = CLOSED
+  }
+}
+
+private object DummyNode {
+  val CLOSED: Int = Int.MinValue
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
index cfa7f3f6dcb97..bbd94d8da2d11 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
@@ -17,35 +17,33 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+
 class ExpandNodeSuite extends LocalNodeTest {
 
-  import testImplicits._
-
-  test("expand") {
-    val input = Seq((1, 1), (2, 2), (3, 3), (4, 4), (5, 5)).toDF("key", "value")
-    checkAnswer(
-      input,
-      node =>
-        ExpandNode(conf, Seq(
-          Seq(
-            input.col("key") + input.col("value"), input.col("key") - input.col("value")
-          ).map(_.expr),
-          Seq(
-            input.col("key") * input.col("value"), input.col("key") / input.col("value")
-          ).map(_.expr)
-        ), node.output, node),
-      Seq(
-        (2, 0),
-        (1, 1),
-        (4, 0),
-        (4, 1),
-        (6, 0),
-        (9, 1),
-        (8, 0),
-        (16, 1),
-        (10, 0),
-        (25, 1)
-      ).toDF().collect()
-    )
+  private def testExpand(inputData: Array[(Int, Int)] = Array.empty): Unit = {
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val projections = Seq(Seq('k + 'v, 'k - 'v), Seq('k * 'v, 'k / 'v))
+    val expandNode = new ExpandNode(conf, projections, inputNode.output, inputNode)
+    val resolvedNode = resolveExpressions(expandNode)
+    val expectedOutput = {
+      val firstHalf = inputData.map { case (k, v) => (k + v, k - v) }
+      val secondHalf = inputData.map { case (k, v) => (k * v, k / v) }
+      firstHalf ++ secondHalf
+    }
+    val actualOutput = resolvedNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput.toSet === expectedOutput.toSet)
+  }
+
+  test("empty") {
+    testExpand()
   }
+
+  test("basic") {
+    testExpand((1 to 100).map { i => (i, i * 1000) }.toArray)
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
index a12670e347c25..4eadce646d379 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
@@ -17,25 +17,29 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.catalyst.dsl.expressions._
 
-class FilterNodeSuite extends LocalNodeTest with SharedSQLContext {
 
-  test("basic") {
-    val condition = (testData.col("key") % 2) === 0
-    checkAnswer(
-      testData,
-      node => FilterNode(conf, condition.expr, node),
-      testData.filter(condition).collect()
-    )
+class FilterNodeSuite extends LocalNodeTest {
+
+  private def testFilter(inputData: Array[(Int, Int)] = Array.empty): Unit = {
+    val cond = 'k % 2 === 0
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val filterNode = new FilterNode(conf, cond, inputNode)
+    val resolvedNode = resolveExpressions(filterNode)
+    val expectedOutput = inputData.filter { case (k, _) => k % 2 == 0 }
+    val actualOutput = resolvedNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
   }
 
   test("empty") {
-    val condition = (emptyTestData.col("key") % 2) === 0
-    checkAnswer(
-      emptyTestData,
-      node => FilterNode(conf, condition.expr, node),
-      emptyTestData.filter(condition).collect()
-    )
+    testFilter()
+  }
+
+  test("basic") {
+    testFilter((1 to 100).map { i => (i, i) }.toArray)
   }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
index 78d891351f4a9..5c1bdb088eeed 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
@@ -18,99 +18,80 @@
 package org.apache.spark.sql.execution.local
 
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.execution.joins
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+
 
 class HashJoinNodeSuite extends LocalNodeTest {
 
-  import testImplicits._
+  // Test all combinations of the two dimensions: with/out unsafe and build sides
+  private val maybeUnsafeAndCodegen = Seq(false, true)
+  private val buildSides = Seq(BuildLeft, BuildRight)
+  maybeUnsafeAndCodegen.foreach { unsafeAndCodegen =>
+    buildSides.foreach { buildSide =>
+      testJoin(unsafeAndCodegen, buildSide)
+    }
+  }
 
-  def joinSuite(suiteName: String, confPairs: (String, String)*): Unit = {
-    test(s"$suiteName: inner join with one match per row") {
-      withSQLConf(confPairs: _*) {
-        checkAnswer2(
-          upperCaseData,
-          lowerCaseData,
-          wrapForUnsafe(
-            (node1, node2) => HashJoinNode(
-              conf,
-              Seq(upperCaseData.col("N").expr),
-              Seq(lowerCaseData.col("n").expr),
-              joins.BuildLeft,
-              node1,
-              node2)
-          ),
-          upperCaseData.join(lowerCaseData, $"n" === $"N").collect()
-        )
+  /**
+   * Test inner hash join with varying degrees of matches.
+   */
+  private def testJoin(
+      unsafeAndCodegen: Boolean,
+      buildSide: BuildSide): Unit = {
+    val simpleOrUnsafe = if (!unsafeAndCodegen) "simple" else "unsafe"
+    val testNamePrefix = s"$simpleOrUnsafe / $buildSide"
+    val someData = (1 to 100).map { i => (i, "burger" + i) }.toArray
+    val conf = new SQLConf
+    conf.setConf(SQLConf.UNSAFE_ENABLED, unsafeAndCodegen)
+    conf.setConf(SQLConf.CODEGEN_ENABLED, unsafeAndCodegen)
+
+    // Actual test body
+    def runTest(leftInput: Array[(Int, String)], rightInput: Array[(Int, String)]): Unit = {
+      val rightInputMap = rightInput.toMap
+      val leftNode = new DummyNode(joinNameAttributes, leftInput)
+      val rightNode = new DummyNode(joinNicknameAttributes, rightInput)
+      val makeNode = (node1: LocalNode, node2: LocalNode) => {
+        resolveExpressions(new HashJoinNode(
+          conf, Seq('id1), Seq('id2), buildSide, node1, node2))
+      }
+      val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
+      val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+      val expectedOutput = leftInput
+        .filter { case (k, _) => rightInputMap.contains(k) }
+        .map { case (k, v) => (k, v, k, rightInputMap(k)) }
+      val actualOutput = hashJoinNode.collect().map { row =>
+        // (id, name, id, nickname)
+        (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
       }
+      assert(actualOutput === expectedOutput)
     }
 
-    test(s"$suiteName: inner join with multiple matches") {
-      withSQLConf(confPairs: _*) {
-        val x = testData2.where($"a" === 1).as("x")
-        val y = testData2.where($"a" === 1).as("y")
-        checkAnswer2(
-          x,
-          y,
-          wrapForUnsafe(
-            (node1, node2) => HashJoinNode(
-              conf,
-              Seq(x.col("a").expr),
-              Seq(y.col("a").expr),
-              joins.BuildLeft,
-              node1,
-              node2)
-          ),
-          x.join(y).where($"x.a" === $"y.a").collect()
-        )
-      }
+    test(s"$testNamePrefix: empty") {
+      runTest(Array.empty, Array.empty)
+      runTest(someData, Array.empty)
+      runTest(Array.empty, someData)
     }
 
-    test(s"$suiteName: inner join, no matches") {
-      withSQLConf(confPairs: _*) {
-        val x = testData2.where($"a" === 1).as("x")
-        val y = testData2.where($"a" === 2).as("y")
-        checkAnswer2(
-          x,
-          y,
-          wrapForUnsafe(
-            (node1, node2) => HashJoinNode(
-              conf,
-              Seq(x.col("a").expr),
-              Seq(y.col("a").expr),
-              joins.BuildLeft,
-              node1,
-              node2)
-          ),
-          Nil
-        )
-      }
+    test(s"$testNamePrefix: no matches") {
+      val someIrrelevantData = (10000 to 100100).map { i => (i, "piper" + i) }.toArray
+      runTest(someData, Array.empty)
+      runTest(Array.empty, someData)
+      runTest(someData, someIrrelevantData)
+      runTest(someIrrelevantData, someData)
     }
 
-    test(s"$suiteName: big inner join, 4 matches per row") {
-      withSQLConf(confPairs: _*) {
-        val bigData = testData.unionAll(testData).unionAll(testData).unionAll(testData)
-        val bigDataX = bigData.as("x")
-        val bigDataY = bigData.as("y")
+    test(s"$testNamePrefix: partial matches") {
+      val someOtherData = (50 to 150).map { i => (i, "finnegan" + i) }.toArray
+      runTest(someData, someOtherData)
+      runTest(someOtherData, someData)
+    }
 
-        checkAnswer2(
-          bigDataX,
-          bigDataY,
-          wrapForUnsafe(
-            (node1, node2) =>
-              HashJoinNode(
-                conf,
-                Seq(bigDataX.col("key").expr),
-                Seq(bigDataY.col("key").expr),
-                joins.BuildLeft,
-                node1,
-                node2)
-          ),
-          bigDataX.join(bigDataY).where($"x.key" === $"y.key").collect())
-      }
+    test(s"$testNamePrefix: full matches") {
+      val someSuperRelevantData = someData.map { case (k, v) => (k, "cooper" + v) }.toArray
+      runTest(someData, someSuperRelevantData)
+      runTest(someSuperRelevantData, someData)
     }
   }
 
-  joinSuite(
-    "general", SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
-  joinSuite("tungsten", SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
index 7deaa375fcfc2..c0ad2021b204a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
@@ -17,19 +17,21 @@
 
 package org.apache.spark.sql.execution.local
 
-class IntersectNodeSuite extends LocalNodeTest {
 
-  import testImplicits._
+class IntersectNodeSuite extends LocalNodeTest {
 
   test("basic") {
-    val input1 = (1 to 10).map(i => (i, i.toString)).toDF("key", "value")
-    val input2 = (1 to 10).filter(_ % 2 == 0).map(i => (i, i.toString)).toDF("key", "value")
-
-    checkAnswer2(
-      input1,
-      input2,
-      (node1, node2) => IntersectNode(conf, node1, node2),
-      input1.intersect(input2).collect()
-    )
+    val n = 100
+    val leftData = (1 to n).filter { i => i % 2 == 0 }.map { i => (i, i) }.toArray
+    val rightData = (1 to n).filter { i => i % 3 == 0 }.map { i => (i, i) }.toArray
+    val leftNode = new DummyNode(kvIntAttributes, leftData)
+    val rightNode = new DummyNode(kvIntAttributes, rightData)
+    val intersectNode = new IntersectNode(conf, leftNode, rightNode)
+    val expectedOutput = leftData.intersect(rightData)
+    val actualOutput = intersectNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
   }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
index 3b183902007e4..fb790636a3689 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
@@ -17,23 +17,25 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.test.SharedSQLContext
 
-class LimitNodeSuite extends LocalNodeTest with SharedSQLContext {
+class LimitNodeSuite extends LocalNodeTest {
 
-  test("basic") {
-    checkAnswer(
-      testData,
-      node => LimitNode(conf, 10, node),
-      testData.limit(10).collect()
-    )
+  private def testLimit(inputData: Array[(Int, Int)] = Array.empty, limit: Int = 10): Unit = {
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val limitNode = new LimitNode(conf, limit, inputNode)
+    val expectedOutput = inputData.take(limit)
+    val actualOutput = limitNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
   }
 
   test("empty") {
-    checkAnswer(
-      emptyTestData,
-      node => LimitNode(conf, 10, node),
-      emptyTestData.limit(10).collect()
-    )
+    testLimit()
   }
+
+  test("basic") {
+    testLimit((1 to 100).map { i => (i, i) }.toArray, 20)
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
index b89fa46f8b3b4..0d1ed99eec6cd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
@@ -17,28 +17,24 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.IntegerType
 
-class LocalNodeSuite extends SparkFunSuite {
-  private val data = (1 to 100).toArray
+class LocalNodeSuite extends LocalNodeTest {
+  private val data = (1 to 100).map { i => (i, i) }.toArray
 
   test("basic open, next, fetch, close") {
-    val node = new DummyLocalNode(data)
+    val node = new DummyNode(kvIntAttributes, data)
     assert(!node.isOpen)
     node.open()
     assert(node.isOpen)
-    data.foreach { i =>
+    data.foreach { case (k, v) =>
       assert(node.next())
       // fetch should be idempotent
       val fetched = node.fetch()
       assert(node.fetch() === fetched)
       assert(node.fetch() === fetched)
-      assert(node.fetch().numFields === 1)
-      assert(node.fetch().getInt(0) === i)
+      assert(node.fetch().numFields === 2)
+      assert(node.fetch().getInt(0) === k)
+      assert(node.fetch().getInt(1) === v)
     }
     assert(!node.next())
     node.close()
@@ -46,16 +42,17 @@ class LocalNodeSuite extends SparkFunSuite {
   }
 
   test("asIterator") {
-    val node = new DummyLocalNode(data)
+    val node = new DummyNode(kvIntAttributes, data)
     val iter = node.asIterator
     node.open()
-    data.foreach { i =>
+    data.foreach { case (k, v) =>
       // hasNext should be idempotent
       assert(iter.hasNext)
       assert(iter.hasNext)
       val item = iter.next()
-      assert(item.numFields === 1)
-      assert(item.getInt(0) === i)
+      assert(item.numFields === 2)
+      assert(item.getInt(0) === k)
+      assert(item.getInt(1) === v)
     }
     intercept[NoSuchElementException] {
       iter.next()
@@ -64,53 +61,13 @@ class LocalNodeSuite extends SparkFunSuite {
   }
 
   test("collect") {
-    val node = new DummyLocalNode(data)
+    val node = new DummyNode(kvIntAttributes, data)
     node.open()
     val collected = node.collect()
     assert(collected.size === data.size)
-    assert(collected.forall(_.size === 1))
-    assert(collected.map(_.getInt(0)) === data)
+    assert(collected.forall(_.size === 2))
+    assert(collected.map { case row => (row.getInt(0), row.getInt(0)) } === data)
     node.close()
   }
 
 }
-
-/**
- * A dummy [[LocalNode]] that just returns one row per integer in the input.
- */
-private case class DummyLocalNode(conf: SQLConf, input: Array[Int]) extends LocalNode(conf) {
-  private var index = Int.MinValue
-
-  def this(input: Array[Int]) {
-    this(new SQLConf, input)
-  }
-
-  def isOpen: Boolean = {
-    index != Int.MinValue
-  }
-
-  override def output: Seq[Attribute] = {
-    Seq(AttributeReference("something", IntegerType)())
-  }
-
-  override def children: Seq[LocalNode] = Seq.empty
-
-  override def open(): Unit = {
-    index = -1
-  }
-
-  override def next(): Boolean = {
-    index += 1
-    index < input.size
-  }
-
-  override def fetch(): InternalRow = {
-    assert(index >= 0 && index < input.size)
-    val values = Array(input(index).asInstanceOf[Any])
-    new GenericInternalRow(values)
-  }
-
-  override def close(): Unit = {
-    index = Int.MinValue
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
index 86dd28064cc6a..098050bcd2236 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -17,147 +17,54 @@
 
 package org.apache.spark.sql.execution.local
 
-import scala.util.control.NonFatal
-
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{DataFrame, Row, SQLConf}
-import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.types.{IntegerType, StringType}
 
-class LocalNodeTest extends SparkFunSuite with SharedSQLContext {
 
-  def conf: SQLConf = sqlContext.conf
+class LocalNodeTest extends SparkFunSuite {
 
-  protected def wrapForUnsafe(
-      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
-    if (conf.unsafeEnabled) {
-      (left: LocalNode, right: LocalNode) => {
-        val _left = ConvertToUnsafeNode(conf, left)
-        val _right = ConvertToUnsafeNode(conf, right)
-        val r = f(_left, _right)
-        ConvertToSafeNode(conf, r)
-      }
-    } else {
-      f
-    }
-  }
-
-  /**
-   * Runs the LocalNode and makes sure the answer matches the expected result.
-   * @param input the input data to be used.
-   * @param nodeFunction a function which accepts the input LocalNode and uses it to instantiate
-   *                     the local physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
-   *                    to being compared.
-   */
-  protected def checkAnswer(
-      input: DataFrame,
-      nodeFunction: LocalNode => LocalNode,
-      expectedAnswer: Seq[Row],
-      sortAnswers: Boolean = true): Unit = {
-    doCheckAnswer(
-      input :: Nil,
-      nodes => nodeFunction(nodes.head),
-      expectedAnswer,
-      sortAnswers)
-  }
-
-  /**
-   * Runs the LocalNode and makes sure the answer matches the expected result.
-   * @param left the left input data to be used.
-   * @param right the right input data to be used.
-   * @param nodeFunction a function which accepts the input LocalNode and uses it to instantiate
-   *                     the local physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
-   *                    to being compared.
-   */
-  protected def checkAnswer2(
-      left: DataFrame,
-      right: DataFrame,
-      nodeFunction: (LocalNode, LocalNode) => LocalNode,
-      expectedAnswer: Seq[Row],
-      sortAnswers: Boolean = true): Unit = {
-    doCheckAnswer(
-      left :: right :: Nil,
-      nodes => nodeFunction(nodes(0), nodes(1)),
-      expectedAnswer,
-      sortAnswers)
-  }
+  protected val conf: SQLConf = new SQLConf
+  protected val kvIntAttributes = Seq(
+    AttributeReference("k", IntegerType)(),
+    AttributeReference("v", IntegerType)())
+  protected val joinNameAttributes = Seq(
+    AttributeReference("id1", IntegerType)(),
+    AttributeReference("name", StringType)())
+  protected val joinNicknameAttributes = Seq(
+    AttributeReference("id2", IntegerType)(),
+    AttributeReference("nickname", StringType)())
 
   /**
-   * Runs the `LocalNode`s and makes sure the answer matches the expected result.
-   * @param input the input data to be used.
-   * @param nodeFunction a function which accepts a sequence of input `LocalNode`s and uses them to
-   *                     instantiate the local physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
-   *                    to being compared.
+   * Wrap a function processing two [[LocalNode]]s such that:
+   *   (1) all input rows are automatically converted to unsafe rows
+   *   (2) all output rows are automatically converted back to safe rows
    */
-  protected def doCheckAnswer(
-    input: Seq[DataFrame],
-    nodeFunction: Seq[LocalNode] => LocalNode,
-    expectedAnswer: Seq[Row],
-    sortAnswers: Boolean = true): Unit = {
-    LocalNodeTest.checkAnswer(
-      input.map(dataFrameToSeqScanNode), nodeFunction, expectedAnswer, sortAnswers) match {
-      case Some(errorMessage) => fail(errorMessage)
-      case None =>
+  protected def wrapForUnsafe(
+      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
+    (left: LocalNode, right: LocalNode) => {
+      val _left = ConvertToUnsafeNode(conf, left)
+      val _right = ConvertToUnsafeNode(conf, right)
+      val r = f(_left, _right)
+      ConvertToSafeNode(conf, r)
     }
   }
 
-  protected def dataFrameToSeqScanNode(df: DataFrame): SeqScanNode = {
-    new SeqScanNode(
-      conf,
-      df.queryExecution.sparkPlan.output,
-      df.queryExecution.toRdd.map(_.copy()).collect())
-  }
-
-}
-
-/**
- * Helper methods for writing tests of individual local physical operators.
- */
-object LocalNodeTest {
-
   /**
-   * Runs the `LocalNode`s and makes sure the answer matches the expected result.
-   * @param input the input data to be used.
-   * @param nodeFunction a function which accepts the input `LocalNode`s and uses them to
-   *                     instantiate the local physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
-   *                    to being compared.
+   * Recursively resolve all expressions in a [[LocalNode]] using the node's attributes.
    */
-  def checkAnswer(
-    input: Seq[SeqScanNode],
-    nodeFunction: Seq[LocalNode] => LocalNode,
-    expectedAnswer: Seq[Row],
-    sortAnswers: Boolean): Option[String] = {
-
-    val outputNode = nodeFunction(input)
-
-    val outputResult: Seq[Row] = try {
-      outputNode.collect()
-    } catch {
-      case NonFatal(e) =>
-        val errorMessage =
-          s"""
-              | Exception thrown while executing local plan:
-              | $outputNode
-              | == Exception ==
-              | $e
-              | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin
-        return Some(errorMessage)
-    }
-
-    SQLTestUtils.compareAnswers(outputResult, expectedAnswer, sortAnswers).map { errorMessage =>
-      s"""
-          | Results do not match for local plan:
-          | $outputNode
-          | $errorMessage
-       """.stripMargin
+  protected def resolveExpressions(outputNode: LocalNode): LocalNode = {
+    outputNode transform {
+      case node: LocalNode =>
+        val inputMap = node.output.map { a => (a.name, a) }.toMap
+        node transformExpressions {
+          case UnresolvedAttribute(Seq(u)) =>
+            inputMap.getOrElse(u,
+              sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+        }
     }
   }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
index b1ef26ba82f16..40299d9d5ee37 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
@@ -18,222 +18,128 @@
 package org.apache.spark.sql.execution.local
 
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.catalyst.plans.{FullOuter, LeftOuter, RightOuter}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
 
+
 class NestedLoopJoinNodeSuite extends LocalNodeTest {
 
-  import testImplicits._
-
-  private def joinSuite(
-      suiteName: String, buildSide: BuildSide, confPairs: (String, String)*): Unit = {
-    test(s"$suiteName: left outer join") {
-      withSQLConf(confPairs: _*) {
-        checkAnswer2(
-          upperCaseData,
-          lowerCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              LeftOuter,
-              Some((upperCaseData.col("N") === lowerCaseData.col("n")).expr))
-          ),
-          upperCaseData.join(lowerCaseData, $"n" === $"N", "left").collect())
-
-        checkAnswer2(
-          upperCaseData,
-          lowerCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              LeftOuter,
-              Some(
-                (upperCaseData.col("N") === lowerCaseData.col("n") &&
-                  lowerCaseData.col("n") > 1).expr))
-          ),
-          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"n" > 1, "left").collect())
-
-        checkAnswer2(
-          upperCaseData,
-          lowerCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              LeftOuter,
-              Some(
-                (upperCaseData.col("N") === lowerCaseData.col("n") &&
-                  upperCaseData.col("N") > 1).expr))
-          ),
-          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"N" > 1, "left").collect())
-
-        checkAnswer2(
-          upperCaseData,
-          lowerCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              LeftOuter,
-              Some(
-                (upperCaseData.col("N") === lowerCaseData.col("n") &&
-                  lowerCaseData.col("l") > upperCaseData.col("L")).expr))
-          ),
-          upperCaseData.join(lowerCaseData, $"n" === $"N" && $"l" > $"L", "left").collect())
+  // Test all combinations of the three dimensions: with/out unsafe, build sides, and join types
+  private val maybeUnsafeAndCodegen = Seq(false, true)
+  private val buildSides = Seq(BuildLeft, BuildRight)
+  private val joinTypes = Seq(LeftOuter, RightOuter, FullOuter)
+  maybeUnsafeAndCodegen.foreach { unsafeAndCodegen =>
+    buildSides.foreach { buildSide =>
+      joinTypes.foreach { joinType =>
+        testJoin(unsafeAndCodegen, buildSide, joinType)
       }
     }
+  }
 
-    test(s"$suiteName: right outer join") {
-      withSQLConf(confPairs: _*) {
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              RightOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N")).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N", "right").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              RightOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                lowerCaseData.col("n") > 1).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > 1, "right").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              RightOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                upperCaseData.col("N") > 1).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > 1, "right").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              RightOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                lowerCaseData.col("l") > upperCaseData.col("L")).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"l" > $"L", "right").collect())
+  /**
+   * Test outer nested loop joins with varying degrees of matches.
+   */
+  private def testJoin(
+      unsafeAndCodegen: Boolean,
+      buildSide: BuildSide,
+      joinType: JoinType): Unit = {
+    val simpleOrUnsafe = if (!unsafeAndCodegen) "simple" else "unsafe"
+    val testNamePrefix = s"$simpleOrUnsafe / $buildSide / $joinType"
+    val someData = (1 to 100).map { i => (i, "burger" + i) }.toArray
+    val conf = new SQLConf
+    conf.setConf(SQLConf.UNSAFE_ENABLED, unsafeAndCodegen)
+    conf.setConf(SQLConf.CODEGEN_ENABLED, unsafeAndCodegen)
+
+    // Actual test body
+    def runTest(
+        joinType: JoinType,
+        leftInput: Array[(Int, String)],
+        rightInput: Array[(Int, String)]): Unit = {
+      val leftNode = new DummyNode(joinNameAttributes, leftInput)
+      val rightNode = new DummyNode(joinNicknameAttributes, rightInput)
+      val cond = 'id1 === 'id2
+      val makeNode = (node1: LocalNode, node2: LocalNode) => {
+        resolveExpressions(
+          new NestedLoopJoinNode(conf, node1, node2, buildSide, joinType, Some(cond)))
       }
+      val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
+      val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+      val expectedOutput = generateExpectedOutput(leftInput, rightInput, joinType)
+      val actualOutput = hashJoinNode.collect().map { row =>
+        // (id, name, id, nickname)
+        (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
+      }
+      assert(actualOutput.toSet === expectedOutput.toSet)
     }
 
-    test(s"$suiteName: full outer join") {
-      withSQLConf(confPairs: _*) {
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              FullOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N")).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N", "full").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              FullOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                lowerCaseData.col("n") > 1).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > 1, "full").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              FullOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                upperCaseData.col("N") > 1).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > 1, "full").collect())
-
-        checkAnswer2(
-          lowerCaseData,
-          upperCaseData,
-          wrapForUnsafe(
-            (node1, node2) => NestedLoopJoinNode(
-              conf,
-              node1,
-              node2,
-              buildSide,
-              FullOuter,
-              Some((lowerCaseData.col("n") === upperCaseData.col("N") &&
-                lowerCaseData.col("l") > upperCaseData.col("L")).expr))
-          ),
-          lowerCaseData.join(upperCaseData, $"n" === $"N" && $"l" > $"L", "full").collect())
-      }
+    test(s"$testNamePrefix: empty") {
+      runTest(joinType, Array.empty, Array.empty)
+    }
+
+    test(s"$testNamePrefix: no matches") {
+      val someIrrelevantData = (10000 to 10100).map { i => (i, "piper" + i) }.toArray
+      runTest(joinType, someData, Array.empty)
+      runTest(joinType, Array.empty, someData)
+      runTest(joinType, someData, someIrrelevantData)
+      runTest(joinType, someIrrelevantData, someData)
+    }
+
+    test(s"$testNamePrefix: partial matches") {
+      val someOtherData = (50 to 150).map { i => (i, "finnegan" + i) }.toArray
+      runTest(joinType, someData, someOtherData)
+      runTest(joinType, someOtherData, someData)
+    }
+
+    test(s"$testNamePrefix: full matches") {
+      val someSuperRelevantData = someData.map { case (k, v) => (k, "cooper" + v) }
+      runTest(joinType, someData, someSuperRelevantData)
+      runTest(joinType, someSuperRelevantData, someData)
+    }
+  }
+
+  /**
+   * Helper method to generate the expected output of a test based on the join type.
+   */
+  private def generateExpectedOutput(
+      leftInput: Array[(Int, String)],
+      rightInput: Array[(Int, String)],
+      joinType: JoinType): Array[(Int, String, Int, String)] = {
+    joinType match {
+      case LeftOuter =>
+        val rightInputMap = rightInput.toMap
+        leftInput.map { case (k, v) =>
+          val rightKey = rightInputMap.get(k).map { _ => k }.getOrElse(0)
+          val rightValue = rightInputMap.getOrElse(k, null)
+          (k, v, rightKey, rightValue)
+        }
+
+      case RightOuter =>
+        val leftInputMap = leftInput.toMap
+        rightInput.map { case (k, v) =>
+          val leftKey = leftInputMap.get(k).map { _ => k }.getOrElse(0)
+          val leftValue = leftInputMap.getOrElse(k, null)
+          (leftKey, leftValue, k, v)
+        }
+
+      case FullOuter =>
+        val leftInputMap = leftInput.toMap
+        val rightInputMap = rightInput.toMap
+        val leftOutput = leftInput.map { case (k, v) =>
+          val rightKey = rightInputMap.get(k).map { _ => k }.getOrElse(0)
+          val rightValue = rightInputMap.getOrElse(k, null)
+          (k, v, rightKey, rightValue)
+        }
+        val rightOutput = rightInput.map { case (k, v) =>
+          val leftKey = leftInputMap.get(k).map { _ => k }.getOrElse(0)
+          val leftValue = leftInputMap.getOrElse(k, null)
+          (leftKey, leftValue, k, v)
+        }
+        (leftOutput ++ rightOutput).distinct
+
+      case other =>
+        throw new IllegalArgumentException(s"Join type $other is not applicable")
     }
   }
 
-  joinSuite(
-    "general-build-left",
-    BuildLeft,
-    SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
-  joinSuite(
-    "general-build-right",
-    BuildRight,
-    SQLConf.CODEGEN_ENABLED.key -> "false", SQLConf.UNSAFE_ENABLED.key -> "false")
-  joinSuite(
-    "tungsten-build-left",
-    BuildLeft,
-    SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
-  joinSuite(
-    "tungsten-build-right",
-    BuildRight,
-    SQLConf.CODEGEN_ENABLED.key -> "true", SQLConf.UNSAFE_ENABLED.key -> "true")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
index 38e0a230c46d8..02ecb23d34b2f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
@@ -17,28 +17,33 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, NamedExpression}
+import org.apache.spark.sql.types.{IntegerType, StringType}
 
-class ProjectNodeSuite extends LocalNodeTest with SharedSQLContext {
 
-  test("basic") {
-    val output = testData.queryExecution.sparkPlan.output
-    val columns = Seq(output(1), output(0))
-    checkAnswer(
-      testData,
-      node => ProjectNode(conf, columns, node),
-      testData.select("value", "key").collect()
-    )
+class ProjectNodeSuite extends LocalNodeTest {
+  private val pieAttributes = Seq(
+    AttributeReference("id", IntegerType)(),
+    AttributeReference("age", IntegerType)(),
+    AttributeReference("name", StringType)())
+
+  private def testProject(inputData: Array[(Int, Int, String)] = Array.empty): Unit = {
+    val inputNode = new DummyNode(pieAttributes, inputData)
+    val columns = Seq[NamedExpression](inputNode.output(0), inputNode.output(2))
+    val projectNode = new ProjectNode(conf, columns, inputNode)
+    val expectedOutput = inputData.map { case (id, age, name) => (id, name) }
+    val actualOutput = projectNode.collect().map { case row =>
+      (row.getInt(0), row.getString(1))
+    }
+    assert(actualOutput === expectedOutput)
   }
 
   test("empty") {
-    val output = emptyTestData.queryExecution.sparkPlan.output
-    val columns = Seq(output(1), output(0))
-    checkAnswer(
-      emptyTestData,
-      node => ProjectNode(conf, columns, node),
-      emptyTestData.select("value", "key").collect()
-    )
+    testProject()
+  }
+
+  test("basic") {
+    testProject((1 to 100).map { i => (i, i + 1, "pie" + i) }.toArray)
   }
 
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
index 87a7da453999c..a3e83bbd51457 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
@@ -17,21 +17,32 @@
 
 package org.apache.spark.sql.execution.local
 
-class SampleNodeSuite extends LocalNodeTest {
+import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
+
 
-  import testImplicits._
+class SampleNodeSuite extends LocalNodeTest {
 
   private def testSample(withReplacement: Boolean): Unit = {
-    test(s"withReplacement: $withReplacement") {
-      val seed = 0L
-      val input = sqlContext.sparkContext.
-        parallelize((1 to 10).map(i => (i, i.toString)), 1). // Should be only 1 partition
-        toDF("key", "value")
-      checkAnswer(
-        input,
-        node => SampleNode(conf, 0.0, 0.3, withReplacement, seed, node),
-        input.sample(withReplacement, 0.3, seed).collect()
-      )
+    val seed = 0L
+    val lowerb = 0.0
+    val upperb = 0.3
+    val maybeOut = if (withReplacement) "" else "out"
+    test(s"with$maybeOut replacement") {
+      val inputData = (1 to 1000).map { i => (i, i) }.toArray
+      val inputNode = new DummyNode(kvIntAttributes, inputData)
+      val sampleNode = new SampleNode(conf, lowerb, upperb, withReplacement, seed, inputNode)
+      val sampler =
+        if (withReplacement) {
+          new PoissonSampler[(Int, Int)](upperb - lowerb, useGapSamplingIfPossible = false)
+        } else {
+          new BernoulliCellSampler[(Int, Int)](lowerb, upperb)
+        }
+      sampler.setSeed(seed)
+      val expectedOutput = sampler.sample(inputData.iterator).toArray
+      val actualOutput = sampleNode.collect().map { case row =>
+        (row.getInt(0), row.getInt(1))
+      }
+      assert(actualOutput === expectedOutput)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
index ff28b24eeff14..42ebc7bfcaadc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
@@ -17,38 +17,34 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.Column
-import org.apache.spark.sql.catalyst.expressions.{Ascending, Expression, SortOrder}
+import scala.util.Random
 
-class TakeOrderedAndProjectNodeSuite extends LocalNodeTest {
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.SortOrder
 
-  import testImplicits._
 
-  private def columnToSortOrder(sortExprs: Column*): Seq[SortOrder] = {
-    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
-      col.expr match {
-        case expr: SortOrder =>
-          expr
-        case expr: Expression =>
-          SortOrder(expr, Ascending)
-      }
-    }
-    sortOrder
-  }
+class TakeOrderedAndProjectNodeSuite extends LocalNodeTest {
 
-  private def testTakeOrderedAndProjectNode(desc: Boolean): Unit = {
-    val testCaseName = if (desc) "desc" else "asc"
-    test(testCaseName) {
-      val input = (1 to 10).map(i => (i, i.toString)).toDF("key", "value")
-      val sortColumn = if (desc) input.col("key").desc else input.col("key")
-      checkAnswer(
-        input,
-        node => TakeOrderedAndProjectNode(conf, 5, columnToSortOrder(sortColumn), None, node),
-        input.sort(sortColumn).limit(5).collect()
-      )
+  private def testTakeOrderedAndProject(desc: Boolean): Unit = {
+    val limit = 10
+    val ascOrDesc = if (desc) "desc" else "asc"
+    test(ascOrDesc) {
+      val inputData = Random.shuffle((1 to 100).toList).map { i => (i, i) }.toArray
+      val inputNode = new DummyNode(kvIntAttributes, inputData)
+      val firstColumn = inputNode.output(0)
+      val sortDirection = if (desc) Descending else Ascending
+      val sortOrder = SortOrder(firstColumn, sortDirection)
+      val takeOrderAndProjectNode = new TakeOrderedAndProjectNode(
+        conf, limit, Seq(sortOrder), Some(Seq(firstColumn)), inputNode)
+      val expectedOutput = inputData
+        .map { case (k, _) => k }
+        .sortBy { k => k * (if (desc) -1 else 1) }
+        .take(limit)
+      val actualOutput = takeOrderAndProjectNode.collect().map { row => row.getInt(0) }
+      assert(actualOutput === expectedOutput)
     }
   }
 
-  testTakeOrderedAndProjectNode(desc = false)
-  testTakeOrderedAndProjectNode(desc = true)
+  testTakeOrderedAndProject(desc = false)
+  testTakeOrderedAndProject(desc = true)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
index eedd7320900f9..666b0235c061d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
@@ -17,36 +17,39 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.test.SharedSQLContext
 
-class UnionNodeSuite extends LocalNodeTest with SharedSQLContext {
+class UnionNodeSuite extends LocalNodeTest {
 
-  test("basic") {
-    checkAnswer2(
-      testData,
-      testData,
-      (node1, node2) => UnionNode(conf, Seq(node1, node2)),
-      testData.unionAll(testData).collect()
-    )
+  private def testUnion(inputData: Seq[Array[(Int, Int)]]): Unit = {
+    val inputNodes = inputData.map { data =>
+      new DummyNode(kvIntAttributes, data)
+    }
+    val unionNode = new UnionNode(conf, inputNodes)
+    val expectedOutput = inputData.flatten
+    val actualOutput = unionNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
   }
 
   test("empty") {
-    checkAnswer2(
-      emptyTestData,
-      emptyTestData,
-      (node1, node2) => UnionNode(conf, Seq(node1, node2)),
-      emptyTestData.unionAll(emptyTestData).collect()
-    )
+    testUnion(Seq(Array.empty))
+    testUnion(Seq(Array.empty, Array.empty))
+  }
+
+  test("self") {
+    val data = (1 to 100).map { i => (i, i) }.toArray
+    testUnion(Seq(data))
+    testUnion(Seq(data, data))
+    testUnion(Seq(data, data, data))
   }
 
-  test("complicated union") {
-    val dfs = Seq(testData, emptyTestData, emptyTestData, testData, testData, emptyTestData,
-      emptyTestData, emptyTestData, testData, emptyTestData)
-    doCheckAnswer(
-      dfs,
-      nodes => UnionNode(conf, nodes),
-      dfs.reduce(_.unionAll(_)).collect()
-    )
+  test("basic") {
+    val zero = Array.empty[(Int, Int)]
+    val one = (1 to 100).map { i => (i, i) }.toArray
+    val two = (50 to 150).map { i => (i, i) }.toArray
+    val three = (800 to 900).map { i => (i, i) }.toArray
+    testUnion(Seq(zero, one, two, three))
   }
 
 }

From 64c29afcb787d9f176a197c25314295108ba0471 Mon Sep 17 00:00:00 2001
From: sureshthalamati <suresh.thalamati@gmail.com>
Date: Tue, 15 Sep 2015 19:41:38 -0700
Subject: [PATCH 306/802] [SPARK-9078] [SQL] Allow jdbc dialects to override
 the query used to check the table.

Current implementation uses query with a LIMIT clause to find if table already exists. This syntax works only in some database systems. This patch changes the default query to the one that is likely to work on most databases, and adds a new method to the  JdbcDialect abstract class to allow  dialects to override the default query.

I looked at using the JDBC meta data calls, it turns out there is no common way to find the current schema, catalog..etc.  There is a new method Connection.getSchema() , but that is available only starting jdk1.7 , and existing jdbc drivers may not have implemented it.  Other option was to use jdbc escape syntax clause for LIMIT, not sure on how well this supported in all the databases also. After looking at all the jdbc metadata options my conclusion was most common way is to use the simple select query with 'where 1 =0' , and allow dialects to customize as needed

Author: sureshthalamati <suresh.thalamati@gmail.com>

Closes #8676 from sureshthalamati/table_exists_spark-9078.
---
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../datasources/jdbc/JdbcUtils.scala          |  9 ++++++---
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 20 +++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 14 +++++++++++++
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index b2a66dd417b4c..745bb4ec9cf1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,7 +255,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     val conn = JdbcUtils.createConnection(url, props)
 
     try {
-      var tableExists = JdbcUtils.tableExists(conn, table)
+      var tableExists = JdbcUtils.tableExists(conn, url, table)
 
       if (mode == SaveMode.Ignore && tableExists) {
         return
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 26788b2a4fd69..f89d55b20e212 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -42,10 +42,13 @@ object JdbcUtils extends Logging {
   /**
    * Returns true if the table already exists in the JDBC database.
    */
-  def tableExists(conn: Connection, table: String): Boolean = {
+  def tableExists(conn: Connection, url: String, table: String): Boolean = {
+    val dialect = JdbcDialects.get(url)
+
     // Somewhat hacky, but there isn't a good way to identify whether a table exists for all
-    // SQL database systems, considering "table" could also include the database name.
-    Try(conn.prepareStatement(s"SELECT 1 FROM $table LIMIT 1").executeQuery().next()).isSuccess
+    // SQL database systems using JDBC meta data calls, considering "table" could also include
+    // the database name. Query used to find table exists can be overriden by the dialects.
+    Try(conn.prepareStatement(dialect.getTableExistsQuery(table)).executeQuery()).isSuccess
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index c6d05c9b83b98..68ebaaca6c53d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -88,6 +88,17 @@ abstract class JdbcDialect {
   def quoteIdentifier(colName: String): String = {
     s""""$colName""""
   }
+
+  /**
+   * Get the SQL query that should be used to find if the given table exists. Dialects can
+   * override this method to return a query that works best in a particular database.
+   * @param table  The name of the table.
+   * @return The SQL query to use for checking the table.
+   */
+  def getTableExistsQuery(table: String): String = {
+    s"SELECT * FROM $table WHERE 1=0"
+  }
+
 }
 
 /**
@@ -198,6 +209,11 @@ case object PostgresDialect extends JdbcDialect {
     case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
     case _ => None
   }
+
+  override def getTableExistsQuery(table: String): String = {
+    s"SELECT 1 FROM $table LIMIT 1"
+  }
+
 }
 
 /**
@@ -222,6 +238,10 @@ case object MySQLDialect extends JdbcDialect {
   override def quoteIdentifier(colName: String): String = {
     s"`$colName`"
   }
+
+  override def getTableExistsQuery(table: String): String = {
+    s"SELECT 1 FROM $table LIMIT 1"
+  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index ed710689cc670..5ab9381de4d66 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -450,4 +450,18 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(db2Dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")
     assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)")
   }
+
+  test("table exists query by jdbc dialect") {
+    val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
+    val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    val db2 = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
+    val h2 = JdbcDialects.get(url)
+    val table = "weblogs"
+    val defaultQuery = s"SELECT * FROM $table WHERE 1=0"
+    val limitQuery = s"SELECT 1 FROM $table LIMIT 1"
+    assert(MySQL.getTableExistsQuery(table) == limitQuery)
+    assert(Postgres.getTableExistsQuery(table) == limitQuery)
+    assert(db2.getTableExistsQuery(table) == defaultQuery)
+    assert(h2.getTableExistsQuery(table) == defaultQuery)
+  }
 }

From b921fe4dc0442aa133ab7d55fba24bc798d59aa2 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 15 Sep 2015 19:43:26 -0700
Subject: [PATCH 307/802] [SPARK-10595] [ML] [MLLIB] [DOCS] Various ML guide
 cleanups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Various ML guide cleanups.

* ml-guide.md: Make it easier to access the algorithm-specific guides.
* LDA user guide: EM often begins with useless topics, but running longer generally improves them dramatically.  E.g., 10 iterations on a Wikipedia dataset produces useless topics, but 50 iterations produces very meaningful topics.
* mllib-feature-extraction.html#elementwiseproduct: “w” parameter should be “scalingVec”
* Clean up Binarizer user guide a little.
* Document in Pipeline that users should not put an instance into the Pipeline in more than 1 place.
* spark.ml Word2Vec user guide: clean up grammar/writing
* Chi Sq Feature Selector docs: Improve text in doc.

CC: mengxr feynmanliang

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #8752 from jkbradley/mlguide-fixes-1.5.
---
 docs/ml-features.md              | 34 +++++++++++++++++---
 docs/ml-guide.md                 | 31 ++++++++++++-------
 docs/mllib-clustering.md         |  4 +++
 docs/mllib-feature-extraction.md | 53 +++++++++++++++++++++-----------
 docs/mllib-guide.md              |  4 +--
 5 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index a414c21b5c280..b70da4ac63845 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -123,12 +123,21 @@ for features_label in rescaledData.select("features", "label").take(3):
 
 ## Word2Vec
 
-`Word2Vec` is an `Estimator` which takes sequences of words that represents documents and trains a `Word2VecModel`. The model is a `Map(String, Vector)` essentially, which maps each word to an unique fix-sized vector. The `Word2VecModel` transforms each documents into a vector using the average of all words in the document, which aims to other computations of documents such as similarity calculation consequencely. Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more details on Word2Vec.
+`Word2Vec` is an `Estimator` which takes sequences of words representing documents and trains a
+`Word2VecModel`. The model maps each word to a unique fixed-size vector. The `Word2VecModel`
+transforms each document into a vector using the average of all words in the document; this vector
+can then be used for as features for prediction, document similarity calculations, etc.
+Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more
+details.
 
-Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec). In the following code segment, we start with a set of documents, each of them is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
+In the following code segment, we start with a set of documents, each of which is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Word2Vec
 
@@ -152,6 +161,10 @@ result.select("result").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -192,6 +205,10 @@ for (Row r: result.select("result").take(3)) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Word2Vec
 
@@ -621,12 +638,15 @@ for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
 
 ## Binarizer
 
-Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.
+Binarization is the process of thresholding numerical features to binary (0/1) features.
 
-A simple [Binarizer](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) class provides this functionality. Besides the common parameters of `inputCol` and `outputCol`, `Binarizer` has the parameter `threshold` used for binarizing continuous numerical features. The features greater than the threshold, will be binarized to 1.0. The features equal to or less than the threshold, will be binarized to 0.0. The example below shows how to binarize numerical features.
+`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` for binarization. Feature values greater than the threshold are binarized to 1.0; values equal to or less than the threshold are binarized to 0.0.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Binarizer API doc](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) for more details.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Binarizer
 import org.apache.spark.sql.DataFrame
@@ -650,6 +670,9 @@ binarizedFeatures.collect().foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Binarizer API doc](api/java/org/apache/spark/ml/feature/Binarizer.html) for more details.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -687,6 +710,9 @@ for (Row r : binarizedFeatures.collect()) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Binarizer API doc](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) for more details.
+
 {% highlight python %}
 from pyspark.ml.feature import Binarizer
 
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 78c93a95c7807..c5d7f990021f1 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -32,7 +32,21 @@ See the [algorithm guides](#algorithm-guides) section below for guides on sub-pa
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-# Main concepts
+# Algorithm guides
+
+We provide several algorithm guides specific to the Pipelines API.
+Several of these algorithms, such as certain feature transformers, are not in the `spark.mllib` API.
+Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., random forests
+provide class probabilities, and linear models provide model summaries.
+
+* [Feature extraction, transformation, and selection](ml-features.html)
+* [Decision Trees for classification and regression](ml-decision-tree.html)
+* [Ensembles](ml-ensembles.html)
+* [Linear methods with elastic net regularization](ml-linear-methods.html)
+* [Multilayer perceptron classifier](ml-ann.html)
+
+
+# Main concepts in Pipelines
 
 Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple
 algorithms into a single pipeline, or workflow.
@@ -166,6 +180,11 @@ compile-time type checking.
 `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.
 This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`.
 
+*Unique Pipeline stages*: A `Pipeline`'s stages should be unique instances.  E.g., the same instance
+`myHashingTF` should not be inserted into the `Pipeline` twice since `Pipeline` stages must have
+unique IDs.  However, different instances `myHashingTF1` and `myHashingTF2` (both of type `HashingTF`)
+can be put into the same `Pipeline` since different instances will be created with different IDs.
+
 ## Parameters
 
 Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters.
@@ -184,16 +203,6 @@ Parameters belong to specific instances of `Estimator`s and `Transformer`s.
 For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
 This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
 
-# Algorithm guides
-
-There are now several algorithms in the Pipelines API which are not in the `spark.mllib` API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
-
-* [Feature extraction, transformation, and selection](ml-features.html)
-* [Decision Trees for classification and regression](ml-decision-tree.html)
-* [Ensembles](ml-ensembles.html)
-* [Linear methods with elastic net regularization](ml-linear-methods.html)
-* [Multilayer perceptron classifier](ml-ann.html)
-
 # Code examples
 
 This section gives code examples illustrating the functionality discussed above.
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 3fb35d3c50b06..c2711cf82deb4 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -507,6 +507,10 @@ must also be $> 1.0$. Providing `Vector(-1)` results in default behavior
 $> 1.0$. Providing `-1` results in defaulting to a value of $0.1 + 1$.
 * `maxIterations`: The maximum number of EM iterations.
 
+*Note*: It is important to do enough iterations.  In early iterations, EM often has useless topics,
+but those topics improve dramatically after more iterations.  Using at least 20 and possibly
+50-100 iterations is often reasonable, depending on your dataset.
+
 `EMLDAOptimizer` produces a `DistributedLDAModel`, which stores not only
 the inferred topics but also the full training corpus and topic
 distributions for each document in the training corpus. A
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index de86aba2ae627..7e417ed5f37a9 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -380,35 +380,43 @@ data2 = labels.zip(normalizer2.transform(features))
 </div>
 </div>
 
-## Feature selection
-[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) allows selecting the most relevant features for use in model construction. Feature selection reduces the size of the vector space and, in turn, the complexity of any subsequent operation with vectors. The number of features to select can be tuned using a held-out validation set.
+## ChiSqSelector
 
-### ChiSqSelector
-[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) stands for Chi-Squared feature selection. It operates on labeled data with categorical features. `ChiSqSelector` orders features based on a Chi-Squared test of independence from the class, and then filters (selects) the top features which the class label depends on the most. This is akin to yielding the features with the most predictive power.
+[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) tries to identify relevant
+features for use in model construction. It reduces the size of the feature space, which can improve
+both speed and statistical learning behavior.
 
-#### Model Fitting
+[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements
+Chi-Squared feature selection. It operates on labeled data with categorical features.
+`ChiSqSelector` orders features based on a Chi-Squared test of independence from the class,
+and then filters (selects) the top features which the class label depends on the most.
+This is akin to yielding the features with the most predictive power.
 
-[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) has the
-following parameters in the constructor:
+The number of features to select can be tuned using a held-out validation set.
 
-* `numTopFeatures` number of top features that the selector will select (filter).
+### Model Fitting
 
-We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method in
-`ChiSqSelector` which can take an input of `RDD[LabeledPoint]` with categorical features, learn the summary statistics, and then
-return a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
+`ChiSqSelector` takes a `numTopFeatures` parameter specifying the number of top features that
+the selector will select.
 
-This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
-which can apply the Chi-Squared feature selection on a `Vector` to produce a reduced `Vector` or on
+The [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method takes
+an input of `RDD[LabeledPoint]` with categorical features, learns the summary statistics, and then
+returns a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
+The `ChiSqSelectorModel` can be applied either to a `Vector` to produce a reduced `Vector`, or to
 an `RDD[Vector]` to produce a reduced `RDD[Vector]`.
 
 Note that the user can also construct a `ChiSqSelectorModel` by hand by providing an array of selected feature indices (which must be sorted in ascending order).
 
-#### Example
+### Example
 
 The following example shows the basic use of ChiSqSelector. The data set used has a feature matrix consisting of greyscale values that vary from 0 to 255 for each feature.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
+for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.linalg.Vectors
@@ -434,7 +442,11 @@ val filteredData = discretizedData.map { lp =>
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+
+Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html)
+for details on the API.
+
 {% highlight java %}
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@@ -486,7 +498,12 @@ sc.stop();
 
 ## ElementwiseProduct
 
-ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier.  This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
+`ElementwiseProduct` multiplies each input vector by a provided "weight" vector, using element-wise
+multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This
+represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29)
+between the input vector, `v` and transforming vector, `scalingVec`, to yield a result vector.
+Qu8T948*1#
+Denoting the `scalingVec` as "`w`," this transformation may be written as:
 
 `\[ \begin{pmatrix}
 v_1 \\
@@ -506,7 +523,7 @@ v_N
 
 [`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
 
-* `w`: the transforming vector.
+* `scalingVec`: the transforming vector.
 
 `ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 257f7cc7603fa..91e50ccfecec4 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -13,9 +13,9 @@ primitives and higher-level pipeline APIs.
 
 It divides into two packages:
 
-* [`spark.mllib`](mllib-guide.html#mllib-types-algorithms-and-utilities) contains the original API
+* [`spark.mllib`](mllib-guide.html#data-types-algorithms-and-utilities) contains the original API
   built on top of [RDDs](programming-guide.html#resilient-distributed-datasets-rdds).
-* [`spark.ml`](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) provides higher-level API
+* [`spark.ml`](ml-guide.html) provides higher-level API
   built on top of [DataFrames](sql-programming-guide.html#dataframes) for constructing ML pipelines.
 
 Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible.

From 95b6a8103fb527f501ca26b1d6e3a5859970a1e2 Mon Sep 17 00:00:00 2001
From: Vinod K C <vinod.kc@huawei.com>
Date: Tue, 15 Sep 2015 23:25:51 -0700
Subject: [PATCH 308/802] [SPARK-10516] [ MLLIB] Added values property in
 DenseVector

Author: Vinod K C <vinod.kc@huawei.com>

Closes #8682 from vinodkc/fix_SPARK-10516.
---
 python/pyspark/mllib/linalg/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 380f86e9b44f8..4829acb16ed8a 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -399,6 +399,10 @@ def squared_distance(self, other):
     def toArray(self):
         return self.array
 
+    @property
+    def values(self):
+        return self.array
+
     def __getitem__(self, item):
         return self.array[item]
 

From 1894653edce718e874d1ddc9ba442bce43cbc082 Mon Sep 17 00:00:00 2001
From: Luciano Resende <lresende@apache.org>
Date: Wed, 16 Sep 2015 10:47:30 +0100
Subject: [PATCH 309/802] [SPARK-10511] [BUILD] Reset git repository before
 packaging source distro

The calculation of Spark version is downloading
Scala and Zinc in the build directory which is
inflating the size of the source distribution.

Reseting the repo before packaging the source
distribution fix this issue.

Author: Luciano Resende <lresende@apache.org>

Closes #8774 from lresende/spark-10511.
---
 dev/create-release/release-build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index d0b3a54dde1dc..9dac43ce54425 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -99,6 +99,7 @@ fi
 DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
 USER_HOST="$ASF_USERNAME@people.apache.org"
 
+git clean -d -f -x
 rm .gitignore
 rm -rf .git
 cd ..

From d9b7f3e4dbceb91ea4d1a1fed3ab847335f8588b Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Wed, 16 Sep 2015 04:34:14 -0700
Subject: [PATCH 310/802] [SPARK-10276] [MLLIB] [PYSPARK] Add @since annotation
 to pyspark.mllib.recommendation

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8677 from yu-iskw/SPARK-10276.
---
 python/pyspark/mllib/recommendation.py | 36 +++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 506ca2151cce7..95047b5b7b4b7 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -18,7 +18,7 @@
 import array
 from collections import namedtuple
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
 from pyspark.mllib.util import JavaLoader, JavaSaveable
@@ -36,6 +36,8 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])):
     (1, 2, 5.0)
     >>> (r[0], r[1], r[2])
     (1, 2, 5.0)
+
+    .. versionadded:: 1.2.0
     """
 
     def __reduce__(self):
@@ -111,13 +113,17 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 0.9.0
     """
+    @since("0.9.0")
     def predict(self, user, product):
         """
         Predicts rating for the given user and product.
         """
         return self._java_model.predict(int(user), int(product))
 
+    @since("0.9.0")
     def predictAll(self, user_product):
         """
         Returns a list of predicted ratings for input user and product pairs.
@@ -128,6 +134,7 @@ def predictAll(self, user_product):
         user_product = user_product.map(lambda u_p: (int(u_p[0]), int(u_p[1])))
         return self.call("predict", user_product)
 
+    @since("1.2.0")
     def userFeatures(self):
         """
         Returns a paired RDD, where the first element is the user and the
@@ -135,6 +142,7 @@ def userFeatures(self):
         """
         return self.call("getUserFeatures").mapValues(lambda v: array.array('d', v))
 
+    @since("1.2.0")
     def productFeatures(self):
         """
         Returns a paired RDD, where the first element is the product and the
@@ -142,6 +150,7 @@ def productFeatures(self):
         """
         return self.call("getProductFeatures").mapValues(lambda v: array.array('d', v))
 
+    @since("1.4.0")
     def recommendUsers(self, product, num):
         """
         Recommends the top "num" number of users for a given product and returns a list
@@ -149,6 +158,7 @@ def recommendUsers(self, product, num):
         """
         return list(self.call("recommendUsers", product, num))
 
+    @since("1.4.0")
     def recommendProducts(self, user, num):
         """
         Recommends the top "num" number of products for a given user and returns a list
@@ -157,17 +167,25 @@ def recommendProducts(self, user, num):
         return list(self.call("recommendProducts", user, num))
 
     @property
+    @since("1.4.0")
     def rank(self):
+        """Rank for the features in this model"""
         return self.call("rank")
 
     @classmethod
+    @since("1.3.1")
     def load(cls, sc, path):
+        """Load a model from the given path"""
         model = cls._load_java(sc, path)
         wrapper = sc._jvm.MatrixFactorizationModelWrapper(model)
         return MatrixFactorizationModel(wrapper)
 
 
 class ALS(object):
+    """Alternating Least Squares matrix factorization
+
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
     def _prepare(cls, ratings):
@@ -188,15 +206,31 @@ def _prepare(cls, ratings):
         return ratings
 
     @classmethod
+    @since("0.9.0")
     def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
               seed=None):
+        """
+        Train a matrix factorization model given an RDD of ratings given by users to some products,
+        in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
+        product of two lower-rank matrices of a given rank (number of features). To solve for these
+        features, we run a given number of iterations of ALS. This is done using a level of
+        parallelism given by `blocks`.
+        """
         model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
                               lambda_, blocks, nonnegative, seed)
         return MatrixFactorizationModel(model)
 
     @classmethod
+    @since("0.9.0")
     def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01,
                       nonnegative=False, seed=None):
+        """
+        Train a matrix factorization model given an RDD of 'implicit preferences' given by users
+        to some products, in the form of (userID, productID, preference) pairs. We approximate the
+        ratings matrix as the product of two lower-rank matrices of a given rank (number of
+        features).  To solve for these features, we run a given number of iterations of ALS.
+        This is done using a level of parallelism given by `blocks`.
+        """
         model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank,
                               iterations, lambda_, blocks, alpha, nonnegative, seed)
         return MatrixFactorizationModel(model)

From 5dbaf3d3911bbfa003bc75459aaad66b4f6e0c67 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 16 Sep 2015 19:19:23 +0100
Subject: [PATCH 311/802] [SPARK-10589] [WEBUI] Add defense against external
 site framing

Set `X-Frame-Options: SAMEORIGIN` to protect against frame-related vulnerability

Author: Sean Owen <sowen@cloudera.com>

Closes #8745 from srowen/SPARK-10589.
---
 .../spark/deploy/worker/ui/WorkerWebUI.scala     |  7 ++++---
 .../org/apache/spark/metrics/MetricsSystem.scala |  2 +-
 .../spark/metrics/sink/MetricsServlet.scala      |  6 +++---
 .../scala/org/apache/spark/ui/JettyUtils.scala   | 16 ++++++++++++++--
 .../main/scala/org/apache/spark/ui/WebUI.scala   |  4 ++--
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
index 709a27233598c..1a0598e50dcf1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
@@ -20,9 +20,8 @@ package org.apache.spark.deploy.worker.ui
 import java.io.File
 import javax.servlet.http.HttpServletRequest
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.Logging
 import org.apache.spark.deploy.worker.Worker
-import org.apache.spark.deploy.worker.ui.WorkerWebUI._
 import org.apache.spark.ui.{SparkUI, WebUI}
 import org.apache.spark.ui.JettyUtils._
 import org.apache.spark.util.RpcUtils
@@ -49,7 +48,9 @@ class WorkerWebUI(
     attachPage(new WorkerPage(this))
     attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
     attachHandler(createServletHandler("/log",
-      (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr))
+      (request: HttpServletRequest) => logPage.renderLog(request),
+      worker.securityMgr,
+      worker.conf))
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 4517f465ebd3b..48afe3ae3511f 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -88,7 +88,7 @@ private[spark] class MetricsSystem private (
    */
   def getServletHandlers: Array[ServletContextHandler] = {
     require(running, "Can only call getServletHandlers on a running MetricsSystem")
-    metricsServlet.map(_.getHandlers).getOrElse(Array())
+    metricsServlet.map(_.getHandlers(conf)).getOrElse(Array())
   }
 
   metricsConfig.initialize()
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
index 0c2e212a33074..4193e1d21d3c1 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
@@ -27,7 +27,7 @@ import com.codahale.metrics.json.MetricsModule
 import com.fasterxml.jackson.databind.ObjectMapper
 import org.eclipse.jetty.servlet.ServletContextHandler
 
-import org.apache.spark.SecurityManager
+import org.apache.spark.{SparkConf, SecurityManager}
 import org.apache.spark.ui.JettyUtils._
 
 private[spark] class MetricsServlet(
@@ -49,10 +49,10 @@ private[spark] class MetricsServlet(
   val mapper = new ObjectMapper().registerModule(
     new MetricsModule(TimeUnit.SECONDS, TimeUnit.MILLISECONDS, servletShowSample))
 
-  def getHandlers: Array[ServletContextHandler] = {
+  def getHandlers(conf: SparkConf): Array[ServletContextHandler] = {
     Array[ServletContextHandler](
       createServletHandler(servletPath,
-        new ServletParams(request => getMetricsSnapshot(request), "text/json"), securityMgr)
+        new ServletParams(request => getMetricsSnapshot(request), "text/json"), securityMgr, conf)
     )
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 779c0ba083596..b796a44fe01ac 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -59,7 +59,17 @@ private[spark] object JettyUtils extends Logging {
 
   def createServlet[T <% AnyRef](
       servletParams: ServletParams[T],
-      securityMgr: SecurityManager): HttpServlet = {
+      securityMgr: SecurityManager,
+      conf: SparkConf): HttpServlet = {
+
+    // SPARK-10589 avoid frame-related click-jacking vulnerability, using X-Frame-Options
+    // (see http://tools.ietf.org/html/rfc7034). By default allow framing only from the
+    // same origin, but allow framing for a specific named URI.
+    // Example: spark.ui.allowFramingFrom = https://example.com/
+    val allowFramingFrom = conf.getOption("spark.ui.allowFramingFrom")
+    val xFrameOptionsValue =
+      allowFramingFrom.map(uri => s"ALLOW-FROM $uri").getOrElse("SAMEORIGIN")
+
     new HttpServlet {
       override def doGet(request: HttpServletRequest, response: HttpServletResponse) {
         try {
@@ -68,6 +78,7 @@ private[spark] object JettyUtils extends Logging {
             response.setStatus(HttpServletResponse.SC_OK)
             val result = servletParams.responder(request)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
+            response.setHeader("X-Frame-Options", xFrameOptionsValue)
             // scalastyle:off println
             response.getWriter.println(servletParams.extractFn(result))
             // scalastyle:on println
@@ -97,8 +108,9 @@ private[spark] object JettyUtils extends Logging {
       path: String,
       servletParams: ServletParams[T],
       securityMgr: SecurityManager,
+      conf: SparkConf,
       basePath: String = ""): ServletContextHandler = {
-    createServletHandler(path, createServlet(servletParams, securityMgr), basePath)
+    createServletHandler(path, createServlet(servletParams, securityMgr, conf), basePath)
   }
 
   /** Create a context handler that responds to a request with the given path prefix */
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 61449847add3d..81a121fd441bd 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -76,9 +76,9 @@ private[spark] abstract class WebUI(
   def attachPage(page: WebUIPage) {
     val pagePath = "/" + page.prefix
     val renderHandler = createServletHandler(pagePath,
-      (request: HttpServletRequest) => page.render(request), securityManager, basePath)
+      (request: HttpServletRequest) => page.render(request), securityManager, conf, basePath)
     val renderJsonHandler = createServletHandler(pagePath.stripSuffix("/") + "/json",
-      (request: HttpServletRequest) => page.renderJson(request), securityManager, basePath)
+      (request: HttpServletRequest) => page.renderJson(request), securityManager, conf, basePath)
     attachHandler(renderHandler)
     attachHandler(renderJsonHandler)
     pageToHandlers.getOrElseUpdate(page, ArrayBuffer[ServletContextHandler]())

From 896edb51ab7a88bbb31259e565311a9be6f2ca6d Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 16 Sep 2015 13:20:39 -0700
Subject: [PATCH 312/802] [SPARK-10050] [SPARKR] Support collecting data of
 MapType in DataFrame.

1. Support collecting data of MapType from DataFrame.
2. Support data of MapType in createDataFrame.

Author: Sun Rui <rui.sun@intel.com>

Closes #8711 from sun-rui/SPARK-10050.
---
 R/pkg/R/SQLContext.R                          |  5 +-
 R/pkg/R/deserialize.R                         | 14 +++++
 R/pkg/R/schema.R                              | 34 ++++++++---
 R/pkg/inst/tests/test_sparkSQL.R              | 56 +++++++++++++++----
 .../scala/org/apache/spark/api/r/SerDe.scala  | 31 ++++++++++
 .../org/apache/spark/sql/api/r/SQLUtils.scala |  6 ++
 6 files changed, 123 insertions(+), 23 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 4ac057d0f2d83..1c58fd96d750a 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -41,10 +41,7 @@ infer_type <- function(x) {
   if (type == "map") {
     stopifnot(length(x) > 0)
     key <- ls(x)[[1]]
-    list(type = "map",
-         keyType = "string",
-         valueType = infer_type(get(key, x)),
-         valueContainsNull = TRUE)
+    paste0("map<string,", infer_type(get(key, x)), ">")
   } else if (type == "array") {
     stopifnot(length(x) > 0)
     names <- names(x)
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index d1858ec227b56..ce88d0b071b72 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -50,6 +50,7 @@ readTypedObject <- function(con, type) {
     "t" = readTime(con),
     "a" = readArray(con),
     "l" = readList(con),
+    "e" = readEnv(con),
     "n" = NULL,
     "j" = getJobj(readString(con)),
     stop(paste("Unsupported type for deserialization", type)))
@@ -121,6 +122,19 @@ readList <- function(con) {
   }
 }
 
+readEnv <- function(con) {
+  env <- new.env()
+  len <- readInt(con)
+  if (len > 0) {
+    for (i in 1:len) {
+      key <- readString(con)
+      value <- readObject(con)
+      env[[key]] <- value
+    }
+  }
+  env
+}
+
 readRaw <- function(con) {
   dataLen <- readInt(con)
   readBin(con, raw(), as.integer(dataLen), endian = "big")
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 62d4f73878d29..8df1563f8ebc0 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -131,13 +131,33 @@ checkType <- function(type) {
   if (type %in% primtiveTypes) {
     return()
   } else {
-    m <- regexec("^array<(.*)>$", type)
-    matchedStrings <- regmatches(type, m)
-    if (length(matchedStrings[[1]]) >= 2) {
-      elemType <- matchedStrings[[1]][2]
-      checkType(elemType)
-      return()
-    }
+    # Check complex types
+    firstChar <- substr(type, 1, 1)
+    switch (firstChar,
+            a = {
+              # Array type
+              m <- regexec("^array<(.*)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 2) {
+                elemType <- matchedStrings[[1]][2]
+                checkType(elemType)
+                return()
+              }
+            },
+            m = {
+              # Map type
+              m <- regexec("^map<(.*),(.*)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 3) {
+                keyType <- matchedStrings[[1]][2]
+                if (keyType != "string" && keyType != "character") {
+                  stop("Key type in a map must be string or character")
+                }
+                valueType <- matchedStrings[[1]][3]
+                checkType(valueType)
+                return()
+              }
+            })
   }
 
   stop(paste("Unsupported type for Dataframe:", type))
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 98d4402d368e1..e159a69584274 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -57,7 +57,7 @@ mockLinesComplexType <-
 complexTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
 writeLines(mockLinesComplexType, complexTypeJsonPath)
 
-test_that("infer types", {
+test_that("infer types and check types", {
   expect_equal(infer_type(1L), "integer")
   expect_equal(infer_type(1.0), "double")
   expect_equal(infer_type("abc"), "string")
@@ -72,9 +72,9 @@ test_that("infer types", {
   checkStructField(testStruct$fields()[[2]], "b", "StringType", TRUE)
   e <- new.env()
   assign("a", 1L, envir = e)
-  expect_equal(infer_type(e),
-               list(type = "map", keyType = "string", valueType = "integer",
-                    valueContainsNull = TRUE))
+  expect_equal(infer_type(e), "map<string,integer>")
+
+  expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
 })
 
 test_that("structType and structField", {
@@ -242,7 +242,7 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
-test_that("create DataFrame with nested array and struct", {
+test_that("create DataFrame with nested array and map", {
 #  e <- new.env()
 #  assign("n", 3L, envir = e)
 #  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
@@ -253,21 +253,35 @@ test_that("create DataFrame with nested array and struct", {
 #  ldf <- collect(df)
 #  expect_equal(ldf[1,], l[[1]])
 
+  #  ArrayType and MapType
+  e <- new.env()
+  assign("n", 3L, envir = e)
 
-  #  ArrayType only for now
-  l <- list(as.list(1:10), list("a", "b"))
-  df <- createDataFrame(sqlContext, list(l), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>")))
+  l <- list(as.list(1:10), list("a", "b"), e)
+  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c"))
+  expect_equal(dtypes(df), list(c("a", "array<int>"),
+                                c("b", "array<string>"),
+                                c("c", "map<string,int>")))
   expect_equal(count(df), 1)
   ldf <- collect(df)
-  expect_equal(names(ldf), c("a", "b"))
+  expect_equal(names(ldf), c("a", "b", "c"))
   expect_equal(ldf[1, 1][[1]], l[[1]])
   expect_equal(ldf[1, 2][[1]], l[[2]])
+  e <- ldf$c[[1]]
+  expect_equal(class(e), "environment")
+  expect_equal(ls(e), "n")
+  expect_equal(e$n, 3L)
 })
 
+# For test map type in DataFrame
+mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+                      "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+                      "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
 test_that("Collect DataFrame with complex types", {
-  # only ArrayType now
-  # TODO: tests for StructType and MapType after they are supported
+  # ArrayType
   df <- jsonFile(sqlContext, complexTypeJsonPath)
 
   ldf <- collect(df)
@@ -277,6 +291,24 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
   expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
   expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+
+  # MapType
+  schema <- structType(structField("name", "string"),
+                       structField("info", "map<string,double>"))
+  df <- read.df(sqlContext, mapTypeJsonPath, "json", schema)
+  expect_equal(dtypes(df), list(c("name", "string"),
+                                c("info", "map<string,double>")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("name", "info"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "environment")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
+
+  # TODO: tests for StructType after it is supported
 })
 
 test_that("jsonFile() on a local file returns a DataFrame", {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 3c92bb7a1c73c..0c78613e406e1 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -209,11 +209,23 @@ private[spark] object SerDe {
       case "array" => dos.writeByte('a')
       // Array of objects
       case "list" => dos.writeByte('l')
+      case "map" => dos.writeByte('e')
       case "jobj" => dos.writeByte('j')
       case _ => throw new IllegalArgumentException(s"Invalid type $typeStr")
     }
   }
 
+  private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
+    if (key == null) {
+      throw new IllegalArgumentException("Key in map can't be null.")
+    } else if (!key.isInstanceOf[String]) {
+      throw new IllegalArgumentException(s"Invalid map key type: ${key.getClass.getName}")
+    }
+
+    writeString(dos, key.asInstanceOf[String])
+    writeObject(dos, value)
+  }
+
   def writeObject(dos: DataOutputStream, obj: Object): Unit = {
     if (obj == null) {
       writeType(dos, "void")
@@ -306,6 +318,25 @@ private[spark] object SerDe {
           writeInt(dos, v.length)
           v.foreach(elem => writeObject(dos, elem))
 
+        // Handle map
+        case v: java.util.Map[_, _] =>
+          writeType(dos, "map")
+          writeInt(dos, v.size)
+          val iter = v.entrySet.iterator
+          while(iter.hasNext) {
+            val entry = iter.next
+            val key = entry.getKey
+            val value = entry.getValue
+
+            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          }
+        case v: scala.collection.Map[_, _] =>
+          writeType(dos, "map")
+          writeInt(dos, v.size)
+          v.foreach { case (key, value) =>
+            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          }
+
         case _ =>
           writeType(dos, "jobj")
           writeJObj(dos, value)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index d4b834adb6e39..f45d119c8cfdf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -64,6 +64,12 @@ private[r] object SQLUtils {
       case r"\Aarray<(.*)${elemType}>\Z" => {
         org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
       }
+      case r"\Amap<(.*)${keyType},(.*)${valueType}>\Z" => {
+        if (keyType != "string" && keyType != "character") {
+          throw new IllegalArgumentException("Key type of a map must be string or character")
+        }
+        org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
+      }
       case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
     }
   }

From d39f15ea2b8bed5342d2f8e3c1936f915c470783 Mon Sep 17 00:00:00 2001
From: Kevin Cox <kevincox@kevincox.ca>
Date: Wed, 16 Sep 2015 15:30:17 -0700
Subject: [PATCH 313/802] [SPARK-9794] [SQL] Fix datetime parsing in SparkSQL.

This fixes https://issues.apache.org/jira/browse/SPARK-9794 by using a real ISO8601 parser. (courtesy of the xml component of the standard java library)

cc: angelini

Author: Kevin Cox <kevincox@kevincox.ca>

Closes #8396 from kevincox/kevincox-sql-time-parsing.
---
 .../sql/catalyst/util/DateTimeUtils.scala     | 27 ++++++----------
 .../catalyst/util/DateTimeUtilsSuite.scala    | 32 +++++++++++++++++++
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 687ca000d12bb..400c4327be1c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.util
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 import java.util.{TimeZone, Calendar}
+import javax.xml.bind.DatatypeConverter;
 
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -109,30 +110,22 @@ object DateTimeUtils {
   }
 
   def stringToTime(s: String): java.util.Date = {
-    if (!s.contains('T')) {
+    var indexOfGMT = s.indexOf("GMT");
+    if (indexOfGMT != -1) {
+      // ISO8601 with a weird time zone specifier (2000-01-01T00:00GMT+01:00)
+      val s0 = s.substring(0, indexOfGMT)
+      val s1 = s.substring(indexOfGMT + 3)
+      // Mapped to 2000-01-01T00:00+01:00
+      stringToTime(s0 + s1)
+    } else if (!s.contains('T')) {
       // JDBC escape string
       if (s.contains(' ')) {
         Timestamp.valueOf(s)
       } else {
         Date.valueOf(s)
       }
-    } else if (s.endsWith("Z")) {
-      // this is zero timezone of ISO8601
-      stringToTime(s.substring(0, s.length - 1) + "GMT-00:00")
-    } else if (s.indexOf("GMT") == -1) {
-      // timezone with ISO8601
-      val inset = "+00.00".length
-      val s0 = s.substring(0, s.length - inset)
-      val s1 = s.substring(s.length - inset, s.length)
-      if (s0.substring(s0.lastIndexOf(':')).contains('.')) {
-        stringToTime(s0 + "GMT" + s1)
-      } else {
-        stringToTime(s0 + ".0GMT" + s1)
-      }
     } else {
-      // ISO8601 with GMT insert
-      val ISO8601GMT: SimpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSSz" )
-      ISO8601GMT.parse(s)
+      DatatypeConverter.parseDateTime(s).getTime()
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 6b9a11f0ff743..46335941b62d6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -136,6 +136,38 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(stringToDate(UTF8String.fromString("2015-031-8")).isEmpty)
   }
 
+  test("string to time") {
+    // Tests with UTC.
+    var c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00GMT-00:00") === c.getTime())
+
+    c.set(2000, 11, 30, 10, 0, 0)
+    assert(stringToTime("2000-12-30T10:00:00Z") === c.getTime())
+
+    // Tests with set time zone.
+    c.setTimeZone(TimeZone.getTimeZone("GMT-04:00"))
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00-04:00") === c.getTime())
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00GMT-04:00") === c.getTime())
+
+    // Tests with local time zone.
+    c.setTimeZone(TimeZone.getDefault())
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(2000, 11, 30, 0, 0, 0)
+    assert(stringToTime("2000-12-30") === new Date(c.getTimeInMillis()))
+
+    c.set(2000, 11, 30, 10, 0, 0)
+    assert(stringToTime("2000-12-30 10:00:00") === new Timestamp(c.getTimeInMillis()))
+  }
+
   test("string to timestamp") {
     var c = Calendar.getInstance()
     c.set(1969, 11, 31, 16, 0, 0)

From 49c649fa0b6affed108dbae85373b4b7247b338c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 16 Sep 2015 15:32:01 -0700
Subject: [PATCH 314/802] Tiny style fix for
 d39f15ea2b8bed5342d2f8e3c1936f915c470783.

---
 .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 400c4327be1c7..781ed1688a327 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 import java.util.{TimeZone, Calendar}
-import javax.xml.bind.DatatypeConverter;
+import javax.xml.bind.DatatypeConverter
 
 import org.apache.spark.unsafe.types.UTF8String
 

From 69c9830d288d5b8d7f0abe7c8a65a4c966580a49 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 17 Sep 2015 00:48:57 -0700
Subject: [PATCH 315/802] [MINOR] [CORE] Fixes minor variable name typo

Author: Cheng Lian <lian@databricks.com>

Closes #8784 from liancheng/typo-fix.
---
 .../apache/spark/serializer/GenericAvroSerializerSuite.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
index bc9f3708ed69d..87f25e7245e1f 100644
--- a/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
@@ -76,9 +76,9 @@ class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
   test("caches previously seen schemas") {
     val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
     val compressedSchema = genericSer.compress(schema)
-    val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema))
+    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))
 
     assert(compressedSchema.eq(genericSer.compress(schema)))
-    assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
+    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
   }
 }

From c633ed3260140f1288f326acc4d7a10dcd2e27d5 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:43:59 -0700
Subject: [PATCH 316/802] [SPARK-10284] [ML] [PYSPARK] [DOCS] Add @since
 annotation to pyspark.ml.tuning

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8694 from yu-iskw/SPARK-10284.
---
 python/pyspark/ml/tuning.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index cae778869e9c5..ab5621f45c72c 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -18,6 +18,7 @@
 import itertools
 import numpy as np
 
+from pyspark import since
 from pyspark.ml.param import Params, Param
 from pyspark.ml import Estimator, Model
 from pyspark.ml.util import keyword_only
@@ -47,11 +48,14 @@ class ParamGridBuilder(object):
     True
     >>> all([m in expected for m in output])
     True
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self):
         self._param_grid = {}
 
+    @since("1.4.0")
     def addGrid(self, param, values):
         """
         Sets the given parameters in this grid to fixed values.
@@ -60,6 +64,7 @@ def addGrid(self, param, values):
 
         return self
 
+    @since("1.4.0")
     def baseOn(self, *args):
         """
         Sets the given parameters in this grid to fixed values.
@@ -73,6 +78,7 @@ def baseOn(self, *args):
 
         return self
 
+    @since("1.4.0")
     def build(self):
         """
         Builds and returns all combinations of parameters specified
@@ -104,6 +110,8 @@ class CrossValidator(Estimator):
     >>> cvModel = cv.fit(dataset)
     >>> evaluator.evaluate(cvModel.transform(dataset))
     0.8333...
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -142,6 +150,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF
         self._set(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3):
         """
         setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3):
@@ -150,6 +159,7 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, num
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setEstimator(self, value):
         """
         Sets the value of :py:attr:`estimator`.
@@ -157,12 +167,14 @@ def setEstimator(self, value):
         self._paramMap[self.estimator] = value
         return self
 
+    @since("1.4.0")
     def getEstimator(self):
         """
         Gets the value of estimator or its default value.
         """
         return self.getOrDefault(self.estimator)
 
+    @since("1.4.0")
     def setEstimatorParamMaps(self, value):
         """
         Sets the value of :py:attr:`estimatorParamMaps`.
@@ -170,12 +182,14 @@ def setEstimatorParamMaps(self, value):
         self._paramMap[self.estimatorParamMaps] = value
         return self
 
+    @since("1.4.0")
     def getEstimatorParamMaps(self):
         """
         Gets the value of estimatorParamMaps or its default value.
         """
         return self.getOrDefault(self.estimatorParamMaps)
 
+    @since("1.4.0")
     def setEvaluator(self, value):
         """
         Sets the value of :py:attr:`evaluator`.
@@ -183,12 +197,14 @@ def setEvaluator(self, value):
         self._paramMap[self.evaluator] = value
         return self
 
+    @since("1.4.0")
     def getEvaluator(self):
         """
         Gets the value of evaluator or its default value.
         """
         return self.getOrDefault(self.evaluator)
 
+    @since("1.4.0")
     def setNumFolds(self, value):
         """
         Sets the value of :py:attr:`numFolds`.
@@ -196,6 +212,7 @@ def setNumFolds(self, value):
         self._paramMap[self.numFolds] = value
         return self
 
+    @since("1.4.0")
     def getNumFolds(self):
         """
         Gets the value of numFolds or its default value.
@@ -231,7 +248,15 @@ def _fit(self, dataset):
         bestModel = est.fit(dataset, epm[bestIndex])
         return CrossValidatorModel(bestModel)
 
+    @since("1.4.0")
     def copy(self, extra=None):
+        """
+        Creates a copy of this instance with a randomly generated uid
+        and some extra params. This copies creates a deep copy of
+        the embedded paramMap, and copies the embedded and extra parameters over.
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
         if extra is None:
             extra = dict()
         newCV = Params.copy(self, extra)
@@ -246,6 +271,8 @@ def copy(self, extra=None):
 class CrossValidatorModel(Model):
     """
     Model from k-fold cross validation.
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, bestModel):
@@ -256,6 +283,7 @@ def __init__(self, bestModel):
     def _transform(self, dataset):
         return self.bestModel.transform(dataset)
 
+    @since("1.4.0")
     def copy(self, extra=None):
         """
         Creates a copy of this instance with a randomly generated uid

From 29bf8aa5a51fdd8c2600533297f991e14fa27c03 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:45:20 -0700
Subject: [PATCH 317/802] [SPARK-10283] [ML] [PYSPARK] [DOCS] Add @since
 annotation to pyspark.ml.regression

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8693 from yu-iskw/SPARK-10283.
---
 python/pyspark/ml/regression.py | 65 +++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index a9503608b7f25..21d454f9003bb 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -62,6 +63,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+
+    .. versionadded:: 1.4.0
     """
 
     @keyword_only
@@ -81,6 +84,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                   standardization=True):
@@ -96,13 +100,31 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)
 
+    @since("1.4.0")
+    def setElasticNetParam(self, value):
+        """
+        Sets the value of :py:attr:`elasticNetParam`.
+        """
+        self._paramMap[self.elasticNetParam] = value
+        return self
+
+    @since("1.4.0")
+    def getElasticNetParam(self):
+        """
+        Gets the value of elasticNetParam or its default value.
+        """
+        return self.getOrDefault(self.elasticNetParam)
+
 
 class LinearRegressionModel(JavaModel):
     """
     Model fitted by LinearRegression.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.4.0")
     def weights(self):
         """
         Model weights.
@@ -110,6 +132,7 @@ def weights(self):
         return self._call_java("weights")
 
     @property
+    @since("1.4.0")
     def intercept(self):
         """
         Model intercept.
@@ -162,6 +185,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -193,6 +218,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
@@ -209,6 +235,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return DecisionTreeRegressionModel(java_model)
 
+    @since("1.4.0")
     def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
@@ -216,6 +243,7 @@ def setImpurity(self, value):
         self._paramMap[self.impurity] = value
         return self
 
+    @since("1.4.0")
     def getImpurity(self):
         """
         Gets the value of impurity or its default value.
@@ -225,13 +253,19 @@ def getImpurity(self):
 
 @inherit_doc
 class DecisionTreeModel(JavaModel):
+    """Abstraction for Decision Tree models.
+
+    .. versionadded:: 1.5.0
+    """
 
     @property
+    @since("1.5.0")
     def numNodes(self):
         """Return number of nodes of the decision tree."""
         return self._call_java("numNodes")
 
     @property
+    @since("1.5.0")
     def depth(self):
         """Return depth of the decision tree."""
         return self._call_java("depth")
@@ -242,8 +276,13 @@ def __repr__(self):
 
 @inherit_doc
 class TreeEnsembleModels(JavaModel):
+    """Represents a tree ensemble model.
+
+    .. versionadded:: 1.5.0
+    """
 
     @property
+    @since("1.5.0")
     def treeWeights(self):
         """Return the weights for each tree"""
         return list(self._call_java("javaTreeWeights"))
@@ -256,6 +295,8 @@ def __repr__(self):
 class DecisionTreeRegressionModel(DecisionTreeModel):
     """
     Model fitted by DecisionTreeRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
@@ -282,6 +323,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     0.5
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -336,6 +379,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
@@ -353,6 +397,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return RandomForestRegressionModel(java_model)
 
+    @since("1.4.0")
     def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
@@ -360,12 +405,14 @@ def setImpurity(self, value):
         self._paramMap[self.impurity] = value
         return self
 
+    @since("1.4.0")
     def getImpurity(self):
         """
         Gets the value of impurity or its default value.
         """
         return self.getOrDefault(self.impurity)
 
+    @since("1.4.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
@@ -373,12 +420,14 @@ def setSubsamplingRate(self, value):
         self._paramMap[self.subsamplingRate] = value
         return self
 
+    @since("1.4.0")
     def getSubsamplingRate(self):
         """
         Gets the value of subsamplingRate or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)
 
+    @since("1.4.0")
     def setNumTrees(self, value):
         """
         Sets the value of :py:attr:`numTrees`.
@@ -386,12 +435,14 @@ def setNumTrees(self, value):
         self._paramMap[self.numTrees] = value
         return self
 
+    @since("1.4.0")
     def getNumTrees(self):
         """
         Gets the value of numTrees or its default value.
         """
         return self.getOrDefault(self.numTrees)
 
+    @since("1.4.0")
     def setFeatureSubsetStrategy(self, value):
         """
         Sets the value of :py:attr:`featureSubsetStrategy`.
@@ -399,6 +450,7 @@ def setFeatureSubsetStrategy(self, value):
         self._paramMap[self.featureSubsetStrategy] = value
         return self
 
+    @since("1.4.0")
     def getFeatureSubsetStrategy(self):
         """
         Gets the value of featureSubsetStrategy or its default value.
@@ -409,6 +461,8 @@ def getFeatureSubsetStrategy(self):
 class RandomForestRegressionModel(TreeEnsembleModels):
     """
     Model fitted by RandomForestRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
@@ -435,6 +489,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -481,6 +537,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
@@ -498,6 +555,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return GBTRegressionModel(java_model)
 
+    @since("1.4.0")
     def setLossType(self, value):
         """
         Sets the value of :py:attr:`lossType`.
@@ -505,12 +563,14 @@ def setLossType(self, value):
         self._paramMap[self.lossType] = value
         return self
 
+    @since("1.4.0")
     def getLossType(self):
         """
         Gets the value of lossType or its default value.
         """
         return self.getOrDefault(self.lossType)
 
+    @since("1.4.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
@@ -518,12 +578,14 @@ def setSubsamplingRate(self, value):
         self._paramMap[self.subsamplingRate] = value
         return self
 
+    @since("1.4.0")
     def getSubsamplingRate(self):
         """
         Gets the value of subsamplingRate or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)
 
+    @since("1.4.0")
     def setStepSize(self, value):
         """
         Sets the value of :py:attr:`stepSize`.
@@ -531,6 +593,7 @@ def setStepSize(self, value):
         self._paramMap[self.stepSize] = value
         return self
 
+    @since("1.4.0")
     def getStepSize(self):
         """
         Gets the value of stepSize or its default value.
@@ -541,6 +604,8 @@ def getStepSize(self):
 class GBTRegressionModel(TreeEnsembleModels):
     """
     Model fitted by GBTRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
From 0ded87a4d49d4484e202bd2ec781821b57b5882c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:47:21 -0700
Subject: [PATCH 318/802] [SPARK-10281] [ML] [PYSPARK] [DOCS] Add @since
 annotation to pyspark.ml.clustering

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8691 from yu-iskw/SPARK-10281.
---
 python/pyspark/ml/clustering.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index cb4c16e25a7a3..7bb8ab94e17df 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -26,8 +27,11 @@
 class KMeansModel(JavaModel):
     """
     Model fitted by KMeans.
+
+    .. versionadded:: 1.5.0
     """
 
+    @since("1.5.0")
     def clusterCenters(self):
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return [c.toArray() for c in self._call_java("clusterCenters")]
@@ -55,6 +59,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     True
     >>> rows[2].prediction == rows[3].prediction
     True
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -88,6 +94,7 @@ def _create_model(self, java_model):
         return KMeansModel(java_model)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
                   initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
         """
@@ -99,6 +106,7 @@ def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setK(self, value):
         """
         Sets the value of :py:attr:`k`.
@@ -110,12 +118,14 @@ def setK(self, value):
         self._paramMap[self.k] = value
         return self
 
+    @since("1.5.0")
     def getK(self):
         """
         Gets the value of `k`
         """
         return self.getOrDefault(self.k)
 
+    @since("1.5.0")
     def setInitMode(self, value):
         """
         Sets the value of :py:attr:`initMode`.
@@ -130,12 +140,14 @@ def setInitMode(self, value):
         self._paramMap[self.initMode] = value
         return self
 
+    @since("1.5.0")
     def getInitMode(self):
         """
         Gets the value of `initMode`
         """
         return self.getOrDefault(self.initMode)
 
+    @since("1.5.0")
     def setInitSteps(self, value):
         """
         Sets the value of :py:attr:`initSteps`.
@@ -147,6 +159,7 @@ def setInitSteps(self, value):
         self._paramMap[self.initSteps] = value
         return self
 
+    @since("1.5.0")
     def getInitSteps(self):
         """
         Gets the value of `initSteps`

From 39b44cb52eb225469eb4ccdf696f0bc6405b9184 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:48:45 -0700
Subject: [PATCH 319/802] [SPARK-10278] [MLLIB] [PYSPARK] Add @since annotation
 to pyspark.mllib.tree

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8685 from yu-iskw/SPARK-10278.
---
 python/pyspark/mllib/tree.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 372b86a7c95d9..0001b60093a69 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -19,7 +19,7 @@
 
 import random
 
-from pyspark import SparkContext, RDD
+from pyspark import SparkContext, RDD, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
@@ -30,6 +30,11 @@
 
 
 class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
+    """TreeEnsembleModel
+
+    .. versionadded:: 1.3.0
+    """
+    @since("1.3.0")
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points using
@@ -45,12 +50,14 @@ def predict(self, x):
         else:
             return self.call("predict", _convert_to_vector(x))
 
+    @since("1.3.0")
     def numTrees(self):
         """
         Get number of trees in ensemble.
         """
         return self.call("numTrees")
 
+    @since("1.3.0")
     def totalNumNodes(self):
         """
         Get total number of nodes, summed over all trees in the
@@ -62,6 +69,7 @@ def __repr__(self):
         """ Summary of model """
         return self._java_model.toString()
 
+    @since("1.3.0")
     def toDebugString(self):
         """ Full model """
         return self._java_model.toDebugString()
@@ -72,7 +80,10 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     .. note:: Experimental
 
     A decision tree model for classification or regression.
+
+    .. versionadded:: 1.1.0
     """
+    @since("1.1.0")
     def predict(self, x):
         """
         Predict the label of one or more examples.
@@ -90,16 +101,23 @@ def predict(self, x):
         else:
             return self.call("predict", _convert_to_vector(x))
 
+    @since("1.1.0")
     def numNodes(self):
+        """Get number of nodes in tree, including leaf nodes."""
         return self._java_model.numNodes()
 
+    @since("1.1.0")
     def depth(self):
+        """Get depth of tree.
+        E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
+        """
         return self._java_model.depth()
 
     def __repr__(self):
         """ summary of model. """
         return self._java_model.toString()
 
+    @since("1.2.0")
     def toDebugString(self):
         """ full model. """
         return self._java_model.toDebugString()
@@ -115,6 +133,8 @@ class DecisionTree(object):
 
     Learning algorithm for a decision tree model for classification or
     regression.
+
+    .. versionadded:: 1.1.0
     """
 
     @classmethod
@@ -127,6 +147,7 @@ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, m
         return DecisionTreeModel(model)
 
     @classmethod
+    @since("1.1.0")
     def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                         impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                         minInfoGain=0.0):
@@ -185,6 +206,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                           impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
 
     @classmethod
+    @since("1.1.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo,
                        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                        minInfoGain=0.0):
@@ -239,6 +261,8 @@ class RandomForestModel(TreeEnsembleModel, JavaLoader):
     .. note:: Experimental
 
     Represents a random forest model.
+
+    .. versionadded:: 1.2.0
     """
 
     @classmethod
@@ -252,6 +276,8 @@ class RandomForest(object):
 
     Learning algorithm for a random forest model for classification or
     regression.
+
+    .. versionadded:: 1.2.0
     """
 
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
@@ -271,6 +297,7 @@ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
         return RandomForestModel(model)
 
     @classmethod
+    @since("1.2.0")
     def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                         featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
                         seed=None):
@@ -352,6 +379,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                           maxDepth, maxBins, seed)
 
     @classmethod
+    @since("1.2.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
                        impurity="variance", maxDepth=4, maxBins=32, seed=None):
         """
@@ -418,6 +446,8 @@ class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader):
     .. note:: Experimental
 
     Represents a gradient-boosted tree model.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -431,6 +461,8 @@ class GradientBoostedTrees(object):
 
     Learning algorithm for a gradient boosted trees model for
     classification or regression.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -443,6 +475,7 @@ def _train(cls, data, algo, categoricalFeaturesInfo,
         return GradientBoostedTreesModel(model)
 
     @classmethod
+    @since("1.3.0")
     def trainClassifier(cls, data, categoricalFeaturesInfo,
                         loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
                         maxBins=32):
@@ -505,6 +538,7 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
                           loss, numIterations, learningRate, maxDepth, maxBins)
 
     @classmethod
+    @since("1.3.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo,
                        loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
                        maxBins=32):

From 4a0b56e8dbb3713b16e58738201d838ffc4b258b Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:50:00 -0700
Subject: [PATCH 320/802] [SPARK-10279] [MLLIB] [PYSPARK] [DOCS] Add @since
 annotation to pyspark.mllib.util

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8689 from yu-iskw/SPARK-10279.
---
 python/pyspark/mllib/util.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 10a1e4b3eb0fc..39bc6586dd582 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -23,7 +23,7 @@
     xrange = range
     basestring = str
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
 
@@ -32,6 +32,8 @@ class MLUtils(object):
 
     """
     Helper methods to load, save and pre-process data used in MLlib.
+
+    .. versionadded:: 1.0.0
     """
 
     @staticmethod
@@ -69,6 +71,7 @@ def _convert_labeled_point_to_libsvm(p):
         return " ".join(items)
 
     @staticmethod
+    @since("1.0.0")
     def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
         """
         Loads labeled data in the LIBSVM format into an RDD of
@@ -123,6 +126,7 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
+    @since("1.0.0")
     def saveAsLibSVMFile(data, dir):
         """
         Save labeled data in LIBSVM format.
@@ -147,6 +151,7 @@ def saveAsLibSVMFile(data, dir):
         lines.saveAsTextFile(dir)
 
     @staticmethod
+    @since("1.1.0")
     def loadLabeledPoints(sc, path, minPartitions=None):
         """
         Load labeled points saved using RDD.saveAsTextFile.
@@ -172,6 +177,7 @@ def loadLabeledPoints(sc, path, minPartitions=None):
         return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
 
     @staticmethod
+    @since("1.5.0")
     def appendBias(data):
         """
         Returns a new vector with `1.0` (bias) appended to
@@ -186,6 +192,7 @@ def appendBias(data):
             return _convert_to_vector(np.append(vec.toArray(), 1.0))
 
     @staticmethod
+    @since("1.5.0")
     def loadVectors(sc, path):
         """
         Loads vectors saved using `RDD[Vector].saveAsTextFile`
@@ -197,6 +204,8 @@ def loadVectors(sc, path):
 class Saveable(object):
     """
     Mixin for models and transformers which may be saved as files.
+
+    .. versionadded:: 1.3.0
     """
 
     def save(self, sc, path):
@@ -222,9 +231,13 @@ class JavaSaveable(Saveable):
     """
     Mixin for models that provide save() through their Scala
     implementation.
+
+    .. versionadded:: 1.3.0
     """
 
+    @since("1.3.0")
     def save(self, sc, path):
+        """Save this model to the given path."""
         if not isinstance(sc, SparkContext):
             raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
         if not isinstance(path, basestring):
@@ -235,6 +248,8 @@ def save(self, sc, path):
 class Loader(object):
     """
     Mixin for classes which can load saved models from files.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -256,6 +271,8 @@ class JavaLoader(Loader):
     """
     Mixin for classes which can load saved models using its Scala
     implementation.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -280,15 +297,21 @@ def _load_java(cls, sc, path):
         return java_obj.load(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.3.0")
     def load(cls, sc, path):
+        """Load a model from the given path."""
         java_model = cls._load_java(sc, path)
         return cls(java_model)
 
 
 class LinearDataGenerator(object):
-    """Utils for generating linear data"""
+    """Utils for generating linear data.
+
+    .. versionadded:: 1.5.0
+    """
 
     @staticmethod
+    @since("1.5.0")
     def generateLinearInput(intercept, weights, xMean, xVariance,
                             nPoints, seed, eps):
         """
@@ -311,6 +334,7 @@ def generateLinearInput(intercept, weights, xMean, xVariance,
             xVariance, int(nPoints), int(seed), float(eps)))
 
     @staticmethod
+    @since("1.5.0")
     def generateLinearRDD(sc, nexamples, nfeatures, eps,
                           nParts=2, intercept=0.0):
         """

From c74d38fd8faf8cba981cf934341d24b9a3167025 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:50:46 -0700
Subject: [PATCH 321/802] [SPARK-10274] [MLLIB] Add @since annotation to
 pyspark.mllib.fpm

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8665 from yu-iskw/SPARK-10274.
---
 python/pyspark/mllib/fpm.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index bdc4a132b1b18..bdabba9602a8c 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -19,7 +19,7 @@
 from numpy import array
 from collections import namedtuple
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
 
@@ -41,8 +41,11 @@ class FPGrowthModel(JavaModelWrapper):
     >>> model = FPGrowth.train(rdd, 0.6, 2)
     >>> sorted(model.freqItemsets().collect())
     [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
+
+    .. versionadded:: 1.4.0
     """
 
+    @since("1.4.0")
     def freqItemsets(self):
         """
         Returns the frequent itemsets of this model.
@@ -55,9 +58,12 @@ class FPGrowth(object):
     .. note:: Experimental
 
     A Parallel FP-growth algorithm to mine frequent itemsets.
+
+    .. versionadded:: 1.4.0
     """
 
     @classmethod
+    @since("1.4.0")
     def train(cls, data, minSupport=0.3, numPartitions=-1):
         """
         Computes an FP-Growth model that contains frequent itemsets.
@@ -74,6 +80,8 @@ def train(cls, data, minSupport=0.3, numPartitions=-1):
     class FreqItemset(namedtuple("FreqItemset", ["items", "freq"])):
         """
         Represents an (items, freq) tuple.
+
+        .. versionadded:: 1.4.0
         """
 
 
From 268088b899e6e165e746aed87840d47bfaf50c43 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 17 Sep 2015 08:51:19 -0700
Subject: [PATCH 322/802] [SPARK-10282] [ML] [PYSPARK] [DOCS] Add @since
 annotation to pyspark.ml.recommendation

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8692 from yu-iskw/SPARK-10282.
---
 python/pyspark/ml/recommendation.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b06099ac0aee6..ec5748a1cfe94 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -80,6 +81,8 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     Row(user=1, item=0, prediction=3.19...)
     >>> predictions[2]
     Row(user=2, item=0, prediction=-1.15...)
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -122,6 +125,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
                   implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                   ratingCol="rating", nonnegative=False, checkpointInterval=10):
@@ -137,6 +141,7 @@ def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItem
     def _create_model(self, java_model):
         return ALSModel(java_model)
 
+    @since("1.4.0")
     def setRank(self, value):
         """
         Sets the value of :py:attr:`rank`.
@@ -144,12 +149,14 @@ def setRank(self, value):
         self._paramMap[self.rank] = value
         return self
 
+    @since("1.4.0")
     def getRank(self):
         """
         Gets the value of rank or its default value.
         """
         return self.getOrDefault(self.rank)
 
+    @since("1.4.0")
     def setNumUserBlocks(self, value):
         """
         Sets the value of :py:attr:`numUserBlocks`.
@@ -157,12 +164,14 @@ def setNumUserBlocks(self, value):
         self._paramMap[self.numUserBlocks] = value
         return self
 
+    @since("1.4.0")
     def getNumUserBlocks(self):
         """
         Gets the value of numUserBlocks or its default value.
         """
         return self.getOrDefault(self.numUserBlocks)
 
+    @since("1.4.0")
     def setNumItemBlocks(self, value):
         """
         Sets the value of :py:attr:`numItemBlocks`.
@@ -170,12 +179,14 @@ def setNumItemBlocks(self, value):
         self._paramMap[self.numItemBlocks] = value
         return self
 
+    @since("1.4.0")
     def getNumItemBlocks(self):
         """
         Gets the value of numItemBlocks or its default value.
         """
         return self.getOrDefault(self.numItemBlocks)
 
+    @since("1.4.0")
     def setNumBlocks(self, value):
         """
         Sets both :py:attr:`numUserBlocks` and :py:attr:`numItemBlocks` to the specific value.
@@ -183,6 +194,7 @@ def setNumBlocks(self, value):
         self._paramMap[self.numUserBlocks] = value
         self._paramMap[self.numItemBlocks] = value
 
+    @since("1.4.0")
     def setImplicitPrefs(self, value):
         """
         Sets the value of :py:attr:`implicitPrefs`.
@@ -190,12 +202,14 @@ def setImplicitPrefs(self, value):
         self._paramMap[self.implicitPrefs] = value
         return self
 
+    @since("1.4.0")
     def getImplicitPrefs(self):
         """
         Gets the value of implicitPrefs or its default value.
         """
         return self.getOrDefault(self.implicitPrefs)
 
+    @since("1.4.0")
     def setAlpha(self, value):
         """
         Sets the value of :py:attr:`alpha`.
@@ -203,12 +217,14 @@ def setAlpha(self, value):
         self._paramMap[self.alpha] = value
         return self
 
+    @since("1.4.0")
     def getAlpha(self):
         """
         Gets the value of alpha or its default value.
         """
         return self.getOrDefault(self.alpha)
 
+    @since("1.4.0")
     def setUserCol(self, value):
         """
         Sets the value of :py:attr:`userCol`.
@@ -216,12 +232,14 @@ def setUserCol(self, value):
         self._paramMap[self.userCol] = value
         return self
 
+    @since("1.4.0")
     def getUserCol(self):
         """
         Gets the value of userCol or its default value.
         """
         return self.getOrDefault(self.userCol)
 
+    @since("1.4.0")
     def setItemCol(self, value):
         """
         Sets the value of :py:attr:`itemCol`.
@@ -229,12 +247,14 @@ def setItemCol(self, value):
         self._paramMap[self.itemCol] = value
         return self
 
+    @since("1.4.0")
     def getItemCol(self):
         """
         Gets the value of itemCol or its default value.
         """
         return self.getOrDefault(self.itemCol)
 
+    @since("1.4.0")
     def setRatingCol(self, value):
         """
         Sets the value of :py:attr:`ratingCol`.
@@ -242,12 +262,14 @@ def setRatingCol(self, value):
         self._paramMap[self.ratingCol] = value
         return self
 
+    @since("1.4.0")
     def getRatingCol(self):
         """
         Gets the value of ratingCol or its default value.
         """
         return self.getOrDefault(self.ratingCol)
 
+    @since("1.4.0")
     def setNonnegative(self, value):
         """
         Sets the value of :py:attr:`nonnegative`.
@@ -255,6 +277,7 @@ def setNonnegative(self, value):
         self._paramMap[self.nonnegative] = value
         return self
 
+    @since("1.4.0")
     def getNonnegative(self):
         """
         Gets the value of nonnegative or its default value.
@@ -265,14 +288,18 @@ def getNonnegative(self):
 class ALSModel(JavaModel):
     """
     Model fitted by ALS.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.4.0")
     def rank(self):
         """rank of the matrix factorization model"""
         return self._call_java("rank")
 
     @property
+    @since("1.4.0")
     def userFactors(self):
         """
         a DataFrame that stores user factors in two columns: `id` and
@@ -281,6 +308,7 @@ def userFactors(self):
         return self._call_java("userFactors")
 
     @property
+    @since("1.4.0")
     def itemFactors(self):
         """
         a DataFrame that stores item factors in two columns: `id` and

From e51345e1e04e439827a07c95887d14ba38333057 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 17 Sep 2015 09:17:43 -0700
Subject: [PATCH 323/802] [SPARK-10077] [DOCS] [ML] Add package info for java
 of ml/feature

Should be the same as SPARK-7808 but use Java for the code example.
It would be great to add package doc for `spark.ml.feature`.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8740 from holdenk/SPARK-10077-JAVA-PACKAGE-DOC-FOR-SPARK.ML.FEATURE.
---
 .../apache/spark/ml/feature/package-info.java | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
new file mode 100644
index 0000000000000..c22d2e0cd2d90
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Feature transformers
+ *
+ * The `ml.feature` package provides common feature transformers that help convert raw data or
+ * features into more suitable forms for model fitting.
+ * Most feature transformers are implemented as {@link org.apache.spark.ml.Transformer}s, which
+ * transforms one {@link org.apache.spark.sql.DataFrame} into another, e.g.,
+ * {@link org.apache.spark.feature.HashingTF}.
+ * Some feature transformers are implemented as {@link org.apache.spark.ml.Estimator}}s, because the
+ * transformation requires some aggregated information of the dataset, e.g., document
+ * frequencies in {@link org.apache.spark.ml.feature.IDF}.
+ * For those feature transformers, calling {@link org.apache.spark.ml.Estimator#fit} is required to
+ * obtain the model first, e.g., {@link org.apache.spark.ml.feature.IDFModel}, in order to apply
+ * transformation.
+ * The transformation is usually done by appending new columns to the input
+ * {@link org.apache.spark.sql.DataFrame}, so all input columns are carried over.
+ *
+ * We try to make each transformer minimal, so it becomes flexible to assemble feature
+ * transformation pipelines.
+ * {@link org.apache.spark.ml.Pipeline} can be used to chain feature transformers, and
+ * {@link org.apache.spark.ml.feature.VectorAssembler} can be used to combine multiple feature
+ * transformations, for example:
+ *
+ * <pre>
+ * <code>
+ *   import java.util.Arrays;
+ *
+ *   import org.apache.spark.api.java.JavaRDD;
+ *   import static org.apache.spark.sql.types.DataTypes.*;
+ *   import org.apache.spark.sql.types.StructType;
+ *   import org.apache.spark.sql.DataFrame;
+ *   import org.apache.spark.sql.RowFactory;
+ *   import org.apache.spark.sql.Row;
+ *
+ *   import org.apache.spark.ml.feature.*;
+ *   import org.apache.spark.ml.Pipeline;
+ *   import org.apache.spark.ml.PipelineStage;
+ *   import org.apache.spark.ml.PipelineModel;
+ *
+ *  // a DataFrame with three columns: id (integer), text (string), and rating (double).
+ *  StructType schema = createStructType(
+ *    Arrays.asList(
+ *      createStructField("id", IntegerType, false),
+ *      createStructField("text", StringType, false),
+ *      createStructField("rating", DoubleType, false)));
+ *  JavaRDD<Row> rowRDD = jsc.parallelize(
+ *    Arrays.asList(
+ *      RowFactory.create(0, "Hi I heard about Spark", 3.0),
+ *      RowFactory.create(1, "I wish Java could use case classes", 4.0),
+ *      RowFactory.create(2, "Logistic regression models are neat", 4.0)));
+ *  DataFrame df = jsql.createDataFrame(rowRDD, schema);
+ *  // define feature transformers
+ *  RegexTokenizer tok = new RegexTokenizer()
+ *    .setInputCol("text")
+ *    .setOutputCol("words");
+ *  StopWordsRemover sw = new StopWordsRemover()
+ *    .setInputCol("words")
+ *    .setOutputCol("filtered_words");
+ *  HashingTF tf = new HashingTF()
+ *    .setInputCol("filtered_words")
+ *    .setOutputCol("tf")
+ *    .setNumFeatures(10000);
+ *  IDF idf = new IDF()
+ *    .setInputCol("tf")
+ *    .setOutputCol("tf_idf");
+ *  VectorAssembler assembler = new VectorAssembler()
+ *    .setInputCols(new String[] {"tf_idf", "rating"})
+ *    .setOutputCol("features");
+ *
+ *  // assemble and fit the feature transformation pipeline
+ *  Pipeline pipeline = new Pipeline()
+ *    .setStages(new PipelineStage[] {tok, sw, tf, idf, assembler});
+ *  PipelineModel model = pipeline.fit(df);
+ *
+ *  // save transformed features with raw data
+ *  model.transform(df)
+ *    .select("id", "text", "rating", "features")
+ *    .write().format("parquet").save("/output/path");
+ * </code>
+ * </pre>
+ *
+ * Some feature transformers implemented in MLlib are inspired by those implemented in scikit-learn.
+ * The major difference is that most scikit-learn feature transformers operate eagerly on the entire
+ * input dataset, while MLlib's feature transformers operate lazily on individual columns,
+ * which is more efficient and flexible to handle large and complex datasets.
+ *
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html" target="_blank">
+ * scikit-learn.preprocessing</a>
+ */
+package org.apache.spark.ml.feature;

From 2a508df20d03b3d4a3c05b65fb02d849bc080ef9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 17 Sep 2015 09:21:21 -0700
Subject: [PATCH 324/802] [SPARK-10459] [SQL] Do not need to have ConvertToSafe
 for PythonUDF

JIRA: https://issues.apache.org/jira/browse/SPARK-10459

As mentioned in the JIRA, `PythonUDF` actually could process `UnsafeRow`.

Specially, the rows in `childResults` in `BatchPythonEvaluation` will be projected to a `MutableRow`. So I think we can enable `canProcessUnsafeRows` for `BatchPythonEvaluation` and get rid of redundant `ConvertToSafe`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8616 from viirya/pyudf-unsafe.
---
 .../scala/org/apache/spark/sql/execution/pythonUDFs.scala     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 5a58d846ad80b..d0411da6fdf5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -337,6 +337,10 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
 
   def children: Seq[SparkPlan] = child :: Nil
 
+  override def outputsUnsafeRows: Boolean = false
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = true
+
   protected override def doExecute(): RDD[InternalRow] = {
     val childResults = child.execute().map(_.copy())
 

From c88bb5df94f9696677c3a429472114bc66f32a52 Mon Sep 17 00:00:00 2001
From: "yangping.wu" <wyphao.2007@163.com>
Date: Thu, 17 Sep 2015 09:52:40 -0700
Subject: [PATCH 325/802] [SPARK-10660] Doc describe error in the "Running
 Spark on YARN" page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the Configuration section, the **spark.yarn.driver.memoryOverhead** and **spark.yarn.am.memoryOverhead**‘s default value should be "driverMemory * 0.10, with minimum of 384" and "AM memory * 0.10, with minimum of 384" respectively. Because from Spark 1.4.0, the **MEMORY_OVERHEAD_FACTOR** is set to 0.1.0, not 0.07.

Author: yangping.wu <wyphao.2007@163.com>

Closes #8797 from 397090770/SparkOnYarnDocError.
---
 docs/running-on-yarn.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index d1244323edfff..3a961d245f3de 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -211,14 +211,14 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.driver.memoryOverhead</code></td>
-  <td>driverMemory * 0.07, with minimum of 384 </td>
+  <td>driverMemory * 0.10, with minimum of 384 </td>
   <td>
     The amount of off heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.memoryOverhead</code></td>
-  <td>AM memory * 0.07, with minimum of 384 </td>
+  <td>AM memory * 0.10, with minimum of 384 </td>
   <td>
     Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the Application Master in client mode.
   </td>

From 136c77d8bbf48f7c45dd7c3fbe261a0476f455fe Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 17 Sep 2015 10:02:15 -0700
Subject: [PATCH 326/802] [SPARK-10642] [PYSPARK] Fix crash when calling
 rdd.lookup() on tuple keys

JIRA: https://issues.apache.org/jira/browse/SPARK-10642

When calling `rdd.lookup()` on a RDD with tuple keys, `portable_hash` will return a long. That causes `DAGScheduler.submitJob` to throw `java.lang.ClassCastException: java.lang.Long cannot be cast to java.lang.Integer`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8796 from viirya/fix-pyrdd-lookup.
---
 python/pyspark/rdd.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 9ef60a7e2c84b..ab5aab1e115f7 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -84,7 +84,7 @@ def portable_hash(x):
         h ^= len(x)
         if h == -1:
             h = -2
-        return h
+        return int(h)
     return hash(x)
 
 
@@ -2192,6 +2192,9 @@ def lookup(self, key):
         [42]
         >>> sorted.lookup(1024)
         []
+        >>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey()
+        >>> list(rdd2.lookup(('a', 'b'))[0])
+        ['c']
         """
         values = self.filter(lambda kv: kv[0] == key).values()
 

From 81b4db374dd61b6f1c30511c70b6ab2a52c68faa Mon Sep 17 00:00:00 2001
From: Josiah Samuel <josiah_sams@in.ibm.com>
Date: Thu, 17 Sep 2015 10:18:21 -0700
Subject: [PATCH 327/802] [SPARK-10172] [CORE] disable sort in HistoryServer
 webUI

This pull request is to address the JIRA SPARK-10172 (History Server web UI gets messed up when sorting on any column).
The content of the table gets messed up due to the rowspan attribute of the table data(cell) during sorting.
The current table sort library used in SparkUI (sorttable.js) doesn't support/handle cells(td) with rowspans.
The fix will disable the table sort in the web UI, when there are jobs listed with multiple attempts.

Author: Josiah Samuel <josiah_sams@in.ibm.com>

Closes #8506 from josiahsams/SPARK-10172.
---
 .../scala/org/apache/spark/deploy/history/HistoryPage.scala  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 0830cc1ba1245..b347cb3be69f7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -51,7 +51,10 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
     val hasMultipleAttempts = appsToShow.exists(_.attempts.size > 1)
     val appTable =
       if (hasMultipleAttempts) {
-        UIUtils.listingTable(appWithAttemptHeader, appWithAttemptRow, appsToShow)
+        // Sorting is disable here as table sort on rowspan has issues.
+        // ref. SPARK-10172
+        UIUtils.listingTable(appWithAttemptHeader, appWithAttemptRow,
+          appsToShow, sortable = false)
       } else {
         UIUtils.listingTable(appHeader, appRow, appsToShow)
       }

From 36d8b278d82e788bf583e8438fac524d0023311d Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Thu, 17 Sep 2015 10:25:18 -0700
Subject: [PATCH 328/802] [SPARK-10531] [CORE] AppId is set as AppName in
 status rest api

Verify it manually.

Author: Jeff Zhang <zjffdu@apache.org>

Closes #8688 from zjffdu/SPARK-10531.
---
 .../main/scala/org/apache/spark/SparkContext.scala    |  1 +
 .../spark/deploy/history/FsHistoryProvider.scala      |  9 ++++-----
 .../scala/org/apache/spark/deploy/master/Master.scala |  2 +-
 core/src/main/scala/org/apache/spark/ui/SparkUI.scala | 11 ++++++-----
 .../scala/org/apache/spark/ui/UISeleniumSuite.scala   |  2 +-
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index a2f34eafa2c38..9c3218719f7fc 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -521,6 +521,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     _applicationId = _taskScheduler.applicationId()
     _applicationAttemptId = taskScheduler.applicationAttemptId()
     _conf.set("spark.app.id", _applicationId)
+    _ui.foreach(_.setAppId(_applicationId))
     _env.blockManager.initialize(_applicationId)
 
     // The metrics system for Driver need to be set spark.app.id to app ID.
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index a5755eac36396..8eb2ba1e8683b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -146,16 +146,15 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           val ui = {
             val conf = this.conf.clone()
             val appSecManager = new SecurityManager(conf)
-            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appId,
+            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appInfo.name,
               HistoryServer.getAttemptURI(appId, attempt.attemptId), attempt.startTime)
             // Do not call ui.bind() to avoid creating a new server for each application
           }
           val appListener = new ApplicationEventListener()
           replayBus.addListener(appListener)
-          val appInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)), replayBus)
-          appInfo.map { info =>
-            ui.setAppName(s"${info.name} ($appId)")
-
+          val appAttemptInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)),
+            replayBus)
+          appAttemptInfo.map { info =>
             val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
             ui.getSecurityManager.setAcls(uiAclsEnabled)
             // make sure to set admin acls before view acls so they are properly picked up
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 26904d39a9bec..d518e92133aad 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -944,7 +944,7 @@ private[deploy] class Master(
       val logInput = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)
       val replayBus = new ReplayListenerBus()
       val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),
-        appName + status, HistoryServer.UI_PATH_PREFIX + s"/${app.id}", app.startTime)
+        appName, HistoryServer.UI_PATH_PREFIX + s"/${app.id}", app.startTime)
       val maybeTruncated = eventLogFile.endsWith(EventLoggingListener.IN_PROGRESS)
       try {
         replayBus.replay(logInput, eventLogFile, maybeTruncated)
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index d8b90568b7b9a..99085ada9f0af 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -56,6 +56,8 @@ private[spark] class SparkUI private (
 
   val stagesTab = new StagesTab(this)
 
+  var appId: String = _
+
   /** Initialize all components of the server. */
   def initialize() {
     attachTab(new JobsTab(this))
@@ -75,9 +77,8 @@ private[spark] class SparkUI private (
 
   def getAppName: String = appName
 
-  /** Set the app name for this UI. */
-  def setAppName(name: String) {
-    appName = name
+  def setAppId(id: String): Unit = {
+    appId = id
   }
 
   /** Stop the server behind this web interface. Only valid after bind(). */
@@ -94,12 +95,12 @@ private[spark] class SparkUI private (
   private[spark] def appUIAddress = s"http://$appUIHostPort"
 
   def getSparkUI(appId: String): Option[SparkUI] = {
-    if (appId == appName) Some(this) else None
+    if (appId == this.appId) Some(this) else None
   }
 
   def getApplicationInfoList: Iterator[ApplicationInfo] = {
     Iterator(new ApplicationInfo(
-      id = appName,
+      id = appId,
       name = appName,
       attempts = Seq(new ApplicationAttemptInfo(
         attemptId = None,
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 22e30ecaf0533..18eec7da9763e 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -658,6 +658,6 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
   }
 
   def apiUrl(ui: SparkUI, path: String): URL = {
-    new URL(ui.appUIAddress + "/api/v1/applications/test/" + path)
+    new URL(ui.appUIAddress + "/api/v1/applications/" + ui.sc.get.applicationId + "/" + path)
   }
 }

From e0dc2bc232206d2f4da4278502c1f88babc8b55a Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 17 Sep 2015 11:05:30 -0700
Subject: [PATCH 329/802] [SPARK-10650] Clean before building docs

The [published docs for 1.5.0](http://spark.apache.org/docs/1.5.0/api/java/org/apache/spark/streaming/) have a bunch of test classes in them.  The only way I can reproduce this is to `test:compile` before running `unidoc`.  To prevent this from happening again, I've added a clean before doc generation.

Author: Michael Armbrust <michael@databricks.com>

Closes #8787 from marmbrus/testsInDocs.
---
 docs/_plugins/copy_api_dirs.rb | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 15ceda11a8a80..01718d98dffe0 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -26,12 +26,15 @@
     curr_dir = pwd
     cd("..")
 
-    puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
-    puts `build/sbt -Pkinesis-asl compile unidoc`
+    puts "Running 'build/sbt -Pkinesis-asl clean compile unidoc' from " + pwd + "; this may take a few minutes..."
+    puts `build/sbt -Pkinesis-asl clean compile unidoc`
 
     puts "Moving back into docs dir."
     cd("docs")
 
+    puts "Removing old docs"
+    puts `rm -rf api`
+
     # Copy over the unified ScalaDoc for all projects to api/scala.
     # This directory will be copied over to _site when `jekyll` command is run.
     source = "../target/scala-2.10/unidoc"

From aad644fbe29151aec9004817d42e4928bdb326f3 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 17 Sep 2015 11:14:52 -0700
Subject: [PATCH 330/802] [SPARK-10639] [SQL] Need to convert UDAF's result
 from scala to sql type

https://issues.apache.org/jira/browse/SPARK-10639

Author: Yin Huai <yhuai@databricks.com>

Closes #8788 from yhuai/udafConversion.
---
 .../sql/catalyst/CatalystTypeConverters.scala |   7 +-
 .../spark/sql/RandomDataGenerator.scala       |  16 ++-
 .../spark/sql/execution/aggregate/udaf.scala  |  37 +++++-
 .../org/apache/spark/sql/QueryTest.scala      |  21 ++--
 .../spark/sql/UserDefinedTypeSuite.scala      |  11 ++
 .../execution/AggregationQuerySuite.scala     | 108 +++++++++++++++++-
 6 files changed, 188 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 966623ed017ba..f25591794abdb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -138,8 +138,13 @@ object CatalystTypeConverters {
 
   private case class UDTConverter(
       udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
+    // toCatalyst (it calls toCatalystImpl) will do null check.
     override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
-    override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
+
+    override def toScala(catalystValue: Any): Any = {
+      if (catalystValue == null) null else udt.deserialize(catalystValue)
+    }
+
     override def toScalaImpl(row: InternalRow, column: Int): Any =
       toScala(row.get(column, udt.sqlType))
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 4025cbcec1019..e48395028e399 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -108,7 +108,21 @@ object RandomDataGenerator {
         arr
       })
       case BooleanType => Some(() => rand.nextBoolean())
-      case DateType => Some(() => new java.sql.Date(rand.nextInt()))
+      case DateType =>
+        val generator =
+          () => {
+            var milliseconds = rand.nextLong() % 253402329599999L
+            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+            // for "0001-01-01 00:00:00.000000". We need to find a
+            // number that is greater or equals to this number as a valid timestamp value.
+            while (milliseconds < -62135740800000L) {
+              // 253402329599999L is the the number of milliseconds since
+              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+              milliseconds = rand.nextLong() % 253402329599999L
+            }
+            DateTimeUtils.toJavaDate((milliseconds / DateTimeUtils.MILLIS_PER_DAY).toInt)
+          }
+        Some(generator)
       case TimestampType =>
         val generator =
           () => {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index d43d3dd9ffaae..1114fe6552bdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -40,6 +40,9 @@ sealed trait BufferSetterGetterUtils {
     var i = 0
     while (i < getters.length) {
       getters(i) = dataTypes(i) match {
+        case NullType =>
+          (row: InternalRow, ordinal: Int) => null
+
         case BooleanType =>
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.getBoolean(ordinal)
@@ -74,6 +77,14 @@ sealed trait BufferSetterGetterUtils {
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.getDecimal(ordinal, precision, scale)
 
+        case DateType =>
+          (row: InternalRow, ordinal: Int) =>
+            if (row.isNullAt(ordinal)) null else row.getInt(ordinal)
+
+        case TimestampType =>
+          (row: InternalRow, ordinal: Int) =>
+            if (row.isNullAt(ordinal)) null else row.getLong(ordinal)
+
         case other =>
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.get(ordinal, other)
@@ -92,6 +103,9 @@ sealed trait BufferSetterGetterUtils {
     var i = 0
     while (i < setters.length) {
       setters(i) = dataTypes(i) match {
+        case NullType =>
+          (row: MutableRow, ordinal: Int, value: Any) => row.setNullAt(ordinal)
+
         case b: BooleanType =>
           (row: MutableRow, ordinal: Int, value: Any) =>
             if (value != null) {
@@ -150,9 +164,23 @@ sealed trait BufferSetterGetterUtils {
 
         case dt: DecimalType =>
           val precision = dt.precision
+          (row: MutableRow, ordinal: Int, value: Any) =>
+            // To make it work with UnsafeRow, we cannot use setNullAt.
+            // Please see the comment of UnsafeRow's setDecimal.
+            row.setDecimal(ordinal, value.asInstanceOf[Decimal], precision)
+
+        case DateType =>
           (row: MutableRow, ordinal: Int, value: Any) =>
             if (value != null) {
-              row.setDecimal(ordinal, value.asInstanceOf[Decimal], precision)
+              row.setInt(ordinal, value.asInstanceOf[Int])
+            } else {
+              row.setNullAt(ordinal)
+            }
+
+        case TimestampType =>
+          (row: MutableRow, ordinal: Int, value: Any) =>
+            if (value != null) {
+              row.setLong(ordinal, value.asInstanceOf[Long])
             } else {
               row.setNullAt(ordinal)
             }
@@ -205,6 +233,7 @@ private[sql] class MutableAggregationBufferImpl (
       throw new IllegalArgumentException(
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
+
     toScalaConverters(i)(bufferValueGetters(i)(underlyingBuffer, offsets(i)))
   }
 
@@ -352,6 +381,10 @@ private[sql] case class ScalaUDAF(
     }
   }
 
+  private[this] lazy val outputToCatalystConverter: Any => Any = {
+    CatalystTypeConverters.createToCatalystConverter(dataType)
+  }
+
   // This buffer is only used at executor side.
   private[this] var inputAggregateBuffer: InputAggregationBuffer = null
 
@@ -424,7 +457,7 @@ private[sql] case class ScalaUDAF(
   override def eval(buffer: InternalRow): Any = {
     evalAggregateBuffer.underlyingInputBuffer = buffer
 
-    udaf.evaluate(evalAggregateBuffer)
+    outputToCatalystConverter(udaf.evaluate(evalAggregateBuffer))
   }
 
   override def toString: String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index cada03e9ac6bb..e3c5a426671d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -115,19 +115,26 @@ object QueryTest {
    */
   def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
     val isSorted = df.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
+
+    // We need to call prepareRow recursively to handle schemas with struct types.
+    def prepareRow(row: Row): Row = {
+      Row.fromSeq(row.toSeq.map {
+        case null => null
+        case d: java.math.BigDecimal => BigDecimal(d)
+        // Convert array to Seq for easy equality check.
+        case b: Array[_] => b.toSeq
+        case r: Row => prepareRow(r)
+        case o => o
+      })
+    }
+
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
       // Java's java.math.BigDecimal.compareTo).
       // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
       // equality test.
-      val converted: Seq[Row] = answer.map { s =>
-        Row.fromSeq(s.toSeq.map {
-          case d: java.math.BigDecimal => BigDecimal(d)
-          case b: Array[Byte] => b.toSeq
-          case o => o
-        })
-      }
+      val converted: Seq[Row] = answer.map(prepareRow)
       if (!isSorted) converted.sortBy(_.toString()) else converted
     }
     val sparkAnswer = try df.collect().toSeq catch {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 46d87843dfa4d..7992fd59ff4ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -22,6 +22,7 @@ import scala.beans.{BeanInfo, BeanProperty}
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
@@ -163,4 +164,14 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
     assert(new MyDenseVectorUDT().typeName === "mydensevector")
     assert(new OpenHashSetUDT(IntegerType).typeName === "openhashset")
   }
+
+  test("Catalyst type converter null handling for UDTs") {
+    val udt = new MyDenseVectorUDT()
+    val toScalaConverter = CatalystTypeConverters.createToScalaConverter(udt)
+    assert(toScalaConverter(null) === null)
+
+    val toCatalystConverter = CatalystTypeConverters.createToCatalystConverter(udt)
+    assert(toCatalystConverter(null) === null)
+
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index a73b1bd52c09f..24b1846923c77 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -17,13 +17,55 @@
 
 package org.apache.spark.sql.hive.execution
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.aggregate
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 
+class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFunction {
+
+  def inputSchema: StructType = schema
+
+  def bufferSchema: StructType = schema
+
+  def dataType: DataType = schema
+
+  def deterministic: Boolean = true
+
+  def initialize(buffer: MutableAggregationBuffer): Unit = {
+    (0 until schema.length).foreach { i =>
+      buffer.update(i, null)
+    }
+  }
+
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+    if (!input.isNullAt(0) && input.getInt(0) == 50) {
+      (0 until schema.length).foreach { i =>
+        buffer.update(i, input.get(i))
+      }
+    }
+  }
+
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+    if (!buffer2.isNullAt(0) && buffer2.getInt(0) == 50) {
+      (0 until schema.length).foreach { i =>
+        buffer1.update(i, buffer2.get(i))
+      }
+    }
+  }
+
+  def evaluate(buffer: Row): Any = {
+    Row.fromSeq(buffer.toSeq)
+  }
+}
+
 abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   import testImplicits._
 
@@ -508,6 +550,70 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
     }
   }
+
+  test("udaf with all data types") {
+    val struct =
+      StructType(
+        StructField("f1", FloatType, true) ::
+          StructField("f2", ArrayType(BooleanType), true) :: Nil)
+    val dataTypes = Seq(StringType, BinaryType, NullType, BooleanType,
+      ByteType, ShortType, IntegerType, LongType,
+      FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+      DateType, TimestampType,
+      ArrayType(IntegerType), MapType(StringType, LongType), struct,
+      new MyDenseVectorUDT())
+    // Right now, we will use SortBasedAggregate to handle UDAFs.
+    // UnsafeRow.mutableFieldTypes.asScala.toSeq will trigger SortBasedAggregate to use
+    // UnsafeRow as the aggregation buffer. While, dataTypes will trigger
+    // SortBasedAggregate to use a safe row as the aggregation buffer.
+    Seq(dataTypes, UnsafeRow.mutableFieldTypes.asScala.toSeq).foreach { dataTypes =>
+      val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
+        StructField(s"col$index", dataType, nullable = true)
+      }
+      // The schema used for data generator.
+      val schemaForGenerator = StructType(fields)
+      // The schema used for the DataFrame df.
+      val schema = StructType(StructField("id", IntegerType) +: fields)
+
+      logInfo(s"Testing schema: ${schema.treeString}")
+
+      val udaf = new ScalaAggregateFunction(schema)
+      // Generate data at the driver side. We need to materialize the data first and then
+      // create RDD.
+      val maybeDataGenerator =
+        RandomDataGenerator.forType(
+          dataType = schemaForGenerator,
+          nullable = true,
+          seed = Some(System.nanoTime()))
+      val dataGenerator =
+        maybeDataGenerator
+          .getOrElse(fail(s"Failed to create data generator for schema $schemaForGenerator"))
+      val data = (1 to 50).map { i =>
+        dataGenerator.apply() match {
+          case row: Row => Row.fromSeq(i +: row.toSeq)
+          case null => Row.fromSeq(i +: Seq.fill(schemaForGenerator.length)(null))
+          case other =>
+            fail(s"Row or null is expected to be generated, " +
+              s"but a ${other.getClass.getCanonicalName} is generated.")
+        }
+      }
+
+      // Create a DF for the schema with random data.
+      val rdd = sqlContext.sparkContext.parallelize(data, 1)
+      val df = sqlContext.createDataFrame(rdd, schema)
+
+      val allColumns = df.schema.fields.map(f => col(f.name))
+      val expectedAnaswer =
+        data
+          .find(r => r.getInt(0) == 50)
+          .getOrElse(fail("A row with id 50 should be the expected answer."))
+      checkAnswer(
+        df.groupBy().agg(udaf(allColumns: _*)),
+        // udaf returns a Row as the output value.
+        Row(expectedAnaswer)
+      )
+    }
+  }
 }
 
 class SortBasedAggregationQuerySuite extends AggregationQuerySuite {

From 64743870f23bffb8d96dcc8a0181c1452782a151 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 17 Sep 2015 11:24:38 -0700
Subject: [PATCH 331/802] [SPARK-10394] [ML] Make GBTParams use shared stepSize

```GBTParams``` has ```stepSize``` as learning rate currently.
ML has shared param class ```HasStepSize```, ```GBTParams``` can extend from it rather than duplicated implementation.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8552 from yanboliang/spark-10394.
---
 .../org/apache/spark/ml/tree/treeParams.scala | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index d29f5253c9c3f..42e74ce6d2c69 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.tree
 import org.apache.spark.ml.classification.ClassifierParams
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasCheckpointInterval, HasMaxIter, HasSeed, HasThresholds}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.impurity.{Entropy => OldEntropy, Gini => OldGini, Impurity => OldImpurity, Variance => OldVariance}
 import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
@@ -365,17 +365,7 @@ private[ml] object RandomForestParams {
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
-
-  /**
-   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
-   * estimator.
-   * (default = 0.1)
-   * @group param
-   */
-  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size (a.k.a." +
-    " learning rate) in interval (0, 1] for shrinking the contribution of each estimator",
-    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -393,11 +383,19 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
-  /** @group setParam */
+  /**
+   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
+   * estimator.
+   * (default = 0.1)
+   * @group setParam
+   */
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
-  /** @group getParam */
-  final def getStepSize: Double = $(stepSize)
+  override def validateParams(): Unit = {
+    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
+      getStepSize), "GBT parameter stepSize should be in interval (0, 1], " +
+      s"but it given invalid value $getStepSize.")
+  }
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(

From f1c911552cf5d0d60831c79c1881016293aec66c Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 17 Sep 2015 11:40:24 -0700
Subject: [PATCH 332/802] [SPARK-10657] Remove SCP-based Jenkins log archiving

As of https://issues.apache.org/jira/browse/SPARK-7561, we no longer need to use our custom SCP-based mechanism for archiving Jenkins logs on the master machine; this has been superseded by the use of a Jenkins plugin which archives the logs and provides public links to view them.

Per shaneknapp, we should remove this log syncing mechanism if it is no longer necessary; removing the need to SCP from the Jenkins workers to the masters is a desired step as part of some larger Jenkins infra refactoring.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8793 from JoshRosen/remove-jenkins-ssh-to-master.
---
 dev/run-tests-jenkins | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 3be78575e70f1..d3b05fa6df0ce 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -116,39 +116,6 @@ function post_message () {
   fi
 }
 
-function send_archived_logs () {
-  echo "Archiving unit tests logs..."
-
-  local log_files=$(
-    find .\
-      -name "unit-tests.log" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.failed" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.hiveFailed" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.wrong"
-  )
-
-  if [ -z "$log_files" ]; then
-    echo "> No log files found." >&2
-  else
-    local log_archive="unit-tests-logs.tar.gz"
-    echo "$log_files" | xargs tar czf ${log_archive}
-
-    local jenkins_build_dir=${JENKINS_HOME}/jobs/${JOB_NAME}/builds/${BUILD_NUMBER}
-    local scp_output=$(scp ${log_archive} amp-jenkins-master:${jenkins_build_dir}/${log_archive})
-    local scp_status="$?"
-
-    if [ "$scp_status" -ne 0 ]; then
-      echo "Failed to send archived unit tests logs to Jenkins master." >&2
-      echo "> scp_status: ${scp_status}" >&2
-      echo "> scp_output: ${scp_output}" >&2
-    else
-      echo "> Send successful."
-    fi
-
-    rm -f ${log_archive}
-  fi
-}
-
 # post start message
 {
   start_message="\
@@ -244,8 +211,6 @@ done
 
     test_result_note=" * This patch **fails $failing_test**."
   fi
-
-  send_archived_logs
 }
 
 # post end message

From 4fbf3328692e876f39ea78494510f9d9c5a53f15 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 17 Sep 2015 14:09:06 -0700
Subject: [PATCH 333/802] [SPARK-9698] [ML] Add RInteraction transformer for
 supporting R-style feature interactions

This is a pre-req for supporting the ":" operator in the RFormula feature transformer.

Design doc from umbrella task: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit

mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #7987 from ericl/interaction.
---
 .../apache/spark/ml/feature/Interaction.scala | 278 ++++++++++++++++++
 .../spark/ml/feature/InteractionSuite.scala   | 165 +++++++++++
 2 files changed, 443 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
new file mode 100644
index 0000000000000..9194763fb32f5
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable.ArrayBuilder
+
+import org.apache.spark.SparkException
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.Transformer
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+ * :: Experimental ::
+ * Implements the feature interaction transform. This transformer takes in Double and Vector type
+ * columns and outputs a flattened vector of their feature interactions. To handle interaction,
+ * we first one-hot encode any nominal features. Then, a vector of the feature cross-products is
+ * produced.
+ *
+ * For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be
+ * `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal
+ * with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.
+ */
+@Experimental
+class Interaction(override val uid: String) extends Transformer
+  with HasInputCols with HasOutputCol {
+
+  def this() = this(Identifiable.randomUID("interaction"))
+
+  /** @group setParam */
+  def setInputCols(values: Array[String]): this.type = set(inputCols, values)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  // optimistic schema; does not contain any ML attributes
+  override def transformSchema(schema: StructType): StructType = {
+    validateParams()
+    StructType(schema.fields :+ StructField($(outputCol), new VectorUDT, false))
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    validateParams()
+    val inputFeatures = $(inputCols).map(c => dataset.schema(c))
+    val featureEncoders = getFeatureEncoders(inputFeatures)
+    val featureAttrs = getFeatureAttrs(inputFeatures)
+
+    def interactFunc = udf { row: Row =>
+      var indices = ArrayBuilder.make[Int]
+      var values = ArrayBuilder.make[Double]
+      var size = 1
+      indices += 0
+      values += 1.0
+      var featureIndex = row.length - 1
+      while (featureIndex >= 0) {
+        val prevIndices = indices.result()
+        val prevValues = values.result()
+        val prevSize = size
+        val currentEncoder = featureEncoders(featureIndex)
+        indices = ArrayBuilder.make[Int]
+        values = ArrayBuilder.make[Double]
+        size *= currentEncoder.outputSize
+        currentEncoder.foreachNonzeroOutput(row(featureIndex), (i, a) => {
+          var j = 0
+          while (j < prevIndices.length) {
+            indices += prevIndices(j) + i * prevSize
+            values += prevValues(j) * a
+            j += 1
+          }
+        })
+        featureIndex -= 1
+      }
+      Vectors.sparse(size, indices.result(), values.result()).compressed
+    }
+
+    val featureCols = inputFeatures.map { f =>
+      f.dataType match {
+        case DoubleType => dataset(f.name)
+        case _: VectorUDT => dataset(f.name)
+        case _: NumericType | BooleanType => dataset(f.name).cast(DoubleType)
+      }
+    }
+    dataset.select(
+      col("*"),
+      interactFunc(struct(featureCols: _*)).as($(outputCol), featureAttrs.toMetadata()))
+  }
+
+  /**
+   * Creates a feature encoder for each input column, which supports efficient iteration over
+   * one-hot encoded feature values. See also the class-level comment of [[FeatureEncoder]].
+   *
+   * @param features The input feature columns to create encoders for.
+   */
+  private def getFeatureEncoders(features: Seq[StructField]): Array[FeatureEncoder] = {
+    def getNumFeatures(attr: Attribute): Int = {
+      attr match {
+        case nominal: NominalAttribute =>
+          math.max(1, nominal.getNumValues.getOrElse(
+            throw new SparkException("Nominal features must have attr numValues defined.")))
+        case _ =>
+          1  // numeric feature
+      }
+    }
+    features.map { f =>
+      val numFeatures = f.dataType match {
+        case _: NumericType | BooleanType =>
+          Array(getNumFeatures(Attribute.fromStructField(f)))
+        case _: VectorUDT =>
+          val attrs = AttributeGroup.fromStructField(f).attributes.getOrElse(
+            throw new SparkException("Vector attributes must be defined for interaction."))
+          attrs.map(getNumFeatures).toArray
+      }
+      new FeatureEncoder(numFeatures)
+    }.toArray
+  }
+
+  /**
+   * Generates ML attributes for the output vector of all feature interactions. We make a best
+   * effort to generate reasonable names for output features, based on the concatenation of the
+   * interacting feature names and values delimited with `_`. When no feature name is specified,
+   * we fall back to using the feature index (e.g. `foo:bar_2_0` may indicate an interaction
+   * between the numeric `foo` feature and a nominal third feature from column `bar`.
+   *
+   * @param features The input feature columns to the Interaction transformer.
+   */
+  private def getFeatureAttrs(features: Seq[StructField]): AttributeGroup = {
+    var featureAttrs: Seq[Attribute] = Nil
+    features.reverse.foreach { f =>
+      val encodedAttrs = f.dataType match {
+        case _: NumericType | BooleanType =>
+          val attr = Attribute.fromStructField(f)
+          encodedFeatureAttrs(Seq(attr), None)
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(f)
+          encodedFeatureAttrs(group.attributes.get, Some(group.name))
+      }
+      if (featureAttrs.isEmpty) {
+        featureAttrs = encodedAttrs
+      } else {
+        featureAttrs = encodedAttrs.flatMap { head =>
+          featureAttrs.map { tail =>
+            NumericAttribute.defaultAttr.withName(head.name.get + ":" + tail.name.get)
+          }
+        }
+      }
+    }
+    new AttributeGroup($(outputCol), featureAttrs.toArray)
+  }
+
+  /**
+   * Generates the output ML attributes for a single input feature. Each output feature name has
+   * up to three parts: the group name, feature name, and category name (for nominal features),
+   * each separated by an underscore.
+   *
+   * @param inputAttrs The attributes of the input feature.
+   * @param groupName Optional name of the input feature group (for Vector type features).
+   */
+  private def encodedFeatureAttrs(
+      inputAttrs: Seq[Attribute],
+      groupName: Option[String]): Seq[Attribute] = {
+
+    def format(
+        index: Int,
+        attrName: Option[String],
+        categoryName: Option[String]): String = {
+      val parts = Seq(groupName, Some(attrName.getOrElse(index.toString)), categoryName)
+      parts.flatten.mkString("_")
+    }
+
+    inputAttrs.zipWithIndex.flatMap {
+      case (nominal: NominalAttribute, i) =>
+        if (nominal.values.isDefined) {
+          nominal.values.get.map(
+            v => BinaryAttribute.defaultAttr.withName(format(i, nominal.name, Some(v))))
+        } else {
+          Array.tabulate(nominal.getNumValues.get)(
+            j => BinaryAttribute.defaultAttr.withName(format(i, nominal.name, Some(j.toString))))
+        }
+      case (a: Attribute, i) =>
+        Seq(NumericAttribute.defaultAttr.withName(format(i, a.name, None)))
+    }
+  }
+
+  override def copy(extra: ParamMap): Interaction = defaultCopy(extra)
+
+  override def validateParams(): Unit = {
+    require(get(inputCols).isDefined, "Input cols must be defined first.")
+    require(get(outputCol).isDefined, "Output col must be defined first.")
+    require($(inputCols).length > 0, "Input cols must have non-zero length.")
+    require($(inputCols).distinct.length == $(inputCols).length, "Input cols must be distinct.")
+  }
+}
+
+/**
+ * This class performs on-the-fly one-hot encoding of features as you iterate over them. To
+ * indicate which input features should be one-hot encoded, an array of the feature counts
+ * must be passed in ahead of time.
+ *
+ * @param numFeatures Array of feature counts for each input feature. For nominal features this
+ *                    count is equal to the number of categories. For numeric features the count
+ *                    should be set to 1.
+ */
+private[ml] class FeatureEncoder(numFeatures: Array[Int]) {
+  assert(numFeatures.forall(_ > 0), "Features counts must all be positive.")
+
+  /** The size of the output vector. */
+  val outputSize = numFeatures.sum
+
+  /** Precomputed offsets for the location of each output feature. */
+  private val outputOffsets = {
+    val arr = new Array[Int](numFeatures.length)
+    var i = 1
+    while (i < arr.length) {
+      arr(i) = arr(i - 1) + numFeatures(i - 1)
+      i += 1
+    }
+    arr
+  }
+
+  /**
+   * Given an input row of features, invokes the specific function for every non-zero output.
+   *
+   * @param value The row value to encode, either a Double or Vector.
+   * @param f The callback to invoke on each non-zero (index, value) output pair.
+   */
+  def foreachNonzeroOutput(value: Any, f: (Int, Double) => Unit): Unit = value match {
+    case d: Double =>
+      assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
+      val numOutputCols = numFeatures.head
+      if (numOutputCols > 1) {
+        assert(
+          d >= 0.0 && d == d.toInt && d < numOutputCols,
+          s"Values from column must be indices, but got $d.")
+        f(d.toInt, 1.0)
+      } else {
+        f(0, d)
+      }
+    case vec: Vector =>
+      assert(numFeatures.length == vec.size,
+        s"Vector column size was ${vec.size}, expected ${numFeatures.length}")
+      vec.foreachActive { (i, v) =>
+        val numOutputCols = numFeatures(i)
+        if (numOutputCols > 1) {
+          assert(
+            v >= 0.0 && v == v.toInt && v < numOutputCols,
+            s"Values from column must be indices, but got $v.")
+          f(outputOffsets(i) + v.toInt, 1.0)
+        } else {
+          f(outputOffsets(i), v)
+        }
+      }
+    case null =>
+      throw new SparkException("Values to interact cannot be null.")
+    case o =>
+      throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
new file mode 100644
index 0000000000000..2beb62ca08233
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable.ArrayBuilder
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.functions.col
+
+class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("params") {
+    ParamsSuite.checkParams(new Interaction())
+  }
+
+  test("feature encoder") {
+    def encode(cardinalities: Array[Int], value: Any): Vector = {
+      var indices = ArrayBuilder.make[Int]
+      var values = ArrayBuilder.make[Double]
+      val encoder = new FeatureEncoder(cardinalities)
+      encoder.foreachNonzeroOutput(value, (i, v) => {
+        indices += i
+        values += v
+      })
+      Vectors.sparse(encoder.outputSize, indices.result(), values.result()).compressed
+    }
+    assert(encode(Array(1), 2.2) === Vectors.dense(2.2))
+    assert(encode(Array(3), Vectors.dense(1)) === Vectors.dense(0, 1, 0))
+    assert(encode(Array(1, 1), Vectors.dense(1.1, 2.2)) === Vectors.dense(1.1, 2.2))
+    assert(encode(Array(3, 1), Vectors.dense(1, 2.2)) === Vectors.dense(0, 1, 0, 2.2))
+    assert(encode(Array(2, 1), Vectors.dense(1, 2.2)) === Vectors.dense(0, 1, 2.2))
+    assert(encode(Array(2, 1, 1), Vectors.dense(0, 2.2, 0)) === Vectors.dense(1, 0, 2.2, 0))
+    intercept[SparkException] { encode(Array(1), "foo") }
+    intercept[SparkException] { encode(Array(1), null) }
+    intercept[AssertionError] { encode(Array(2), 2.2) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(2.2)) }
+    intercept[AssertionError] { encode(Array(1), Vectors.dense(1.0, 2.0, 3.0)) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(-1)) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(3)) }
+  }
+
+  test("numeric interaction") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0)),
+        (1, Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NumericAttribute.defaultAttr.withName("foo"),
+        NumericAttribute.defaultAttr.withName("bar")))
+    val df = data.select(
+      col("a").as("a", NumericAttribute.defaultAttr.toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0), Vectors.dense(6.0, 8.0)),
+        (1, Vectors.dense(1.0, 5.0), Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a:b_foo"), Some(1)),
+        new NumericAttribute(Some("a:b_bar"), Some(2))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("nominal interaction") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0)),
+        (1, Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NumericAttribute.defaultAttr.withName("foo"),
+        NumericAttribute.defaultAttr.withName("bar")))
+    val df = data.select(
+      col("a").as(
+        "a", NominalAttribute.defaultAttr.withValues(Array("up", "down", "left")).toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0), Vectors.dense(0, 0, 0, 0, 3, 4)),
+        (1, Vectors.dense(1.0, 5.0), Vectors.dense(0, 0, 1, 5, 0, 0)))
+      ).toDF("a", "b", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_up:b_foo"), Some(1)),
+        new NumericAttribute(Some("a_up:b_bar"), Some(2)),
+        new NumericAttribute(Some("a_down:b_foo"), Some(3)),
+        new NumericAttribute(Some("a_down:b_bar"), Some(4)),
+        new NumericAttribute(Some("a_left:b_foo"), Some(5)),
+        new NumericAttribute(Some("a_left:b_bar"), Some(6))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("default attr names") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(0.0, 4.0), 1.0),
+        (1, Vectors.dense(1.0, 5.0), 10.0))
+      ).toDF("a", "b", "c")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NominalAttribute.defaultAttr.withNumValues(2),
+        NumericAttribute.defaultAttr))
+    val df = data.select(
+      col("a").as("a", NominalAttribute.defaultAttr.withNumValues(3).toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()),
+      col("c").as("c", NumericAttribute.defaultAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b", "c")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(0.0, 4.0), 1.0, Vectors.dense(0, 0, 0, 0, 0, 0, 1, 0, 4)),
+        (1, Vectors.dense(1.0, 5.0), 10.0, Vectors.dense(0, 0, 0, 0, 10, 50, 0, 0, 0)))
+      ).toDF("a", "b", "c", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_0:b_0_0:c"), Some(1)),
+        new NumericAttribute(Some("a_0:b_0_1:c"), Some(2)),
+        new NumericAttribute(Some("a_0:b_1:c"), Some(3)),
+        new NumericAttribute(Some("a_1:b_0_0:c"), Some(4)),
+        new NumericAttribute(Some("a_1:b_0_1:c"), Some(5)),
+        new NumericAttribute(Some("a_1:b_1:c"), Some(6)),
+        new NumericAttribute(Some("a_2:b_0_0:c"), Some(7)),
+        new NumericAttribute(Some("a_2:b_0_1:c"), Some(8)),
+        new NumericAttribute(Some("a_2:b_1:c"), Some(9))))
+    assert(attrs === expectedAttrs)
+  }
+}

From 0f5ef6dfa67a068606aff8ea9d1addfce73446eb Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 17 Sep 2015 19:16:34 -0700
Subject: [PATCH 334/802] [SPARK-10674] [TESTS] Increase timeouts in
 SaslIntegrationSuite.

1s seems to trigger too many times on the jenkins build boxes, so
increase the timeout and cross fingers.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8802 from vanzin/SPARK-10674 and squashes the following commits:

3c93117 [Marcelo Vanzin] Use java 7 syntax.
d667d1b [Marcelo Vanzin] [SPARK-10674] [tests] Increase timeouts in SaslIntegrationSuite.
---
 .../spark/network/sasl/SaslIntegrationSuite.java  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
index 5cb0e4d4a6458..c393a5e1e6810 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
@@ -56,6 +56,11 @@
 import org.apache.spark.network.util.TransportConf;
 
 public class SaslIntegrationSuite {
+
+  // Use a long timeout to account for slow / overloaded build machines. In the normal case,
+  // tests should finish way before the timeout expires.
+  private final static long TIMEOUT_MS = 10_000;
+
   static TransportServer server;
   static TransportConf conf;
   static TransportContext context;
@@ -102,7 +107,7 @@ public void testGoodClient() throws IOException {
 
     TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
     String msg = "Hello, World!";
-    byte[] resp = client.sendRpcSync(msg.getBytes(), 1000);
+    byte[] resp = client.sendRpcSync(msg.getBytes(), TIMEOUT_MS);
     assertEquals(msg, new String(resp)); // our rpc handler should just return the given msg
   }
 
@@ -131,7 +136,7 @@ public void testNoSaslClient() throws IOException {
 
     TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
     try {
-      client.sendRpcSync(new byte[13], 1000);
+      client.sendRpcSync(new byte[13], TIMEOUT_MS);
       fail("Should have failed");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("Expected SaslMessage"));
@@ -139,7 +144,7 @@ public void testNoSaslClient() throws IOException {
 
     try {
       // Guessing the right tag byte doesn't magically get you in...
-      client.sendRpcSync(new byte[] { (byte) 0xEA }, 1000);
+      client.sendRpcSync(new byte[] { (byte) 0xEA }, TIMEOUT_MS);
       fail("Should have failed");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("java.lang.IndexOutOfBoundsException"));
@@ -217,12 +222,12 @@ public synchronized void onBlockFetchFailure(String blockId, Throwable t) {
         new String[] { System.getProperty("java.io.tmpdir") }, 1,
         "org.apache.spark.shuffle.sort.SortShuffleManager");
       RegisterExecutor regmsg = new RegisterExecutor("app-1", "0", executorInfo);
-      client1.sendRpcSync(regmsg.toByteArray(), 10000);
+      client1.sendRpcSync(regmsg.toByteArray(), TIMEOUT_MS);
 
       // Make a successful request to fetch blocks, which creates a new stream. But do not actually
       // fetch any blocks, to keep the stream open.
       OpenBlocks openMessage = new OpenBlocks("app-1", "0", blockIds);
-      byte[] response = client1.sendRpcSync(openMessage.toByteArray(), 10000);
+      byte[] response = client1.sendRpcSync(openMessage.toByteArray(), TIMEOUT_MS);
       StreamHandle stream = (StreamHandle) BlockTransferMessage.Decoder.fromByteArray(response);
       long streamId = stream.streamId;
 

From 98f1ea67da1b0e3aa791c3cbfa06e48e2ba0d75b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 17 Sep 2015 21:37:10 -0700
Subject: [PATCH 335/802] [SPARK-8518] [ML] Log-linear models for survival
 analysis

[Accelerated Failure Time (AFT) model](https://en.wikipedia.org/wiki/Accelerated_failure_time_model) is the most commonly used and easy to parallel method of survival analysis for censored survival data. It is the log-linear model based on the Weibull distribution of the survival time.
Users can refer to the R function [```survreg```](https://stat.ethz.ch/R-manual/R-devel/library/survival/html/survreg.html) to compare the model and [```predict```](https://stat.ethz.ch/R-manual/R-devel/library/survival/html/predict.survreg.html) to compare the prediction. There are different kinds of model prediction, I have just select the type ```response``` which is default used for R.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8611 from yanboliang/spark-8518.
---
 .../ml/regression/AFTSurvivalRegression.scala | 449 ++++++++++++++++++
 .../AFTSurvivalRegressionSuite.scala          | 311 ++++++++++++
 2 files changed, 760 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
new file mode 100644
index 0000000000000..5b25db651f56c
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -0,0 +1,449 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import scala.collection.mutable
+
+import breeze.linalg.{DenseVector => BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS}
+
+import org.apache.spark.{SparkException, Logging}
+import org.apache.spark.annotation.{Since, Experimental}
+import org.apache.spark.ml.{Model, Estimator}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
+import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.storage.StorageLevel
+
+/**
+ * Params for accelerated failure time (AFT) regression.
+ */
+private[regression] trait AFTSurvivalRegressionParams extends Params
+  with HasFeaturesCol with HasLabelCol with HasPredictionCol with HasMaxIter
+  with HasTol with HasFitIntercept {
+
+  /**
+   * Param for censor column name.
+   * The value of this column could be 0 or 1.
+   * If the value is 1, it means the event has occurred i.e. uncensored; otherwise censored.
+   * @group param
+   */
+  @Since("1.6.0")
+  final val censorCol: Param[String] = new Param(this, "censorCol", "censor column name")
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getCensorCol: String = $(censorCol)
+  setDefault(censorCol -> "censor")
+
+  /**
+   * Param for quantile probabilities array.
+   * Values of the quantile probabilities array should be in the range [0, 1].
+   * @group param
+   */
+  @Since("1.6.0")
+  final val quantileProbabilities: DoubleArrayParam = new DoubleArrayParam(this,
+    "quantileProbabilities", "quantile probabilities array",
+    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)))
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getQuantileProbabilities: Array[Double] = $(quantileProbabilities)
+
+  /** Checks whether the input has quantile probabilities array. */
+  protected[regression] def hasQuantileProbabilities: Boolean = {
+    isDefined(quantileProbabilities) && $(quantileProbabilities).size != 0
+  }
+
+  /**
+   * Validates and transforms the input schema with the provided param map.
+   * @param schema input schema
+   * @param fitting whether this is in fitting or prediction
+   * @return output schema
+   */
+  protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean): StructType = {
+    SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+    if (fitting) {
+      SchemaUtils.checkColumnType(schema, $(censorCol), DoubleType)
+      SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+    }
+    SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Fit a parametric survival regression model named accelerated failure time (AFT) model
+ * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]])
+ * based on the Weibull distribution of the survival time.
+ */
+@Experimental
+@Since("1.6.0")
+class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: String)
+  extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with Logging {
+
+  @Since("1.6.0")
+  def this() = this(Identifiable.randomUID("aftSurvReg"))
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setCensorCol(value: String): this.type = set(censorCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /**
+   * Set if we should fit the intercept
+   * Default is true.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 100)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-6.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
+
+  /**
+   * Extract [[featuresCol]], [[labelCol]] and [[censorCol]] from input dataset,
+   * and put it in an RDD with strong types.
+   */
+  protected[ml] def extractAFTPoints(dataset: DataFrame): RDD[AFTPoint] = {
+    dataset.select($(featuresCol), $(labelCol), $(censorCol)).map {
+      case Row(features: Vector, label: Double, censor: Double) =>
+        AFTPoint(features, label, censor)
+    }
+  }
+
+  @Since("1.6.0")
+  override def fit(dataset: DataFrame): AFTSurvivalRegressionModel = {
+    validateAndTransformSchema(dataset.schema, fitting = true)
+    val instances = extractAFTPoints(dataset)
+    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val costFun = new AFTCostFun(instances, $(fitIntercept))
+    val optimizer = new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
+
+    val numFeatures = dataset.select($(featuresCol)).take(1)(0).getAs[Vector](0).size
+    /*
+       The weights vector has three parts:
+       the first element: Double, log(sigma), the log of scale parameter
+       the second element: Double, intercept of the beta parameter
+       the third to the end elements: Doubles, regression coefficients vector of the beta parameter
+     */
+    val initialWeights = Vectors.zeros(numFeatures + 2)
+
+    val states = optimizer.iterations(new CachedDiffFunction(costFun),
+      initialWeights.toBreeze.toDenseVector)
+
+    val weights = {
+      val arrayBuilder = mutable.ArrayBuilder.make[Double]
+      var state: optimizer.State = null
+      while (states.hasNext) {
+        state = states.next()
+        arrayBuilder += state.adjustedValue
+      }
+      if (state == null) {
+        val msg = s"${optimizer.getClass.getName} failed."
+        throw new SparkException(msg)
+      }
+
+      state.x.toArray.clone()
+    }
+
+    if (handlePersistence) instances.unpersist()
+
+    val coefficients = Vectors.dense(weights.slice(2, weights.length))
+    val intercept = weights(1)
+    val scale = math.exp(weights(0))
+    val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale)
+    copyValues(model.setParent(this))
+  }
+
+  @Since("1.6.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema, fitting = true)
+  }
+
+  @Since("1.6.0")
+  override def copy(extra: ParamMap): AFTSurvivalRegression = defaultCopy(extra)
+}
+
+/**
+ * :: Experimental ::
+ * Model produced by [[AFTSurvivalRegression]].
+ */
+@Experimental
+@Since("1.6.0")
+class AFTSurvivalRegressionModel private[ml] (
+    @Since("1.6.0") override val uid: String,
+    @Since("1.6.0") val coefficients: Vector,
+    @Since("1.6.0") val intercept: Double,
+    @Since("1.6.0") val scale: Double)
+  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value)
+
+  @Since("1.6.0")
+  def predictQuantiles(features: Vector): Vector = {
+    require(hasQuantileProbabilities,
+      "AFTSurvivalRegressionModel predictQuantiles must set quantile probabilities array")
+    // scale parameter for the Weibull distribution of lifetime
+    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
+    // shape parameter for the Weibull distribution of lifetime
+    val k = 1 / scale
+    val quantiles = $(quantileProbabilities).map {
+      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
+    }
+    Vectors.dense(quantiles)
+  }
+
+  @Since("1.6.0")
+  def predict(features: Vector): Double = {
+    math.exp(BLAS.dot(coefficients, features) + intercept)
+  }
+
+  @Since("1.6.0")
+  override def transform(dataset: DataFrame): DataFrame = {
+    transformSchema(dataset.schema)
+    val predictUDF = udf { features: Vector => predict(features) }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
+  @Since("1.6.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema, fitting = false)
+  }
+
+  @Since("1.6.0")
+  override def copy(extra: ParamMap): AFTSurvivalRegressionModel = {
+    copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra)
+      .setParent(parent)
+  }
+}
+
+/**
+ * AFTAggregator computes the gradient and loss for a AFT loss function,
+ * as used in AFT survival regression for samples in sparse or dense vector in a online fashion.
+ *
+ * The loss function and likelihood function under the AFT model based on:
+ * Lawless, J. F., Statistical Models and Methods for Lifetime Data,
+ * New York: John Wiley & Sons, Inc. 2003.
+ *
+ * Two AFTAggregator can be merged together to have a summary of loss and gradient of
+ * the corresponding joint dataset.
+ *
+ * Given the values of the covariates x^{'}, for random lifetime t_{i} of subjects i = 1, ..., n,
+ * with possible right-censoring, the likelihood function under the AFT model is given as
+ * {{{
+ *   L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}
+ *   (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}
+ *   (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
+ * }}}
+ * Where \delta_{i} is the indicator of the event has occurred i.e. uncensored or not.
+ * Using \epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}, the log-likelihood function
+ * assumes the form
+ * {{{
+ *   \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+
+ *   \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
+ * }}}
+ * Where S_{0}(\epsilon_{i}) is the baseline survivor function,
+ * and f_{0}(\epsilon_{i}) is corresponding density function.
+ *
+ * The most commonly used log-linear survival regression method is based on the Weibull
+ * distribution of the survival time. The Weibull distribution for lifetime corresponding
+ * to extreme value distribution for log of the lifetime,
+ * and the S_{0}(\epsilon) function is
+ * {{{
+ *   S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
+ * }}}
+ * the f_{0}(\epsilon_{i}) function is
+ * {{{
+ *   f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
+ * }}}
+ * The log-likelihood function for Weibull distribution of lifetime is
+ * {{{
+ *   \iota(\beta,\sigma)=
+ *   -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
+ * }}}
+ * Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
+ * the loss function we use to optimize is -\iota(\beta,\sigma).
+ * The gradient functions for \beta and \log\sigma respectively are
+ * {{{
+ *   \frac{\partial (-\iota)}{\partial \beta}=
+ *   \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma}
+ * }}}
+ * {{{
+ *   \frac{\partial (-\iota)}{\partial (\log\sigma)}=
+ *   \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
+ * }}}
+ * @param weights The log of scale parameter, the intercept and
+ *                regression coefficients corresponding to the features.
+ * @param fitIntercept Whether to fit an intercept term.
+ */
+private class AFTAggregator(weights: BDV[Double], fitIntercept: Boolean)
+  extends Serializable {
+
+  // beta is the intercept and regression coefficients to the covariates
+  private val beta = weights.slice(1, weights.length)
+  // sigma is the scale parameter of the AFT model
+  private val sigma = math.exp(weights(0))
+
+  private var totalCnt: Long = 0L
+  private var lossSum = 0.0
+  private var gradientBetaSum = BDV.zeros[Double](beta.length)
+  private var gradientLogSigmaSum = 0.0
+
+  def count: Long = totalCnt
+
+  def loss: Double = if (totalCnt == 0) 1.0 else lossSum / totalCnt
+
+  // Here we optimize loss function over beta and log(sigma)
+  def gradient: BDV[Double] = BDV.vertcat(BDV(Array(gradientLogSigmaSum / totalCnt.toDouble)),
+    gradientBetaSum/totalCnt.toDouble)
+
+  /**
+   * Add a new training data to this AFTAggregator, and update the loss and gradient
+   * of the objective function.
+   *
+   * @param data The AFTPoint representation for one data point to be added into this aggregator.
+   * @return This AFTAggregator object.
+   */
+  def add(data: AFTPoint): this.type = {
+
+    // TODO: Don't create a new xi vector each time.
+    val xi = if (fitIntercept) {
+      Vectors.dense(Array(1.0) ++ data.features.toArray).toBreeze
+    } else {
+      Vectors.dense(Array(0.0) ++ data.features.toArray).toBreeze
+    }
+    val ti = data.label
+    val delta = data.censor
+    val epsilon = (math.log(ti) - beta.dot(xi)) / sigma
+
+    lossSum += math.log(sigma) * delta
+    lossSum += (math.exp(epsilon) - delta * epsilon)
+
+    // Sanity check (should never occur):
+    assert(!lossSum.isInfinity,
+      s"AFTAggregator loss sum is infinity. Error for unknown reason.")
+
+    gradientBetaSum += xi * (delta - math.exp(epsilon)) / sigma
+    gradientLogSigmaSum += delta + (delta - math.exp(epsilon)) * epsilon
+
+    totalCnt += 1
+    this
+  }
+
+  /**
+   * Merge another AFTAggregator, and update the loss and gradient
+   * of the objective function.
+   * (Note that it's in place merging; as a result, `this` object will be modified.)
+   *
+   * @param other The other AFTAggregator to be merged.
+   * @return This AFTAggregator object.
+   */
+  def merge(other: AFTAggregator): this.type = {
+    if (totalCnt != 0) {
+      totalCnt += other.totalCnt
+      lossSum += other.lossSum
+
+      gradientBetaSum += other.gradientBetaSum
+      gradientLogSigmaSum += other.gradientLogSigmaSum
+    }
+    this
+  }
+}
+
+/**
+ * AFTCostFun implements Breeze's DiffFunction[T] for AFT cost.
+ * It returns the loss and gradient at a particular point (coefficients).
+ * It's used in Breeze's convex optimization routines.
+ */
+private class AFTCostFun(data: RDD[AFTPoint], fitIntercept: Boolean)
+  extends DiffFunction[BDV[Double]] {
+
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+
+    val aftAggregator = data.treeAggregate(new AFTAggregator(coefficients, fitIntercept))(
+      seqOp = (c, v) => (c, v) match {
+        case (aggregator, instance) => aggregator.add(instance)
+      },
+      combOp = (c1, c2) => (c1, c2) match {
+        case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
+      })
+
+    (aftAggregator.loss, aftAggregator.gradient)
+  }
+}
+
+/**
+ * Class that represents the (features, label, censor) of a data point.
+ *
+ * @param features List of features for this data point.
+ * @param label Label for this data point.
+ * @param censor Indicator of the event has occurred or not. If the value is 1, it means
+ *                 the event has occurred i.e. uncensored; otherwise censored.
+ */
+private[regression] case class AFTPoint(features: Vector, label: Double, censor: Double) {
+  require(censor == 1.0 || censor == 0.0, "censor of class AFTPoint must be 1.0 or 0.0")
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
new file mode 100644
index 0000000000000..ca7140a45ea65
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.MLTestingUtils
+import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.mllib.random.{ExponentialGenerator, WeibullGenerator}
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{Row, DataFrame}
+
+class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  @transient var datasetUnivariate: DataFrame = _
+  @transient var datasetMultivariate: DataFrame = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    datasetUnivariate = sqlContext.createDataFrame(
+      sc.parallelize(generateAFTInput(
+        1, Array(5.5), Array(0.8), 1000, 42, 1.0, 2.0, 2.0)))
+    datasetMultivariate = sqlContext.createDataFrame(
+      sc.parallelize(generateAFTInput(
+        2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0)))
+  }
+
+  test("params") {
+    ParamsSuite.checkParams(new AFTSurvivalRegression)
+    val model = new AFTSurvivalRegressionModel("aftSurvReg", Vectors.dense(0.0), 0.0, 0.0)
+    ParamsSuite.checkParams(model)
+  }
+
+  test("aft survival regression: default params") {
+    val aftr = new AFTSurvivalRegression
+    assert(aftr.getLabelCol === "label")
+    assert(aftr.getFeaturesCol === "features")
+    assert(aftr.getPredictionCol === "prediction")
+    assert(aftr.getCensorCol === "censor")
+    assert(aftr.getFitIntercept)
+    assert(aftr.getMaxIter === 100)
+    assert(aftr.getTol === 1E-6)
+    val model = aftr.fit(datasetUnivariate)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+
+    model.transform(datasetUnivariate)
+      .select("label", "prediction")
+      .collect()
+    assert(model.getFeaturesCol === "features")
+    assert(model.getPredictionCol === "prediction")
+    assert(model.intercept !== 0.0)
+    assert(model.hasParent)
+  }
+
+  def generateAFTInput(
+      numFeatures: Int,
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      weibullShape: Double,
+      weibullScale: Double,
+      exponentialMean: Double): Seq[AFTPoint] = {
+
+    def censor(x: Double, y: Double): Double = { if (x <= y) 1.0 else 0.0 }
+
+    val weibull = new WeibullGenerator(weibullShape, weibullScale)
+    weibull.setSeed(seed)
+
+    val exponential = new ExponentialGenerator(exponentialMean)
+    exponential.setSeed(seed)
+
+    val rnd = new Random(seed)
+    val x = Array.fill[Array[Double]](nPoints)(Array.fill[Double](numFeatures)(rnd.nextDouble()))
+
+    x.foreach { v =>
+      var i = 0
+      val len = v.length
+      while (i < len) {
+        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        i += 1
+      }
+    }
+    val y = (1 to nPoints).map { i => (weibull.nextValue(), exponential.nextValue()) }
+
+    y.zip(x).map { p => AFTPoint(Vectors.dense(p._2), p._1._1, censor(p._1._1, p._1._2)) }
+  }
+
+  test("aft survival regression with univariate") {
+    val trainer = new AFTSurvivalRegression
+    val model = trainer.fit(datasetUnivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       features <- data$V1
+       censor <- data$V2
+       label <- data$V3
+       sr.fit <- survreg(Surv(label, censor) ~ features, dist='weibull')
+       summary(sr.fit)
+
+                    Value Std. Error      z        p
+       (Intercept)  1.759     0.4141  4.247 2.16e-05
+       features    -0.039     0.0735 -0.531 5.96e-01
+       Log(scale)   0.344     0.0379  9.073 1.16e-19
+
+       Scale= 1.41
+
+       Weibull distribution
+       Loglik(model)= -1152.2   Loglik(intercept only)= -1152.3
+           Chisq= 0.28 on 1 degrees of freedom, p= 0.6
+       Number of Newton-Raphson Iterations: 5
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(-0.039)
+    val interceptR = 1.759
+    val scaleR = 1.41
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+
+       testdata <- list(features=6.559282795753792)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       4.494763
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]  0.1879174  2.6801195 14.5779394
+     */
+    val features = Vectors.dense(6.559282795753792)
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val responsePredictR = 4.494763
+    val quantilePredictR = Vectors.dense(0.1879174, 2.6801195, 14.5779394)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    model.setQuantileProbabilities(quantileProbabilities)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetUnivariate).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression with multivariate") {
+    val trainer = new AFTSurvivalRegression
+    val model = trainer.fit(datasetMultivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       feature1 <- data$V1
+       feature2 <- data$V2
+       censor <- data$V3
+       label <- data$V4
+       sr.fit <- survreg(Surv(label, censor) ~ feature1 + feature2, dist='weibull')
+       summary(sr.fit)
+
+                     Value Std. Error      z        p
+       (Intercept)  1.9206     0.1057 18.171 8.78e-74
+       feature1    -0.0844     0.0611 -1.381 1.67e-01
+       feature2     0.0677     0.0468  1.447 1.48e-01
+       Log(scale)  -0.0236     0.0436 -0.542 5.88e-01
+
+       Scale= 0.977
+
+       Weibull distribution
+       Loglik(model)= -1070.7   Loglik(intercept only)= -1072.7
+           Chisq= 3.91 on 2 degrees of freedom, p= 0.14
+       Number of Newton-Raphson Iterations: 5
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(-0.0844, 0.0677)
+    val interceptR = 1.9206
+    val scaleR = 0.977
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+       testdata <- list(feature1=2.233396950271428, feature2=-2.5321374085997683)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       4.761219
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]  0.5287044  3.3285858 10.7517072
+     */
+    val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val responsePredictR = 4.761219
+    val quantilePredictR = Vectors.dense(0.5287044, 3.3285858, 10.7517072)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    model.setQuantileProbabilities(quantileProbabilities)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetMultivariate).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression w/o intercept") {
+    val trainer = new AFTSurvivalRegression().setFitIntercept(false)
+    val model = trainer.fit(datasetMultivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       feature1 <- data$V1
+       feature2 <- data$V2
+       censor <- data$V3
+       label <- data$V4
+       sr.fit <- survreg(Surv(label, censor) ~ feature1 + feature2 - 1, dist='weibull')
+       summary(sr.fit)
+
+                   Value Std. Error     z        p
+       feature1    0.896     0.0685  13.1 3.93e-39
+       feature2   -0.709     0.0522 -13.6 5.78e-42
+       Log(scale)  0.420     0.0401  10.5 1.23e-25
+
+       Scale= 1.52
+
+       Weibull distribution
+       Loglik(model)= -1292.4   Loglik(intercept only)= -1072.7
+         Chisq= -439.57 on 1 degrees of freedom, p= 1
+       Number of Newton-Raphson Iterations: 6
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(0.896, -0.709)
+    val interceptR = 0.0
+    val scaleR = 1.52
+
+    assert(model.intercept === interceptR)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+       testdata <- list(feature1=2.233396950271428, feature2=-2.5321374085997683)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       44.54465
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]   1.452103  25.506077 158.428600
+     */
+    val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val responsePredictR = 44.54465
+    val quantilePredictR = Vectors.dense(1.452103, 25.506077, 158.428600)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    model.setQuantileProbabilities(quantileProbabilities)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetMultivariate).select("features", "prediction").collect().foreach {
+      case Row(features: DenseVector, prediction1: Double) =>
+        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
+        assert(prediction1 ~== prediction2 relTol 1E-5)
+    }
+  }
+}

From d009da2f5c803f3b7344c96abbfcf3ecef2f5ad2 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 17 Sep 2015 22:05:20 -0700
Subject: [PATCH 336/802] [SPARK-10682] [GRAPHX] Remove Bagel test suites.

Bagel has been deprecated and we haven't done any changes to it. There is no need to run those tests.

This should speed up tests by 1 min.

Author: Reynold Xin <rxin@databricks.com>

Closes #8807 from rxin/SPARK-10682.
---
 bagel/src/test/resources/log4j.properties     |  27 -----
 .../org/apache/spark/bagel/BagelSuite.scala   | 113 ------------------
 2 files changed, 140 deletions(-)
 delete mode 100644 bagel/src/test/resources/log4j.properties
 delete mode 100644 bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala

diff --git a/bagel/src/test/resources/log4j.properties b/bagel/src/test/resources/log4j.properties
deleted file mode 100644
index edbecdae92096..0000000000000
--- a/bagel/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
-log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=true
-log4j.appender.file.file=target/unit-tests.log
-log4j.appender.file.layout=org.apache.log4j.PatternLayout
-log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
-
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.spark-project.jetty=WARN
diff --git a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala b/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
deleted file mode 100644
index fb10d734ac74b..0000000000000
--- a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.bagel
-
-import org.scalatest.{BeforeAndAfter, Assertions}
-import org.scalatest.concurrent.Timeouts
-import org.scalatest.time.SpanSugar._
-
-import org.apache.spark._
-import org.apache.spark.storage.StorageLevel
-
-class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable
-class TestMessage(val targetId: String) extends Message[String] with Serializable
-
-class BagelSuite extends SparkFunSuite with Assertions with BeforeAndAfter with Timeouts {
-
-  var sc: SparkContext = _
-
-  after {
-    if (sc != null) {
-      sc.stop()
-      sc = null
-    }
-  }
-
-  test("halting by voting") {
-    sc = new SparkContext("local", "test")
-    val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(true, 0))))
-    val msgs = sc.parallelize(Array[(String, TestMessage)]())
-    val numSupersteps = 5
-    val result =
-      Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-        (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-          (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-      }
-    for ((id, vert) <- result.collect) {
-      assert(vert.age === numSupersteps)
-    }
-  }
-
-  test("halting by message silence") {
-    sc = new SparkContext("local", "test")
-    val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(false, 0))))
-    val msgs = sc.parallelize(Array("a" -> new TestMessage("a")))
-    val numSupersteps = 5
-    val result =
-      Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-        (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-          val msgsOut =
-            msgs match {
-              case Some(ms) if (superstep < numSupersteps - 1) =>
-                ms
-              case _ =>
-                Array[TestMessage]()
-            }
-        (new TestVertex(self.active, self.age + 1), msgsOut)
-      }
-    for ((id, vert) <- result.collect) {
-      assert(vert.age === numSupersteps)
-    }
-  }
-
-  test("large number of iterations") {
-    // This tests whether jobs with a large number of iterations finish in a reasonable time,
-    // because non-memoized recursion in RDD or DAGScheduler used to cause them to hang
-    failAfter(30 seconds) {
-      sc = new SparkContext("local", "test")
-      val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
-      val msgs = sc.parallelize(Array[(String, TestMessage)]())
-      val numSupersteps = 50
-      val result =
-        Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-          (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-            (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-        }
-      for ((id, vert) <- result.collect) {
-        assert(vert.age === numSupersteps)
-      }
-    }
-  }
-
-  test("using non-default persistence level") {
-    failAfter(10 seconds) {
-      sc = new SparkContext("local", "test")
-      val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
-      val msgs = sc.parallelize(Array[(String, TestMessage)]())
-      val numSupersteps = 20
-      val result =
-        Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) {
-          (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-            (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-        }
-      for ((id, vert) <- result.collect) {
-        assert(vert.age === numSupersteps)
-      }
-    }
-  }
-}

From 93c7650ab60a839a9cbe8b4ea1d5eda93e53ebe0 Mon Sep 17 00:00:00 2001
From: linweizhong <linweizhong@huawei.com>
Date: Thu, 17 Sep 2015 22:25:24 -0700
Subject: [PATCH 337/802] [SPARK-9522] [SQL] SparkSubmit process can not exit
 if kill application when HiveThriftServer was starting

When we start HiveThriftServer, we will start SparkContext first, then start HiveServer2, if we kill application while HiveServer2 is starting then SparkContext will stop successfully, but SparkSubmit process can not exit.

Author: linweizhong <linweizhong@huawei.com>

Closes #7853 from Sephiroth-Lin/SPARK-9522.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala     | 2 +-
 .../spark/sql/hive/thriftserver/HiveThriftServer2.scala     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 9c3218719f7fc..ebd8e946ee7a2 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -97,7 +97,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   val startTime = System.currentTimeMillis()
 
-  private val stopped: AtomicBoolean = new AtomicBoolean(false)
+  private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
 
   private def assertNotStopped(): Unit = {
     if (stopped.get()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index dd9fef9206d0b..a0643cec0fb7c 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -93,6 +93,12 @@ object HiveThriftServer2 extends Logging {
       } else {
         None
       }
+      // If application was killed before HiveThriftServer2 start successfully then SparkSubmit
+      // process can not exit, so check whether if SparkContext was stopped.
+      if (SparkSQLEnv.sparkContext.stopped.get()) {
+        logError("SparkContext has stopped even if HiveServer2 has started, so exit")
+        System.exit(-1)
+      }
     } catch {
       case e: Exception =>
         logError("Error starting HiveThriftServer2", e)

From 9a56dcdf7f19c9f7f913a2ce9bc981cb43a113c5 Mon Sep 17 00:00:00 2001
From: Felix Bechstein <felix.bechstein@otto.de>
Date: Thu, 17 Sep 2015 22:42:46 -0700
Subject: [PATCH 338/802] docs/running-on-mesos.md: state default values in
 default column

This PR simply uses the default value column for defaults.

Author: Felix Bechstein <felix.bechstein@otto.de>

Closes #8810 from felixb/fix_mesos_doc.
---
 docs/running-on-mesos.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 247e6ecfbdb86..1814fb32ed8a5 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -332,21 +332,21 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.principal</code></td>
-  <td>Framework principal to authenticate to Mesos</td>
+  <td>(none)</td>
   <td>
     Set the principal with which Spark framework will use to authenticate with Mesos.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.secret</code></td>
-  <td>Framework secret to authenticate to Mesos</td>
+  <td>(none)/td>
   <td>
     Set the secret with which Spark framework will use to authenticate with Mesos.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.role</code></td>
-  <td>Role for the Spark framework</td>
+  <td>*</td>
   <td>
     Set the role of this Spark framework for Mesos. Roles are used in Mesos for reservations
     and resource weight sharing.
@@ -354,7 +354,7 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.constraints</code></td>
-  <td>Attribute based constraints to be matched against when accepting resource offers.</td>
+  <td>(none)</td>
   <td>
     Attribute based constraints on mesos resource offers. By default, all resource offers will be accepted. Refer to <a href="http://mesos.apache.org/documentation/attributes-resources/">Mesos Attributes & Resources</a> for more information on attributes.
     <ul>

From 74d8f7dda82c3a16348f3ff22da83203e0b7f708 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 17 Sep 2015 22:46:13 -0700
Subject: [PATCH 339/802] Added <code> tag to documentation.

---
 docs/running-on-mesos.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 1814fb32ed8a5..330c159c67bca 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -346,7 +346,7 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.role</code></td>
-  <td>*</td>
+  <td><code>*</code></td>
   <td>
     Set the role of this Spark framework for Mesos. Roles are used in Mesos for reservations
     and resource weight sharing.

From e3b5d6cb29e0f983fcc55920619e6433298955f5 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Fri, 18 Sep 2015 00:43:02 -0700
Subject: [PATCH 340/802] [SPARK-10684] [SQL] StructType.interpretedOrdering
 need not to be serialized

Kryo fails with buffer overflow even with max value (2G).

{noformat}
org.apache.spark.SparkException: Kryo serialization failed: Buffer overflow. Available: 0, required: 1
Serialization trace:
containsChild (org.apache.spark.sql.catalyst.expressions.BoundReference)
child (org.apache.spark.sql.catalyst.expressions.SortOrder)
array (scala.collection.mutable.ArraySeq)
ordering (org.apache.spark.sql.catalyst.expressions.InterpretedOrdering)
interpretedOrdering (org.apache.spark.sql.types.StructType)
schema (org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema). To avoid this, increase spark.kryoserializer.buffer.max value.
        at org.apache.spark.serializer.KryoSerializerInstance.serialize(KryoSerializer.scala:263)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:240)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)
{noformat}

Author: navis.ryu <navis@apache.org>

Closes #8808 from navis/SPARK-10684.
---
 .../main/scala/org/apache/spark/sql/types/StructType.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index d8968ef806390..b29cf22dcb582 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -305,7 +305,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     f(this) || fields.exists(field => field.dataType.existsRecursively(f))
   }
 
-  private[sql] val interpretedOrdering = InterpretedOrdering.forSchema(this.fields.map(_.dataType))
+  @transient
+  private[sql] lazy val interpretedOrdering =
+    InterpretedOrdering.forSchema(this.fields.map(_.dataType))
 }
 
 object StructType extends AbstractDataType {

From 20fd35dfd1ac402b622604e7bbedcc53a580b0a2 Mon Sep 17 00:00:00 2001
From: Yash Datta <Yash.Datta@guavus.com>
Date: Fri, 18 Sep 2015 08:22:38 -0700
Subject: [PATCH 341/802] [SPARK-10451] [SQL] Prevent unnecessary
 serializations in InMemoryColumnarTableScan

Many of the fields in InMemoryColumnar scan and InMemoryRelation can be made transient.

This  reduces my 1000ms job to abt 700 ms . The task size reduces from 2.8 mb to ~1300kb

Author: Yash Datta <Yash.Datta@guavus.com>

Closes #8604 from saucam/serde.
---
 .../columnar/InMemoryColumnarTableScan.scala  | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 66d429bc06198..d7e145f9c2bb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -48,10 +48,10 @@ private[sql] case class InMemoryRelation(
     useCompression: Boolean,
     batchSize: Int,
     storageLevel: StorageLevel,
-    child: SparkPlan,
+    @transient child: SparkPlan,
     tableName: Option[String])(
-    private var _cachedColumnBuffers: RDD[CachedBatch] = null,
-    private var _statistics: Statistics = null,
+    @transient private var _cachedColumnBuffers: RDD[CachedBatch] = null,
+    @transient private var _statistics: Statistics = null,
     private var _batchStats: Accumulable[ArrayBuffer[InternalRow], InternalRow] = null)
   extends LogicalPlan with MultiInstanceRelation {
 
@@ -62,7 +62,7 @@ private[sql] case class InMemoryRelation(
       _batchStats
     }
 
-  val partitionStatistics = new PartitionStatistics(output)
+  @transient val partitionStatistics = new PartitionStatistics(output)
 
   private def computeSizeInBytes = {
     val sizeOfRow: Expression =
@@ -196,7 +196,7 @@ private[sql] case class InMemoryRelation(
 private[sql] case class InMemoryColumnarTableScan(
     attributes: Seq[Attribute],
     predicates: Seq[Expression],
-    relation: InMemoryRelation)
+    @transient relation: InMemoryRelation)
   extends LeafNode {
 
   override def output: Seq[Attribute] = attributes
@@ -205,7 +205,7 @@ private[sql] case class InMemoryColumnarTableScan(
 
   // Returned filter predicate should return false iff it is impossible for the input expression
   // to evaluate to `true' based on statistics collected about this partition batch.
-  val buildFilter: PartialFunction[Expression, Expression] = {
+  @transient val buildFilter: PartialFunction[Expression, Expression] = {
     case And(lhs: Expression, rhs: Expression)
       if buildFilter.isDefinedAt(lhs) || buildFilter.isDefinedAt(rhs) =>
       (buildFilter.lift(lhs) ++ buildFilter.lift(rhs)).reduce(_ && _)
@@ -268,16 +268,23 @@ private[sql] case class InMemoryColumnarTableScan(
       readBatches.setValue(0)
     }
 
-    relation.cachedColumnBuffers.mapPartitions { cachedBatchIterator =>
-      val partitionFilter = newPredicate(
-        partitionFilters.reduceOption(And).getOrElse(Literal(true)),
-        relation.partitionStatistics.schema)
+    // Using these variables here to avoid serialization of entire objects (if referenced directly)
+    // within the map Partitions closure.
+    val schema = relation.partitionStatistics.schema
+    val schemaIndex = schema.zipWithIndex
+    val relOutput = relation.output
+    val buffers = relation.cachedColumnBuffers
+
+    buffers.mapPartitions { cachedBatchIterator =>
+    val partitionFilter = newPredicate(
+      partitionFilters.reduceOption(And).getOrElse(Literal(true)),
+      schema)
 
       // Find the ordinals and data types of the requested columns.  If none are requested, use the
       // narrowest (the field with minimum default element size).
       val (requestedColumnIndices, requestedColumnDataTypes) = if (attributes.isEmpty) {
         val (narrowestOrdinal, narrowestDataType) =
-          relation.output.zipWithIndex.map { case (a, ordinal) =>
+          relOutput.zipWithIndex.map { case (a, ordinal) =>
             ordinal -> a.dataType
           } minBy { case (_, dataType) =>
             ColumnType(dataType).defaultSize
@@ -285,7 +292,7 @@ private[sql] case class InMemoryColumnarTableScan(
         Seq(narrowestOrdinal) -> Seq(narrowestDataType)
       } else {
         attributes.map { a =>
-          relation.output.indexWhere(_.exprId == a.exprId) -> a.dataType
+          relOutput.indexWhere(_.exprId == a.exprId) -> a.dataType
         }.unzip
       }
 
@@ -296,7 +303,7 @@ private[sql] case class InMemoryColumnarTableScan(
           // Build column accessors
           val columnAccessors = requestedColumnIndices.map { batchColumnIndex =>
             ColumnAccessor(
-              relation.output(batchColumnIndex).dataType,
+              relOutput(batchColumnIndex).dataType,
               ByteBuffer.wrap(cachedBatch.buffers(batchColumnIndex)))
           }
 
@@ -328,7 +335,7 @@ private[sql] case class InMemoryColumnarTableScan(
         if (inMemoryPartitionPruningEnabled) {
           cachedBatchIterator.filter { cachedBatch =>
             if (!partitionFilter(cachedBatch.stats)) {
-              def statsString: String = relation.partitionStatistics.schema.zipWithIndex.map {
+              def statsString: String = schemaIndex.map {
                 case (a, i) =>
                   val value = cachedBatch.stats.get(i, a.dataType)
                   s"${a.name}: $value"

From 35e8ab939000d4a1a01c1af4015c25ff6f4013a3 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 18 Sep 2015 09:53:52 -0700
Subject: [PATCH 342/802] [SPARK-10615] [PYSPARK] change assertEquals to
 assertEqual

As ```assertEquals``` is deprecated, so we need to change ```assertEquals``` to ```assertEqual``` for existing python unit tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8814 from yanboliang/spark-10615.
---
 python/pyspark/ml/tests.py        |  16 +--
 python/pyspark/mllib/tests.py     | 162 +++++++++++++++---------------
 python/pyspark/sql/tests.py       |  18 ++--
 python/pyspark/streaming/tests.py |   2 +-
 4 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index b892318f50bd9..648fa8858fba3 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -182,7 +182,7 @@ def test_params(self):
         self.assertEqual(testParams.getMaxIter(), 10)
         testParams.setMaxIter(100)
         self.assertTrue(testParams.isSet(maxIter))
-        self.assertEquals(testParams.getMaxIter(), 100)
+        self.assertEqual(testParams.getMaxIter(), 100)
 
         self.assertTrue(testParams.hasParam(inputCol))
         self.assertFalse(testParams.hasDefault(inputCol))
@@ -195,7 +195,7 @@ def test_params(self):
         testParams._setDefault(seed=41)
         testParams.setSeed(43)
 
-        self.assertEquals(
+        self.assertEqual(
             testParams.explainParams(),
             "\n".join(["inputCol: input column name (undefined)",
                        "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
@@ -264,23 +264,23 @@ def test_ngram(self):
         self.assertEqual(ngram0.getInputCol(), "input")
         self.assertEqual(ngram0.getOutputCol(), "output")
         transformedDF = ngram0.transform(dataset)
-        self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])
+        self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
 
     def test_stopwordsremover(self):
         sqlContext = SQLContext(self.sc)
         dataset = sqlContext.createDataFrame([Row(input=["a", "panda"])])
         stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
         # Default
-        self.assertEquals(stopWordRemover.getInputCol(), "input")
+        self.assertEqual(stopWordRemover.getInputCol(), "input")
         transformedDF = stopWordRemover.transform(dataset)
-        self.assertEquals(transformedDF.head().output, ["panda"])
+        self.assertEqual(transformedDF.head().output, ["panda"])
         # Custom
         stopwords = ["panda"]
         stopWordRemover.setStopWords(stopwords)
-        self.assertEquals(stopWordRemover.getInputCol(), "input")
-        self.assertEquals(stopWordRemover.getStopWords(), stopwords)
+        self.assertEqual(stopWordRemover.getInputCol(), "input")
+        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
         transformedDF = stopWordRemover.transform(dataset)
-        self.assertEquals(transformedDF.head().output, ["a"])
+        self.assertEqual(transformedDF.head().output, ["a"])
 
 
 class HasInducedError(Params):
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 636f9a06cab7b..96cf13495aa95 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -166,13 +166,13 @@ def test_dot(self):
                      [1., 2., 3., 4.],
                      [1., 2., 3., 4.]])
         arr = pyarray.array('d', [0, 1, 2, 3])
-        self.assertEquals(10.0, sv.dot(dv))
+        self.assertEqual(10.0, sv.dot(dv))
         self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
-        self.assertEquals(30.0, dv.dot(dv))
+        self.assertEqual(30.0, dv.dot(dv))
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
-        self.assertEquals(30.0, lst.dot(dv))
+        self.assertEqual(30.0, lst.dot(dv))
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
-        self.assertEquals(7.0, sv.dot(arr))
+        self.assertEqual(7.0, sv.dot(arr))
 
     def test_squared_distance(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -181,27 +181,27 @@ def test_squared_distance(self):
         lst1 = [4, 3, 2, 1]
         arr = pyarray.array('d', [0, 2, 1, 3])
         narr = array([0, 2, 1, 3])
-        self.assertEquals(15.0, _squared_distance(sv, dv))
-        self.assertEquals(25.0, _squared_distance(sv, lst))
-        self.assertEquals(20.0, _squared_distance(dv, lst))
-        self.assertEquals(15.0, _squared_distance(dv, sv))
-        self.assertEquals(25.0, _squared_distance(lst, sv))
-        self.assertEquals(20.0, _squared_distance(lst, dv))
-        self.assertEquals(0.0, _squared_distance(sv, sv))
-        self.assertEquals(0.0, _squared_distance(dv, dv))
-        self.assertEquals(0.0, _squared_distance(lst, lst))
-        self.assertEquals(25.0, _squared_distance(sv, lst1))
-        self.assertEquals(3.0, _squared_distance(sv, arr))
-        self.assertEquals(3.0, _squared_distance(sv, narr))
+        self.assertEqual(15.0, _squared_distance(sv, dv))
+        self.assertEqual(25.0, _squared_distance(sv, lst))
+        self.assertEqual(20.0, _squared_distance(dv, lst))
+        self.assertEqual(15.0, _squared_distance(dv, sv))
+        self.assertEqual(25.0, _squared_distance(lst, sv))
+        self.assertEqual(20.0, _squared_distance(lst, dv))
+        self.assertEqual(0.0, _squared_distance(sv, sv))
+        self.assertEqual(0.0, _squared_distance(dv, dv))
+        self.assertEqual(0.0, _squared_distance(lst, lst))
+        self.assertEqual(25.0, _squared_distance(sv, lst1))
+        self.assertEqual(3.0, _squared_distance(sv, arr))
+        self.assertEqual(3.0, _squared_distance(sv, narr))
 
     def test_hash(self):
         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
         v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
-        self.assertEquals(hash(v1), hash(v2))
-        self.assertEquals(hash(v1), hash(v3))
-        self.assertEquals(hash(v2), hash(v3))
+        self.assertEqual(hash(v1), hash(v2))
+        self.assertEqual(hash(v1), hash(v3))
+        self.assertEqual(hash(v2), hash(v3))
         self.assertFalse(hash(v1) == hash(v4))
         self.assertFalse(hash(v2) == hash(v4))
 
@@ -212,8 +212,8 @@ def test_eq(self):
         v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
         v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
         v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
-        self.assertEquals(v1, v2)
-        self.assertEquals(v1, v3)
+        self.assertEqual(v1, v2)
+        self.assertEqual(v1, v3)
         self.assertFalse(v2 == v4)
         self.assertFalse(v1 == v5)
         self.assertFalse(v1 == v6)
@@ -238,13 +238,13 @@ def test_conversion(self):
 
     def test_sparse_vector_indexing(self):
         sv = SparseVector(4, {1: 1, 3: 2})
-        self.assertEquals(sv[0], 0.)
-        self.assertEquals(sv[3], 2.)
-        self.assertEquals(sv[1], 1.)
-        self.assertEquals(sv[2], 0.)
-        self.assertEquals(sv[-1], 2)
-        self.assertEquals(sv[-2], 0)
-        self.assertEquals(sv[-4], 0)
+        self.assertEqual(sv[0], 0.)
+        self.assertEqual(sv[3], 2.)
+        self.assertEqual(sv[1], 1.)
+        self.assertEqual(sv[2], 0.)
+        self.assertEqual(sv[-1], 2)
+        self.assertEqual(sv[-2], 0)
+        self.assertEqual(sv[-4], 0)
         for ind in [4, -5]:
             self.assertRaises(ValueError, sv.__getitem__, ind)
         for ind in [7.8, '1']:
@@ -255,7 +255,7 @@ def test_matrix_indexing(self):
         expected = [[0, 6], [1, 8], [4, 10]]
         for i in range(3):
             for j in range(2):
-                self.assertEquals(mat[i, j], expected[i][j])
+                self.assertEqual(mat[i, j], expected[i][j])
 
     def test_repr_dense_matrix(self):
         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
@@ -308,11 +308,11 @@ def test_sparse_matrix(self):
         # Test sparse matrix creation.
         sm1 = SparseMatrix(
             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
-        self.assertEquals(sm1.numRows, 3)
-        self.assertEquals(sm1.numCols, 4)
-        self.assertEquals(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
-        self.assertEquals(sm1.rowIndices.tolist(), [1, 2, 1, 2])
-        self.assertEquals(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
+        self.assertEqual(sm1.numRows, 3)
+        self.assertEqual(sm1.numCols, 4)
+        self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
+        self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
+        self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
         self.assertTrue(
             repr(sm1),
             'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
@@ -325,13 +325,13 @@ def test_sparse_matrix(self):
 
         for i in range(3):
             for j in range(4):
-                self.assertEquals(expected[i][j], sm1[i, j])
+                self.assertEqual(expected[i][j], sm1[i, j])
         self.assertTrue(array_equal(sm1.toArray(), expected))
 
         # Test conversion to dense and sparse.
         smnew = sm1.toDense().toSparse()
-        self.assertEquals(sm1.numRows, smnew.numRows)
-        self.assertEquals(sm1.numCols, smnew.numCols)
+        self.assertEqual(sm1.numRows, smnew.numRows)
+        self.assertEqual(sm1.numCols, smnew.numCols)
         self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
         self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
         self.assertTrue(array_equal(sm1.values, smnew.values))
@@ -339,11 +339,11 @@ def test_sparse_matrix(self):
         sm1t = SparseMatrix(
             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
             isTransposed=True)
-        self.assertEquals(sm1t.numRows, 3)
-        self.assertEquals(sm1t.numCols, 4)
-        self.assertEquals(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
-        self.assertEquals(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
-        self.assertEquals(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
+        self.assertEqual(sm1t.numRows, 3)
+        self.assertEqual(sm1t.numCols, 4)
+        self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
+        self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
+        self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
 
         expected = [
             [3, 2, 0, 0],
@@ -352,18 +352,18 @@ def test_sparse_matrix(self):
 
         for i in range(3):
             for j in range(4):
-                self.assertEquals(expected[i][j], sm1t[i, j])
+                self.assertEqual(expected[i][j], sm1t[i, j])
         self.assertTrue(array_equal(sm1t.toArray(), expected))
 
     def test_dense_matrix_is_transposed(self):
         mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
         mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
-        self.assertEquals(mat1, mat)
+        self.assertEqual(mat1, mat)
 
         expected = [[0, 4], [1, 6], [3, 9]]
         for i in range(3):
             for j in range(2):
-                self.assertEquals(mat1[i, j], expected[i][j])
+                self.assertEqual(mat1[i, j], expected[i][j])
         self.assertTrue(array_equal(mat1.toArray(), expected))
 
         sm = mat1.toSparse()
@@ -412,8 +412,8 @@ def test_kmeans(self):
         ]
         clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||",
                                 initializationSteps=7, epsilon=1e-4)
-        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
-        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
+        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
+        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
 
     def test_kmeans_deterministic(self):
         from pyspark.mllib.clustering import KMeans
@@ -443,8 +443,8 @@ def test_gmm(self):
         clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                          maxIterations=10, seed=56)
         labels = clusters.predict(data).collect()
-        self.assertEquals(labels[0], labels[1])
-        self.assertEquals(labels[2], labels[3])
+        self.assertEqual(labels[0], labels[1])
+        self.assertEqual(labels[2], labels[3])
 
     def test_gmm_deterministic(self):
         from pyspark.mllib.clustering import GaussianMixture
@@ -456,7 +456,7 @@ def test_gmm_deterministic(self):
         clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
                                           maxIterations=10, seed=63)
         for c1, c2 in zip(clusters1.weights, clusters2.weights):
-            self.assertEquals(round(c1, 7), round(c2, 7))
+            self.assertEqual(round(c1, 7), round(c2, 7))
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
@@ -711,18 +711,18 @@ def test_serialize(self):
         lil[1, 0] = 1
         lil[3, 0] = 2
         sv = SparseVector(4, {1: 1, 3: 2})
-        self.assertEquals(sv, _convert_to_vector(lil))
-        self.assertEquals(sv, _convert_to_vector(lil.tocsc()))
-        self.assertEquals(sv, _convert_to_vector(lil.tocoo()))
-        self.assertEquals(sv, _convert_to_vector(lil.tocsr()))
-        self.assertEquals(sv, _convert_to_vector(lil.todok()))
+        self.assertEqual(sv, _convert_to_vector(lil))
+        self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
+        self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
+        self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
+        self.assertEqual(sv, _convert_to_vector(lil.todok()))
 
         def serialize(l):
             return ser.loads(ser.dumps(_convert_to_vector(l)))
-        self.assertEquals(sv, serialize(lil))
-        self.assertEquals(sv, serialize(lil.tocsc()))
-        self.assertEquals(sv, serialize(lil.tocsr()))
-        self.assertEquals(sv, serialize(lil.todok()))
+        self.assertEqual(sv, serialize(lil))
+        self.assertEqual(sv, serialize(lil.tocsc()))
+        self.assertEqual(sv, serialize(lil.tocsr()))
+        self.assertEqual(sv, serialize(lil.todok()))
 
     def test_dot(self):
         from scipy.sparse import lil_matrix
@@ -730,7 +730,7 @@ def test_dot(self):
         lil[1, 0] = 1
         lil[3, 0] = 2
         dv = DenseVector(array([1., 2., 3., 4.]))
-        self.assertEquals(10.0, dv.dot(lil))
+        self.assertEqual(10.0, dv.dot(lil))
 
     def test_squared_distance(self):
         from scipy.sparse import lil_matrix
@@ -739,8 +739,8 @@ def test_squared_distance(self):
         lil[3, 0] = 2
         dv = DenseVector(array([1., 2., 3., 4.]))
         sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
-        self.assertEquals(15.0, dv.squared_distance(lil))
-        self.assertEquals(15.0, sv.squared_distance(lil))
+        self.assertEqual(15.0, dv.squared_distance(lil))
+        self.assertEqual(15.0, sv.squared_distance(lil))
 
     def scipy_matrix(self, size, values):
         """Create a column SciPy matrix from a dictionary of values"""
@@ -759,8 +759,8 @@ def test_clustering(self):
             self.scipy_matrix(3, {2: 1.1})
         ]
         clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
-        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
-        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
+        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
+        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
@@ -984,12 +984,12 @@ def test_word2vec_setters(self):
             .setNumIterations(10) \
             .setSeed(1024) \
             .setMinCount(3)
-        self.assertEquals(model.vectorSize, 2)
+        self.assertEqual(model.vectorSize, 2)
         self.assertTrue(model.learningRate < 0.02)
-        self.assertEquals(model.numPartitions, 2)
-        self.assertEquals(model.numIterations, 10)
-        self.assertEquals(model.seed, 1024)
-        self.assertEquals(model.minCount, 3)
+        self.assertEqual(model.numPartitions, 2)
+        self.assertEqual(model.numIterations, 10)
+        self.assertEqual(model.seed, 1024)
+        self.assertEqual(model.minCount, 3)
 
     def test_word2vec_get_vectors(self):
         data = [
@@ -1002,7 +1002,7 @@ def test_word2vec_get_vectors(self):
             ["a"]
         ]
         model = Word2Vec().fit(self.sc.parallelize(data))
-        self.assertEquals(len(model.getVectors()), 3)
+        self.assertEqual(len(model.getVectors()), 3)
 
 
 class StandardScalerTests(MLlibTestCase):
@@ -1044,8 +1044,8 @@ def test_model_params(self):
         """Test that the model params are set correctly"""
         stkm = StreamingKMeans()
         stkm.setK(5).setDecayFactor(0.0)
-        self.assertEquals(stkm._k, 5)
-        self.assertEquals(stkm._decayFactor, 0.0)
+        self.assertEqual(stkm._k, 5)
+        self.assertEqual(stkm._decayFactor, 0.0)
 
         # Model not set yet.
         self.assertIsNone(stkm.latestModel())
@@ -1053,9 +1053,9 @@ def test_model_params(self):
 
         stkm.setInitialCenters(
             centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
-        self.assertEquals(
+        self.assertEqual(
             stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
-        self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
+        self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
 
     def test_accuracy_for_single_center(self):
         """Test that parameters obtained are correct for a single center."""
@@ -1070,7 +1070,7 @@ def test_accuracy_for_single_center(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
             return True
         self._eventually(condition, catch_assertions=True)
 
@@ -1114,7 +1114,7 @@ def test_trainOn_model(self):
         def condition():
             finalModel = stkm.latestModel()
             self.assertTrue(all(finalModel.centers == array(initCenters)))
-            self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
+            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
             return True
         self._eventually(condition, catch_assertions=True)
 
@@ -1141,7 +1141,7 @@ def update(rdd):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(result, [[0], [1], [2], [3]])
+            self.assertEqual(result, [[0], [1], [2], [3]])
             return True
 
         self._eventually(condition, catch_assertions=True)
@@ -1263,7 +1263,7 @@ def test_convergence(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(models), len(input_batches))
+            self.assertEqual(len(models), len(input_batches))
             return True
 
         # We want all batches to finish for this test.
@@ -1297,7 +1297,7 @@ def test_predictions(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(true_predicted), len(input_batches))
+            self.assertEqual(len(true_predicted), len(input_batches))
             return True
 
         self._eventually(condition, catch_assertions=True)
@@ -1400,7 +1400,7 @@ def test_parameter_convergence(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(model_weights), len(batches))
+            self.assertEqual(len(model_weights), len(batches))
             return True
 
         # We want all batches to finish for this test.
@@ -1433,7 +1433,7 @@ def test_prediction(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(samples), len(batches))
+            self.assertEqual(len(samples), len(batches))
             return True
 
         # We want all batches to finish for this test.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f2172b7a27d88..3e680f1030a71 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -157,7 +157,7 @@ class DataTypeTests(unittest.TestCase):
     def test_data_type_eq(self):
         lt = LongType()
         lt2 = pickle.loads(pickle.dumps(LongType()))
-        self.assertEquals(lt, lt2)
+        self.assertEqual(lt, lt2)
 
     # regression test for SPARK-7978
     def test_decimal_type(self):
@@ -393,7 +393,7 @@ def test_infer_nested_schema(self):
                                    CustomRow(field1=2, field2="row2"),
                                    CustomRow(field1=3, field2="row3")])
         df = self.sqlCtx.inferSchema(rdd)
-        self.assertEquals(Row(field1=1, field2=u'row1'), df.first())
+        self.assertEqual(Row(field1=1, field2=u'row1'), df.first())
 
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
@@ -403,7 +403,7 @@ def test_create_dataframe_from_objects(self):
 
     def test_select_null_literal(self):
         df = self.sqlCtx.sql("select null as col")
-        self.assertEquals(Row(col=None), df.first())
+        self.assertEqual(Row(col=None), df.first())
 
     def test_apply_schema(self):
         from datetime import date, datetime
@@ -519,14 +519,14 @@ def test_apply_schema_with_udt(self):
                              StructField("point", ExamplePointUDT(), False)])
         df = self.sqlCtx.createDataFrame([row], schema)
         point = df.head().point
-        self.assertEquals(point, ExamplePoint(1.0, 2.0))
+        self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
         row = (1.0, PythonOnlyPoint(1.0, 2.0))
         schema = StructType([StructField("label", DoubleType(), False),
                              StructField("point", PythonOnlyUDT(), False)])
         df = self.sqlCtx.createDataFrame([row], schema)
         point = df.head().point
-        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0))
 
     def test_udf_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
@@ -554,14 +554,14 @@ def test_parquet_with_udt(self):
         df0.write.parquet(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
         point = df1.head().point
-        self.assertEquals(point, ExamplePoint(1.0, 2.0))
+        self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
         row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
         df0 = self.sqlCtx.createDataFrame([row])
         df0.write.parquet(output_dir, mode='overwrite')
         df1 = self.sqlCtx.parquetFile(output_dir)
         point = df1.head().point
-        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0))
 
     def test_column_operators(self):
         ci = self.df.key
@@ -826,8 +826,8 @@ def test_infer_long_type(self):
         output_dir = os.path.join(self.tempdir.name, "infer_long_type")
         df.saveAsParquetFile(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
-        self.assertEquals('a', df1.first().f1)
-        self.assertEquals(100000000000000, df1.first().f2)
+        self.assertEqual('a', df1.first().f1)
+        self.assertEqual(100000000000000, df1.first().f2)
 
         self.assertEqual(_infer_type(1), LongType())
         self.assertEqual(_infer_type(2**10), LongType())
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index cfea95b0dec71..e4e56fff3b3fc 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -693,7 +693,7 @@ def check_output(n):
 
         # Verify that getActiveOrCreate() returns active context
         self.setupCalled = False
-        self.assertEquals(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
+        self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
         self.assertFalse(self.setupCalled)
 
         # Verify that getActiveOrCreate() uses existing SparkContext

From 00a2911c5bea67a1a4796fb1d6fd5d0a8ee79001 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 18 Sep 2015 12:19:08 -0700
Subject: [PATCH 343/802] [SPARK-10540] Fixes flaky all-data-type test

This PR breaks the original test case into multiple ones (one test case for each data type). In this way, test failure output can be much more readable.

Within each test case, we build a table with two columns, one of them is for the data type to test, the other is an "index" column, which is used to sort the DataFrame and workaround [SPARK-10591] [1]

[1]: https://issues.apache.org/jira/browse/SPARK-10591

Author: Cheng Lian <lian@databricks.com>

Closes #8768 from liancheng/spark-10540/test-all-data-types.
---
 .../sql/sources/hadoopFsRelationSuites.scala  | 109 +++++++-----------
 1 file changed, 43 insertions(+), 66 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 8ffcef85668d6..d7504936d90e5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -100,80 +100,57 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
-  ignore("test all data types") {
-    withTempPath { file =>
-      // Create the schema.
-      val struct =
-        StructType(
-          StructField("f1", FloatType, true) ::
-            StructField("f2", ArrayType(BooleanType), true) :: Nil)
-      // TODO: add CalendarIntervalType to here once we can save it out.
-      val dataTypes =
-        Seq(
-          StringType, BinaryType, NullType, BooleanType,
-          ByteType, ShortType, IntegerType, LongType,
-          FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
-          DateType, TimestampType,
-          ArrayType(IntegerType), MapType(StringType, LongType), struct,
-          new MyDenseVectorUDT())
-      val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
-        StructField(s"col$index", dataType, nullable = true)
-      }
-      val schema = StructType(fields)
-
-      // Generate data at the driver side. We need to materialize the data first and then
-      // create RDD.
-      val maybeDataGenerator =
-        RandomDataGenerator.forType(
-          dataType = schema,
+  private val supportedDataTypes = Seq(
+    StringType, BinaryType,
+    NullType, BooleanType,
+    ByteType, ShortType, IntegerType, LongType,
+    FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+    DateType, TimestampType,
+    ArrayType(IntegerType),
+    MapType(StringType, LongType),
+    new StructType()
+      .add("f1", FloatType, nullable = true)
+      .add("f2", ArrayType(BooleanType, containsNull = true), nullable = true),
+    new MyDenseVectorUDT()
+  ).filter(supportsDataType)
+
+  for (dataType <- supportedDataTypes) {
+    test(s"test all data types - $dataType") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+
+        val dataGenerator = RandomDataGenerator.forType(
+          dataType = dataType,
           nullable = true,
-          seed = Some(System.nanoTime()))
-      val dataGenerator =
-        maybeDataGenerator
-          .getOrElse(fail(s"Failed to create data generator for schema $schema"))
-      val data = (1 to 10).map { i =>
-        dataGenerator.apply() match {
-          case row: Row => row
-          case null => Row.fromSeq(Seq.fill(schema.length)(null))
-          case other =>
-            fail(s"Row or null is expected to be generated, " +
-              s"but a ${other.getClass.getCanonicalName} is generated.")
+          seed = Some(System.nanoTime())
+        ).getOrElse {
+          fail(s"Failed to create data generator for schema $dataType")
         }
-      }
 
-      // Create a DF for the schema with random data.
-      val rdd = sqlContext.sparkContext.parallelize(data, 10)
-      val df = sqlContext.createDataFrame(rdd, schema)
+        // Create a DF for the schema with random data. The index field is used to sort the
+        // DataFrame.  This is a workaround for SPARK-10591.
+        val schema = new StructType()
+          .add("index", IntegerType, nullable = false)
+          .add("col", dataType, nullable = true)
+        val rdd = sqlContext.sparkContext.parallelize((1 to 10).map(i => Row(i, dataGenerator())))
+        val df = sqlContext.createDataFrame(rdd, schema).orderBy("index").coalesce(1)
 
-      // All columns that have supported data types of this source.
-      val supportedColumns = schema.fields.collect {
-        case StructField(name, dataType, _, _) if supportsDataType(dataType) => name
-      }
-      val selectedColumns = util.Random.shuffle(supportedColumns.toSeq)
-
-      val dfToBeSaved = df.selectExpr(selectedColumns: _*)
-
-      // Save the data out.
-      dfToBeSaved
-        .write
-        .format(dataSourceName)
-        .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests.
-        .save(file.getCanonicalPath)
+        df.write
+          .mode("overwrite")
+          .format(dataSourceName)
+          .option("dataSchema", df.schema.json)
+          .save(path)
 
-      val loadedDF =
-        sqlContext
+        val loadedDF = sqlContext
           .read
           .format(dataSourceName)
-          .schema(dfToBeSaved.schema)
-          .option("dataSchema", dfToBeSaved.schema.json) // This option is just used by tests.
-          .load(file.getCanonicalPath)
-          .selectExpr(selectedColumns: _*)
+          .option("dataSchema", df.schema.json)
+          .schema(df.schema)
+          .load(path)
+          .orderBy("index")
 
-      // Read the data back.
-      checkAnswer(
-        loadedDF,
-        dfToBeSaved
-      )
+        checkAnswer(loadedDF, df)
+      }
     }
   }
 

From c6f8135ee52202bd86adb090ab631e80330ea4df Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 18 Sep 2015 13:20:13 -0700
Subject: [PATCH 344/802] [SPARK-10539] [SQL] Project should not be pushed down
 through Intersect or Except #8742

Intersect and Except are both set operators and they use the all the columns to compare equality between rows. When pushing their Project parent down, the relations they based on would change, therefore not an equivalent transformation.

JIRA: https://issues.apache.org/jira/browse/SPARK-10539

I added some comments based on the fix of https://github.com/apache/spark/pull/8742.

Author: Yijie Shen <henry.yijieshen@gmail.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #8823 from yhuai/fix_set_optimization.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 37 ++++++++++---------
 .../optimizer/SetOperationPushDownSuite.scala | 23 ++++++------
 .../org/apache/spark/sql/DataFrameSuite.scala |  9 +++++
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 648a65e7c0eb3..324f40a051c38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -85,7 +85,22 @@ object SamplePushDown extends Rule[LogicalPlan] {
 }
 
 /**
- * Pushes operations to either side of a Union, Intersect or Except.
+ * Pushes certain operations to both sides of a Union, Intersect or Except operator.
+ * Operations that are safe to pushdown are listed as follows.
+ * Union:
+ * Right now, Union means UNION ALL, which does not de-duplicate rows. So, it is
+ * safe to pushdown Filters and Projections through it. Once we add UNION DISTINCT,
+ * we will not be able to pushdown Projections.
+ *
+ * Intersect:
+ * It is not safe to pushdown Projections through it because we need to get the
+ * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
+ * because we will not have non-deterministic expressions.
+ *
+ * Except:
+ * It is not safe to pushdown Projections through it because we need to get the
+ * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
+ * because we will not have non-deterministic expressions.
  */
 object SetOperationPushDown extends Rule[LogicalPlan] {
 
@@ -122,40 +137,26 @@ object SetOperationPushDown extends Rule[LogicalPlan] {
         Filter(condition, left),
         Filter(pushToRight(condition, rewrites), right))
 
-    // Push down projection into union
+    // Push down projection through UNION ALL
     case Project(projectList, u @ Union(left, right)) =>
       val rewrites = buildRewrites(u)
       Union(
         Project(projectList, left),
         Project(projectList.map(pushToRight(_, rewrites)), right))
 
-    // Push down filter into intersect
+    // Push down filter through INTERSECT
     case Filter(condition, i @ Intersect(left, right)) =>
       val rewrites = buildRewrites(i)
       Intersect(
         Filter(condition, left),
         Filter(pushToRight(condition, rewrites), right))
 
-    // Push down projection into intersect
-    case Project(projectList, i @ Intersect(left, right)) =>
-      val rewrites = buildRewrites(i)
-      Intersect(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
-
-    // Push down filter into except
+    // Push down filter through EXCEPT
     case Filter(condition, e @ Except(left, right)) =>
       val rewrites = buildRewrites(e)
       Except(
         Filter(condition, left),
         Filter(pushToRight(condition, rewrites), right))
-
-    // Push down projection into except
-    case Project(projectList, e @ Except(left, right)) =>
-      val rewrites = buildRewrites(e)
-      Except(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
index 49c979bc7d72c..3fca47a023dc6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
@@ -60,23 +60,22 @@ class SetOperationPushDownSuite extends PlanTest {
     comparePlans(exceptOptimized, exceptCorrectAnswer)
   }
 
-  test("union/intersect/except: project to each side") {
+  test("union: project to each side") {
     val unionQuery = testUnion.select('a)
+    val unionOptimized = Optimize.execute(unionQuery.analyze)
+    val unionCorrectAnswer =
+      Union(testRelation.select('a), testRelation2.select('d)).analyze
+    comparePlans(unionOptimized, unionCorrectAnswer)
+  }
+
+  test("SPARK-10539: Project should not be pushed down through Intersect or Except") {
     val intersectQuery = testIntersect.select('b, 'c)
     val exceptQuery = testExcept.select('a, 'b, 'c)
 
-    val unionOptimized = Optimize.execute(unionQuery.analyze)
     val intersectOptimized = Optimize.execute(intersectQuery.analyze)
     val exceptOptimized = Optimize.execute(exceptQuery.analyze)
 
-    val unionCorrectAnswer =
-      Union(testRelation.select('a), testRelation2.select('d)).analyze
-    val intersectCorrectAnswer =
-      Intersect(testRelation.select('b, 'c), testRelation2.select('e, 'f)).analyze
-    val exceptCorrectAnswer =
-      Except(testRelation.select('a, 'b, 'c), testRelation2.select('d, 'e, 'f)).analyze
-
-    comparePlans(unionOptimized, unionCorrectAnswer)
-    comparePlans(intersectOptimized, intersectCorrectAnswer)
-    comparePlans(exceptOptimized, exceptCorrectAnswer)  }
+    comparePlans(intersectOptimized, intersectQuery.analyze)
+    comparePlans(exceptOptimized, exceptQuery.analyze)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c167999af580e..1370713975f2f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -907,4 +907,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       assert(row.getDouble(1) - row.getDouble(3) === 0.0 +- 0.001)
     }
   }
+
+  test("SPARK-10539: Project should not be pushed down through Intersect or Except") {
+    val df1 = (1 to 100).map(Tuple1.apply).toDF("i")
+    val df2 = (1 to 30).map(Tuple1.apply).toDF("i")
+    val intersect = df1.intersect(df2)
+    val except = df1.except(df2)
+    assert(intersect.count() === 30)
+    assert(except.count() === 70)
+  }
 }

From 3a22b1004f527d54d399dd0225cd7f2f8ffad9c5 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Fri, 18 Sep 2015 13:47:14 -0700
Subject: [PATCH 345/802] [SPARK-10449] [SQL] Don't merge decimal types with
 incompatable precision or scales

From JIRA: Schema merging should only handle struct fields. But currently we also reconcile decimal precision and scale information.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8634 from holdenk/SPARK-10449-dont-merge-different-precision.
---
 .../org/apache/spark/sql/types/StructType.scala | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index b29cf22dcb582..d6b436724b2a0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -373,10 +373,19 @@ object StructType extends AbstractDataType {
         StructType(newFields)
 
       case (DecimalType.Fixed(leftPrecision, leftScale),
-      DecimalType.Fixed(rightPrecision, rightScale)) =>
-        DecimalType(
-          max(leftScale, rightScale) + max(leftPrecision - leftScale, rightPrecision - rightScale),
-          max(leftScale, rightScale))
+        DecimalType.Fixed(rightPrecision, rightScale)) =>
+        if ((leftPrecision == rightPrecision) && (leftScale == rightScale)) {
+          DecimalType(leftPrecision, leftScale)
+        } else if ((leftPrecision != rightPrecision) && (leftScale != rightScale)) {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"precision $leftPrecision and $rightPrecision & scale $leftScale and $rightScale")
+        } else if (leftPrecision != rightPrecision) {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"precision $leftPrecision and $rightPrecision")
+        } else {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"scala $leftScale and $rightScale")
+        }
 
       case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
         if leftUdt.userClass == rightUdt.userClass => leftUdt

From 348d7c9a93dd00d3d1859342a8eb0aea2e77f597 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 18 Sep 2015 13:48:41 -0700
Subject: [PATCH 346/802] [SPARK-9808] Remove hash shuffle file consolidation.

Author: Reynold Xin <rxin@databricks.com>

Closes #8812 from rxin/SPARK-9808-1.
---
 .../shuffle/FileShuffleBlockResolver.scala    | 178 ++----------------
 .../apache/spark/storage/BlockManager.scala   |   9 -
 .../org/apache/spark/storage/DiskStore.scala  |   3 -
 .../hash/HashShuffleManagerSuite.scala        | 110 -----------
 docs/configuration.md                         |  10 -
 .../shuffle/ExternalShuffleBlockResolver.java |   4 -
 project/MimaExcludes.scala                    |   4 +
 7 files changed, 17 insertions(+), 301 deletions(-)
 delete mode 100644 core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
index c057de9b3f4df..d9902f96dfd4e 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.shuffle
 
-import java.io.File
 import java.util.concurrent.ConcurrentLinkedQueue
-import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.JavaConverters._
 
@@ -28,10 +26,8 @@ import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle.FileShuffleBlockResolver.ShuffleFileGroup
 import org.apache.spark.storage._
 import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap}
-import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
@@ -43,24 +39,7 @@ private[spark] trait ShuffleWriterGroup {
 
 /**
  * Manages assigning disk-based block writers to shuffle tasks. Each shuffle task gets one file
- * per reducer (this set of files is called a ShuffleFileGroup).
- *
- * As an optimization to reduce the number of physical shuffle files produced, multiple shuffle
- * blocks are aggregated into the same file. There is one "combined shuffle file" per reducer
- * per concurrently executing shuffle task. As soon as a task finishes writing to its shuffle
- * files, it releases them for another task.
- * Regarding the implementation of this feature, shuffle files are identified by a 3-tuple:
- *   - shuffleId: The unique id given to the entire shuffle stage.
- *   - bucketId: The id of the output partition (i.e., reducer id)
- *   - fileId: The unique id identifying a group of "combined shuffle files." Only one task at a
- *       time owns a particular fileId, and this id is returned to a pool when the task finishes.
- * Each shuffle file is then mapped to a FileSegment, which is a 3-tuple (file, offset, length)
- * that specifies where in a given file the actual block data is located.
- *
- * Shuffle file metadata is stored in a space-efficient manner. Rather than simply mapping
- * ShuffleBlockIds directly to FileSegments, each ShuffleFileGroup maintains a list of offsets for
- * each block stored in each file. In order to find the location of a shuffle block, we search the
- * files within a ShuffleFileGroups associated with the block's reducer.
+ * per reducer.
  */
 // Note: Changes to the format in this file should be kept in sync with
 // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getHashBasedShuffleBlockData().
@@ -71,26 +50,15 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
 
   private lazy val blockManager = SparkEnv.get.blockManager
 
-  // Turning off shuffle file consolidation causes all shuffle Blocks to get their own file.
-  // TODO: Remove this once the shuffle file consolidation feature is stable.
-  private val consolidateShuffleFiles =
-    conf.getBoolean("spark.shuffle.consolidateFiles", false)
-
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val bufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
   /**
-   * Contains all the state related to a particular shuffle. This includes a pool of unused
-   * ShuffleFileGroups, as well as all ShuffleFileGroups that have been created for the shuffle.
+   * Contains all the state related to a particular shuffle.
    */
-  private class ShuffleState(val numBuckets: Int) {
-    val nextFileId = new AtomicInteger(0)
-    val unusedFileGroups = new ConcurrentLinkedQueue[ShuffleFileGroup]()
-    val allFileGroups = new ConcurrentLinkedQueue[ShuffleFileGroup]()
-
+  private class ShuffleState(val numReducers: Int) {
     /**
      * The mapIds of all map tasks completed on this Executor for this shuffle.
-     * NB: This is only populated if consolidateShuffleFiles is FALSE. We don't need it otherwise.
      */
     val completedMapTasks = new ConcurrentLinkedQueue[Int]()
   }
@@ -104,24 +72,16 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
    * Get a ShuffleWriterGroup for the given map task, which will register it as complete
    * when the writers are closed successfully
    */
-  def forMapTask(shuffleId: Int, mapId: Int, numBuckets: Int, serializer: Serializer,
+  def forMapTask(shuffleId: Int, mapId: Int, numReducers: Int, serializer: Serializer,
       writeMetrics: ShuffleWriteMetrics): ShuffleWriterGroup = {
     new ShuffleWriterGroup {
-      shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
+      shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numReducers))
       private val shuffleState = shuffleStates(shuffleId)
-      private var fileGroup: ShuffleFileGroup = null
 
       val openStartTime = System.nanoTime
       val serializerInstance = serializer.newInstance()
-      val writers: Array[DiskBlockObjectWriter] = if (consolidateShuffleFiles) {
-        fileGroup = getUnusedFileGroup()
-        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
-          val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
-          blockManager.getDiskWriter(blockId, fileGroup(bucketId), serializerInstance, bufferSize,
-            writeMetrics)
-        }
-      } else {
-        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
+      val writers: Array[DiskBlockObjectWriter] = {
+        Array.tabulate[DiskBlockObjectWriter](numReducers) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
           // Because of previous failures, the shuffle file may already exist on this machine.
@@ -142,58 +102,14 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
       writeMetrics.incShuffleWriteTime(System.nanoTime - openStartTime)
 
       override def releaseWriters(success: Boolean) {
-        if (consolidateShuffleFiles) {
-          if (success) {
-            val offsets = writers.map(_.fileSegment().offset)
-            val lengths = writers.map(_.fileSegment().length)
-            fileGroup.recordMapOutput(mapId, offsets, lengths)
-          }
-          recycleFileGroup(fileGroup)
-        } else {
-          shuffleState.completedMapTasks.add(mapId)
-        }
-      }
-
-      private def getUnusedFileGroup(): ShuffleFileGroup = {
-        val fileGroup = shuffleState.unusedFileGroups.poll()
-        if (fileGroup != null) fileGroup else newFileGroup()
-      }
-
-      private def newFileGroup(): ShuffleFileGroup = {
-        val fileId = shuffleState.nextFileId.getAndIncrement()
-        val files = Array.tabulate[File](numBuckets) { bucketId =>
-          val filename = physicalFileName(shuffleId, bucketId, fileId)
-          blockManager.diskBlockManager.getFile(filename)
-        }
-        val fileGroup = new ShuffleFileGroup(shuffleId, fileId, files)
-        shuffleState.allFileGroups.add(fileGroup)
-        fileGroup
-      }
-
-      private def recycleFileGroup(group: ShuffleFileGroup) {
-        shuffleState.unusedFileGroups.add(group)
+        shuffleState.completedMapTasks.add(mapId)
       }
     }
   }
 
   override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = {
-    if (consolidateShuffleFiles) {
-      // Search all file groups associated with this shuffle.
-      val shuffleState = shuffleStates(blockId.shuffleId)
-      val iter = shuffleState.allFileGroups.iterator
-      while (iter.hasNext) {
-        val segmentOpt = iter.next.getFileSegmentFor(blockId.mapId, blockId.reduceId)
-        if (segmentOpt.isDefined) {
-          val segment = segmentOpt.get
-          return new FileSegmentManagedBuffer(
-            transportConf, segment.file, segment.offset, segment.length)
-        }
-      }
-      throw new IllegalStateException("Failed to find shuffle block: " + blockId)
-    } else {
-      val file = blockManager.diskBlockManager.getFile(blockId)
-      new FileSegmentManagedBuffer(transportConf, file, 0, file.length)
-    }
+    val file = blockManager.diskBlockManager.getFile(blockId)
+    new FileSegmentManagedBuffer(transportConf, file, 0, file.length)
   }
 
   /** Remove all the blocks / files and metadata related to a particular shuffle. */
@@ -209,17 +125,9 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
   private def removeShuffleBlocks(shuffleId: ShuffleId): Boolean = {
     shuffleStates.get(shuffleId) match {
       case Some(state) =>
-        if (consolidateShuffleFiles) {
-          for (fileGroup <- state.allFileGroups.asScala;
-               file <- fileGroup.files) {
-            file.delete()
-          }
-        } else {
-          for (mapId <- state.completedMapTasks.asScala;
-               reduceId <- 0 until state.numBuckets) {
-            val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
-            blockManager.diskBlockManager.getFile(blockId).delete()
-          }
+        for (mapId <- state.completedMapTasks.asScala; reduceId <- 0 until state.numReducers) {
+          val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
+          blockManager.diskBlockManager.getFile(blockId).delete()
         }
         logInfo("Deleted all files for shuffle " + shuffleId)
         true
@@ -229,10 +137,6 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
     }
   }
 
-  private def physicalFileName(shuffleId: Int, bucketId: Int, fileId: Int) = {
-    "merged_shuffle_%d_%d_%d".format(shuffleId, bucketId, fileId)
-  }
-
   private def cleanup(cleanupTime: Long) {
     shuffleStates.clearOldValues(cleanupTime, (shuffleId, state) => removeShuffleBlocks(shuffleId))
   }
@@ -241,59 +145,3 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
     metadataCleaner.cancel()
   }
 }
-
-private[spark] object FileShuffleBlockResolver {
-  /**
-   * A group of shuffle files, one per reducer.
-   * A particular mapper will be assigned a single ShuffleFileGroup to write its output to.
-   */
-  private class ShuffleFileGroup(val shuffleId: Int, val fileId: Int, val files: Array[File]) {
-    private var numBlocks: Int = 0
-
-    /**
-     * Stores the absolute index of each mapId in the files of this group. For instance,
-     * if mapId 5 is the first block in each file, mapIdToIndex(5) = 0.
-     */
-    private val mapIdToIndex = new PrimitiveKeyOpenHashMap[Int, Int]()
-
-    /**
-     * Stores consecutive offsets and lengths of blocks into each reducer file, ordered by
-     * position in the file.
-     * Note: mapIdToIndex(mapId) returns the index of the mapper into the vector for every
-     * reducer.
-     */
-    private val blockOffsetsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
-      new PrimitiveVector[Long]()
-    }
-    private val blockLengthsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
-      new PrimitiveVector[Long]()
-    }
-
-    def apply(bucketId: Int): File = files(bucketId)
-
-    def recordMapOutput(mapId: Int, offsets: Array[Long], lengths: Array[Long]) {
-      assert(offsets.length == lengths.length)
-      mapIdToIndex(mapId) = numBlocks
-      numBlocks += 1
-      for (i <- 0 until offsets.length) {
-        blockOffsetsByReducer(i) += offsets(i)
-        blockLengthsByReducer(i) += lengths(i)
-      }
-    }
-
-    /** Returns the FileSegment associated with the given map task, or None if no entry exists. */
-    def getFileSegmentFor(mapId: Int, reducerId: Int): Option[FileSegment] = {
-      val file = files(reducerId)
-      val blockOffsets = blockOffsetsByReducer(reducerId)
-      val blockLengths = blockLengthsByReducer(reducerId)
-      val index = mapIdToIndex.getOrElse(mapId, -1)
-      if (index >= 0) {
-        val offset = blockOffsets(index)
-        val length = blockLengths(index)
-        Some(new FileSegment(file, offset, length))
-      } else {
-        None
-      }
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index d31aa68eb6954..bca3942f8c555 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -106,15 +106,6 @@ private[spark] class BlockManager(
     }
   }
 
-  // Check that we're not using external shuffle service with consolidated shuffle files.
-  if (externalShuffleServiceEnabled
-      && conf.getBoolean("spark.shuffle.consolidateFiles", false)
-      && shuffleManager.isInstanceOf[HashShuffleManager]) {
-    throw new UnsupportedOperationException("Cannot use external shuffle service with consolidated"
-      + " shuffle files in hash-based shuffle. Please disable spark.shuffle.consolidateFiles or "
-      + " switch to sort-based shuffle.")
-  }
-
   var blockManagerId: BlockManagerId = _
 
   // Address of the server that serves this executor's shuffle files. This is either an external
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 1f45956282166..feb9533604ffb 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -154,9 +154,6 @@ private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBloc
 
   override def remove(blockId: BlockId): Boolean = {
     val file = diskManager.getFile(blockId.name)
-    // If consolidation mode is used With HashShuffleMananger, the physical filename for the block
-    // is different from blockId.name. So the file returns here will not be exist, thus we avoid to
-    // delete the whole consolidated file by mistake.
     if (file.exists()) {
       file.delete()
     } else {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
deleted file mode 100644
index 491dc3659e184..0000000000000
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.hash
-
-import java.io.{File, FileWriter}
-
-import scala.language.reflectiveCalls
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
-import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.shuffle.FileShuffleBlockResolver
-import org.apache.spark.storage.{ShuffleBlockId, FileSegment}
-
-class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext {
-  private val testConf = new SparkConf(false)
-
-  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
-    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
-    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
-    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
-    assert(expected.offset === segment.getOffset)
-    assert(expected.length === segment.getLength)
-  }
-
-  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {
-
-    val conf = new SparkConf(false)
-    // reset after EACH object write. This is to ensure that there are bytes appended after
-    // an object is written. So if the codepaths assume writeObject is end of data, this should
-    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
-    conf.set("spark.serializer.objectStreamReset", "1")
-    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
-
-    sc = new SparkContext("local", "test", conf)
-
-    val shuffleBlockResolver =
-      SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver]
-
-    val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf),
-      new ShuffleWriteMetrics)
-    for (writer <- shuffle1.writers) {
-      writer.write("test1", "value")
-      writer.write("test2", "value")
-    }
-    for (writer <- shuffle1.writers) {
-      writer.commitAndClose()
-    }
-
-    val shuffle1Segment = shuffle1.writers(0).fileSegment()
-    shuffle1.releaseWriters(success = true)
-
-    val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf),
-      new ShuffleWriteMetrics)
-
-    for (writer <- shuffle2.writers) {
-      writer.write("test3", "value")
-      writer.write("test4", "vlue")
-    }
-    for (writer <- shuffle2.writers) {
-      writer.commitAndClose()
-    }
-    val shuffle2Segment = shuffle2.writers(0).fileSegment()
-    shuffle2.releaseWriters(success = true)
-
-    // Now comes the test :
-    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
-    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
-    // of block based on remaining data in file : which could mess things up when there is
-    // concurrent read and writes happening to the same shuffle group.
-
-    val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf),
-      new ShuffleWriteMetrics)
-    for (writer <- shuffle3.writers) {
-      writer.write("test3", "value")
-      writer.write("test4", "value")
-    }
-    for (writer <- shuffle3.writers) {
-      writer.commitAndClose()
-    }
-    // check before we register.
-    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
-    shuffle3.releaseWriters(success = true)
-    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
-    shuffleBlockResolver.removeShuffle(1)
-  }
-
-  def writeToFile(file: File, numBytes: Int) {
-    val writer = new FileWriter(file, true)
-    for (i <- 0 until numBytes) writer.write(i)
-    writer.close()
-  }
-}
diff --git a/docs/configuration.md b/docs/configuration.md
index 1a701f18881fe..3700051efb448 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -390,16 +390,6 @@ Apart from these, the following properties are also available, and may be useful
     <code>spark.io.compression.codec</code>.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.consolidateFiles</code></td>
-  <td>false</td>
-  <td>
-    If set to "true", consolidates intermediate files created during a shuffle. Creating fewer
-    files can improve filesystem performance for shuffles with large numbers of reduce tasks. It
-    is recommended to set this to "true" when using ext4 or xfs filesystems. On ext3, this option
-    might degrade performance on machines with many (>8) cores due to filesystem limitations.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.file.buffer</code></td>
   <td>32k</td>
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
index 79beec4429a99..c5f93bb47f55c 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -50,9 +50,6 @@
  * of Executors. Each Executor must register its own configuration about where it stores its files
  * (local dirs) and how (shuffle manager). The logic for retrieval of individual files is replicated
  * from Spark's FileShuffleBlockResolver and IndexShuffleBlockResolver.
- *
- * Executors with shuffle file consolidation are not currently supported, as the index is stored in
- * the Executor's memory, unlike the IndexShuffleBlockResolver.
  */
 public class ExternalShuffleBlockResolver {
   private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);
@@ -254,7 +251,6 @@ private void deleteExecutorDirs(String[] dirs) {
    * Hash-based shuffle data is simply stored as one file per block.
    * This logic is from FileShuffleBlockResolver.
    */
-  // TODO: Support consolidated hash shuffle files
   private ManagedBuffer getHashBasedShuffleBlockData(ExecutorShuffleInfo executor, String blockId) {
     File shuffleFile = getFile(executor.localDirs, executor.subDirsPerLocalDir, blockId);
     return new FileSegmentManagedBuffer(conf, shuffleFile, 0, shuffleFile.length());
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 1c96b0958586f..814a11e588ceb 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -70,6 +70,10 @@ object MimaExcludes {
           "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
         ProblemFilters.exclude[IncompatibleMethTypeProblem](
           "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
+      ) ++
+      Seq(
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.shuffle.FileShuffleBlockResolver$ShuffleFileGroup")
       )
     case v if v.startsWith("1.5") =>
       Seq(

From 8074208fa47fa654c1055c48cfa0d923edeeb04f Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mkim@palantir.com>
Date: Fri, 18 Sep 2015 15:40:58 -0700
Subject: [PATCH 347/802] [SPARK-10611] Clone Configuration for each task for
 NewHadoopRDD

This patch attempts to fix the Hadoop Configuration thread safety issue for NewHadoopRDD in the same way SPARK-2546 fixed the issue for HadoopRDD.

Author: Mingyu Kim <mkim@palantir.com>

Closes #8763 from mingyukim/mkim/SPARK-10611.
---
 .../org/apache/spark/rdd/BinaryFileRDD.scala  |  5 ++-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 37 ++++++++++++++++---
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 6fec00dcd0d85..aedced7408cde 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -34,12 +34,13 @@ private[spark] class BinaryFileRDD[T](
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(getConf)
+        configurable.setConf(conf)
       case _ =>
     }
-    val jobContext = newJobContext(getConf, jobId)
+    val jobContext = newJobContext(conf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 174979aaeb231..2872b93b8730e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -44,7 +44,6 @@ private[spark] class NewHadoopPartition(
   extends Partition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
-
   override def hashCode(): Int = 41 * (41 + rddId) + index
 }
 
@@ -84,6 +83,27 @@ class NewHadoopRDD[K, V](
 
   @transient protected val jobId = new JobID(jobTrackerId, id)
 
+  private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
+
+  def getConf: Configuration = {
+    val conf: Configuration = confBroadcast.value.value
+    if (shouldCloneJobConf) {
+      // Hadoop Configuration objects are not thread-safe, which may lead to various problems if
+      // one job modifies a configuration while another reads it (SPARK-2546, SPARK-10611).  This
+      // problem occurs somewhat rarely because most jobs treat the configuration as though it's
+      // immutable.  One solution, implemented here, is to clone the Configuration object.
+      // Unfortunately, this clone can be very expensive.  To avoid unexpected performance
+      // regressions for workloads and Hadoop versions that do not suffer from these thread-safety
+      // issues, this cloning is disabled by default.
+      NewHadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+        logDebug("Cloning Hadoop Configuration")
+        new Configuration(conf)
+      }
+    } else {
+      conf
+    }
+  }
+
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
@@ -104,7 +124,7 @@ class NewHadoopRDD[K, V](
     val iter = new Iterator[(K, V)] {
       val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
-      val conf = confBroadcast.value.value
+      val conf = getConf
 
       val inputMetrics = context.taskMetrics
         .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
@@ -230,11 +250,15 @@ class NewHadoopRDD[K, V](
     super.persist(storageLevel)
   }
 
-
-  def getConf: Configuration = confBroadcast.value.value
 }
 
 private[spark] object NewHadoopRDD {
+  /**
+   * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
+   * Therefore, we synchronize on this lock before calling new Configuration().
+   */
+  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
   /**
    * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
    * the given function rather than the index of the partition.
@@ -268,12 +292,13 @@ private[spark] class WholeTextFileRDD(
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(getConf)
+        configurable.setConf(conf)
       case _ =>
     }
-    val jobContext = newJobContext(getConf, jobId)
+    val jobContext = newJobContext(conf, jobId)
     inputFormat.setMinPartitions(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)

From c8149ef2c57f5c47ab97ee8d8d58a216d4bc4156 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 18 Sep 2015 16:23:05 -0700
Subject: [PATCH 348/802] [MINOR] [ML] override toString of AttributeGroup

This makes equality test failures much more readable.

mengxr

Author: Eric Liang <ekl@databricks.com>
Author: Eric Liang <ekhliang@gmail.com>

Closes #8826 from ericl/attrgroupstr.
---
 .../scala/org/apache/spark/ml/attribute/AttributeGroup.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 457c15830fd38..2c29eeb01a921 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -183,6 +183,8 @@ class AttributeGroup private (
     sum = 37 * sum + attributes.map(_.toSeq).hashCode
     sum
   }
+
+  override def toString: String = toMetadata.toString
 }
 
 /**

From 22be2ae147a111e88896f6fb42ed46bbf108a99b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 18 Sep 2015 18:42:20 -0700
Subject: [PATCH 349/802] [SPARK-10623] [SQL] Fixes ORC predicate push-down

When pushing down a leaf predicate, ORC `SearchArgument` builder requires an extra "parent" predicate (any one among `AND`/`OR`/`NOT`) to wrap the leaf predicate. E.g., to push down `a < 1`, we must build `AND(a < 1)` instead. Fortunately, when actually constructing the `SearchArgument`, the builder will eliminate all those unnecessary wrappers.

This PR is based on #8783 authored by zhzhan. I also took the chance to simply `OrcFilters` a little bit to improve readability.

Author: Cheng Lian <lian@databricks.com>

Closes #8799 from liancheng/spark-10623/fix-orc-ppd.
---
 .../spark/sql/hive/orc/OrcFilters.scala       | 56 ++++++++-----------
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 30 ++++++++++
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
index b3d9f7f71a27d..27193f54d3a91 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -31,11 +31,13 @@ import org.apache.spark.sql.sources._
  * and cannot be used anymore.
  */
 private[orc] object OrcFilters extends Logging {
-  def createFilter(expr: Array[Filter]): Option[SearchArgument] = {
-    expr.reduceOption(And).flatMap { conjunction =>
-      val builder = SearchArgumentFactory.newBuilder()
-      buildSearchArgument(conjunction, builder).map(_.build())
-    }
+  def createFilter(filters: Array[Filter]): Option[SearchArgument] = {
+    for {
+      // Combines all filters with `And`s to produce a single conjunction predicate
+      conjunction <- filters.reduceOption(And)
+      // Then tries to build a single ORC `SearchArgument` for the conjunction predicate
+      builder <- buildSearchArgument(conjunction, SearchArgumentFactory.newBuilder())
+    } yield builder.build()
   }
 
   private def buildSearchArgument(expression: Filter, builder: Builder): Option[Builder] = {
@@ -102,46 +104,32 @@ private[orc] object OrcFilters extends Logging {
           negate <- buildSearchArgument(child, builder.startNot())
         } yield negate.end()
 
-      case EqualTo(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.equals(attribute, _))
+      case EqualTo(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().equals(attribute, value).end())
 
-      case EqualNullSafe(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.nullSafeEquals(attribute, _))
+      case EqualNullSafe(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().nullSafeEquals(attribute, value).end())
 
-      case LessThan(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.lessThan(attribute, _))
+      case LessThan(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().lessThan(attribute, value).end())
 
-      case LessThanOrEqual(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.lessThanEquals(attribute, _))
+      case LessThanOrEqual(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().lessThanEquals(attribute, value).end())
 
-      case GreaterThan(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.startNot().lessThanEquals(attribute, _).end())
+      case GreaterThan(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startNot().lessThanEquals(attribute, value).end())
 
-      case GreaterThanOrEqual(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.startNot().lessThan(attribute, _).end())
+      case GreaterThanOrEqual(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startNot().lessThan(attribute, value).end())
 
       case IsNull(attribute) =>
-        Some(builder.isNull(attribute))
+        Some(builder.startAnd().isNull(attribute).end())
 
       case IsNotNull(attribute) =>
         Some(builder.startNot().isNull(attribute).end())
 
-      case In(attribute, values) =>
-        Option(values)
-          .filter(_.forall(isSearchableLiteral))
-          .map(builder.in(attribute, _))
+      case In(attribute, values) if values.forall(isSearchableLiteral) =>
+        Some(builder.startAnd().in(attribute, values.map(_.asInstanceOf[AnyRef]): _*).end())
 
       case _ => None
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 8bc33fcf5d906..5eb39b1129701 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -344,4 +344,34 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       }
     }
   }
+
+  test("SPARK-10623 Enable ORC PPD") {
+    withTempPath { dir =>
+      withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+        import testImplicits._
+
+        val path = dir.getCanonicalPath
+        sqlContext.range(10).coalesce(1).write.orc(path)
+        val df = sqlContext.read.orc(path)
+
+        def checkPredicate(pred: Column, answer: Seq[Long]): Unit = {
+          checkAnswer(df.where(pred), answer.map(Row(_)))
+        }
+
+        checkPredicate('id === 5, Seq(5L))
+        checkPredicate('id <=> 5, Seq(5L))
+        checkPredicate('id < 5, 0L to 4L)
+        checkPredicate('id <= 5, 0L to 5L)
+        checkPredicate('id > 5, 6L to 9L)
+        checkPredicate('id >= 5, 5L to 9L)
+        checkPredicate('id.isNull, Seq.empty[Long])
+        checkPredicate('id.isNotNull, 0L to 9L)
+        checkPredicate('id.isin(1L, 3L, 5L), Seq(1L, 3L, 5L))
+        checkPredicate('id > 0 && 'id < 3, 1L to 2L)
+        checkPredicate('id < 1 || 'id > 8, Seq(0L, 9L))
+        checkPredicate(!('id > 3), 0L to 3L)
+        checkPredicate(!('id > 0 && 'id < 3), Seq(0L) ++ (3L to 9L))
+      }
+    }
+  }
 }

From 7ff8d68cc19299e16dedfd819b9e96480fa6cf44 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 18 Sep 2015 23:58:25 -0700
Subject: [PATCH 350/802] [SPARK-10474] [SQL] Aggregation fails to allocate
 memory for pointer array

When `TungstenAggregation` hits memory pressure, it switches from hash-based to sort-based aggregation in-place. However, in the process we try to allocate the pointer array for writing to the new `UnsafeExternalSorter` *before* actually freeing the memory from the hash map. This lead to the following exception:
```
 java.io.IOException: Could not acquire 65536 bytes of memory
        at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.initializeForWriting(UnsafeExternalSorter.java:169)
        at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spill(UnsafeExternalSorter.java:220)
        at org.apache.spark.sql.execution.UnsafeKVExternalSorter.<init>(UnsafeKVExternalSorter.java:126)
        at org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap.destructAndCreateExternalSorter(UnsafeFixedWidthAggregationMap.java:257)
        at org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.switchToSortBasedAggregation(TungstenAggregationIterator.scala:435)
```

Author: Andrew Or <andrew@databricks.com>

Closes #8827 from andrewor14/allocate-pointer-array.
---
 .../unsafe/sort/UnsafeExternalSorter.java     | 14 +++++-
 .../sql/execution/UnsafeKVExternalSorter.java |  8 ++-
 .../UnsafeFixedWidthAggregationMapSuite.scala | 49 ++++++++++++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index fc364e0a895b1..14b6aafdea7df 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -159,7 +159,7 @@ public BoxedUnit apply() {
   /**
    * Allocates new sort data structures. Called when creating the sorter and after each spill.
    */
-  private void initializeForWriting() throws IOException {
+  public void initializeForWriting() throws IOException {
     this.writeMetrics = new ShuffleWriteMetrics();
     final long pointerArrayMemory =
       UnsafeInMemorySorter.getMemoryRequirementsForPointerArray(initialSize);
@@ -187,6 +187,14 @@ public void closeCurrentPage() {
    * Sort and spill the current records in response to memory pressure.
    */
   public void spill() throws IOException {
+    spill(true);
+  }
+
+  /**
+   * Sort and spill the current records in response to memory pressure.
+   * @param shouldInitializeForWriting whether to allocate memory for writing after the spill
+   */
+  public void spill(boolean shouldInitializeForWriting) throws IOException {
     assert(inMemSorter != null);
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
       Thread.currentThread().getId(),
@@ -217,7 +225,9 @@ public void spill() throws IOException {
     // written to disk. This also counts the space needed to store the sorter's pointer array.
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
 
-    initializeForWriting();
+    if (shouldInitializeForWriting) {
+      initializeForWriting();
+    }
   }
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index 7db6b7ff50f22..b81f67a16b815 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -85,6 +85,7 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
       // We will use the number of elements in the map as the initialSize of the
       // UnsafeInMemorySorter. Because UnsafeInMemorySorter does not accept 0 as the initialSize,
       // we will use 1 as its initial size if the map is empty.
+      // TODO: track pointer array memory used by this in-memory sorter!
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
         taskMemoryManager, recordComparator, prefixComparator, Math.max(1, map.numElements()));
 
@@ -123,8 +124,13 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
         pageSizeBytes,
         inMemSorter);
 
-      sorter.spill();
+      // Note: This spill doesn't actually release any memory, so if we try to allocate a new
+      // pointer array immediately after the spill then we may fail to acquire sufficient space
+      // for it (SPARK-10474). For this reason, we must initialize for writing explicitly *after*
+      // we have actually freed memory from our map.
+      sorter.spill(false /* initialize for writing */);
       map.free();
+      sorter.initializeForWriting();
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index d1f0b2b1fc52f..ada4d42f991ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -23,9 +23,10 @@ import scala.util.{Try, Random}
 
 import org.scalatest.Matchers
 
-import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.{TaskContextImpl, TaskContext, SparkFunSuite}
+import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
@@ -325,7 +326,7 @@ class UnsafeFixedWidthAggregationMapSuite
       // At here, we also test if copy is correct.
       iter.getKey.copy()
       iter.getValue.copy()
-      count += 1;
+      count += 1
     }
 
     // 1 record was from the map and 4096 records were explicitly inserted.
@@ -333,4 +334,48 @@ class UnsafeFixedWidthAggregationMapSuite
 
     map.free()
   }
+
+  testWithMemoryLeakDetection("convert to external sorter under memory pressure (SPARK-10474)") {
+    val smm = ShuffleMemoryManager.createForTesting(65536)
+    val pageSize = 4096
+    val map = new UnsafeFixedWidthAggregationMap(
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
+      taskMemoryManager,
+      smm,
+      128, // initial capacity
+      pageSize,
+      false // disable perf metrics
+    )
+
+    // Insert into the map until we've run out of space
+    val rand = new Random(42)
+    var hasSpace = true
+    while (hasSpace) {
+      val str = rand.nextString(1024)
+      val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+      if (buf == null) {
+        hasSpace = false
+      } else {
+        buf.setInt(0, str.length)
+      }
+    }
+
+    // Ensure we're actually maxed out by asserting that we can't acquire even just 1 byte
+    assert(smm.tryToAcquire(1) === 0)
+
+    // Convert the map into a sorter. This used to fail before the fix for SPARK-10474
+    // because we would try to acquire space for the in-memory sorter pointer array before
+    // actually releasing the pages despite having spilled all of them.
+    var sorter: UnsafeKVExternalSorter = null
+    try {
+      sorter = map.destructAndCreateExternalSorter()
+    } finally {
+      if (sorter != null) {
+        sorter.cleanupResources()
+      }
+    }
+  }
+
 }

From d507f9c0b7f7a524137a694ed6443747aaf90463 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Sat, 19 Sep 2015 01:59:36 -0700
Subject: [PATCH 351/802] [SPARK-10584] [SQL] [DOC] Documentation about the
 compatible Hive version is wrong.

In Spark 1.5.0, Spark SQL is compatible with Hive 0.12.0 through 1.2.1 but the documentation is wrong.

/CC yhuai

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #8776 from sarutak/SPARK-10584-2.
---
 docs/sql-programming-guide.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index a0b911d207243..82d4243cc6b27 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1954,7 +1954,7 @@ without the need to write any code.
 ## Running the Thrift JDBC/ODBC server
 
 The Thrift JDBC/ODBC server implemented here corresponds to the [`HiveServer2`](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2)
-in Hive 0.13. You can test the JDBC server with the beeline script that comes with either Spark or Hive 0.13.
+in Hive 1.2.1 You can test the JDBC server with the beeline script that comes with either Spark or Hive 1.2.1.
 
 To start the JDBC/ODBC server, run the following in the Spark directory:
 
@@ -2260,8 +2260,10 @@ Several caching related features are not supported yet:
 
 ## Compatibility with Apache Hive
 
-Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.  Currently Spark
-SQL is based on Hive 0.12.0 and 0.13.1.
+Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.
+Currently Hive SerDes and UDFs are based on Hive 1.2.1,
+and Spark SQL can be connected to different versions of Hive Metastore
+(from 0.12.0 to 1.2.1. Also see http://spark.apache.org/docs/latest/sql-programming-guide.html#interacting-with-different-versions-of-hive-metastore).
 
 #### Deploying in Existing Hive Warehouses
 

From d83b6aae8b4357c56779cc98804eb350ab8af62d Mon Sep 17 00:00:00 2001
From: Alexis Seigneurin <alexis.seigneurin@gmail.com>
Date: Sat, 19 Sep 2015 12:01:22 +0100
Subject: [PATCH 352/802] Fixed links to the API

Submitting this change on the master branch as requested in https://github.com/apache/spark/pull/8819#issuecomment-141505941

Author: Alexis Seigneurin <alexis.seigneurin@gmail.com>

Closes #8838 from aseigneurin/patch-2.
---
 docs/ml-guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index c5d7f990021f1..0427ac6695aa1 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -619,13 +619,13 @@ for row in selected.collect():
 An important task in ML is *model selection*, or using data to find the best model or parameters for a given task.  This is also called *tuning*.
 `Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately.
 
-Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.Evaluator).
+Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator).
 `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing.
 `CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
 
-The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.RegressionEvaluator)
-for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.BinaryClassificationEvaluator)
-for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator)
+The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator)
+for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
+for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MultiClassClassificationEvaluator)
 for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the `setMetric`
 method in each of these evaluators.
 

From e789000b88a6bd840f821c53f42c08b97dc02496 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 19 Sep 2015 18:22:43 -0700
Subject: [PATCH 353/802] [SPARK-10155] [SQL] Change SqlParser to object to
 avoid memory leak

Since `scala.util.parsing.combinator.Parsers` is thread-safe since Scala 2.10 (See [SI-4929](https://issues.scala-lang.org/browse/SI-4929)), we can change SqlParser to object to avoid memory leak.

I didn't change other subclasses of `scala.util.parsing.combinator.Parsers` because there is only one instance in one SQLContext, which should not be an issue.

Author: zsxwing <zsxwing@gmail.com>

Closes #8357 from zsxwing/sql-memory-leak.
---
 .../apache/spark/sql/catalyst/AbstractSparkSQLParser.scala  | 2 +-
 .../scala/org/apache/spark/sql/catalyst/ParserDialect.scala | 2 +-
 .../scala/org/apache/spark/sql/catalyst/SqlParser.scala     | 6 +++---
 .../src/main/scala/org/apache/spark/sql/DataFrame.scala     | 6 +++---
 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 4 ++--
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala    | 6 +++---
 .../src/main/scala/org/apache/spark/sql/functions.scala     | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveContext.scala  | 6 +++---
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala    | 4 ++--
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
index 5898a5f93f381..2bac08eac4fe2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 private[sql] abstract class AbstractSparkSQLParser
   extends StandardTokenParsers with PackratParsers {
 
-  def parse(input: String): LogicalPlan = {
+  def parse(input: String): LogicalPlan = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(start)(new lexical.Scanner(input)) match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
index 554fb4eb25eb1..e21d3c05464b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
@@ -61,7 +61,7 @@ abstract class ParserDialect {
  */
 private[spark] class DefaultParserDialect extends ParserDialect {
   @transient
-  protected val sqlParser = new SqlParser
+  protected val sqlParser = SqlParser
 
   override def parse(sqlText: String): LogicalPlan = {
     sqlParser.parse(sqlText)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index f2498861c9573..dfab2398857e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -37,9 +37,9 @@ import org.apache.spark.unsafe.types.CalendarInterval
  * This is currently included mostly for illustrative purposes.  Users wanting more complete support
  * for a SQL like language should checkout the HiveQL support in the sql/hive sub-project.
  */
-class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
+object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
-  def parseExpression(input: String): Expression = {
+  def parseExpression(input: String): Expression = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(projection)(new lexical.Scanner(input)) match {
@@ -48,7 +48,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     }
   }
 
-  def parseTableIdentifier(input: String): TableIdentifier = {
+  def parseTableIdentifier(input: String): TableIdentifier = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(tableIdentifier)(new lexical.Scanner(input)) match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 3e61123c145cd..8f737c2023931 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -720,7 +720,7 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def selectExpr(exprs: String*): DataFrame = {
     select(exprs.map { expr =>
-      Column(new SqlParser().parseExpression(expr))
+      Column(SqlParser.parseExpression(expr))
     }: _*)
   }
 
@@ -745,7 +745,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def filter(conditionExpr: String): DataFrame = {
-    filter(Column(new SqlParser().parseExpression(conditionExpr)))
+    filter(Column(SqlParser.parseExpression(conditionExpr)))
   }
 
   /**
@@ -769,7 +769,7 @@ class DataFrame private[sql](
    * @since 1.5.0
    */
   def where(conditionExpr: String): DataFrame = {
-    filter(Column(new SqlParser().parseExpression(conditionExpr)))
+    filter(Column(SqlParser.parseExpression(conditionExpr)))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 745bb4ec9cf1c..03e973666e888 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -163,7 +163,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def insertInto(tableName: String): Unit = {
-    insertInto(new SqlParser().parseTableIdentifier(tableName))
+    insertInto(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def insertInto(tableIdent: TableIdentifier): Unit = {
@@ -197,7 +197,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
-    saveAsTable(new SqlParser().parseTableIdentifier(tableName))
+    saveAsTable(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index e3fdd782e6ff6..f099940800cc0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -590,7 +590,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       tableName: String,
       source: String,
       options: Map[String, String]): DataFrame = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
         tableIdent,
@@ -636,7 +636,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       source: String,
       schema: StructType,
       options: Map[String, String]): DataFrame = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
         tableIdent,
@@ -732,7 +732,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @since 1.3.0
    */
   def table(tableName: String): DataFrame = {
-    table(new SqlParser().parseTableIdentifier(tableName))
+    table(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def table(tableIdent: TableIdentifier): DataFrame = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 60d9c509104d5..2467b4e48415b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -823,7 +823,7 @@ object functions {
    *
    * @group normal_funcs
    */
-  def expr(expr: String): Column = Column(new SqlParser().parseExpression(expr))
+  def expr(expr: String): Column = Column(SqlParser.parseExpression(expr))
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Math Functions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index d37ba5ddc2d80..c12a734863326 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -291,12 +291,12 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * @since 1.3.0
    */
   def refreshTable(tableName: String): Unit = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     catalog.refreshTable(tableIdent)
   }
 
   protected[hive] def invalidateTable(tableName: String): Unit = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     catalog.invalidateTable(tableIdent)
   }
 
@@ -311,7 +311,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    */
   @Experimental
   def analyze(tableName: String) {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq))
 
     relation match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 0a5569b0a4446..0c1b41e3377e3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -199,7 +199,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       options: Map[String, String],
       isExternal: Boolean): Unit = {
     createDataSourceTable(
-      new SqlParser().parseTableIdentifier(tableName),
+      SqlParser.parseTableIdentifier(tableName),
       userSpecifiedSchema,
       partitionColumns,
       provider,
@@ -375,7 +375,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   }
 
   def hiveDefaultTableFilePath(tableName: String): String = {
-    hiveDefaultTableFilePath(new SqlParser().parseTableIdentifier(tableName))
+    hiveDefaultTableFilePath(SqlParser.parseTableIdentifier(tableName))
   }
 
   def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {

From 2117eea71ece825fbc3797c8b38184ae221f5223 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 19 Sep 2015 21:40:21 -0700
Subject: [PATCH 354/802] [SPARK-10710] Remove ability to disable spilling in
 core and SQL

It does not make much sense to set `spark.shuffle.spill` or `spark.sql.planner.externalSort` to false: I believe that these configurations were initially added as "escape hatches" to guard against bugs in the external operators, but these operators are now mature and well-tested. In addition, these configurations are not handled in a consistent way anymore: SQL's Tungsten codepath ignores these configurations and will continue to use spilling operators. Similarly, Spark Core's `tungsten-sort` shuffle manager does not respect `spark.shuffle.spill=false`.

This pull request removes these configurations, adds warnings at the appropriate places, and deletes a large amount of code which was only used in code paths that did not support spilling.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8831 from JoshRosen/remove-ability-to-disable-spilling.
---
 .../scala/org/apache/spark/Aggregator.scala   | 59 +++++--------------
 .../org/apache/spark/rdd/CoGroupedRDD.scala   | 40 ++++---------
 .../shuffle/hash/HashShuffleManager.scala     |  8 ++-
 .../shuffle/sort/SortShuffleManager.scala     | 10 +++-
 .../util/collection/ExternalSorter.scala      |  6 --
 .../spark/deploy/SparkSubmitSuite.scala       | 22 +++----
 docs/configuration.md                         | 14 +----
 docs/sql-programming-guide.md                 |  7 ---
 python/pyspark/rdd.py                         | 25 +++-----
 python/pyspark/shuffle.py                     | 30 ----------
 python/pyspark/tests.py                       | 13 +---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  8 +--
 .../spark/sql/execution/SparkStrategies.scala |  2 -
 .../apache/spark/sql/execution/commands.scala |  9 +++
 .../org/apache/spark/sql/execution/sort.scala | 30 +---------
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 26 ++------
 .../execution/RowFormatConvertersSuite.scala  |  2 +-
 .../spark/sql/execution/SortSuite.scala       |  4 +-
 18 files changed, 81 insertions(+), 234 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index 289aab9bd9e51..7196e57d5d2e2 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
+import org.apache.spark.util.collection.ExternalAppendOnlyMap
 
 /**
  * :: DeveloperApi ::
@@ -34,59 +34,30 @@ case class Aggregator[K, V, C] (
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C) {
 
-  // When spilling is enabled sorting will happen externally, but not necessarily with an
-  // ExternalSorter.
-  private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
-
   @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
     combineValuesByKey(iter, null)
 
-  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
-                         context: TaskContext): Iterator[(K, C)] = {
-    if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K, C]
-      var kv: Product2[K, V] = null
-      val update = (hadValue: Boolean, oldValue: C) => {
-        if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
-      }
-      while (iter.hasNext) {
-        kv = iter.next()
-        combiners.changeValue(kv._1, update)
-      }
-      combiners.iterator
-    } else {
-      val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
-      combiners.insertAll(iter)
-      updateMetrics(context, combiners)
-      combiners.iterator
-    }
+  def combineValuesByKey(
+      iter: Iterator[_ <: Product2[K, V]],
+      context: TaskContext): Iterator[(K, C)] = {
+    val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
+    combiners.insertAll(iter)
+    updateMetrics(context, combiners)
+    combiners.iterator
   }
 
   @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
   def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] =
     combineCombinersByKey(iter, null)
 
-  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
-    : Iterator[(K, C)] =
-  {
-    if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K, C]
-      var kc: Product2[K, C] = null
-      val update = (hadValue: Boolean, oldValue: C) => {
-        if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2
-      }
-      while (iter.hasNext) {
-        kc = iter.next()
-        combiners.changeValue(kc._1, update)
-      }
-      combiners.iterator
-    } else {
-      val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
-      combiners.insertAll(iter)
-      updateMetrics(context, combiners)
-      combiners.iterator
-    }
+  def combineCombinersByKey(
+      iter: Iterator[_ <: Product2[K, C]],
+      context: TaskContext): Iterator[(K, C)] = {
+    val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
+    combiners.insertAll(iter)
+    updateMetrics(context, combiners)
+    combiners.iterator
   }
 
   /** Update task metrics after populating the external map. */
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 7bad749d58327..935c3babd8ea1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -26,7 +26,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.collection.{ExternalAppendOnlyMap, AppendOnlyMap, CompactBuffer}
+import org.apache.spark.util.collection.{CompactBuffer, ExternalAppendOnlyMap}
 import org.apache.spark.util.Utils
 import org.apache.spark.serializer.Serializer
 
@@ -128,8 +128,6 @@ class CoGroupedRDD[K: ClassTag](
   override val partitioner: Some[Partitioner] = Some(part)
 
   override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {
-    val sparkConf = SparkEnv.get.conf
-    val externalSorting = sparkConf.getBoolean("spark.shuffle.spill", true)
     val split = s.asInstanceOf[CoGroupPartition]
     val numRdds = dependencies.length
 
@@ -150,34 +148,16 @@ class CoGroupedRDD[K: ClassTag](
         rddIterators += ((it, depNum))
     }
 
-    if (!externalSorting) {
-      val map = new AppendOnlyMap[K, CoGroupCombiner]
-      val update: (Boolean, CoGroupCombiner) => CoGroupCombiner = (hadVal, oldVal) => {
-        if (hadVal) oldVal else Array.fill(numRdds)(new CoGroup)
-      }
-      val getCombiner: K => CoGroupCombiner = key => {
-        map.changeValue(key, update)
-      }
-      rddIterators.foreach { case (it, depNum) =>
-        while (it.hasNext) {
-          val kv = it.next()
-          getCombiner(kv._1)(depNum) += kv._2
-        }
-      }
-      new InterruptibleIterator(context,
-        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
-    } else {
-      val map = createExternalMap(numRdds)
-      for ((it, depNum) <- rddIterators) {
-        map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
-      }
-      context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
-      context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
-      context.internalMetricsToAccumulators(
-        InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes)
-      new InterruptibleIterator(context,
-        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
+    val map = createExternalMap(numRdds)
+    for ((it, depNum) <- rddIterators) {
+      map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
     }
+    context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
+    context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
+    context.internalMetricsToAccumulators(
+      InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes)
+    new InterruptibleIterator(context,
+      map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
   }
 
   private def createExternalMap(numRdds: Int)
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
index c089088f409dd..0b46634b8b466 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
@@ -24,7 +24,13 @@ import org.apache.spark.shuffle._
  * A ShuffleManager using hashing, that creates one output file per reduce partition on each
  * mapper (possibly reusing these across waves of tasks).
  */
-private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager {
+private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
+
+  if (!conf.getBoolean("spark.shuffle.spill", true)) {
+    logWarning(
+      "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." +
+        " Shuffle will continue to spill to disk when necessary.")
+  }
 
   private val fileShuffleBlockResolver = new FileShuffleBlockResolver(conf)
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
index d7fab351ca3b8..476cc1f303da7 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -19,11 +19,17 @@ package org.apache.spark.shuffle.sort
 
 import java.util.concurrent.ConcurrentHashMap
 
-import org.apache.spark.{SparkConf, TaskContext, ShuffleDependency}
+import org.apache.spark.{Logging, SparkConf, TaskContext, ShuffleDependency}
 import org.apache.spark.shuffle._
 import org.apache.spark.shuffle.hash.HashShuffleReader
 
-private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager {
+private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
+
+  if (!conf.getBoolean("spark.shuffle.spill", true)) {
+    logWarning(
+      "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." +
+        " Shuffle will continue to spill to disk when necessary.")
+  }
 
   private val indexShuffleBlockResolver = new IndexShuffleBlockResolver(conf)
   private val shuffleMapNumber = new ConcurrentHashMap[Int, Int]()
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 31230d5978b2a..2a30f751ff03d 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -116,8 +116,6 @@ private[spark] class ExternalSorter[K, V, C](
   private val ser = Serializer.getSerializer(serializer)
   private val serInstance = ser.newInstance()
 
-  private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
-
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val fileBufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
@@ -229,10 +227,6 @@ private[spark] class ExternalSorter[K, V, C](
    * @param usingMap whether we're using a map or buffer as our current in-memory collection
    */
   private def maybeSpillCollection(usingMap: Boolean): Unit = {
-    if (!spillingEnabled) {
-      return
-    }
-
     var estimatedSize = 0L
     if (usingMap) {
       estimatedSize = map.estimateSize()
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 1110ca6051a40..1fd470cd3b01d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -147,7 +147,7 @@ class SparkSubmitSuite
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "beauty",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -166,7 +166,7 @@ class SparkSubmitSuite
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
     classpath should have length (0)
     sysProps("spark.app.name") should be ("beauty")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
     sysProps("SPARK_SUBMIT") should be ("true")
     sysProps.keys should not contain ("spark.jars")
   }
@@ -185,7 +185,7 @@ class SparkSubmitSuite
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "trill",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -206,7 +206,7 @@ class SparkSubmitSuite
     sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
     sysProps("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
     sysProps("SPARK_SUBMIT") should be ("true")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles standalone cluster mode") {
@@ -229,7 +229,7 @@ class SparkSubmitSuite
       "--supervise",
       "--driver-memory", "4g",
       "--driver-cores", "5",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -253,9 +253,9 @@ class SparkSubmitSuite
     sysProps.keys should contain ("spark.driver.memory")
     sysProps.keys should contain ("spark.driver.cores")
     sysProps.keys should contain ("spark.driver.supervise")
-    sysProps.keys should contain ("spark.shuffle.spill")
+    sysProps.keys should contain ("spark.ui.enabled")
     sysProps.keys should contain ("spark.submit.deployMode")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles standalone client mode") {
@@ -266,7 +266,7 @@ class SparkSubmitSuite
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -277,7 +277,7 @@ class SparkSubmitSuite
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles mesos client mode") {
@@ -288,7 +288,7 @@ class SparkSubmitSuite
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -299,7 +299,7 @@ class SparkSubmitSuite
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles confs with flag equivalents") {
diff --git a/docs/configuration.md b/docs/configuration.md
index 3700051efb448..5ec097c78aa38 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -69,7 +69,7 @@ val sc = new SparkContext(new SparkConf())
 
 Then, you can supply configuration values at runtime:
 {% highlight bash %}
-./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false
+./bin/spark-submit --name "My app" --master local[4] --conf spark.eventLog.enabled=false
   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar
 {% endhighlight %}
 
@@ -449,8 +449,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.memoryFraction</code></td>
   <td>0.2</td>
   <td>
-    Fraction of Java heap to use for aggregation and cogroups during shuffles, if
-    <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
+    Fraction of Java heap to use for aggregation and cogroups during shuffles.
+    At any given time, the collective size of
     all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
     begin to spill to disk. If spills are often, consider increasing this value at the expense of
     <code>spark.storage.memoryFraction</code>.
@@ -483,14 +483,6 @@ Apart from these, the following properties are also available, and may be useful
     map-side aggregation and there are at most this many reduce partitions.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.spill</code></td>
-  <td>true</td>
-  <td>
-    If set to "true", limits the amount of memory used during reduces by spilling data out to disk.
-    This spilling threshold is specified by <code>spark.shuffle.memoryFraction</code>.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.spill.compress</code></td>
   <td>true</td>
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 82d4243cc6b27..7ae9244c271e3 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1936,13 +1936,6 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
-  <tr>
-    <td><code>spark.sql.planner.externalSort</code></td>
-    <td>true</td>
-    <td>
-      When true, performs sorts spilling to disk as needed otherwise sort each partition in memory.
-    </td>
-  </tr>
 </table>
 
 # Distributed SQL Engine
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index ab5aab1e115f7..73d7d9a5692a9 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -48,7 +48,7 @@
 from pyspark.rddsampler import RDDSampler, RDDRangeSampler, RDDStratifiedSampler
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
-from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, \
+from pyspark.shuffle import Aggregator, ExternalMerger, \
     get_used_memory, ExternalSorter, ExternalGroupBy
 from pyspark.traceback_utils import SCCallSiteSync
 
@@ -580,12 +580,11 @@ def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=p
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-        spill = (self.ctx._conf.get("spark.shuffle.spill", 'True').lower() == "true")
         memory = _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))
         serializer = self._jrdd_deserializer
 
         def sortPartition(iterator):
-            sort = ExternalSorter(memory * 0.9, serializer).sorted if spill else sorted
+            sort = ExternalSorter(memory * 0.9, serializer).sorted
             return iter(sort(iterator, key=lambda k_v: keyfunc(k_v[0]), reverse=(not ascending)))
 
         return self.partitionBy(numPartitions, partitionFunc).mapPartitions(sortPartition, True)
@@ -610,12 +609,11 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-        spill = self._can_spill()
         memory = self._memory_limit()
         serializer = self._jrdd_deserializer
 
         def sortPartition(iterator):
-            sort = ExternalSorter(memory * 0.9, serializer).sorted if spill else sorted
+            sort = ExternalSorter(memory * 0.9, serializer).sorted
             return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending)))
 
         if numPartitions == 1:
@@ -1770,13 +1768,11 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
             numPartitions = self._defaultReducePartitions()
 
         serializer = self.ctx.serializer
-        spill = self._can_spill()
         memory = self._memory_limit()
         agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
 
         def combineLocally(iterator):
-            merger = ExternalMerger(agg, memory * 0.9, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory * 0.9, serializer)
             merger.mergeValues(iterator)
             return merger.items()
 
@@ -1784,8 +1780,7 @@ def combineLocally(iterator):
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
-            merger = ExternalMerger(agg, memory, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory, serializer)
             merger.mergeCombiners(iterator)
             return merger.items()
 
@@ -1824,9 +1819,6 @@ def createZero():
 
         return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions)
 
-    def _can_spill(self):
-        return self.ctx._conf.get("spark.shuffle.spill", "True").lower() == "true"
-
     def _memory_limit(self):
         return _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))
 
@@ -1857,14 +1849,12 @@ def mergeCombiners(a, b):
             a.extend(b)
             return a
 
-        spill = self._can_spill()
         memory = self._memory_limit()
         serializer = self._jrdd_deserializer
         agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
 
         def combine(iterator):
-            merger = ExternalMerger(agg, memory * 0.9, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory * 0.9, serializer)
             merger.mergeValues(iterator)
             return merger.items()
 
@@ -1872,8 +1862,7 @@ def combine(iterator):
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def groupByKey(it):
-            merger = ExternalGroupBy(agg, memory, serializer)\
-                if spill else InMemoryMerger(agg)
+            merger = ExternalGroupBy(agg, memory, serializer)
             merger.mergeCombiners(it)
             return merger.items()
 
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index b8118bdb7ca76..e974cda9fc3e1 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -131,36 +131,6 @@ def items(self):
         raise NotImplementedError
 
 
-class InMemoryMerger(Merger):
-
-    """
-    In memory merger based on in-memory dict.
-    """
-
-    def __init__(self, aggregator):
-        Merger.__init__(self, aggregator)
-        self.data = {}
-
-    def mergeValues(self, iterator):
-        """ Combine the items by creator and combiner """
-        # speed up attributes lookup
-        d, creator = self.data, self.agg.createCombiner
-        comb = self.agg.mergeValue
-        for k, v in iterator:
-            d[k] = comb(d[k], v) if k in d else creator(v)
-
-    def mergeCombiners(self, iterator):
-        """ Merge the combined items by mergeCombiner """
-        # speed up attributes lookup
-        d, comb = self.data, self.agg.mergeCombiners
-        for k, v in iterator:
-            d[k] = comb(d[k], v) if k in d else v
-
-    def items(self):
-        """ Return the merged items ad iterator """
-        return iter(self.data.items())
-
-
 def _compressed_serializer(self, serializer=None):
     # always use PickleSerializer to simplify implementation
     ser = PickleSerializer()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 647504c32f156..f11aaf001c8df 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -62,7 +62,7 @@
     CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \
     PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \
     FlattenedValuesSerializer
-from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
+from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter
 from pyspark import shuffle
 from pyspark.profiler import BasicProfiler
 
@@ -95,17 +95,6 @@ def setUp(self):
                               lambda x, y: x.append(y) or x,
                               lambda x, y: x.extend(y) or x)
 
-    def test_in_memory(self):
-        m = InMemoryMerger(self.agg)
-        m.mergeValues(self.data)
-        self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
-
-        m = InMemoryMerger(self.agg)
-        m.mergeCombiners(map(lambda x_y: (x_y[0], [x_y[1]]), self.data))
-        self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
-
     def test_small_dataset(self):
         m = ExternalMerger(self.agg, 1000)
         m.mergeValues(self.data)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 9de75f4c4d084..b9fb90d964206 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -330,11 +330,6 @@ private[spark] object SQLConf {
 
   // Options that control which operators can be chosen by the query planner.  These should be
   // considered hints and may be ignored by future versions of Spark SQL.
-  val EXTERNAL_SORT = booleanConf("spark.sql.planner.externalSort",
-    defaultValue = Some(true),
-    doc = "When true, performs sorts spilling to disk as needed otherwise sort each partition in" +
-      " memory.")
-
   val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
     defaultValue = Some(true),
     doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
@@ -422,6 +417,7 @@ private[spark] object SQLConf {
 
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
+    val EXTERNAL_SORT = "spark.sql.planner.externalSort"
   }
 }
 
@@ -476,8 +472,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT)
-
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
   private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, getConf(TUNGSTEN_ENABLED))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 5e40d77689045..41b215c79296a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -312,8 +312,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
         TungstenSort.supportsSchema(child.schema)) {
         execution.TungstenSort(sortExprs, global, child)
-      } else if (sqlContext.conf.externalSortEnabled) {
-        execution.ExternalSort(sortExprs, global, child)
       } else {
         execution.Sort(sortExprs, global, child)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 95209e6634519..af28e2dfa4186 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -105,6 +105,15 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
       }
       (keyValueOutput, runFunc)
 
+    case Some((SQLConf.Deprecated.EXTERNAL_SORT, Some(value))) =>
+      val runFunc = (sqlContext: SQLContext) => {
+        logWarning(
+          s"Property ${SQLConf.Deprecated.EXTERNAL_SORT} is deprecated and will be ignored. " +
+            s"External sort will continue to be used.")
+        Seq(Row(SQLConf.Deprecated.EXTERNAL_SORT, "true"))
+      }
+      (keyValueOutput, runFunc)
+
     // Configures a single property.
     case Some((key, Some(value))) =>
       val runFunc = (sqlContext: SQLContext) => {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index 40ef7c3b53530..27f26245a5ef0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -31,38 +31,12 @@ import org.apache.spark.{SparkEnv, InternalAccumulator, TaskContext}
 // This file defines various sort operators.
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-/**
- * Performs a sort on-heap.
- * @param global when true performs a global sort of all partitions by shuffling the data first
- *               if necessary.
- */
-case class Sort(
-    sortOrder: Seq[SortOrder],
-    global: Boolean,
-    child: SparkPlan)
-  extends UnaryNode {
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    child.execute().mapPartitions( { iterator =>
-      val ordering = newOrdering(sortOrder, child.output)
-      iterator.map(_.copy()).toArray.sorted(ordering).iterator
-    }, preservesPartitioning = true)
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-}
-
 /**
  * Performs a sort, spilling to disk as needed.
  * @param global when true performs a global sort of all partitions by shuffling the data first
  *               if necessary.
  */
-case class ExternalSort(
+case class Sort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
     child: SparkPlan)
@@ -93,7 +67,7 @@ case class ExternalSort(
 }
 
 /**
- * Optimized version of [[ExternalSort]] that operates on binary data (implemented as part of
+ * Optimized version of [[Sort]] that operates on binary data (implemented as part of
  * Project Tungsten).
  *
  * @param global when true performs a global sort of all partitions by shuffling the data first
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index f9981356f364f..05b4127cbcaff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -581,28 +581,12 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       mapData.collect().sortBy(_.data(1)).reverse.map(Row.fromTuple).toSeq)
   }
 
-  test("sorting") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false") {
-      sortTest()
-    }
-  }
-
   test("external sorting") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true") {
-      sortTest()
-    }
-  }
-
-  test("SPARK-6927 sorting with codegen on") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false",
-      SQLConf.CODEGEN_ENABLED.key -> "true") {
-      sortTest()
-    }
+    sortTest()
   }
 
   test("SPARK-6927 external sorting with codegen on") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true",
-      SQLConf.CODEGEN_ENABLED.key -> "true") {
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
       sortTest()
     }
   }
@@ -1731,10 +1715,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("external sorting updates peak execution memory") {
-    withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) {
-      AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "external sort") {
-        sortTest()
-      }
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "external sort") {
+      sortTest()
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index 4492e37ad01ff..5dc37e5c3c238 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -32,7 +32,7 @@ class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
     case c: ConvertToSafe => c
   }
 
-  private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
+  private val outputsSafe = Sort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
   assert(!outputsSafe.outputsUnsafeRows)
   private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
   assert(outputsUnsafe.outputsUnsafeRows)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
index 3073d492e613b..847c188a30333 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
@@ -36,13 +36,13 @@ class SortSuite extends SparkPlanTest with SharedSQLContext {
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('a.asc :: 'b.asc :: Nil, global = true, _: SparkPlan),
+      Sort('a.asc :: 'b.asc :: Nil, global = true, _: SparkPlan),
       input.sortBy(t => (t._1, t._2)).map(Row.fromTuple),
       sortAnswers = false)
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('b.asc :: 'a.asc :: Nil, global = true, _: SparkPlan),
+      Sort('b.asc :: 'a.asc :: Nil, global = true, _: SparkPlan),
       input.sortBy(t => (t._2, t._1)).map(Row.fromTuple),
       sortAnswers = false)
   }

From 1aa9e50256988533fa54584b49dbc408a14438ee Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 20 Sep 2015 16:05:12 -0700
Subject: [PATCH 355/802] [SPARK-5905] [MLLIB] Note requirements for certain
 RowMatrix methods in docs

Note methods that fail for cols > 65535; note that SVD does not require n >= m
CC mengxr

Author: Sean Owen <sowen@cloudera.com>

Closes #8839 from srowen/SPARK-5905.
---
 .../spark/mllib/linalg/distributed/RowMatrix.scala    | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index e55ef26858adb..7c7d900af3d5a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -109,7 +109,8 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`.
+   * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
+   * more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
@@ -150,7 +151,8 @@ class RowMatrix @Since("1.0.0") (
    *  - s is a Vector of size k, holding the singular values in descending order,
    *  - V is a Matrix of size n x k that satisfies V' * V = eye(k).
    *
-   * We assume n is smaller than m. The singular values and the right singular vectors are derived
+   * We assume n is smaller than m, though this is not strictly required.
+   * The singular values and the right singular vectors are derived
    * from the eigenvalues and the eigenvectors of the Gramian matrix A' * A. U, the matrix
    * storing the right singular vectors, is computed via matrix multiplication as
    * U = A * (V * S^-1^), if requested by user. The actual method to use is determined
@@ -320,7 +322,8 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the covariance matrix, treating each row as an observation.
+   * Computes the covariance matrix, treating each row as an observation. Note that this cannot
+   * be computed on matrices with more than 65535 columns.
    * @return a local dense matrix of size n x n
    */
   @Since("1.0.0")
@@ -374,6 +377,8 @@ class RowMatrix @Since("1.0.0") (
    * The row data do not need to be "centered" first; it is not necessary for
    * the mean of each column to be 0.
    *
+   * Note that this cannot be computed on matrices with more than 65535 columns.
+   *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components
    */

From 0c498717ba9622b6c889e701e8eed5ef9215c030 Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Sun, 20 Sep 2015 16:16:31 -0700
Subject: [PATCH 356/802] [SPARK-10715] [ML] Duplicate initialization flag in
 WeightedLeastSquare

There are duplicate set of initialization flag in `WeightedLeastSquares#add`.
`initialized` is already set in `init(Int)`.

Author: lewuathe <lewuathe@me.com>

Closes #8837 from Lewuathe/duplicate-initialization-flag.
---
 .../scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 0ff8931b0bab4..4374e99631560 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -193,7 +193,6 @@ private[ml] object WeightedLeastSquares {
       val ak = a.size
       if (!initialized) {
         init(ak)
-        initialized = true
       }
       assert(ak == k, s"Dimension mismatch. Expect vectors of size $k but got $ak.")
       count += 1L

From 01440395176bdbb2662480f03b27851cb860f385 Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Sun, 20 Sep 2015 22:55:24 -0700
Subject: [PATCH 357/802] [SPARK-10631] [DOCUMENTATION, MLLIB, PYSPARK] Added
 documentation for few APIs

There are some missing API docs in pyspark.mllib.linalg.Vector (including DenseVector and SparseVector). We should add them based on their Scala counterparts.

Author: vinodkc <vinod.kc.in@gmail.com>

Closes #8834 from vinodkc/fix_SPARK-10631.
---
 python/pyspark/mllib/linalg/__init__.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 4829acb16ed8a..f929e3e96fbe2 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -301,11 +301,14 @@ def __reduce__(self):
         return DenseVector, (self.array.tostring(),)
 
     def numNonzeros(self):
+        """
+        Number of nonzero elements. This scans all active values and count non zeros
+        """
         return np.count_nonzero(self.array)
 
     def norm(self, p):
         """
-        Calculte the norm of a DenseVector.
+        Calculates the norm of a DenseVector.
 
         >>> a = DenseVector([0, -1, 2, -3])
         >>> a.norm(2)
@@ -397,10 +400,16 @@ def squared_distance(self, other):
         return np.dot(diff, diff)
 
     def toArray(self):
+        """
+        Returns an numpy.ndarray
+        """
         return self.array
 
     @property
     def values(self):
+        """
+        Returns a list of values
+        """
         return self.array
 
     def __getitem__(self, item):
@@ -479,8 +488,8 @@ def __init__(self, size, *args):
 
         :param size: Size of the vector.
         :param args: Active entries, as a dictionary {index: value, ...},
-          a list of tuples [(index, value), ...], or a list of strictly i
-          ncreasing indices and a list of corresponding values [index, ...],
+          a list of tuples [(index, value), ...], or a list of strictly
+          increasing indices and a list of corresponding values [index, ...],
           [value, ...]. Inactive entries are treated as zeros.
 
         >>> SparseVector(4, {1: 1.0, 3: 5.5})
@@ -521,11 +530,14 @@ def __init__(self, size, *args):
                     raise TypeError("indices array must be sorted")
 
     def numNonzeros(self):
+        """
+        Number of nonzero elements. This scans all active values and count non zeros.
+        """
         return np.count_nonzero(self.values)
 
     def norm(self, p):
         """
-        Calculte the norm of a SparseVector.
+        Calculates the norm of a SparseVector.
 
         >>> a = SparseVector(4, [0, 1], [3., -4.])
         >>> a.norm(1)
@@ -797,7 +809,7 @@ def sparse(size, *args):
         values (sorted by index).
 
         :param size: Size of the vector.
-        :param args: Non-zero entries, as a dictionary, list of tupes,
+        :param args: Non-zero entries, as a dictionary, list of tuples,
                      or two sorted lists containing indices and values.
 
         >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})

From 20a61dbd9b57957fcc5b58ef8935533914172b07 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 21 Sep 2015 18:53:28 +0100
Subject: [PATCH 358/802] [SPARK-10626] [MLLIB] create java friendly method for
 random rdd

SPARK-3136 added a large number of functions for creating Java RandomRDDs, but for people that want to use custom RandomDataGenerators we should make a Java friendly method.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8782 from holdenk/SPARK-10626-create-java-friendly-method-for-randomRDD.
---
 .../spark/mllib/random/RandomRDDs.scala       | 52 ++++++++++++++++++-
 .../mllib/random/JavaRandomRDDsSuite.java     | 30 +++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 4dd5ea214d678..f8ff26b5795be 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -22,6 +22,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext}
+import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.rdd.{RandomRDD, RandomVectorRDD}
 import org.apache.spark.rdd.RDD
@@ -381,7 +382,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of `i.i.d.` samples produced by generator.
+   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
   @Since("1.1.0")
@@ -394,6 +395,55 @@ object RandomRDDs {
     new RandomRDD[T](sc, size, numPartitionsOrDefault(sc, numPartitions), generator, seed)
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
+   *
+   * @param jsc JavaSparkContext used to create the RDD.
+   * @param generator RandomDataGenerator used to populate the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[T],
+      size: Long,
+      numPartitions: Int,
+      seed: Long): JavaRDD[T] = {
+    implicit val ctag: ClassTag[T] = fakeClassTag
+    val rdd = randomRDD(jsc.sc, generator, size, numPartitions, seed)
+    JavaRDD.fromRDD(rdd)
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaRDD]] with the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+    jsc: JavaSparkContext,
+    generator: RandomDataGenerator[T],
+    size: Long,
+    numPartitions: Int): JavaRDD[T] = {
+    randomJavaRDD(jsc, generator, size, numPartitions, Utils.random.nextLong())
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaRDD]] with the default seed & numPartitions
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+    jsc: JavaSparkContext,
+    generator: RandomDataGenerator[T],
+    size: Long): JavaRDD[T] = {
+    randomJavaRDD(jsc, generator, size, 0);
+  }
+
   // TODO Generate RDD[Vector] from multivariate distributions.
 
   /**
diff --git a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
index 33d81b1e9592b..fce5f6712f462 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.random;
 
+import java.io.Serializable;
 import java.util.Arrays;
 
 import org.apache.spark.api.java.JavaRDD;
@@ -231,4 +232,33 @@ public void testGammaVectorRDD() {
     }
   }
 
+  @Test
+  public void testArbitrary() {
+    long size = 10;
+    long seed = 1L;
+    int numPartitions = 0;
+    StringGenerator gen = new StringGenerator();
+    JavaRDD<String> rdd1 = randomJavaRDD(sc, gen, size);
+    JavaRDD<String> rdd2 = randomJavaRDD(sc, gen, size, numPartitions);
+    JavaRDD<String> rdd3 = randomJavaRDD(sc, gen, size, numPartitions, seed);
+    for (JavaRDD<String> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(size, rdd.count());
+      Assert.assertEquals(2, rdd.first().length());
+    }
+  }
+}
+
+// This is just a test generator, it always returns a string of 42
+class StringGenerator implements RandomDataGenerator<String>, Serializable {
+  @Override
+  public String nextValue() {
+    return "42";
+  }
+  @Override
+  public StringGenerator copy() {
+    return new StringGenerator();
+  }
+  @Override
+  public void setSeed(long seed) {
+  }
 }

From ebbf85f07bb8de0d566f1ae4b41f26421180bebe Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 21 Sep 2015 11:39:04 -0700
Subject: [PATCH 359/802] [SPARK-7989] [SPARK-10651] [CORE] [TESTS] Increase
 timeout to fix flaky tests

I noticed only one block manager registered with master in an unsuccessful build (https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.2,label=spark-test/3534/)
```
15/09/16 13:02:30.981 pool-1-thread-1-ScalaTest-running-BroadcastSuite INFO SparkContext: Running Spark version 1.6.0-SNAPSHOT
...
15/09/16 13:02:38.133 sparkDriver-akka.actor.default-dispatcher-19 INFO BlockManagerMasterEndpoint: Registering block manager localhost:48196 with 530.3 MB RAM, BlockManagerId(0, localhost, 48196)
```
In addition, the first block manager needed 7+ seconds to start. But the test expected 2 block managers so it failed.

However, there was no exception in this log file. So I checked a successful build (https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/3536/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.2,label=spark-test/) and it needed 4-5 seconds to set up the local cluster:
```
15/09/16 18:11:27.738 sparkWorker1-akka.actor.default-dispatcher-5 INFO Worker: Running Spark version 1.6.0-SNAPSHOT
...
15/09/16 18:11:30.838 sparkDriver-akka.actor.default-dispatcher-20 INFO BlockManagerMasterEndpoint: Registering block manager localhost:54202 with 530.3 MB RAM, BlockManagerId(1, localhost, 54202)
15/09/16 18:11:32.112 sparkDriver-akka.actor.default-dispatcher-20 INFO BlockManagerMasterEndpoint: Registering block manager localhost:32955 with 530.3 MB RAM, BlockManagerId(0, localhost, 32955)
```
In this build, the first block manager needed only 3+ seconds to start.

Comparing these two builds, I guess it's possible that the local cluster in `BroadcastSuite` cannot be ready in 10 seconds if the Jenkins worker is busy. So I just increased the timeout to 60 seconds to see if this can fix the issue.

Author: zsxwing <zsxwing@gmail.com>

Closes #8813 from zsxwing/fix-BroadcastSuite.
---
 .../scala/org/apache/spark/ExternalShuffleServiceSuite.scala    | 2 +-
 .../test/scala/org/apache/spark/broadcast/BroadcastSuite.scala  | 2 +-
 .../apache/spark/scheduler/SparkListenerWithClusterSuite.scala  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index e846a72c888c6..231f4631e0a47 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -61,7 +61,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
     // local blocks from the local BlockManager and won't send requests to ExternalShuffleService.
     // In this case, we won't receive FetchFailed. And it will make this test fail.
     // Therefore, we should wait until all slaves are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
 
     val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _)
 
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index fb7a8ae3f9d41..ba21075ce6be5 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -311,7 +311,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
         new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", broadcastConf)
       // Wait until all salves are up
       try {
-        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
+        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 60000)
         _sc
       } catch {
         case e: Throwable =>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index d1e23ed527ff1..9fa8859382911 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -43,7 +43,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
 
     // This test will check if the number of executors received by "SparkListener" is same as the
     // number of all executors, so we need to wait until all executors are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
 
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.map(_.toString)

From ca9fe540fe04e2e230d1e76526b5502bab152914 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Mon, 21 Sep 2015 19:46:39 +0100
Subject: [PATCH 360/802] [SPARK-10662] [DOCS] Code snippets are not properly
 formatted in tables

* Backticks are processed properly in Spark Properties table
* Removed unnecessary spaces
* See http://people.apache.org/~pwendell/spark-nightly/spark-master-docs/latest/running-on-yarn.html

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #8795 from jaceklaskowski/docs-yarn-formatting.
---
 docs/configuration.md           |  97 +++++++++++++++--------------
 docs/programming-guide.md       | 100 +++++++++++++++---------------
 docs/running-on-mesos.md        |  14 ++---
 docs/running-on-yarn.md         | 106 ++++++++++++++++----------------
 docs/sql-programming-guide.md   |  16 ++---
 docs/submitting-applications.md |   8 +--
 6 files changed, 171 insertions(+), 170 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 5ec097c78aa38..b22587c70316b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -34,20 +34,20 @@ val conf = new SparkConf()
 val sc = new SparkContext(conf)
 {% endhighlight %}
 
-Note that we can have more than 1 thread in local mode, and in cases like Spark Streaming, we may 
+Note that we can have more than 1 thread in local mode, and in cases like Spark Streaming, we may
 actually require one to prevent any sort of starvation issues.
 
-Properties that specify some time duration should be configured with a unit of time. 
+Properties that specify some time duration should be configured with a unit of time.
 The following format is accepted:
- 
+
     25ms (milliseconds)
     5s (seconds)
     10m or 10min (minutes)
     3h (hours)
     5d (days)
     1y (years)
-    
-    
+
+
 Properties that specify a byte size should be configured with a unit of size.  
 The following format is accepted:
 
@@ -140,7 +140,7 @@ of the most common options to set are:
   <td>
     Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
     (e.g. <code>1g</code>, <code>2g</code>).
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
     Instead, please set this through the <code>--driver-memory</code> command line option
@@ -207,7 +207,7 @@ Apart from these, the following properties are also available, and may be useful
 
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
+    Instead, please set this through the <code>--driver-class-path</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -216,10 +216,10 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
+    Instead, please set this through the <code>--driver-java-options</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -228,10 +228,10 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     Set a special library path to use when launching the driver JVM.
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-library-path</code> command line option or in 
+    Instead, please set this through the <code>--driver-library-path</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -242,7 +242,7 @@ Apart from these, the following properties are also available, and may be useful
     (Experimental) Whether to give user-added jars precedence over Spark's own jars when loading
     classes in the the driver. This feature can be used to mitigate conflicts between Spark's
     dependencies and user dependencies. It is currently an experimental feature.
-    
+
     This is used in cluster mode only.
   </td>
 </tr>
@@ -250,8 +250,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executor.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to prepend to the classpath of executors. This exists primarily for 
-    backwards-compatibility with older versions of Spark. Users typically should not need to set 
+    Extra classpath entries to prepend to the classpath of executors. This exists primarily for
+    backwards-compatibility with older versions of Spark. Users typically should not need to set
     this option.
   </td>
 </tr>
@@ -259,9 +259,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executor.extraJavaOptions</code></td>
   <td>(none)</td>
   <td>
-    A string of extra JVM options to pass to executors. For instance, GC settings or other logging. 
-    Note that it is illegal to set Spark properties or heap size settings with this option. Spark 
-    properties should be set using a SparkConf object or the spark-defaults.conf file used with the 
+    A string of extra JVM options to pass to executors. For instance, GC settings or other logging.
+    Note that it is illegal to set Spark properties or heap size settings with this option. Spark
+    properties should be set using a SparkConf object or the spark-defaults.conf file used with the
     spark-submit script. Heap size settings can be set with spark.executor.memory.
   </td>
 </tr>
@@ -305,7 +305,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>daily</td>
   <td>
     Set the time interval by which the executor logs will be rolled over.
-    Rolling is disabled by default. Valid values are `daily`, `hourly`, `minutely` or
+    Rolling is disabled by default. Valid values are <code>daily</code>, <code>hourly<code>, <code>minutely<code> or
     any interval in seconds. See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
     for automatic cleaning of old logs.
   </td>
@@ -330,13 +330,13 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.python.profile</code></td>
   <td>false</td>
   <td>
-    Enable profiling in Python worker, the profile result will show up by `sc.show_profiles()`,
+    Enable profiling in Python worker, the profile result will show up by <code>sc.show_profiles()<code>,
     or it will be displayed before the driver exiting. It also can be dumped into disk by
-    `sc.dump_profiles(path)`. If some of the profile results had been displayed manually,
+    <code>sc.dump_profiles(path)<code>. If some of the profile results had been displayed manually,
     they will not be displayed automatically before driver exiting.
 
-    By default the `pyspark.profiler.BasicProfiler` will be used, but this can be overridden by
-    passing a profiler class in as a parameter to the `SparkContext` constructor.
+    By default the <code>pyspark.profiler.BasicProfiler<code> will be used, but this can be overridden by
+    passing a profiler class in as a parameter to the <code>SparkContext<code> constructor.
   </td>
 </tr>
 <tr>
@@ -460,11 +460,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.service.enabled</code></td>
   <td>false</td>
   <td>
-    Enables the external shuffle service. This service preserves the shuffle files written by 
-    executors so the executors can be safely removed. This must be enabled if 
+    Enables the external shuffle service. This service preserves the shuffle files written by
+    executors so the executors can be safely removed. This must be enabled if
     <code>spark.dynamicAllocation.enabled</code> is "true". The external shuffle service
     must be set up in order to enable it. See
-    <a href="job-scheduling.html#configuration-and-setup">dynamic allocation 
+    <a href="job-scheduling.html#configuration-and-setup">dynamic allocation
     configuration and setup documentation</a> for more information.
   </td>
 </tr>
@@ -747,9 +747,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>1 in YARN mode, all the available cores on the worker in standalone mode.</td>
   <td>
     The number of cores to use on each executor. For YARN and standalone mode only.
-    
-    In standalone mode, setting this parameter allows an application to run multiple executors on 
-    the same worker, provided that there are enough cores on that worker. Otherwise, only one 
+
+    In standalone mode, setting this parameter allows an application to run multiple executors on
+    the same worker, provided that there are enough cores on that worker. Otherwise, only one
     executor per application will run on each worker.
   </td>
 </tr>
@@ -893,14 +893,14 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.akka.heartbeat.interval</code></td>
   <td>1000s</td>
   <td>
-    This is set to a larger value to disable the transport failure detector that comes built in to 
-    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger 
-    interval value reduces network overhead and a smaller value ( ~ 1 s) might be more 
-    informative for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` 
-    if you need to. A likely positive use case for using failure detector would be: a sensistive 
-    failure detector can help evict rogue executors quickly. However this is usually not the case 
-    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling 
-    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network 
+    This is set to a larger value to disable the transport failure detector that comes built in to
+    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger
+    interval value reduces network overhead and a smaller value ( ~ 1 s) might be more
+    informative for Akka's failure detector. Tune this in combination of <code>spark.akka.heartbeat.pauses</code>
+    if you need to. A likely positive use case for using failure detector would be: a sensistive
+    failure detector can help evict rogue executors quickly. However this is usually not the case
+    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling
+    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network
     with those.
   </td>
 </tr>
@@ -909,9 +909,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>6000s</td>
   <td>
      This is set to a larger value to disable the transport failure detector that comes built in to Akka.
-     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart 
+     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart
      beat pause for Akka. This can be used to control sensitivity to GC pauses. Tune
-     this along with `spark.akka.heartbeat.interval` if you need to.
+     this along with <code>spark.akka.heartbeat.interval</code> if you need to.
   </td>
 </tr>
 <tr>
@@ -978,7 +978,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.network.timeout</code></td>
   <td>120s</td>
   <td>
-    Default timeout for all network interactions. This config will be used in place of 
+    Default timeout for all network interactions. This config will be used in place of
     <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
     <code>spark.storage.blockManagerSlaveTimeoutMs</code>,
     <code>spark.shuffle.io.connectionTimeout</code>, <code>spark.rpc.askTimeout</code> or
@@ -991,8 +991,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Maximum number of retries when binding to a port before giving up.
     When a port is given a specific value (non 0), each subsequent retry will
-    increment the port used in the previous attempt by 1 before retrying. This 
-    essentially allows it to try a range of ports from the start port specified 
+    increment the port used in the previous attempt by 1 before retrying. This
+    essentially allows it to try a range of ports from the start port specified
     to port + maxRetries.
   </td>
 </tr>
@@ -1191,7 +1191,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
   <td>60s</td>
   <td>
-    If dynamic allocation is enabled and an executor has been idle for more than this duration, 
+    If dynamic allocation is enabled and an executor has been idle for more than this duration,
     the executor will be removed. For more detail, see this
     <a href="job-scheduling.html#resource-allocation-policy">description</a>.
   </td>
@@ -1424,11 +1424,11 @@ Apart from these, the following properties are also available, and may be useful
   <td>false</td>
   <td>
     Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5).
-    This enables the Spark Streaming to control the receiving rate based on the 
+    This enables the Spark Streaming to control the receiving rate based on the
     current batch scheduling delays and processing times so that the system receives
-    only as fast as the system can process. Internally, this dynamically sets the 
+    only as fast as the system can process. Internally, this dynamically sets the
     maximum receiving rate of receivers. This rate is upper bounded by the values
-    `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition`
+    <code>spark.streaming.receiver.maxRate</code> and <code>spark.streaming.kafka.maxRatePerPartition</code>
     if they are set (see below).
   </td>
 </tr>
@@ -1542,15 +1542,15 @@ The following variables can be set in `spark-env.sh`:
   <tr><th style="width:21%">Environment Variable</th><th>Meaning</th></tr>
   <tr>
     <td><code>JAVA_HOME</code></td>
-    <td>Location where Java is installed (if it's not on your default `PATH`).</td>
+    <td>Location where Java is installed (if it's not on your default <code>PATH</code>).</td>
   </tr>
   <tr>
     <td><code>PYSPARK_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in both driver and workers (default is `python`).</td>
+    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python</code>).</td>
   </tr>
   <tr>
     <td><code>PYSPARK_DRIVER_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in driver only (default is PYSPARK_PYTHON).</td>
+    <td>Python binary executable to use for PySpark in driver only (default is <code>PYSPARK_PYTHON</code>).</td>
   </tr>
   <tr>
     <td><code>SPARK_LOCAL_IP</code></td>
@@ -1580,4 +1580,3 @@ Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can config
 To specify a different configuration directory other than the default "SPARK_HOME/conf",
 you can set SPARK_CONF_DIR. Spark will use the the configuration files (spark-defaults.conf, spark-env.sh, log4j.properties, etc)
 from this directory.
-
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 4cf83bb392636..8ad238315f12c 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -182,8 +182,8 @@ in-process.
 In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the
 variable called `sc`. Making your own SparkContext will not work. You can set which master the
 context connects to using the `--master` argument, and you can add JARs to the classpath
-by passing a comma-separated list to the `--jars` argument. You can also add dependencies 
-(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates 
+by passing a comma-separated list to the `--jars` argument. You can also add dependencies
+(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
 to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
 can be passed to the `--repositories` argument. For example, to run `bin/spark-shell` on exactly
 four cores, use:
@@ -217,7 +217,7 @@ context connects to using the `--master` argument, and you can add Python .zip,
 to the runtime path by passing a comma-separated list to `--py-files`. You can also add dependencies
 (e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
 to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
-can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in 
+can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in
 the requirements.txt of that package) must be manually installed using pip when necessary.
 For example, to run `bin/pyspark` on exactly four cores, use:
 
@@ -249,8 +249,8 @@ the [IPython Notebook](http://ipython.org/notebook.html) with PyLab plot support
 $ PYSPARK_DRIVER_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS="notebook" ./bin/pyspark
 {% endhighlight %}
 
-After the IPython Notebook server is launched, you can create a new "Python 2" notebook from 
-the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of 
+After the IPython Notebook server is launched, you can create a new "Python 2" notebook from
+the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
 your notebook before you start to try Spark from the IPython notebook.
 
 </div>
@@ -418,9 +418,9 @@ Apart from text files, Spark's Python API also supports several other data forma
 
 **Writable Support**
 
-PySpark SequenceFile support loads an RDD of key-value pairs within Java, converts Writables to base Java types, and pickles the 
-resulting Java objects using [Pyrolite](https://github.com/irmen/Pyrolite/). When saving an RDD of key-value pairs to SequenceFile, 
-PySpark does the reverse. It unpickles Python objects into Java objects and then converts them to Writables. The following 
+PySpark SequenceFile support loads an RDD of key-value pairs within Java, converts Writables to base Java types, and pickles the
+resulting Java objects using [Pyrolite](https://github.com/irmen/Pyrolite/). When saving an RDD of key-value pairs to SequenceFile,
+PySpark does the reverse. It unpickles Python objects into Java objects and then converts them to Writables. The following
 Writables are automatically converted:
 
 <table class="table">
@@ -435,9 +435,9 @@ Writables are automatically converted:
 <tr><td>MapWritable</td><td>dict</td></tr>
 </table>
 
-Arrays are not handled out-of-the-box. Users need to specify custom `ArrayWritable` subtypes when reading or writing. When writing, 
-users also need to specify custom converters that convert arrays to custom `ArrayWritable` subtypes. When reading, the default 
-converter will convert custom `ArrayWritable` subtypes to Java `Object[]`, which then get pickled to Python tuples. To get 
+Arrays are not handled out-of-the-box. Users need to specify custom `ArrayWritable` subtypes when reading or writing. When writing,
+users also need to specify custom converters that convert arrays to custom `ArrayWritable` subtypes. When reading, the default
+converter will convert custom `ArrayWritable` subtypes to Java `Object[]`, which then get pickled to Python tuples. To get
 Python `array.array` for arrays of primitive types, users need to specify custom converters.
 
 **Saving and Loading SequenceFiles**
@@ -454,7 +454,7 @@ classes can be specified, but for standard Writables this is not required.
 
 **Saving and Loading Other Hadoop Input/Output Formats**
 
-PySpark can also read any Hadoop InputFormat or write any Hadoop OutputFormat, for both 'new' and 'old' Hadoop MapReduce APIs. 
+PySpark can also read any Hadoop InputFormat or write any Hadoop OutputFormat, for both 'new' and 'old' Hadoop MapReduce APIs.
 If required, a Hadoop configuration can be passed in as a Python dict. Here is an example using the
 Elasticsearch ESInputFormat:
 
@@ -474,15 +474,15 @@ Note that, if the InputFormat simply depends on a Hadoop configuration and/or in
 the key and value classes can easily be converted according to the above table,
 then this approach should work well for such cases.
 
-If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to 
+If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to
 transform that data on the Scala/Java side to something which can be handled by Pyrolite's pickler.
-A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided 
-for this. Simply extend this trait and implement your transformation code in the ```convert``` 
-method. Remember to ensure that this class, along with any dependencies required to access your ```InputFormat```, are packaged into your Spark job jar and included on the PySpark 
+A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided
+for this. Simply extend this trait and implement your transformation code in the ```convert```
+method. Remember to ensure that this class, along with any dependencies required to access your ```InputFormat```, are packaged into your Spark job jar and included on the PySpark
 classpath.
 
-See the [Python examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python) and 
-the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/pythonconverters) 
+See the [Python examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python) and
+the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/pythonconverters)
 for examples of using Cassandra / HBase ```InputFormat``` and ```OutputFormat``` with custom converters.
 
 </div>
@@ -758,7 +758,7 @@ One of the harder things about Spark is understanding the scope and life cycle o
 
 #### Example
 
-Consider the naive RDD element sum below, which behaves completely differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = local[n]`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN): 
+Consider the naive RDD element sum below, which behaves completely differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = local[n]`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN):
 
 <div class="codetabs">
 
@@ -777,7 +777,7 @@ println("Counter value: " + counter)
 <div data-lang="java"  markdown="1">
 {% highlight java %}
 int counter = 0;
-JavaRDD<Integer> rdd = sc.parallelize(data); 
+JavaRDD<Integer> rdd = sc.parallelize(data);
 
 // Wrong: Don't do this!!
 rdd.foreach(x -> counter += x);
@@ -803,7 +803,7 @@ print("Counter value: " + counter)
 
 #### Local vs. cluster modes
 
-The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node. 
+The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node.
 
 However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on seperate worker nodes each have their own copy of the closure.
 
@@ -813,9 +813,9 @@ To ensure well-defined behavior in these sorts of scenarios one should use an [`
 
 In general, closures - constructs like loops or locally defined methods, should not be used to mutate some global state. Spark does not define or guarantee the behavior of mutations to objects referenced from outside of closures. Some code that does this may work in local mode, but that's just by accident and such code will not behave as expected in distributed mode. Use an Accumulator instead if some global aggregation is needed.
 
-#### Printing elements of an RDD 
+#### Printing elements of an RDD
 Another common idiom is attempting to print out the elements of an RDD using `rdd.foreach(println)` or `rdd.map(println)`. On a single machine, this will generate the expected output and print all the RDD's elements. However, in `cluster` mode, the output to `stdout` being called by the executors is now writing to the executor's `stdout` instead, not the one on the driver, so `stdout` on the driver won't show these! To print all elements on the driver, one can use the `collect()` method to first bring the RDD to the driver node thus: `rdd.collect().foreach(println)`. This can cause the driver to run out of memory, though, because `collect()` fetches the entire RDD to a single machine; if you only need to print a few elements of the RDD, a safer approach is to use the `take()`: `rdd.take(100).foreach(println)`.
- 
+
 ### Working with Key-Value Pairs
 
 <div class="codetabs">
@@ -859,7 +859,7 @@ only available on RDDs of key-value pairs.
 The most common ones are distributed "shuffle" operations, such as grouping or aggregating the elements
 by a key.
 
-In Java, key-value pairs are represented using the 
+In Java, key-value pairs are represented using the
 [scala.Tuple2](http://www.scala-lang.org/api/{{site.SCALA_VERSION}}/index.html#scala.Tuple2) class
 from the Scala standard library. You can simply call `new Tuple2(a, b)` to create a tuple, and access
 its fields later with `tuple._1()` and `tuple._2()`.
@@ -974,7 +974,7 @@ for details.
   <td> <b>groupByKey</b>([<i>numTasks</i>]) <a name="GroupByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable&lt;V&gt;) pairs. <br />
     <b>Note:</b> If you are grouping in order to perform an aggregation (such as a sum or
-      average) over each key, using <code>reduceByKey</code> or <code>aggregateByKey</code> will yield much better 
+      average) over each key, using <code>reduceByKey</code> or <code>aggregateByKey</code> will yield much better
       performance.
     <br />
     <b>Note:</b> By default, the level of parallelism in the output depends on the number of partitions of the parent RDD.
@@ -1025,7 +1025,7 @@ for details.
 <tr>
   <td> <b>repartitionAndSortWithinPartitions</b>(<i>partitioner</i>) <a name="Repartition2Link"></a></td>
   <td> Repartition the RDD according to the given partitioner and, within each resulting partition,
-  sort records by their keys. This is more efficient than calling <code>repartition</code> and then sorting within 
+  sort records by their keys. This is more efficient than calling <code>repartition</code> and then sorting within
   each partition because it can push the sorting down into the shuffle machinery. </td>
 </tr>
 </table>
@@ -1038,7 +1038,7 @@ RDD API doc
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
  [Python](api/python/pyspark.html#pyspark.RDD),
  [R](api/R/index.html))
- 
+
 and pair RDD functions doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
@@ -1094,7 +1094,7 @@ for details.
 </tr>
 <tr>
   <td> <b>foreach</b>(<i>func</i>) </td>
-  <td> Run a function <i>func</i> on each element of the dataset. This is usually done for side effects such as updating an <a href="#AccumLink">Accumulator</a> or interacting with external storage systems. 
+  <td> Run a function <i>func</i> on each element of the dataset. This is usually done for side effects such as updating an <a href="#AccumLink">Accumulator</a> or interacting with external storage systems.
   <br /><b>Note</b>: modifying variables other than Accumulators outside of the <code>foreach()</code> may result in undefined behavior. See <a href="#ClosuresLink">Understanding closures </a> for more details.</td>
 </tr>
 </table>
@@ -1118,13 +1118,13 @@ co-located to compute the result.
 In Spark, data is generally not distributed across partitions to be in the necessary place for a
 specific operation. During computations, a single task will operate on a single partition - thus, to
 organize all the data for a single `reduceByKey` reduce task to execute, Spark needs to perform an
-all-to-all operation. It must read from all partitions to find all the values for all keys, 
-and then bring together values across partitions to compute the final result for each key - 
+all-to-all operation. It must read from all partitions to find all the values for all keys,
+and then bring together values across partitions to compute the final result for each key -
 this is called the **shuffle**.
 
 Although the set of elements in each partition of newly shuffled data will be deterministic, and so
-is the ordering of partitions themselves, the ordering of these elements is not. If one desires predictably 
-ordered data following shuffle then it's possible to use: 
+is the ordering of partitions themselves, the ordering of these elements is not. If one desires predictably
+ordered data following shuffle then it's possible to use:
 
 * `mapPartitions` to sort each partition using, for example, `.sorted`
 * `repartitionAndSortWithinPartitions` to efficiently sort partitions while simultaneously repartitioning
@@ -1141,26 +1141,26 @@ network I/O. To organize data for the shuffle, Spark generates sets of tasks - *
 organize the data, and a set of *reduce* tasks to aggregate it. This nomenclature comes from
 MapReduce and does not directly relate to Spark's `map` and `reduce` operations.
 
-Internally, results from individual map tasks are kept in memory until they can't fit. Then, these 
-are sorted based on the target partition and written to a single file. On the reduce side, tasks 
+Internally, results from individual map tasks are kept in memory until they can't fit. Then, these
+are sorted based on the target partition and written to a single file. On the reduce side, tasks
 read the relevant sorted blocks.
-        
-Certain shuffle operations can consume significant amounts of heap memory since they employ 
-in-memory data structures to organize records before or after transferring them. Specifically, 
-`reduceByKey` and `aggregateByKey` create these structures on the map side, and `'ByKey` operations 
-generate these on the reduce side. When data does not fit in memory Spark will spill these tables 
+
+Certain shuffle operations can consume significant amounts of heap memory since they employ
+in-memory data structures to organize records before or after transferring them. Specifically,
+`reduceByKey` and `aggregateByKey` create these structures on the map side, and `'ByKey` operations
+generate these on the reduce side. When data does not fit in memory Spark will spill these tables
 to disk, incurring the additional overhead of disk I/O and increased garbage collection.
 
 Shuffle also generates a large number of intermediate files on disk. As of Spark 1.3, these files
-are preserved until the corresponding RDDs are no longer used and are garbage collected. 
-This is done so the shuffle files don't need to be re-created if the lineage is re-computed. 
-Garbage collection may happen only after a long period time, if the application retains references 
-to these RDDs or if GC does not kick in frequently. This means that long-running Spark jobs may 
+are preserved until the corresponding RDDs are no longer used and are garbage collected.
+This is done so the shuffle files don't need to be re-created if the lineage is re-computed.
+Garbage collection may happen only after a long period time, if the application retains references
+to these RDDs or if GC does not kick in frequently. This means that long-running Spark jobs may
 consume a large amount of disk space. The temporary storage directory is specified by the
 `spark.local.dir` configuration parameter when configuring the Spark context.
 
 Shuffle behavior can be tuned by adjusting a variety of configuration parameters. See the
-'Shuffle Behavior' section within the [Spark Configuration Guide](configuration.html). 
+'Shuffle Behavior' section within the [Spark Configuration Guide](configuration.html).
 
 ## RDD Persistence
 
@@ -1246,7 +1246,7 @@ efficiency. We recommend going through the following process to select one:
   This is the most CPU-efficient option, allowing operations on the RDDs to run as fast as possible.
 
 * If not, try using `MEMORY_ONLY_SER` and [selecting a fast serialization library](tuning.html) to
-make the objects much more space-efficient, but still reasonably fast to access. 
+make the objects much more space-efficient, but still reasonably fast to access.
 
 * Don't spill to disk unless the functions that computed your datasets are expensive, or they filter
 a large amount of the data. Otherwise, recomputing a partition may be as fast as reading it from
@@ -1345,7 +1345,7 @@ Accumulators are variables that are only "added" to through an associative opera
 therefore be efficiently supported in parallel. They can be used to implement counters (as in
 MapReduce) or sums. Spark natively supports accumulators of numeric types, and programmers
 can add support for new types. If accumulators are created with a name, they will be
-displayed in Spark's UI. This can be useful for understanding the progress of 
+displayed in Spark's UI. This can be useful for understanding the progress of
 running stages (NOTE: this is not yet supported in Python).
 
 An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks
@@ -1474,8 +1474,8 @@ vecAccum = sc.accumulator(Vector(...), VectorAccumulatorParam())
 
 </div>
 
-For accumulator updates performed inside <b>actions only</b>, Spark guarantees that each task's update to the accumulator 
-will only be applied once, i.e. restarted tasks will not update the value. In transformations, users should be aware 
+For accumulator updates performed inside <b>actions only</b>, Spark guarantees that each task's update to the accumulator
+will only be applied once, i.e. restarted tasks will not update the value. In transformations, users should be aware
 of that each task's update may be applied more than once if tasks or job stages are re-executed.
 
 Accumulators do not change the lazy evaluation model of Spark. If they are being updated within an operation on an RDD, their value is only updated once that RDD is computed as part of an action. Consequently, accumulator updates are not guaranteed to be executed when made within a lazy transformation like `map()`. The below code fragment demonstrates this property:
@@ -1486,7 +1486,7 @@ Accumulators do not change the lazy evaluation model of Spark. If they are being
 {% highlight scala %}
 val accum = sc.accumulator(0)
 data.map { x => accum += x; f(x) }
-// Here, accum is still 0 because no actions have caused the `map` to be computed.
+// Here, accum is still 0 because no actions have caused the <code>map</code> to be computed.
 {% endhighlight %}
 </div>
 
@@ -1553,7 +1553,7 @@ Several changes were made to the Java API:
   code that `extends Function` should `implement Function` instead.
 * New variants of the `map` transformations, like `mapToPair` and `mapToDouble`, were added to create RDDs
   of special data types.
-* Grouping operations like `groupByKey`, `cogroup` and `join` have changed from returning 
+* Grouping operations like `groupByKey`, `cogroup` and `join` have changed from returning
   `(Key, List<Value>)` pairs to `(Key, Iterable<Value>)`.
 
 </div>
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 330c159c67bca..460a66f37dd64 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -245,7 +245,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td><code>spark.mesos.coarse</code></td>
   <td>false</td>
   <td>
-    If set to "true", runs over Mesos clusters in
+    If set to <code>true</code>, runs over Mesos clusters in
     <a href="running-on-mesos.html#mesos-run-modes">"coarse-grained" sharing mode</a>,
     where Spark acquires one long-lived Mesos task on each machine instead of one Mesos task per
     Spark task. This gives lower-latency scheduling for short queries, but leaves resources in use
@@ -254,16 +254,16 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.extra.cores</code></td>
-  <td>0</td>
+  <td><code>0</code></td>
   <td>
     Set the extra amount of cpus to request per task. This setting is only used for Mesos coarse grain mode.
     The total amount of cores requested per task is the number of cores in the offer plus the extra cores configured.
-    Note that total amount of cores the executor will request in total will not exceed the spark.cores.max setting.
+    Note that total amount of cores the executor will request in total will not exceed the <code>spark.cores.max</code> setting.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.mesosExecutor.cores</code></td>
-  <td>1.0</td>
+  <td><code>1.0</code></td>
   <td>
     (Fine-grained mode only) Number of cores to give each Mesos executor. This does not
     include the cores used to run the Spark tasks. In other words, even if no Spark task
@@ -287,7 +287,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td>
     Set the list of volumes which will be mounted into the Docker image, which was set using
     <code>spark.mesos.executor.docker.image</code>. The format of this property is a comma-separated list of
-    mappings following the form passed to <tt>docker run -v</tt>. That is they take the form:
+    mappings following the form passed to <code>docker run -v</code>. That is they take the form:
 
     <pre>[host_path:]container_path[:ro|:rw]</pre>
   </td>
@@ -318,7 +318,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td>executor memory * 0.10, with minimum of 384</td>
   <td>
     The amount of additional memory, specified in MB, to be allocated per executor. By default,
-    the overhead will be larger of either 384 or 10% of `spark.executor.memory`. If it's set,
+    the overhead will be larger of either 384 or 10% of <code>spark.executor.memory</code>. If set,
     the final overhead will be this value.
   </td>
 </tr>
@@ -339,7 +339,7 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.secret</code></td>
-  <td>(none)/td>
+  <td>(none)</td>
   <td>
     Set the secret with which Spark framework will use to authenticate with Mesos.
   </td>
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 3a961d245f3de..0e25ccf512c02 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -23,7 +23,7 @@ Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.ht
 To launch a Spark application in `yarn-cluster` mode:
 
     $ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]
-    
+
 For example:
 
     $ ./bin/spark-submit --class org.apache.spark.examples.SparkPi \
@@ -43,7 +43,7 @@ To launch a Spark application in `yarn-client` mode, do the same, but replace `y
 
 ## Adding Other JARs
 
-In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command. 
+In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command.
 
     $ ./bin/spark-submit --class my.main.Class \
         --master yarn-cluster \
@@ -64,16 +64,16 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
 
 # Debugging your Application
 
-In YARN terminology, executors and application masters run inside "containers". YARN has two modes for handling container logs after an application has completed. If log aggregation is turned on (with the `yarn.log-aggregation-enable` config), container logs are copied to HDFS and deleted on the local machine. These logs can be viewed from anywhere on the cluster with the "yarn logs" command.
+In YARN terminology, executors and application masters run inside "containers". YARN has two modes for handling container logs after an application has completed. If log aggregation is turned on (with the `yarn.log-aggregation-enable` config), container logs are copied to HDFS and deleted on the local machine. These logs can be viewed from anywhere on the cluster with the `yarn logs` command.
 
     yarn logs -applicationId <app ID>
-    
+
 will print out the contents of all log files from all containers from the given application. You can also view the container log files directly in HDFS using the HDFS shell or API. The directory where they are located can be found by looking at your YARN configs (`yarn.nodemanager.remote-app-log-dir` and `yarn.nodemanager.remote-app-log-dir-suffix`). The logs are also available on the Spark Web UI under the Executors Tab. You need to have both the Spark history server and the MapReduce history server running and configure `yarn.log.server.url` in `yarn-site.xml` properly. The log URL on the Spark history server UI will redirect you to the MapReduce history server to show the aggregated logs.
 
 When log aggregation isn't turned on, logs are retained locally on each machine under `YARN_APP_LOGS_DIR`, which is usually configured to `/tmp/logs` or `$HADOOP_HOME/logs/userlogs` depending on the Hadoop version and installation. Viewing logs for a container requires going to the host that contains them and looking in this directory.  Subdirectories organize log files by application ID and container ID. The logs are also available on the Spark Web UI under the Executors Tab and doesn't require running the MapReduce history server.
 
 To review per-container launch environment, increase `yarn.nodemanager.delete.debug-delay-sec` to a
-large value (e.g. 36000), and then access the application cache through `yarn.nodemanager.local-dirs`
+large value (e.g. `36000`), and then access the application cache through `yarn.nodemanager.local-dirs`
 on the nodes on which containers are launched. This directory contains the launch script, JARs, and
 all environment variables used for launching each container. This process is useful for debugging
 classpath problems in particular. (Note that enabling this requires admin privileges on cluster
@@ -92,7 +92,7 @@ Note that for the first option, both executors and the application master will s
 log4j configuration, which may cause issues when they run on the same node (e.g. trying to write
 to the same log file).
 
-If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your log4j.properties. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming application, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log file, and logs can be accessed using YARN's log utility.
+If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming applications, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log files, and logs can be accessed using YARN's log utility.
 
 #### Spark Properties
 
@@ -100,24 +100,26 @@ If you need a reference to the proper location to put log files in the YARN so t
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
   <td><code>spark.yarn.am.memory</code></td>
-  <td>512m</td>
+  <td><code>512m</code></td>
   <td>
     Amount of memory to use for the YARN Application Master in client mode, in the same format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>).
     In cluster mode, use <code>spark.driver.memory</code> instead.
+    <p/>
+    Use lower-case suffixes, e.g. <code>k</code>, <code>m</code>, <code>g</code>, <code>t</code>, and <code>p</code>, for kibi-, mebi-, gibi-, tebi-, and pebibytes, respectively.
   </td>
 </tr>
 <tr>
   <td><code>spark.driver.cores</code></td>
-  <td>1</td>
+  <td><code>1</code></td>
   <td>
     Number of cores used by the driver in YARN cluster mode.
-    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN AM.
-    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN AM instead.
+    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN Application Master.
+    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN Application Master instead.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.cores</code></td>
-  <td>1</td>
+  <td><code>1</code></td>
   <td>
     Number of cores to use for the YARN Application Master in client mode.
     In cluster mode, use <code>spark.driver.cores</code> instead.
@@ -125,39 +127,39 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.am.waitTime</code></td>
-  <td>100s</td>
+  <td><code>100s</code></td>
   <td>
-    In `yarn-cluster` mode, time for the application master to wait for the
-    SparkContext to be initialized. In `yarn-client` mode, time for the application master to wait
+    In <code>yarn-cluster</code> mode, time for the YARN Application Master to wait for the
+    SparkContext to be initialized. In <code>yarn-client</code> mode, time for the YARN Application Master to wait
     for the driver to connect to it.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.submit.file.replication</code></td>
-  <td>The default HDFS replication (usually 3)</td>
+  <td>The default HDFS replication (usually <code>3</code>)</td>
   <td>
     HDFS replication level for the files uploaded into HDFS for the application. These include things like the Spark jar, the app jar, and any distributed cache files/archives.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.preserve.staging.files</code></td>
-  <td>false</td>
+  <td><code>false</code></td>
   <td>
-    Set to true to preserve the staged files (Spark jar, app jar, distributed cache files) at the end of the job rather than delete them.
+    Set to <code>true</code> to preserve the staged files (Spark jar, app jar, distributed cache files) at the end of the job rather than delete them.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.scheduler.heartbeat.interval-ms</code></td>
-  <td>3000</td>
+  <td><code>3000</code></td>
   <td>
     The interval in ms in which the Spark application master heartbeats into the YARN ResourceManager.
-    The value is capped at half the value of YARN's configuration for the expiry interval
-    (<code>yarn.am.liveness-monitor.expiry-interval-ms</code>).
+    The value is capped at half the value of YARN's configuration for the expiry interval, i.e.
+    <code>yarn.am.liveness-monitor.expiry-interval-ms</code>.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.scheduler.initial-allocation.interval</code></td>
-  <td>200ms</td>
+  <td><code>200ms</code></td>
   <td>
     The initial interval in which the Spark application master eagerly heartbeats to the YARN ResourceManager
     when there are pending container allocation requests. It should be no larger than
@@ -177,8 +179,8 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td><code>spark.yarn.historyServer.address</code></td>
   <td>(none)</td>
   <td>
-    The address of the Spark history server (i.e. host.com:18080). The address should not contain a scheme (http://). Defaults to not being set since the history server is an optional service. This address is given to the YARN ResourceManager when the Spark application finishes to link the application from the ResourceManager UI to the Spark history server UI. 
-    For this property, YARN properties can be used as variables, and these are substituted by Spark at runtime. For eg, if the Spark history server runs on the same node as the YARN ResourceManager, it can be set to `${hadoopconf-yarn.resourcemanager.hostname}:18080`. 
+    The address of the Spark history server, e.g. <code>host.com:18080</code>. The address should not contain a scheme (<code>http://</code>). Defaults to not being set since the history server is an optional service. This address is given to the YARN ResourceManager when the Spark application finishes to link the application from the ResourceManager UI to the Spark history server UI.
+    For this property, YARN properties can be used as variables, and these are substituted by Spark at runtime. For example, if the Spark history server runs on the same node as the YARN ResourceManager, it can be set to <code>${hadoopconf-yarn.resourcemanager.hostname}:18080</code>.
   </td>
 </tr>
 <tr>
@@ -197,42 +199,42 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
  <td><code>spark.executor.instances</code></td>
-  <td>2</td>
+  <td><code>2</code></td>
   <td>
-    The number of executors. Note that this property is incompatible with <code>spark.dynamicAllocation.enabled</code>. If both <code>spark.dynamicAllocation.enabled</code> and <code>spark.executor.instances</code> are specified, dynamic allocation is turned off and the specified number of <code>spark.executor.instances</code> is used. 
+    The number of executors. Note that this property is incompatible with <code>spark.dynamicAllocation.enabled</code>. If both <code>spark.dynamicAllocation.enabled</code> and <code>spark.executor.instances</code> are specified, dynamic allocation is turned off and the specified number of <code>spark.executor.instances</code> is used.
   </td>
 </tr>
 <tr>
  <td><code>spark.yarn.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%).
+    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%).
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.driver.memoryOverhead</code></td>
   <td>driverMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
+    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.memoryOverhead</code></td>
   <td>AM memory * 0.10, with minimum of 384 </td>
   <td>
-    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the Application Master in client mode.
+    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the YARN Application Master in client mode.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.port</code></td>
   <td>(random)</td>
   <td>
-    Port for the YARN Application Master to listen on. In YARN client mode, this is used to communicate between the Spark driver running on a gateway and the Application Master running on YARN. In YARN cluster mode, this is used for the dynamic executor feature, where it handles the kill from the scheduler backend.
+    Port for the YARN Application Master to listen on. In YARN client mode, this is used to communicate between the Spark driver running on a gateway and the YARN Application Master running on YARN. In YARN cluster mode, this is used for the dynamic executor feature, where it handles the kill from the scheduler backend.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.queue</code></td>
-  <td>default</td>
+  <td><code>default</code></td>
   <td>
     The name of the YARN queue to which the application is submitted.
   </td>
@@ -245,18 +247,18 @@ If you need a reference to the proper location to put log files in the YARN so t
     By default, Spark on YARN will use a Spark jar installed locally, but the Spark jar can also be
     in a world-readable location on HDFS. This allows YARN to cache it on nodes so that it doesn't
     need to be distributed each time an application runs. To point to a jar on HDFS, for example,
-    set this configuration to "hdfs:///some/path".
+    set this configuration to <code>hdfs:///some/path</code>.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.access.namenodes</code></td>
   <td>(none)</td>
   <td>
-    A list of secure HDFS namenodes your Spark application is going to access. For 
-    example, `spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032`. 
-    The Spark application must have acess to the namenodes listed and Kerberos must 
-    be properly configured to be able to access them (either in the same realm or in 
-    a trusted realm). Spark acquires security tokens for each of the namenodes so that 
+    A comma-separated list of secure HDFS namenodes your Spark application is going to access. For
+    example, <code>spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032</code>.
+    The Spark application must have access to the namenodes listed and Kerberos must
+    be properly configured to be able to access them (either in the same realm or in
+    a trusted realm). Spark acquires security tokens for each of the namenodes so that
     the Spark application can access those remote HDFS clusters.
   </td>
 </tr>
@@ -264,18 +266,18 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td><code>spark.yarn.appMasterEnv.[EnvironmentVariableName]</code></td>
   <td>(none)</td>
   <td>
-     Add the environment variable specified by <code>EnvironmentVariableName</code> to the 
-     Application Master process launched on YARN. The user can specify multiple of 
-     these and to set multiple environment variables. In `yarn-cluster` mode this controls 
-     the environment of the SPARK driver and in `yarn-client` mode it only controls 
-     the environment of the executor launcher. 
+     Add the environment variable specified by <code>EnvironmentVariableName</code> to the
+     Application Master process launched on YARN. The user can specify multiple of
+     these and to set multiple environment variables. In <code>yarn-cluster</code> mode this controls
+     the environment of the Spark driver and in <code>yarn-client</code> mode it only controls
+     the environment of the executor launcher.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.containerLauncherMaxThreads</code></td>
-  <td>25</td>
+  <td><code>25</code></td>
   <td>
-    The maximum number of threads to use in the application master for launching executor containers.
+    The maximum number of threads to use in the YARN Application Master for launching executor containers.
   </td>
 </tr>
 <tr>
@@ -283,19 +285,19 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td>(none)</td>
   <td>
   A string of extra JVM options to pass to the YARN Application Master in client mode.
-  In cluster mode, use `spark.driver.extraJavaOptions` instead.
+  In cluster mode, use <code>spark.driver.extraJavaOptions</code> instead.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.extraLibraryPath</code></td>
   <td>(none)</td>
   <td>
-    Set a special library path to use when launching the application master in client mode.
+    Set a special library path to use when launching the YARN Application Master in client mode.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.maxAppAttempts</code></td>
-  <td>yarn.resourcemanager.am.max-attempts in YARN</td>
+  <td><code>yarn.resourcemanager.am.max-attempts</code> in YARN</td>
   <td>
   The maximum number of attempts that will be made to submit the application.
   It should be no larger than the global number of max attempts in the YARN configuration.
@@ -303,10 +305,10 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.submit.waitAppCompletion</code></td>
-  <td>true</td>
+  <td><code>true</code></td>
   <td>
   In YARN cluster mode, controls whether the client waits to exit until the application completes.
-  If set to true, the client process will stay alive reporting the application's status.
+  If set to <code>true</code>, the client process will stay alive reporting the application's status.
   Otherwise, the client process will exit after submission.
   </td>
 </tr>
@@ -332,7 +334,7 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td>(none)</td>
   <td>
   The full path to the file that contains the keytab for the principal specified above.
-  This keytab will be copied to the node running the Application Master via the Secure Distributed Cache,
+  This keytab will be copied to the node running the YARN Application Master via the Secure Distributed Cache,
   for renewing the login tickets and the delegation tokens periodically.
   </td>
 </tr>
@@ -371,14 +373,14 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.security.tokens.${service}.enabled</code></td>
-  <td>true</td>
+  <td><code>true</code></td>
   <td>
   Controls whether to retrieve delegation tokens for non-HDFS services when security is enabled.
   By default, delegation tokens for all supported services are retrieved when those services are
   configured, but it's possible to disable that behavior if it somehow conflicts with the
   application being run.
   <p/>
-  Currently supported services are: hive, hbase
+  Currently supported services are: <code>hive</code>, <code>hbase</code>
   </td>
 </tr>
 </table>
@@ -387,5 +389,5 @@ If you need a reference to the proper location to put log files in the YARN so t
 
 - Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured.
 - In `yarn-cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `yarn-client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `yarn-client` mode, only the Spark executors do.
-- The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named localtest.txt into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN.
+- The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named `localtest.txt` into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN.
 - The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `yarn-cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7ae9244c271e3..a1cbc7de97c65 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1676,7 +1676,7 @@ results <- collect(sql(sqlContext, "FROM src SELECT key, value"))
 ### Interacting with Different Versions of Hive Metastore
 
 One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore,
-which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary 
+which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary
 build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
 Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL
 will compile against Hive 1.2.1 and use those classes for internal execution (serdes, UDFs, UDAFs, etc).
@@ -1706,8 +1706,8 @@ The following options can be used to configure the version of Hive that is used
         either <code>1.2.1</code> or not defined.
         <li><code>maven</code></li>
         Use Hive jars of specified version downloaded from Maven repositories.  This configuration
-        is not generally recommended for production deployments. 
-        <li>A classpath in the standard format for the JVM.  This classpath must include all of Hive 
+        is not generally recommended for production deployments.
+        <li>A classpath in the standard format for the JVM.  This classpath must include all of Hive
         and its dependencies, including the correct version of Hadoop.  These jars only need to be
         present on the driver, but if you are running in yarn cluster mode then you must ensure
         they are packaged with you application.</li>
@@ -1806,7 +1806,7 @@ the Data Sources API.  The following options are supported:
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-val jdbcDF = sqlContext.read.format("jdbc").options( 
+val jdbcDF = sqlContext.read.format("jdbc").options(
   Map("url" -> "jdbc:postgresql:dbserver",
   "dbtable" -> "schema.tablename")).load()
 {% endhighlight %}
@@ -2023,11 +2023,11 @@ options.
 
  - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with
    code generation for expression evaluation.  These features can both be disabled by setting
-   `spark.sql.tungsten.enabled` to `false.
- - Parquet schema merging is no longer enabled by default.  It can be re-enabled by setting 
+   `spark.sql.tungsten.enabled` to `false`.
+ - Parquet schema merging is no longer enabled by default.  It can be re-enabled by setting
    `spark.sql.parquet.mergeSchema` to `true`.
- - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or 
-   access nested values.  For example `df['table.column.nestedField']`.  However, this means that if 
+ - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or
+   access nested values.  For example `df['table.column.nestedField']`.  However, this means that if
    your column name contains any dots you must now escape them using backticks (e.g., ``table.`column.with.dots`.nested``).   
  - In-memory columnar storage partition pruning is on by default. It can be disabled by setting
    `spark.sql.inMemoryColumnarStorage.partitionPruning` to `false`.
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 7ea4d6f1a3f8f..915be0f479157 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -103,7 +103,7 @@ run it with `--help`. Here are a few examples of common options:
 export HADOOP_CONF_DIR=XXX
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
-  --master yarn-cluster \  # can also be `yarn-client` for client mode
+  --master yarn-cluster \  # can also be yarn-client for client mode
   --executor-memory 20G \
   --num-executors 50 \
   /path/to/examples.jar \
@@ -174,9 +174,9 @@ This can use up a significant amount of space over time and will need to be clea
 is handled automatically, and with Spark standalone, automatic cleanup can be configured with the
 `spark.worker.cleanup.appDataTtl` property.
 
-Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates 
-with `--packages`. All transitive dependencies will be handled when using this command. Additional 
-repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`. 
+Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates
+with `--packages`. All transitive dependencies will be handled when using this command. Additional
+repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`.
 These commands can be used with `pyspark`, `spark-shell`, and `spark-submit` to include Spark Packages.
 
 For Python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries

From 331f0b10f78a37d96d3e573d211d74a0935265db Mon Sep 17 00:00:00 2001
From: Meihua Wu <meihuawu@umich.edu>
Date: Mon, 21 Sep 2015 12:09:00 -0700
Subject: [PATCH 361/802] [SPARK-9642] [ML] LinearRegression should supported
 weighted data

In many modeling application, data points are not necessarily sampled with equal probabilities. Linear regression should support weighting which account the over or under sampling.

work in progress.

Author: Meihua Wu <meihuawu@umich.edu>

Closes #8631 from rotationsymmetry/SPARK-9642.
---
 .../ml/regression/LinearRegression.scala      | 164 +++++++++++-------
 .../ml/regression/LinearRegressionSuite.scala |  88 ++++++++++
 project/MimaExcludes.scala                    |   8 +-
 3 files changed, 191 insertions(+), 69 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index e4602d36ccc87..78a67c5fdab20 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -31,21 +31,29 @@ import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.functions.{col, udf, lit}
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.StatCounter
 
 /**
  * Params for linear regression.
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
-    with HasFitIntercept with HasStandardization
+    with HasFitIntercept with HasStandardization with HasWeightCol
+
+/**
+ * Class that represents an instance of weighted data point with label and features.
+ *
+ * TODO: Refactor this class to proper place.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param features The vector of features for this data point.
+ */
+private[regression] case class Instance(label: Double, weight: Double, features: Vector)
 
 /**
  * :: Experimental ::
@@ -123,30 +131,43 @@ class LinearRegression(override val uid: String)
   def setTol(value: Double): this.type = set(tol, value)
   setDefault(tol -> 1E-6)
 
+  /**
+   * Whether to over-/under-sample training instances according to the given weights in weightCol.
+   * If empty, all instances are treated equally (weight 1.0).
+   * Default is empty, so all instances have weight one.
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+  setDefault(weightCol -> "")
+
   override protected def train(dataset: DataFrame): LinearRegressionModel = {
     // Extract columns from data.  If dataset is persisted, do not persist instances.
-    val instances = extractLabeledPoints(dataset).map {
-      case LabeledPoint(label: Double, features: Vector) => (label, features)
+    val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
+    val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).map {
+      case Row(label: Double, weight: Double, features: Vector) =>
+        Instance(label, weight, features)
     }
+
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val (summarizer, statCounter) = instances.treeAggregate(
-      (new MultivariateOnlineSummarizer, new StatCounter))(
-        seqOp = (c, v) => (c, v) match {
-          case ((summarizer: MultivariateOnlineSummarizer, statCounter: StatCounter),
-          (label: Double, features: Vector)) =>
-            (summarizer.add(features), statCounter.merge(label))
-      },
-        combOp = (c1, c2) => (c1, c2) match {
-          case ((summarizer1: MultivariateOnlineSummarizer, statCounter1: StatCounter),
-          (summarizer2: MultivariateOnlineSummarizer, statCounter2: StatCounter)) =>
-            (summarizer1.merge(summarizer2), statCounter1.merge(statCounter2))
-      })
-
-    val numFeatures = summarizer.mean.size
-    val yMean = statCounter.mean
-    val yStd = math.sqrt(statCounter.variance)
+    val (featuresSummarizer, ySummarizer) = {
+      val seqOp = (c: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+        instance: Instance) =>
+          (c._1.add(instance.features, instance.weight),
+            c._2.add(Vectors.dense(instance.label), instance.weight))
+
+      val combOp = (c1: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+        c2: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer)) =>
+          (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+      instances.treeAggregate(
+        new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)(seqOp, combOp)
+    }
+
+    val numFeatures = featuresSummarizer.mean.size
+    val yMean = ySummarizer.mean(0)
+    val yStd = math.sqrt(ySummarizer.variance(0))
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
@@ -167,8 +188,8 @@ class LinearRegression(override val uid: String)
       return copyValues(model.setSummary(trainingSummary))
     }
 
-    val featuresMean = summarizer.mean.toArray
-    val featuresStd = summarizer.variance.toArray.map(math.sqrt)
+    val featuresMean = featuresSummarizer.mean.toArray
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
 
     // Since we implicitly do the feature scaling when we compute the cost function
     // to improve the convergence, the effective regParam will be changed.
@@ -318,7 +339,8 @@ class LinearRegressionModel private[ml] (
 
 /**
  * :: Experimental ::
- * Linear regression training results.
+ * Linear regression training results. Currently, the training summary ignores the
+ * training weights except for the objective trace.
  * @param predictions predictions outputted by the model's `transform` method.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
@@ -477,7 +499,7 @@ class LinearRegressionSummary private[regression] (
  * \frac{\partial L}{\partial\w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
  * }}},
  *
- * @param weights The weights/coefficients corresponding to the features.
+ * @param coefficients The coefficients corresponding to the features.
  * @param labelStd The standard deviation value of the label.
  * @param labelMean The mean value of the label.
  * @param fitIntercept Whether to fit an intercept term.
@@ -485,7 +507,7 @@ class LinearRegressionSummary private[regression] (
  * @param featuresMean The mean values of the features.
  */
 private class LeastSquaresAggregator(
-    weights: Vector,
+    coefficients: Vector,
     labelStd: Double,
     labelMean: Double,
     fitIntercept: Boolean,
@@ -493,26 +515,28 @@ private class LeastSquaresAggregator(
     featuresMean: Array[Double]) extends Serializable {
 
   private var totalCnt: Long = 0L
+  private var weightSum: Double = 0.0
   private var lossSum = 0.0
 
-  private val (effectiveWeightsArray: Array[Double], offset: Double, dim: Int) = {
-    val weightsArray = weights.toArray.clone()
+  private val (effectiveCoefficientsArray: Array[Double], offset: Double, dim: Int) = {
+    val coefficientsArray = coefficients.toArray.clone()
     var sum = 0.0
     var i = 0
-    val len = weightsArray.length
+    val len = coefficientsArray.length
     while (i < len) {
       if (featuresStd(i) != 0.0) {
-        weightsArray(i) /=  featuresStd(i)
-        sum += weightsArray(i) * featuresMean(i)
+        coefficientsArray(i) /=  featuresStd(i)
+        sum += coefficientsArray(i) * featuresMean(i)
       } else {
-        weightsArray(i) = 0.0
+        coefficientsArray(i) = 0.0
       }
       i += 1
     }
-    (weightsArray, if (fitIntercept) labelMean / labelStd - sum else 0.0, weightsArray.length)
+    val offset = if (fitIntercept) labelMean / labelStd - sum else 0.0
+    (coefficientsArray, offset, coefficientsArray.length)
   }
 
-  private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
+  private val effectiveCoefficientsVector = Vectors.dense(effectiveCoefficientsArray)
 
   private val gradientSumArray = Array.ofDim[Double](dim)
 
@@ -520,30 +544,33 @@ private class LeastSquaresAggregator(
    * Add a new training data to this LeastSquaresAggregator, and update the loss and gradient
    * of the objective function.
    *
-   * @param label The label for this data point.
-   * @param data The features for one data point in dense/sparse vector format to be added
-   *             into this aggregator.
+   * @param instance  The data point instance to be added.
    * @return This LeastSquaresAggregator object.
    */
-  def add(label: Double, data: Vector): this.type = {
-    require(dim == data.size, s"Dimensions mismatch when adding new sample." +
-      s" Expecting $dim but got ${data.size}.")
+  def add(instance: Instance): this.type =
+    instance match { case Instance(label, weight, features) =>
+      require(dim == features.size, s"Dimensions mismatch when adding new sample." +
+        s" Expecting $dim but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
 
-    val diff = dot(data, effectiveWeightsVector) - label / labelStd + offset
+      if (weight == 0.0) return this
 
-    if (diff != 0) {
-      val localGradientSumArray = gradientSumArray
-      data.foreachActive { (index, value) =>
-        if (featuresStd(index) != 0.0 && value != 0.0) {
-          localGradientSumArray(index) += diff * value / featuresStd(index)
+      val diff = dot(features, effectiveCoefficientsVector) - label / labelStd + offset
+
+      if (diff != 0) {
+        val localGradientSumArray = gradientSumArray
+        features.foreachActive { (index, value) =>
+          if (featuresStd(index) != 0.0 && value != 0.0) {
+            localGradientSumArray(index) += weight * diff * value / featuresStd(index)
+          }
         }
+        lossSum += weight * diff * diff / 2.0
       }
-      lossSum += diff * diff / 2.0
-    }
 
-    totalCnt += 1
-    this
-  }
+      totalCnt += 1
+      weightSum += weight
+      this
+    }
 
   /**
    * Merge another LeastSquaresAggregator, and update the loss and gradient
@@ -557,8 +584,9 @@ private class LeastSquaresAggregator(
     require(dim == other.dim, s"Dimensions mismatch when merging with another " +
       s"LeastSquaresAggregator. Expecting $dim but got ${other.dim}.")
 
-    if (other.totalCnt != 0) {
+    if (other.weightSum != 0) {
       totalCnt += other.totalCnt
+      weightSum += other.weightSum
       lossSum += other.lossSum
 
       var i = 0
@@ -574,11 +602,17 @@ private class LeastSquaresAggregator(
 
   def count: Long = totalCnt
 
-  def loss: Double = lossSum / totalCnt
+  def loss: Double = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
+    lossSum / weightSum
+  }
 
   def gradient: Vector = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
-    scal(1.0 / totalCnt, result)
+    scal(1.0 / weightSum, result)
     result
   }
 }
@@ -589,7 +623,7 @@ private class LeastSquaresAggregator(
  * It's used in Breeze's convex optimization routines.
  */
 private class LeastSquaresCostFun(
-    data: RDD[(Double, Vector)],
+    data: RDD[Instance],
     labelStd: Double,
     labelMean: Double,
     fitIntercept: Boolean,
@@ -598,17 +632,13 @@ private class LeastSquaresCostFun(
     featuresMean: Array[Double],
     effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
 
-  override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
-    val w = Vectors.fromBreeze(weights)
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+    val coeff = Vectors.fromBreeze(coefficients)
 
-    val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(w, labelStd,
+    val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(coeff, labelStd,
       labelMean, fitIntercept, featuresStd, featuresMean))(
-        seqOp = (c, v) => (c, v) match {
-          case (aggregator, (label, features)) => aggregator.add(label, features)
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
-        })
+        seqOp = (aggregator, instance) => aggregator.add(instance),
+        combOp = (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
 
     val totalGradientArray = leastSquaresAggregator.gradient.toArray
 
@@ -616,7 +646,7 @@ private class LeastSquaresCostFun(
       0.0
     } else {
       var sum = 0.0
-      w.foreachActive { (index, value) =>
+      coeff.foreachActive { (index, value) =>
         // The following code will compute the loss of the regularization; also
         // the gradient of the regularization, and add back to totalGradientArray.
         sum += {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 2aaee71ecc734..8428f4f00b370 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.ml.regression
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -510,4 +513,89 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       .zip(testSummary.residuals.select("residuals").collect())
       .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
   }
+
+  test("linear regression with weighted samples"){
+    val (data, weightedData) = {
+      val activeData = LinearDataGenerator.generateLinearInput(
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+
+      val rnd = new Random(8392)
+      val signedData = activeData.map { case p: LabeledPoint =>
+        (rnd.nextGaussian() > 0.0, p)
+      }
+
+      val data1 = signedData.flatMap {
+        case (true, p) => Iterator(p, p)
+        case (false, p) => Iterator(p)
+      }
+
+      val weightedSignedData = signedData.flatMap {
+        case (true, LabeledPoint(label, features)) =>
+          Iterator(
+            Instance(label, weight = 1.2, features),
+            Instance(label, weight = 0.8, features)
+          )
+        case (false, LabeledPoint(label, features)) =>
+          Iterator(
+            Instance(label, weight = 0.3, features),
+            Instance(label, weight = 0.1, features),
+            Instance(label, weight = 0.6, features)
+          )
+      }
+
+      val noiseData = LinearDataGenerator.generateLinearInput(
+        2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+      val weightedNoiseData = noiseData.map {
+        case LabeledPoint(label, features) => Instance(label, weight = 0, features)
+      }
+      val data2 = weightedSignedData ++ weightedNoiseData
+
+      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+    }
+
+    val trainer1a = (new LinearRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val model1a0 = trainer1a.fit(data)
+    val model1a1 = trainer1a.fit(weightedData)
+    val model1b = trainer1b.fit(weightedData)
+    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+    val trainer2a = (new LinearRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val model2a0 = trainer2a.fit(data)
+    val model2a1 = trainer2a.fit(weightedData)
+    val model2b = trainer2b.fit(weightedData)
+    assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
+    assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
+    assert(model2a0.weights ~== model2b.weights absTol 1E-3)
+    assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
+
+    val trainer3a = (new LinearRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val model3a0 = trainer3a.fit(data)
+    val model3a1 = trainer3a.fit(weightedData)
+    val model3b = trainer3b.fit(weightedData)
+    assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
+    assert(model3a0.weights ~== model3b.weights absTol 1E-3)
+
+    val trainer4a = (new LinearRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val model4a0 = trainer4a.fit(data)
+    val model4a1 = trainer4a.fit(weightedData)
+    val model4b = trainer4b.fit(weightedData)
+    assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
+    assert(model4a0.weights ~== model4b.weights absTol 1E-3)
+  }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 814a11e588ceb..b2e6be706637b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -70,10 +70,14 @@ object MimaExcludes {
           "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
         ProblemFilters.exclude[IncompatibleMethTypeProblem](
           "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
-      ) ++
-      Seq(
+      ) ++ Seq(
         ProblemFilters.exclude[MissingClassProblem](
           "org.apache.spark.shuffle.FileShuffleBlockResolver$ShuffleFileGroup")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresAggregator.add"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this")
       )
     case v if v.startsWith("1.5") =>
       Seq(

From b78c65b03ae87a3ba348c9d29ff4c296349eb49c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?hushan=5B=E8=83=A1=E7=8F=8A=5D?= <hushan@xiaomi.com>
Date: Mon, 21 Sep 2015 14:26:15 -0500
Subject: [PATCH 362/802] [SPARK-5259] [CORE] don't submit stage until its
 dependencies map outputs are registered
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Track pending tasks by partition ID instead of Task objects.

Before this change, failure & retry could result in a case where a stage got submitted before the map output from its dependencies get registered.  This was due to an error in the condition for registering map outputs.

Author: hushan[胡珊] <hushan@xiaomi.com>
Author: Imran Rashid <irashid@cloudera.com>

Closes #7699 from squito/SPARK-5259.
---
 .../apache/spark/scheduler/DAGScheduler.scala |  12 +-
 .../org/apache/spark/scheduler/Stage.scala    |   2 +-
 .../spark/scheduler/TaskSetManager.scala      |   4 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   | 197 ++++++++++++++++--
 4 files changed, 191 insertions(+), 24 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 3c9a66e504403..394228b2728d1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -944,7 +944,7 @@ class DAGScheduler(
   private def submitMissingTasks(stage: Stage, jobId: Int) {
     logDebug("submitMissingTasks(" + stage + ")")
     // Get our pending tasks and remember them in our pendingTasks entry
-    stage.pendingTasks.clear()
+    stage.pendingPartitions.clear()
 
     // First figure out the indexes of partition ids to compute.
     val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
@@ -1060,8 +1060,8 @@ class DAGScheduler(
 
     if (tasks.size > 0) {
       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
-      stage.pendingTasks ++= tasks
-      logDebug("New pending tasks: " + stage.pendingTasks)
+      stage.pendingPartitions ++= tasks.map(_.partitionId)
+      logDebug("New pending partitions: " + stage.pendingPartitions)
       taskScheduler.submitTasks(new TaskSet(
         tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
       stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
@@ -1152,7 +1152,7 @@ class DAGScheduler(
       case Success =>
         listenerBus.post(SparkListenerTaskEnd(stageId, stage.latestInfo.attemptId, taskType,
           event.reason, event.taskInfo, event.taskMetrics))
-        stage.pendingTasks -= task
+        stage.pendingPartitions -= task.partitionId
         task match {
           case rt: ResultTask[_, _] =>
             // Cast to ResultStage here because it's part of the ResultTask
@@ -1198,7 +1198,7 @@ class DAGScheduler(
               shuffleStage.addOutputLoc(smt.partitionId, status)
             }
 
-            if (runningStages.contains(shuffleStage) && shuffleStage.pendingTasks.isEmpty) {
+            if (runningStages.contains(shuffleStage) && shuffleStage.pendingPartitions.isEmpty) {
               markStageAsFinished(shuffleStage)
               logInfo("looking for newly runnable stages")
               logInfo("running: " + runningStages)
@@ -1242,7 +1242,7 @@ class DAGScheduler(
 
       case Resubmitted =>
         logInfo("Resubmitted " + task + ", so marking it as still running")
-        stage.pendingTasks += task
+        stage.pendingPartitions += task.partitionId
 
       case FetchFailed(bmAddress, shuffleId, mapId, reduceId, failureMessage) =>
         val failedStage = stageIdToStage(task.stageId)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index b37eccbd0f7b8..a3829c319c48d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -66,7 +66,7 @@ private[scheduler] abstract class Stage(
   /** Set of jobs that this stage belongs to. */
   val jobIds = new HashSet[Int]
 
-  var pendingTasks = new HashSet[Task[_]]
+  val pendingPartitions = new HashSet[Int]
 
   /** The ID to use for the next new attempt for this stage. */
   private var nextAttemptId: Int = 0
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 62af9031b9f8b..c02597c4365c9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -487,8 +487,8 @@ private[spark] class TaskSetManager(
           // a good proxy to task serialization time.
           // val timeTaken = clock.getTime() - startTime
           val taskName = s"task ${info.id} in stage ${taskSet.id}"
-          logInfo("Starting %s (TID %d, %s, %s, %d bytes)".format(
-              taskName, taskId, host, taskLocality, serializedTask.limit))
+          logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," +
+            s"$taskLocality, ${serializedTask.limit} bytes)")
 
           sched.dagScheduler.taskStarted(task, info)
           return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 1c55f90ad9b44..6b5bcf0574de6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -479,8 +479,8 @@ class DAGSchedulerSuite
     val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-        (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
-        (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+        (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+        (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
     // the 2nd ResultTask failed
     complete(taskSets(1), Seq(
         (Success, 42),
@@ -490,7 +490,7 @@ class DAGSchedulerSuite
     // ask the scheduler to try it again
     scheduler.resubmitFailedStages()
     // have the 2nd attempt pass
-    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.length))))
     // we can see both result blocks now
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
       HashSet("hostA", "hostB"))
@@ -782,8 +782,8 @@ class DAGSchedulerSuite
     val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-      (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
-      (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
     // The MapOutputTracker should know about both map output locations.
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
       HashSet("hostA", "hostB"))
@@ -1035,6 +1035,173 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+  /**
+   * This test runs a three stage job, with a fetch failure in stage 1.  but during the retry, we
+   * have completions from both the first & second attempt of stage 1.  So all the map output is
+   * available before we finish any task set for stage 1.  We want to make sure that we don't
+   * submit stage 2 until the map output for stage 1 is registered
+   */
+  test("don't submit stage until its dependencies map outputs are registered (SPARK-5259)") {
+    val firstRDD = new MyRDD(sc, 3, Nil)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val firstShuffleId = firstShuffleDep.shuffleId
+    val shuffleMapRdd = new MyRDD(sc, 3, List(firstShuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    submit(reduceRdd, Array(0))
+
+    // things start out smoothly, stage 0 completes with no issues
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostA", shuffleMapRdd.partitions.length))
+    ))
+
+    // then one executor dies, and a task fails in stage 1
+    runEvent(ExecutorLost("exec-hostA"))
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(0),
+      FetchFailed(null, firstShuffleId, 2, 0, "Fetch failed"),
+      null,
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // so we resubmit stage 0, which completes happily
+    scheduler.resubmitFailedStages()
+    val stage0Resubmit = taskSets(2)
+    assert(stage0Resubmit.stageId == 0)
+    assert(stage0Resubmit.stageAttemptId === 1)
+    val task = stage0Resubmit.tasks(0)
+    assert(task.partitionId === 2)
+    runEvent(CompletionEvent(
+      task,
+      Success,
+      makeMapStatus("hostC", shuffleMapRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // now here is where things get tricky : we will now have a task set representing
+    // the second attempt for stage 1, but we *also* have some tasks for the first attempt for
+    // stage 1 still going
+    val stage1Resubmit = taskSets(3)
+    assert(stage1Resubmit.stageId == 1)
+    assert(stage1Resubmit.stageAttemptId === 1)
+    assert(stage1Resubmit.tasks.length === 3)
+
+    // we'll have some tasks finish from the first attempt, and some finish from the second attempt,
+    // so that we actually have all stage outputs, though no attempt has completed all its
+    // tasks
+    runEvent(CompletionEvent(
+      taskSets(3).tasks(0),
+      Success,
+      makeMapStatus("hostC", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+    runEvent(CompletionEvent(
+      taskSets(3).tasks(1),
+      Success,
+      makeMapStatus("hostC", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+    // late task finish from the first attempt
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(2),
+      Success,
+      makeMapStatus("hostB", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // What should happen now is that we submit stage 2.  However, we might not see an error
+    // b/c of DAGScheduler's error handling (it tends to swallow errors and just log them).  But
+    // we can check some conditions.
+    // Note that the really important thing here is not so much that we submit stage 2 *immediately*
+    // but that we don't end up with some error from these interleaved completions.  It would also
+    // be OK (though sub-optimal) if stage 2 simply waited until the resubmission of stage 1 had
+    // all its tasks complete
+
+    // check that we have all the map output for stage 0 (it should have been there even before
+    // the last round of completions from stage 1, but just to double check it hasn't been messed
+    // up) and also the newly available stage 1
+    val stageToReduceIdxs = Seq(
+      0 -> (0 until 3),
+      1 -> (0 until 1)
+    )
+    for {
+      (stage, reduceIdxs) <- stageToReduceIdxs
+      reduceIdx <- reduceIdxs
+    } {
+      // this would throw an exception if the map status hadn't been registered
+      val statuses = mapOutputTracker.getMapSizesByExecutorId(stage, reduceIdx)
+      // really we should have already thrown an exception rather than fail either of these
+      // asserts, but just to be extra defensive let's double check the statuses are OK
+      assert(statuses != null)
+      assert(statuses.nonEmpty)
+    }
+
+    // and check that stage 2 has been submitted
+    assert(taskSets.size == 5)
+    val stage2TaskSet = taskSets(4)
+    assert(stage2TaskSet.stageId == 2)
+    assert(stage2TaskSet.stageAttemptId == 0)
+  }
+
+  /**
+   * We lose an executor after completing some shuffle map tasks on it.  Those tasks get
+   * resubmitted, and when they finish the job completes normally
+   */
+  test("register map outputs correctly after ExecutorLost and task Resubmitted") {
+    val firstRDD = new MyRDD(sc, 3, Nil)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val reduceRdd = new MyRDD(sc, 5, List(firstShuffleDep))
+    submit(reduceRdd, Array(0))
+
+    // complete some of the tasks from the first stage, on one host
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(0), Success,
+      makeMapStatus("hostA", reduceRdd.partitions.length), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(1), Success,
+      makeMapStatus("hostA", reduceRdd.partitions.length), null, createFakeTaskInfo(), null))
+
+    // now that host goes down
+    runEvent(ExecutorLost("exec-hostA"))
+
+    // so we resubmit those tasks
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(0), Resubmitted, null, null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(1), Resubmitted, null, null, createFakeTaskInfo(), null))
+
+    // now complete everything on a different host
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))
+    ))
+
+    // now we should submit stage 1, and the map output from stage 0 should be registered
+
+    // check that we have all the map output for stage 0
+    (0 until reduceRdd.partitions.length).foreach { reduceIdx =>
+      val statuses = mapOutputTracker.getMapSizesByExecutorId(0, reduceIdx)
+      // really we should have already thrown an exception rather than fail either of these
+      // asserts, but just to be extra defensive let's double check the statuses are OK
+      assert(statuses != null)
+      assert(statuses.nonEmpty)
+    }
+
+    // and check that stage 1 has been submitted
+    assert(taskSets.size == 2)
+    val stage1TaskSet = taskSets(1)
+    assert(stage1TaskSet.stageId == 1)
+    assert(stage1TaskSet.stageAttemptId == 0)
+  }
+
   /**
    * Makes sure that failures of stage used by multiple jobs are correctly handled.
    *
@@ -1393,8 +1560,8 @@ class DAGSchedulerSuite
     // Submit a map stage by itself
     submitMapStage(shuffleDep)
     complete(taskSets(0), Seq(
-      (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
-      (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
     assert(results.size === 1)
     results.clear()
     assertDataStructuresEmpty()
@@ -1407,7 +1574,7 @@ class DAGSchedulerSuite
     // Ask the scheduler to try it again; TaskSet 2 will rerun the map task that we couldn't fetch
     // from, then TaskSet 3 will run the reduce stage
     scheduler.resubmitFailedStages()
-    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.length))))
     complete(taskSets(3), Seq((Success, 43)))
     assert(results === Map(0 -> 42, 1 -> 43))
     results.clear()
@@ -1452,8 +1619,8 @@ class DAGSchedulerSuite
     // Complete the first stage
     assert(taskSets(0).stageId === 0)
     complete(taskSets(0), Seq(
-      (Success, makeMapStatus("hostA", rdd1.partitions.size)),
-      (Success, makeMapStatus("hostB", rdd1.partitions.size))))
+      (Success, makeMapStatus("hostA", rdd1.partitions.length)),
+      (Success, makeMapStatus("hostB", rdd1.partitions.length))))
     assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
       HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
     assert(listener1.results.size === 1)
@@ -1461,7 +1628,7 @@ class DAGSchedulerSuite
     // When attempting the second stage, show a fetch failure
     assert(taskSets(1).stageId === 1)
     complete(taskSets(1), Seq(
-      (Success, makeMapStatus("hostA", rdd2.partitions.size)),
+      (Success, makeMapStatus("hostA", rdd2.partitions.length)),
       (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null)))
     scheduler.resubmitFailedStages()
     assert(listener2.results.size === 0)    // Second stage listener should not have a result yet
@@ -1469,7 +1636,7 @@ class DAGSchedulerSuite
     // Stage 0 should now be running as task set 2; make its task succeed
     assert(taskSets(2).stageId === 0)
     complete(taskSets(2), Seq(
-      (Success, makeMapStatus("hostC", rdd2.partitions.size))))
+      (Success, makeMapStatus("hostC", rdd2.partitions.length))))
     assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
       HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
     assert(listener2.results.size === 0)    // Second stage listener should still not have a result
@@ -1477,8 +1644,8 @@ class DAGSchedulerSuite
     // Stage 1 should now be running as task set 3; make its first task succeed
     assert(taskSets(3).stageId === 1)
     complete(taskSets(3), Seq(
-      (Success, makeMapStatus("hostB", rdd2.partitions.size)),
-      (Success, makeMapStatus("hostD", rdd2.partitions.size))))
+      (Success, makeMapStatus("hostB", rdd2.partitions.length)),
+      (Success, makeMapStatus("hostD", rdd2.partitions.length))))
     assert(mapOutputTracker.getMapSizesByExecutorId(dep2.shuffleId, 0).map(_._1).toSet ===
       HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostD")))
     assert(listener2.results.size === 1)
@@ -1494,7 +1661,7 @@ class DAGSchedulerSuite
     // TaskSet 5 will rerun stage 1's lost task, then TaskSet 6 will rerun stage 2
     assert(taskSets(5).stageId === 1)
     complete(taskSets(5), Seq(
-      (Success, makeMapStatus("hostE", rdd2.partitions.size))))
+      (Success, makeMapStatus("hostE", rdd2.partitions.length))))
     complete(taskSets(6), Seq(
       (Success, 53)))
     assert(listener3.results === Map(0 -> 52, 1 -> 53))

From ba882db6f43dd2bc05675133158e4664ed07030a Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 21 Sep 2015 13:06:23 -0700
Subject: [PATCH 363/802] [SPARK-9769] [ML] [PY] add python api for
 countvectorizermodel

From JIRA: Add Python API, user guide and example for ml.feature.CountVectorizerModel

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8561 from holdenk/SPARK-9769-add-python-api-for-countvectorizermodel.
---
 python/pyspark/ml/feature.py | 148 +++++++++++++++++++++++++++++++++--
 1 file changed, 142 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 92db8df80280b..f41d72f877256 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,12 +26,13 @@
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'IndexToString', 'MinMaxScaler', 'MinMaxScalerModel', 'NGram', 'Normalizer',
-           'OneHotEncoder', 'PCA', 'PCAModel', 'PolynomialExpansion', 'RegexTokenizer',
-           'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
-           'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
-           'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel']
+__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
+           'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler',
+           'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
+           'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
+           'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
+           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
+           'Word2Vec', 'Word2VecModel']
 
 
 @inherit_doc
@@ -171,6 +172,141 @@ def getSplits(self):
         return self.getOrDefault(self.splits)
 
 
+@inherit_doc
+class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
+    >>> df = sqlContext.createDataFrame(
+    ...    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
+    ...    ["label", "raw"])
+    >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
+    >>> model = cv.fit(df)
+    >>> model.transform(df).show(truncate=False)
+    +-----+---------------+-------------------------+
+    |label|raw            |vectors                  |
+    +-----+---------------+-------------------------+
+    |0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
+    |1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+    +-----+---------------+-------------------------+
+    ...
+    >>> sorted(map(str, model.vocabulary))
+    ['a', 'b', 'c']
+    """
+
+    # a placeholder to make it appear in the generated doc
+    minTF = Param(
+        Params._dummy(), "minTF", "Filter to ignore rare words in" +
+        " a document. For each document, terms with frequency/count less than the given" +
+        " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
+        " times the term must appear in the document); if this is a double in [0,1), then this " +
+        "specifies a fraction (out of the document's token count). Note that the parameter is " +
+        "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0")
+    minDF = Param(
+        Params._dummy(), "minDF", "Specifies the minimum number of" +
+        " different documents a term must appear in to be included in the vocabulary." +
+        " If this is an integer >= 1, this specifies the number of documents the term must" +
+        " appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
+        " Default 1.0")
+    vocabSize = Param(
+        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.")
+
+    @keyword_only
+    def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
+        """
+        __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
+        """
+        super(CountVectorizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
+                                            self.uid)
+        self.minTF = Param(
+            self, "minTF", "Filter to ignore rare words in" +
+            " a document. For each document, terms with frequency/count less than the given" +
+            " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
+            " times the term must appear in the document); if this is a double in [0,1), then " +
+            "this specifies a fraction (out of the document's token count). Note that the " +
+            "parameter is only used in transform of CountVectorizerModel and does not affect" +
+            "fitting. Default 1.0")
+        self.minDF = Param(
+            self, "minDF", "Specifies the minimum number of" +
+            " different documents a term must appear in to be included in the vocabulary." +
+            " If this is an integer >= 1, this specifies the number of documents the term must" +
+            " appear in; if this is a double in [0,1), then this specifies the fraction of " +
+            "documents. Default 1.0")
+        self.vocabSize = Param(
+            self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
+        self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
+        """
+        setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
+        Set the params for the CountVectorizer
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setMinTF(self, value):
+        """
+        Sets the value of :py:attr:`minTF`.
+        """
+        self._paramMap[self.minTF] = value
+        return self
+
+    def getMinTF(self):
+        """
+        Gets the value of minTF or its default value.
+        """
+        return self.getOrDefault(self.minTF)
+
+    def setMinDF(self, value):
+        """
+        Sets the value of :py:attr:`minDF`.
+        """
+        self._paramMap[self.minDF] = value
+        return self
+
+    def getMinDF(self):
+        """
+        Gets the value of minDF or its default value.
+        """
+        return self.getOrDefault(self.minDF)
+
+    def setVocabSize(self, value):
+        """
+        Sets the value of :py:attr:`vocabSize`.
+        """
+        self._paramMap[self.vocabSize] = value
+        return self
+
+    def getVocabSize(self):
+        """
+        Gets the value of vocabSize or its default value.
+        """
+        return self.getOrDefault(self.vocabSize)
+
+    def _create_model(self, java_model):
+        return CountVectorizerModel(java_model)
+
+
+class CountVectorizerModel(JavaModel):
+    """
+    .. note:: Experimental
+
+    Model fitted by CountVectorizer.
+    """
+
+    @property
+    def vocabulary(self):
+        """
+        An array of terms in the vocabulary.
+        """
+        return self._call_java("vocabulary")
+
+
 @inherit_doc
 class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     """

From aeef44a3e32b53f7adecc8e9cfd684fb4598e87d Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 21 Sep 2015 13:11:28 -0700
Subject: [PATCH 364/802] [SPARK-3147] [MLLIB] [STREAMING] Streaming 2-sample
 statistical significance testing

Implementation of significance testing using Streaming API.

Author: Feynman Liang <fliang@databricks.com>
Author: Feynman Liang <feynman.liang@gmail.com>

Closes #4716 from feynmanliang/ab_testing.
---
 .../examples/mllib/StreamingTestExample.scala |  90 +++++++
 .../spark/mllib/stat/test/StreamingTest.scala | 145 +++++++++++
 .../mllib/stat/test/StreamingTestMethod.scala | 167 ++++++++++++
 .../spark/mllib/stat/test/TestResult.scala    |  22 ++
 .../spark/mllib/stat/StreamingTestSuite.scala | 243 ++++++++++++++++++
 5 files changed, 667 insertions(+)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
new file mode 100644
index 0000000000000..ab29f90254d34
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.mllib.stat.test.StreamingTest
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+import org.apache.spark.util.Utils
+
+/**
+ * Perform streaming testing using Welch's 2-sample t-test on a stream of data, where the data
+ * stream arrives as text files in a directory. Stops when the two groups are statistically
+ * significant (p-value < 0.05) or after a user-specified timeout in number of batches is exceeded.
+ *
+ * The rows of the text files must be in the form `Boolean, Double`. For example:
+ *   false, -3.92
+ *   true, 99.32
+ *
+ * Usage:
+ *   StreamingTestExample <dataDir> <batchDuration> <numBatchesTimeout>
+ *
+ * To run on your local machine using the directory `dataDir` with 5 seconds between each batch and
+ * a timeout after 100 insignificant batches, call:
+ *    $ bin/run-example mllib.StreamingTestExample dataDir 5 100
+ *
+ * As you add text files to `dataDir` the significance test wil continually update every
+ * `batchDuration` seconds until the test becomes significant (p-value < 0.05) or the number of
+ * batches processed exceeds `numBatchesTimeout`.
+ */
+object StreamingTestExample {
+
+  def main(args: Array[String]) {
+    if (args.length != 3) {
+      // scalastyle:off println
+      System.err.println(
+        "Usage: StreamingTestExample " +
+          "<dataDir> <batchDuration> <numBatchesTimeout>")
+      // scalastyle:on println
+      System.exit(1)
+    }
+    val dataDir = args(0)
+    val batchDuration = Seconds(args(1).toLong)
+    val numBatchesTimeout = args(2).toInt
+
+    val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample")
+    val ssc = new StreamingContext(conf, batchDuration)
+    ssc.checkpoint({
+      val dir = Utils.createTempDir()
+      dir.toString
+    })
+
+    val data = ssc.textFileStream(dataDir).map(line => line.split(",") match {
+      case Array(label, value) => (label.toBoolean, value.toDouble)
+    })
+
+    val streamingTest = new StreamingTest()
+      .setPeacePeriod(0)
+      .setWindowSize(0)
+      .setTestMethod("welch")
+
+    val out = streamingTest.registerStream(data)
+    out.print()
+
+    // Stop processing if test becomes significant or we time out
+    var timeoutCounter = numBatchesTimeout
+    out.foreachRDD { rdd =>
+      timeoutCounter -= 1
+      val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _)
+      if (timeoutCounter == 0 || anySignificant) rdd.context.stop()
+    }
+
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
new file mode 100644
index 0000000000000..75c6a51d09571
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+
+/**
+ * :: Experimental ::
+ * Performs online 2-sample significance testing for a stream of (Boolean, Double) pairs. The
+ * Boolean identifies which sample each observation comes from, and the Double is the numeric value
+ * of the observation.
+ *
+ * To address novelty affects, the `peacePeriod` specifies a set number of initial
+ * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing.
+ *
+ * The `windowSize` sets the number of batches each significance test is to be performed over. The
+ * window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform
+ * cumulative processing, using all batches seen so far.
+ *
+ * Different tests may be used for assessing statistical significance depending on assumptions
+ * satisfied by data. For more details, see [[StreamingTestMethod]]. The `testMethod` specifies
+ * which test will be used.
+ *
+ * Use a builder pattern to construct a streaming test in an application, for example:
+ * {{{
+ *   val model = new StreamingTest()
+ *     .setPeacePeriod(10)
+ *     .setWindowSize(0)
+ *     .setTestMethod("welch")
+ *     .registerStream(DStream)
+ * }}}
+ */
+@Experimental
+@Since("1.6.0")
+class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
+  private var peacePeriod: Int = 0
+  private var windowSize: Int = 0
+  private var testMethod: StreamingTestMethod = WelchTTest
+
+  /** Set the number of initial batches to ignore. Default: 0. */
+  @Since("1.6.0")
+  def setPeacePeriod(peacePeriod: Int): this.type = {
+    this.peacePeriod = peacePeriod
+    this
+  }
+
+  /**
+   * Set the number of batches to compute significance tests over. Default: 0.
+   * A value of 0 will use all batches seen so far.
+   */
+  @Since("1.6.0")
+  def setWindowSize(windowSize: Int): this.type = {
+    this.windowSize = windowSize
+    this
+  }
+
+  /** Set the statistical method used for significance testing. Default: "welch" */
+  @Since("1.6.0")
+  def setTestMethod(method: String): this.type = {
+    this.testMethod = StreamingTestMethod.getTestMethodFromName(method)
+    this
+  }
+
+  /**
+   * Register a [[DStream]] of values for significance testing.
+   *
+   * @param data stream of (key,value) pairs where the key denotes group membership (true =
+   *             experiment, false = control) and the value is the numerical metric to test for
+   *             significance
+   * @return stream of significance testing results
+   */
+  @Since("1.6.0")
+  def registerStream(data: DStream[(Boolean, Double)]): DStream[StreamingTestResult] = {
+    val dataAfterPeacePeriod = dropPeacePeriod(data)
+    val summarizedData = summarizeByKeyAndWindow(dataAfterPeacePeriod)
+    val pairedSummaries = pairSummaries(summarizedData)
+
+    testMethod.doTest(pairedSummaries)
+  }
+
+  /** Drop all batches inside the peace period. */
+  private[stat] def dropPeacePeriod(
+      data: DStream[(Boolean, Double)]): DStream[(Boolean, Double)] = {
+    data.transform { (rdd, time) =>
+      if (time.milliseconds > data.slideDuration.milliseconds * peacePeriod) {
+        rdd
+      } else {
+        data.context.sparkContext.parallelize(Seq())
+      }
+    }
+  }
+
+  /** Compute summary statistics over each key and the specified test window size. */
+  private[stat] def summarizeByKeyAndWindow(
+      data: DStream[(Boolean, Double)]): DStream[(Boolean, StatCounter)] = {
+    if (this.windowSize == 0) {
+      data.updateStateByKey[StatCounter](
+        (newValues: Seq[Double], oldSummary: Option[StatCounter]) => {
+          val newSummary = oldSummary.getOrElse(new StatCounter())
+          newSummary.merge(newValues)
+          Some(newSummary)
+        })
+    } else {
+      val windowDuration = data.slideDuration * this.windowSize
+      data
+        .groupByKeyAndWindow(windowDuration)
+        .mapValues { values =>
+          val summary = new StatCounter()
+          values.foreach(value => summary.merge(value))
+          summary
+        }
+    }
+  }
+
+  /**
+   * Transform a stream of summaries into pairs representing summary statistics for control group
+   * and experiment group up to this batch.
+   */
+  private[stat] def pairSummaries(summarizedData: DStream[(Boolean, StatCounter)])
+      : DStream[(StatCounter, StatCounter)] = {
+    summarizedData
+      .map[(Int, StatCounter)](x => (0, x._2))
+      .groupByKey()  // should be length two (control/experiment group)
+      .map(x => (x._2.head, x._2.last))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
new file mode 100644
index 0000000000000..a7eaed51b4d55
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import java.io.Serializable
+
+import scala.language.implicitConversions
+import scala.math.pow
+
+import com.twitter.chill.MeatLocker
+import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
+import org.apache.commons.math3.stat.inference.TTest
+
+import org.apache.spark.Logging
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+
+/**
+ * Significance testing methods for [[StreamingTest]]. New 2-sample statistical significance tests
+ * should extend [[StreamingTestMethod]] and introduce a new entry in
+ * [[StreamingTestMethod.TEST_NAME_TO_OBJECT]]
+ */
+private[stat] sealed trait StreamingTestMethod extends Serializable {
+
+  val methodName: String
+  val nullHypothesis: String
+
+  protected type SummaryPairStream =
+    DStream[(StatCounter, StatCounter)]
+
+  /**
+   * Perform streaming 2-sample statistical significance testing.
+   *
+   * @param sampleSummaries stream pairs of summary statistics for the 2 samples
+   * @return stream of rest results
+   */
+  def doTest(sampleSummaries: SummaryPairStream): DStream[StreamingTestResult]
+
+  /**
+   * Implicit adapter to convert between streaming summary statistics type and the type required by
+   * the t-testing libraries.
+   */
+  protected implicit def toApacheCommonsStats(
+      summaryStats: StatCounter): StatisticalSummaryValues = {
+    new StatisticalSummaryValues(
+      summaryStats.mean,
+      summaryStats.variance,
+      summaryStats.count,
+      summaryStats.max,
+      summaryStats.min,
+      summaryStats.mean * summaryStats.count
+    )
+  }
+}
+
+/**
+ * Performs Welch's 2-sample t-test. The null hypothesis is that the two data sets have equal mean.
+ * This test does not assume equal variance between the two samples and does not assume equal
+ * sample size.
+ *
+ * @see http://en.wikipedia.org/wiki/Welch%27s_t_test
+ */
+private[stat] object WelchTTest extends StreamingTestMethod with Logging {
+
+  override final val methodName = "Welch's 2-sample t-test"
+  override final val nullHypothesis = "Both groups have same mean"
+
+  private final val tTester = MeatLocker(new TTest())
+
+  override def doTest(data: SummaryPairStream): DStream[StreamingTestResult] =
+    data.map[StreamingTestResult]((test _).tupled)
+
+  private def test(
+      statsA: StatCounter,
+      statsB: StatCounter): StreamingTestResult = {
+    def welchDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double = {
+      val s1 = sample1.getVariance
+      val n1 = sample1.getN
+      val s2 = sample2.getVariance
+      val n2 = sample2.getN
+
+      val a = pow(s1, 2) / n1
+      val b = pow(s2, 2) / n2
+
+      pow(a + b, 2) / ((pow(a, 2) / (n1 - 1)) + (pow(b, 2) / (n2 - 1)))
+    }
+
+    new StreamingTestResult(
+      tTester.get.tTest(statsA, statsB),
+      welchDF(statsA, statsB),
+      tTester.get.t(statsA, statsB),
+      methodName,
+      nullHypothesis
+    )
+  }
+}
+
+/**
+ * Performs Students's 2-sample t-test. The null hypothesis is that the two data sets have equal
+ * mean. This test assumes equal variance between the two samples and does not assume equal sample
+ * size. For unequal variances, Welch's t-test should be used instead.
+ *
+ * @see http://en.wikipedia.org/wiki/Student%27s_t-test
+ */
+private[stat] object StudentTTest extends StreamingTestMethod with Logging {
+
+  override final val methodName = "Student's 2-sample t-test"
+  override final val nullHypothesis = "Both groups have same mean"
+
+  private final val tTester = MeatLocker(new TTest())
+
+  override def doTest(data: SummaryPairStream): DStream[StreamingTestResult] =
+    data.map[StreamingTestResult]((test _).tupled)
+
+  private def test(
+      statsA: StatCounter,
+      statsB: StatCounter): StreamingTestResult = {
+    def studentDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double =
+      sample1.getN + sample2.getN - 2
+
+    new StreamingTestResult(
+      tTester.get.homoscedasticTTest(statsA, statsB),
+      studentDF(statsA, statsB),
+      tTester.get.homoscedasticT(statsA, statsB),
+      methodName,
+      nullHypothesis
+    )
+  }
+}
+
+/**
+ * Companion object holding supported [[StreamingTestMethod]] names and handles conversion between
+ * strings used in [[StreamingTest]] configuration and actual method implementation.
+ *
+ * Currently supported tests: `welch`, `student`.
+ */
+private[stat] object StreamingTestMethod {
+  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
+  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
+    "welch"->WelchTTest,
+    "student"->StudentTTest)
+
+  def getTestMethodFromName(method: String): StreamingTestMethod =
+    TEST_NAME_TO_OBJECT.get(method) match {
+      case Some(test) => test
+      case None =>
+        throw new IllegalArgumentException(
+          "Unrecognized method name. Supported streaming test methods: "
+            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
+    }
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
index d01b3707be944..b0916d3e84651 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -115,3 +115,25 @@ class KolmogorovSmirnovTestResult private[stat] (
     "Kolmogorov-Smirnov test summary:\n" + super.toString
   }
 }
+
+/**
+ * :: Experimental ::
+ * Object containing the test results for streaming testing.
+ */
+@Experimental
+@Since("1.6.0")
+private[stat] class StreamingTestResult @Since("1.6.0") (
+    @Since("1.6.0") override val pValue: Double,
+    @Since("1.6.0") override val degreesOfFreedom: Double,
+    @Since("1.6.0") override val statistic: Double,
+    @Since("1.6.0") val method: String,
+    @Since("1.6.0") override val nullHypothesis: String)
+  extends TestResult[Double] with Serializable {
+
+  override def toString: String = {
+    "Streaming test summary:\n" +
+      s"method: $method\n" +
+      super.toString
+  }
+}
+
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala
new file mode 100644
index 0000000000000..d3e9ef4ff079c
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.stat.test.{StreamingTest, StreamingTestResult, StudentTTest, WelchTTest}
+import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+import org.apache.spark.util.random.XORShiftRandom
+
+class StreamingTestSuite extends SparkFunSuite with TestSuiteBase {
+
+  override def maxWaitTimeMillis : Int = 30000
+
+  test("accuracy for null hypothesis using welch t-test") {
+    // set parameters
+    val testMethod = "welch"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue > 0.05 && res.method == WelchTTest.methodName))
+  }
+
+  test("accuracy for alternative hypothesis using welch t-test") {
+    // set parameters
+    val testMethod = "welch"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue < 0.05 && res.method == WelchTTest.methodName))
+  }
+
+  test("accuracy for null hypothesis using student t-test") {
+    // set parameters
+    val testMethod = "student"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue > 0.05 && res.method == StudentTTest.methodName))
+  }
+
+  test("accuracy for alternative hypothesis using student t-test") {
+    // set parameters
+    val testMethod = "student"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue < 0.05 && res.method == StudentTTest.methodName))
+  }
+
+  test("batches within same test window are grouped") {
+    // set parameters
+    val testWindow = 3
+    val numBatches = 5
+    val pointsPerBatch = 100
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(testWindow)
+      .setPeacePeriod(0)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input,
+      (inputDStream: DStream[(Boolean, Double)]) => model.summarizeByKeyAndWindow(inputDStream))
+    val outputBatches = runStreams[(Boolean, StatCounter)](ssc, numBatches, numBatches)
+    val outputCounts = outputBatches.flatten.map(_._2.count)
+
+    // number of batches seen so far does not exceed testWindow, expect counts to continue growing
+    for (i <- 0 until testWindow) {
+      assert(outputCounts.drop(2 * i).take(2).forall(_ == (i + 1) * pointsPerBatch / 2))
+    }
+
+    // number of batches seen exceeds testWindow, expect counts to be constant
+    assert(outputCounts.drop(2 * (testWindow - 1)).forall(_ == testWindow * pointsPerBatch / 2))
+  }
+
+
+  test("entries in peace period are dropped") {
+    // set parameters
+    val peacePeriod = 3
+    val numBatches = 7
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(peacePeriod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.dropPeacePeriod(inputDStream))
+    val outputBatches = runStreams[(Boolean, Double)](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.length == (numBatches - peacePeriod) * pointsPerBatch)
+  }
+
+  test("null hypothesis when only data from one group is present") {
+    // set parameters
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+
+    val input = generateTestData(numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+      .map(batch => batch.filter(_._1)) // only keep one test group
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(result => (result.pValue - 1.0).abs < 0.001))
+  }
+
+  // Generate testing input with half of the entries in group A and half in group B
+  private def generateTestData(
+      numBatches: Int,
+      pointsPerBatch: Int,
+      meanA: Double,
+      stdevA: Double,
+      meanB: Double,
+      stdevB: Double,
+      seed: Int): (IndexedSeq[IndexedSeq[(Boolean, Double)]]) = {
+    val rand = new XORShiftRandom(seed)
+    val numTrues = pointsPerBatch / 2
+    val data = (0 until numBatches).map { i =>
+      (0 until numTrues).map { idx => (true, meanA + stdevA * rand.nextGaussian())} ++
+        (pointsPerBatch / 2 until pointsPerBatch).map { idx =>
+          (false, meanB + stdevB * rand.nextGaussian())
+        }
+    }
+
+    data
+  }
+}

From 97a99dde6e8d69a4c4c135dc1d9b1520b2548b5b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 21 Sep 2015 13:15:44 -0700
Subject: [PATCH 365/802] [SPARK-10676] [DOCS] Add documentation for SASL
 encryption options.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8803 from vanzin/SPARK-10676.
---
 docs/configuration.md | 16 ++++++++++++++++
 docs/security.md      | 22 ++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index b22587c70316b..284f97ad09ec3 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1285,6 +1285,22 @@ Apart from these, the following properties are also available, and may be useful
     not running on YARN and authentication is enabled.
   </td>
 </tr>
+<tr>
+  <td><code>spark.authenticate.enableSaslEncryption</code></td>
+  <td>false</td>
+  <td>
+    Enable encrypted communication when authentication is enabled. This option is currently
+    only supported by the block transfer service.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.network.sasl.serverAlwaysEncrypt</code></td>
+  <td>false</td>
+  <td>
+    Disable unencrypted connections for services that support SASL authentication. This is
+    currently supported by the external shuffle service.
+  </td>
+</tr>
 <tr>
   <td><code>spark.core.connection.ack.wait.timeout</code></td>
   <td>60s</td>
diff --git a/docs/security.md b/docs/security.md
index d4ffa60e59a33..177109415180b 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -23,9 +23,16 @@ If your applications are using event logging, the directory where the event logs
 
 ## Encryption
 
-Spark supports SSL for Akka and HTTP (for broadcast and file server) protocols. However SSL is not supported yet for WebUI and block transfer service.
+Spark supports SSL for Akka and HTTP (for broadcast and file server) protocols. SASL encryption is
+supported for the block transfer service. Encryption is not yet supported for the WebUI.
 
-Connection encryption (SSL) configuration is organized hierarchically. The user can configure the default SSL settings which will be used for all the supported communication protocols unless they are overwritten by protocol-specific settings. This way the user can easily provide the common settings for all the protocols without disabling the ability to configure each one individually. The common SSL settings are at `spark.ssl` namespace in Spark configuration, while Akka SSL configuration is at `spark.ssl.akka` and HTTP for broadcast and file server SSL configuration is at `spark.ssl.fs`. The full breakdown can be found on the [configuration page](configuration.html).
+Encryption is not yet supported for data stored by Spark in temporary local storage, such as shuffle
+files, cached data, and other application files. If encrypting this data is desired, a workaround is
+to configure your cluster manager to store application data on encrypted disks.
+
+### SSL Configuration
+
+Configuration for SSL is organized hierarchically. The user can configure the default SSL settings which will be used for all the supported communication protocols unless they are overwritten by protocol-specific settings. This way the user can easily provide the common settings for all the protocols without disabling the ability to configure each one individually. The common SSL settings are at `spark.ssl` namespace in Spark configuration, while Akka SSL configuration is at `spark.ssl.akka` and HTTP for broadcast and file server SSL configuration is at `spark.ssl.fs`. The full breakdown can be found on the [configuration page](configuration.html).
 
 SSL must be configured on each node and configured for each component involved in communication using the particular protocol.
 
@@ -47,6 +54,17 @@ follows:
 * Import all exported public keys into a single trust-store
 * Distribute the trust-store over the nodes
 
+### Configuring SASL Encryption
+
+SASL encryption is currently supported for the block transfer service when authentication
+(`spark.authenticate`) is enabled. To enable SASL encryption for an application, set
+`spark.authenticate.enableSaslEncryption` to `true` in the application's configuration.
+
+When using an external shuffle service, it's possible to disable unencrypted connections by setting
+`spark.network.sasl.serverAlwaysEncrypt` to `true` in the shuffle service's configuration. If that
+option is enabled, applications that are not set up to use SASL encryption will fail to connect to
+the shuffle service.
+
 ## Configuring Ports for Network Security
 
 Spark makes heavy use of the network, and some environments have strict requirements for using tight

From 362539f8d97f6bb67f0d0983f7dea36b77cc9d18 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 21 Sep 2015 13:33:10 -0700
Subject: [PATCH 366/802] [SPARK-10630] [SQL] Add a createDataFrame API that
 takes in a java list

It would be nice to support creating a DataFrame directly from a Java List of Row.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8779 from holdenk/SPARK-10630-create-DataFrame-from-Java-List.
---
 .../scala/org/apache/spark/sql/SQLContext.scala    | 14 ++++++++++++++
 .../org/apache/spark/sql/JavaDataFrameSuite.java   | 10 ++++++++++
 2 files changed, 24 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index f099940800cc0..1bd4e26fb3162 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -476,6 +476,20 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rowRDD.rdd, schema)
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Creates a [[DataFrame]] from an [[java.util.List]] containing [[Row]]s using the given schema.
+   * It is important to make sure that the structure of every [[Row]] of the provided List matches
+   * the provided schema. Otherwise, there will be runtime exception.
+   *
+   * @group dataframes
+   * @since 1.6.0
+   */
+  @DeveloperApi
+  def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = {
+    DataFrame(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala))
+  }
+
   /**
    * Applies a schema to an RDD of Java Beans.
    *
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 5f9abd4999ce0..250ac2e1092d4 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -37,6 +37,7 @@
 import static org.apache.spark.sql.functions.*;
 import org.apache.spark.sql.test.TestSQLContext;
 import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
 
 public class JavaDataFrameSuite {
   private transient JavaSparkContext jsc;
@@ -181,6 +182,15 @@ public void testCreateDataFrameFromJavaBeans() {
     }
   }
 
+  @Test
+  public void testCreateDataFromFromList() {
+    StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true)));
+    List<Row> rows = Arrays.asList(RowFactory.create(0));
+    DataFrame df = context.createDataFrame(rows, schema);
+    Row[] result = df.collect();
+    Assert.assertEquals(1, result.length);
+  }
+
   private static final Comparator<Row> crosstabRowComparator = new Comparator<Row>() {
     @Override
     public int compare(Row row1, Row row2) {

From 7c4f852bfc39537840f56cd8121457a0dc1ad7c1 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Mon, 21 Sep 2015 14:24:19 -0700
Subject: [PATCH 367/802] [DOC] [PYSPARK] [MLLIB] Added newlines to docstrings
 to fix parameter formatting

Added newlines before `:param ...:` and `:return:` markup. Without these, parameter lists aren't formatted correctly in the API docs. I.e:

![screen shot 2015-09-21 at 21 49 26](https://cloud.githubusercontent.com/assets/11915197/10004686/de3c41d4-60aa-11e5-9c50-a46dcb51243f.png)

.. looks like this once newline is added:

![screen shot 2015-09-21 at 21 50 14](https://cloud.githubusercontent.com/assets/11915197/10004706/f86bfb08-60aa-11e5-8524-ae4436713502.png)

Author: noelsmith <mail@noelsmith.com>

Closes #8851 from noel-smith/docstring-missing-newline-fix.
---
 python/pyspark/ml/param/__init__.py     | 4 ++++
 python/pyspark/ml/pipeline.py           | 1 +
 python/pyspark/ml/tuning.py             | 2 ++
 python/pyspark/ml/wrapper.py            | 2 ++
 python/pyspark/mllib/evaluation.py      | 2 +-
 python/pyspark/mllib/linalg/__init__.py | 1 +
 python/pyspark/streaming/context.py     | 2 ++
 python/pyspark/streaming/mqtt.py        | 1 +
 8 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index eeeac49b21980..2e0c63cb47b17 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -164,6 +164,7 @@ def extractParamMap(self, extra=None):
         a flat param map, where the latter value is used if there exist
         conflicts, i.e., with ordering: default param values <
         user-supplied values < extra.
+
         :param extra: extra param values
         :return: merged param map
         """
@@ -182,6 +183,7 @@ def copy(self, extra=None):
         embedded and extra parameters over and returns the copy.
         Subclasses should override this method if the default approach
         is not sufficient.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
@@ -201,6 +203,7 @@ def _shouldOwn(self, param):
     def _resolveParam(self, param):
         """
         Resolves a param and validates the ownership.
+
         :param param: param name or the param instance, which must
                       belong to this Params instance
         :return: resolved param instance
@@ -243,6 +246,7 @@ def _copyValues(self, to, extra=None):
         """
         Copies param values from this instance to another instance for
         params shared by them.
+
         :param to: the target instance
         :param extra: extra params to be copied
         :return: the target instance with param values copied
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 13cf2b0f7bbd9..312a8502b3a2c 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -154,6 +154,7 @@ def __init__(self, stages=None):
     def setStages(self, value):
         """
         Set pipeline stages.
+
         :param value: a list of transformers or estimators
         :return: the pipeline instance
         """
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index ab5621f45c72c..705ee53685752 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -254,6 +254,7 @@ def copy(self, extra=None):
         Creates a copy of this instance with a randomly generated uid
         and some extra params. This copies creates a deep copy of
         the embedded paramMap, and copies the embedded and extra parameters over.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
@@ -290,6 +291,7 @@ def copy(self, extra=None):
         and some extra params. This copies the underlying bestModel,
         creates a deep copy of the embedded paramMap, and
         copies the embedded and extra parameters over.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 8218c7c5f801c..4bcb4aaec89de 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -119,6 +119,7 @@ def _create_model(self, java_model):
     def _fit_java(self, dataset):
         """
         Fits a Java model to the input dataset.
+
         :param dataset: input dataset, which is an instance of
                         :py:class:`pyspark.sql.DataFrame`
         :param params: additional params (overwriting embedded values)
@@ -173,6 +174,7 @@ def copy(self, extra=None):
         extra params. This implementation first calls Params.copy and
         then make a copy of the companion Java model with extra params.
         So both the Python wrapper and the Java model get copied.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 4398ca86f2ec2..a90e5c50e54b9 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -147,7 +147,7 @@ class MulticlassMetrics(JavaModelWrapper):
     """
     Evaluator for multiclass classification.
 
-    :param predictionAndLabels an RDD of (prediction, label) pairs.
+    :param predictionAndLabels: an RDD of (prediction, label) pairs.
 
     >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index f929e3e96fbe2..ea42127f1651f 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -240,6 +240,7 @@ class Vector(object):
     def toArray(self):
         """
         Convert the vector into an numpy.ndarray
+
         :return: numpy.ndarray
         """
         raise NotImplementedError
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 4069d7a149986..a8c9ffc235b9e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -240,6 +240,7 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
+
         @param timeout: time to wait in seconds
         """
         if timeout is None:
@@ -252,6 +253,7 @@ def awaitTerminationOrTimeout(self, timeout):
         Wait for the execution to stop. Return `true` if it's stopped; or
         throw the reported error during the execution; or `false` if the
         waiting time elapsed before returning from the method.
+
         @param timeout: time to wait in seconds
         """
         self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))
diff --git a/python/pyspark/streaming/mqtt.py b/python/pyspark/streaming/mqtt.py
index f06598971c548..fa83006c36db6 100644
--- a/python/pyspark/streaming/mqtt.py
+++ b/python/pyspark/streaming/mqtt.py
@@ -31,6 +31,7 @@ def createStream(ssc, brokerUrl, topic,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
         """
         Create an input stream that pulls messages from a Mqtt Broker.
+
         :param ssc:  StreamingContext object
         :param brokerUrl:  Url of remote mqtt publisher
         :param topic:  topic name to subscribe to

From 72869883f12b6e0a4e5aad79c0ac2cfdb4d83f09 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 21 Sep 2015 16:47:52 -0700
Subject: [PATCH 368/802] [SPARK-10649] [STREAMING] Prevent inheriting job
 group and irrelevant job description in streaming jobs

The job group, and job descriptions information is passed through thread local properties, and get inherited by child threads. In case of spark streaming, the streaming jobs inherit these properties from the thread that called streamingContext.start(). This may not make sense.

1. Job group: This is mainly used for cancelling a group of jobs together. It does not make sense to cancel streaming jobs like this, as the effect will be unpredictable. And its not a valid usecase any way, to cancel a streaming context, call streamingContext.stop()

2. Job description: This is used to pass on nice text descriptions for jobs to show up in the UI. The job description of the thread that calls streamingContext.start() is not useful for all the streaming jobs, as it does not make sense for all of the streaming jobs to have the same description, and the description may or may not be related to streaming.

The solution in this PR is meant for the Spark master branch, where local properties are inherited by cloning the properties. The job group and job description in the thread that starts the streaming scheduler are explicitly removed, so that all the subsequent child threads does not inherit them. Also, the starting is done in a new child thread, so that setting the job group and description for streaming, does not change those properties in the thread that called streamingContext.start().

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8781 from tdas/SPARK-10649.
---
 .../org/apache/spark/util/ThreadUtils.scala   | 59 +++++++++++++++++++
 .../apache/spark/util/ThreadUtilsSuite.scala  | 24 +++++++-
 .../spark/streaming/StreamingContext.scala    | 15 ++++-
 .../streaming/StreamingContextSuite.scala     | 32 ++++++++++
 4 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index ca5624a3d8b3d..22e291a2b48d6 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -21,6 +21,7 @@ package org.apache.spark.util
 import java.util.concurrent._
 
 import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
+import scala.util.control.NonFatal
 
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 
@@ -86,4 +87,62 @@ private[spark] object ThreadUtils {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()
     Executors.newSingleThreadScheduledExecutor(threadFactory)
   }
+
+  /**
+   * Run a piece of code in a new thread and return the result. Exception in the new thread is
+   * thrown in the caller thread with an adjusted stack trace that removes references to this
+   * method for clarity. The exception stack traces will be like the following
+   *
+   * SomeException: exception-message
+   *   at CallerClass.body-method (sourcefile.scala)
+   *   at ... run in separate thread using org.apache.spark.util.ThreadUtils ... ()
+   *   at CallerClass.caller-method (sourcefile.scala)
+   *   ...
+   */
+  def runInNewThread[T](
+      threadName: String,
+      isDaemon: Boolean = true)(body: => T): T = {
+    @volatile var exception: Option[Throwable] = None
+    @volatile var result: T = null.asInstanceOf[T]
+
+    val thread = new Thread(threadName) {
+      override def run(): Unit = {
+        try {
+          result = body
+        } catch {
+          case NonFatal(e) =>
+            exception = Some(e)
+        }
+      }
+    }
+    thread.setDaemon(isDaemon)
+    thread.start()
+    thread.join()
+
+    exception match {
+      case Some(realException) =>
+        // Remove the part of the stack that shows method calls into this helper method
+        // This means drop everything from the top until the stack element
+        // ThreadUtils.runInNewThread(), and then drop that as well (hence the `drop(1)`).
+        val baseStackTrace = Thread.currentThread().getStackTrace().dropWhile(
+          ! _.getClassName.contains(this.getClass.getSimpleName)).drop(1)
+
+        // Remove the part of the new thread stack that shows methods call from this helper method
+        val extraStackTrace = realException.getStackTrace.takeWhile(
+          ! _.getClassName.contains(this.getClass.getSimpleName))
+
+        // Combine the two stack traces, with a place holder just specifying that there
+        // was a helper method used, without any further details of the helper
+        val placeHolderStackElem = new StackTraceElement(
+          s"... run in separate thread using ${ThreadUtils.getClass.getName.stripSuffix("$")} ..",
+          " ", "", -1)
+        val finalStackTrace = extraStackTrace ++ Seq(placeHolderStackElem) ++ baseStackTrace
+
+        // Update the stack trace and rethrow the exception in the caller thread
+        realException.setStackTrace(finalStackTrace)
+        throw realException
+      case None =>
+        result
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
index 8c51e6b14b7fc..620e4debf4e08 100644
--- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
@@ -20,8 +20,9 @@ package org.apache.spark.util
 
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
-import scala.concurrent.{Await, Future}
 import scala.concurrent.duration._
+import scala.concurrent.{Await, Future}
+import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
 
@@ -66,4 +67,25 @@ class ThreadUtilsSuite extends SparkFunSuite {
     val futureThreadName = Await.result(f, 10.seconds)
     assert(futureThreadName === callerThreadName)
   }
+
+  test("runInNewThread") {
+    import ThreadUtils._
+    assert(runInNewThread("thread-name") { Thread.currentThread().getName } === "thread-name")
+    assert(runInNewThread("thread-name") { Thread.currentThread().isDaemon } === true)
+    assert(
+      runInNewThread("thread-name", isDaemon = false) { Thread.currentThread().isDaemon } === false
+    )
+    val uniqueExceptionMessage = "test" + Random.nextInt()
+    val exception = intercept[IllegalArgumentException] {
+      runInNewThread("thread-name") { throw new IllegalArgumentException(uniqueExceptionMessage) }
+    }
+    assert(exception.asInstanceOf[IllegalArgumentException].getMessage === uniqueExceptionMessage)
+    assert(exception.getStackTrace.mkString("\n").contains(
+      "... run in separate thread using org.apache.spark.util.ThreadUtils ...") === true,
+      "stack trace does not contain expected place holder"
+    )
+    assert(exception.getStackTrace.mkString("\n").contains("ThreadUtils.scala") === false,
+      "stack trace contains unexpected references to ThreadUtils"
+    )
+  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index b496d1f341a0b..6720ba4f72cf3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver}
 import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener}
 import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab}
-import org.apache.spark.util.{CallSite, ShutdownHookManager, Utils}
+import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils}
 
 /**
  * Main entry point for Spark Streaming functionality. It provides methods used to create
@@ -588,12 +588,20 @@ class StreamingContext private[streaming] (
     state match {
       case INITIALIZED =>
         startSite.set(DStream.getCreationSite())
-        sparkContext.setCallSite(startSite.get)
         StreamingContext.ACTIVATION_LOCK.synchronized {
           StreamingContext.assertNoOtherContextIsActive()
           try {
             validate()
-            scheduler.start()
+
+            // Start the streaming scheduler in a new thread, so that thread local properties
+            // like call sites and job groups can be reset without affecting those of the
+            // current thread.
+            ThreadUtils.runInNewThread("streaming-start") {
+              sparkContext.setCallSite(startSite.get)
+              sparkContext.clearJobGroup()
+              sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
+              scheduler.start()
+            }
             state = StreamingContextState.ACTIVE
           } catch {
             case NonFatal(e) =>
@@ -618,6 +626,7 @@ class StreamingContext private[streaming] (
     }
   }
 
+
   /**
    * Wait for the execution to stop. Any exceptions that occurs during the execution
    * will be thrown in this thread.
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index d26894e88fc26..3b9d0d15ea04c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -180,6 +180,38 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     assert(ssc.scheduler.isStarted === false)
   }
 
+  test("start should set job group and description of streaming jobs correctly") {
+    ssc = new StreamingContext(conf, batchDuration)
+    ssc.sc.setJobGroup("non-streaming", "non-streaming", true)
+    val sc = ssc.sc
+
+    @volatile var jobGroupFound: String = ""
+    @volatile var jobDescFound: String = ""
+    @volatile var jobInterruptFound: String = ""
+    @volatile var allFound: Boolean = false
+
+    addInputStream(ssc).foreachRDD { rdd =>
+      jobGroupFound = sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID)
+      jobDescFound = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+      jobInterruptFound = sc.getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)
+      allFound = true
+    }
+    ssc.start()
+
+    eventually(timeout(10 seconds), interval(10 milliseconds)) {
+      assert(allFound === true)
+    }
+
+    // Verify streaming jobs have expected thread-local properties
+    assert(jobGroupFound === null)
+    assert(jobDescFound === null)
+    assert(jobInterruptFound === "false")
+
+    // Verify current thread's thread-local properties have not changed
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "non-streaming")
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) === "non-streaming")
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL) === "true")
+  }
 
   test("start multiple times") {
     ssc = new StreamingContext(master, appName, batchDuration)

From 0494c80ef54f6f3a8c6f2d92abfe1a77a91df8b0 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 21 Sep 2015 18:06:45 -0700
Subject: [PATCH 369/802] [SPARK-10495] [SQL] Read date values in JSON data
 stored by Spark 1.5.0.

https://issues.apache.org/jira/browse/SPARK-10681

Author: Yin Huai <yhuai@databricks.com>

Closes #8806 from yhuai/SPARK-10495.
---
 .../datasources/json/JacksonGenerator.scala   |  36 ++++++
 .../datasources/json/JacksonParser.scala      |  15 ++-
 .../datasources/json/JsonSuite.scala          | 103 +++++++++++++++++-
 3 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index f65c7bbd6e29d..23bada1ddd92f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -73,6 +73,38 @@ private[sql] object JacksonGenerator {
             valWriter(field.dataType, v)
         }
         gen.writeEndObject()
+
+      // For UDT, udt.serialize will produce SQL types. So, we need the following three cases.
+      case (ArrayType(ty, _), v: ArrayData) =>
+        gen.writeStartArray()
+        v.foreach(ty, (_, value) => valWriter(ty, value))
+        gen.writeEndArray()
+
+      case (MapType(kt, vt, _), v: MapData) =>
+        gen.writeStartObject()
+        v.foreach(kt, vt, { (k, v) =>
+          gen.writeFieldName(k.toString)
+          valWriter(vt, v)
+        })
+        gen.writeEndObject()
+
+      case (StructType(ty), v: InternalRow) =>
+        gen.writeStartObject()
+        var i = 0
+        while (i < ty.length) {
+          val field = ty(i)
+          val value = v.get(i, field.dataType)
+          if (value != null) {
+            gen.writeFieldName(field.name)
+            valWriter(field.dataType, value)
+          }
+          i += 1
+        }
+        gen.writeEndObject()
+
+      case (dt, v) =>
+        sys.error(
+          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
     }
 
     valWriter(rowSchema, row)
@@ -133,6 +165,10 @@ private[sql] object JacksonGenerator {
           i += 1
         }
         gen.writeEndObject()
+
+      case (dt, v) =>
+        sys.error(
+          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
     }
 
     valWriter(rowSchema, row)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index ff4d8c04e8eaf..c51140749c8e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -62,10 +62,23 @@ private[sql] object JacksonParser {
         // guard the non string type
         null
 
+      case (VALUE_STRING, BinaryType) =>
+        parser.getBinaryValue
+
       case (VALUE_STRING, DateType) =>
-        DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
+        val stringValue = parser.getText
+        if (stringValue.contains("-")) {
+          // The format of this string will probably be "yyyy-mm-dd".
+          DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
+        } else {
+          // In Spark 1.5.0, we store the data as number of days since epoch in string.
+          // So, we just convert it to Int.
+          stringValue.toInt
+        }
 
       case (VALUE_STRING, TimestampType) =>
+        // This one will lose microseconds parts.
+        // See https://issues.apache.org/jira/browse/SPARK-10681.
         DateTimeUtils.stringToTime(parser.getText).getTime * 1000L
 
       case (VALUE_NUMBER_INT, TimestampType) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 6a18cc6d27138..b614e6c4148fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -24,7 +24,7 @@ import com.fasterxml.jackson.core.JsonFactory
 import org.apache.spark.rdd.RDD
 import org.scalactic.Tolerance._
 
-import org.apache.spark.sql.{QueryTest, Row, SQLConf}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.InferSchema.compatibleType
@@ -1159,4 +1159,105 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
           "SELECT count(a) FROM test_myjson_with_part where d1 = 1"), Row(9))
     })
   }
+
+  test("backward compatibility") {
+    // This test we make sure our JSON support can read JSON data generated by previous version
+    // of Spark generated through toJSON method and JSON data source.
+    // The data is generated by the following program.
+    // Here are a few notes:
+    //  - Spark 1.5.0 cannot save timestamp data. So, we manually added timestamp field (col13)
+    //      in the JSON object.
+    //  - For Spark before 1.5.1, we do not generate UDTs. So, we manually added the UDT value to
+    //      JSON objects generated by those Spark versions (col17).
+    //  - If the type is NullType, we do not write data out.
+
+    // Create the schema.
+    val struct =
+      StructType(
+        StructField("f1", FloatType, true) ::
+          StructField("f2", ArrayType(BooleanType), true) :: Nil)
+
+    val dataTypes =
+      Seq(
+        StringType, BinaryType, NullType, BooleanType,
+        ByteType, ShortType, IntegerType, LongType,
+        FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+        DateType, TimestampType,
+        ArrayType(IntegerType), MapType(StringType, LongType), struct,
+        new MyDenseVectorUDT())
+    val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
+      StructField(s"col$index", dataType, nullable = true)
+    }
+    val schema = StructType(fields)
+
+    val constantValues =
+      Seq(
+        "a string in binary".getBytes("UTF-8"),
+        null,
+        true,
+        1.toByte,
+        2.toShort,
+        3,
+        Long.MaxValue,
+        0.25.toFloat,
+        0.75,
+        new java.math.BigDecimal(s"1234.23456"),
+        new java.math.BigDecimal(s"1.23456"),
+        java.sql.Date.valueOf("2015-01-01"),
+        java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123"),
+        Seq(2, 3, 4),
+        Map("a string" -> 2000L),
+        Row(4.75.toFloat, Seq(false, true)),
+        new MyDenseVector(Array(0.25, 2.25, 4.25)))
+    val data =
+      Row.fromSeq(Seq("Spark " + sqlContext.sparkContext.version) ++ constantValues) :: Nil
+
+    // Data generated by previous versions.
+    // scalastyle:off
+    val existingJSONData =
+      """{"col0":"Spark 1.2.2","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"16436","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: Nil
+    // scalastyle:on
+
+    // Generate data for the current version.
+    val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(data, 1), schema)
+    withTempPath { path =>
+      df.write.format("json").mode("overwrite").save(path.getCanonicalPath)
+
+      // df.toJSON will convert internal rows to external rows first and then generate
+      // JSON objects. While, df.write.format("json") will write internal rows directly.
+      val allJSON =
+        existingJSONData ++
+          df.toJSON.collect() ++
+          sparkContext.textFile(path.getCanonicalPath).collect()
+
+      Utils.deleteRecursively(path)
+      sparkContext.parallelize(allJSON, 1).saveAsTextFile(path.getCanonicalPath)
+
+      // Read data back with the schema specified.
+      val col0Values =
+        Seq(
+          "Spark 1.2.2",
+          "Spark 1.3.1",
+          "Spark 1.3.1",
+          "Spark 1.4.1",
+          "Spark 1.4.1",
+          "Spark 1.5.0",
+          "Spark 1.5.0",
+          "Spark " + sqlContext.sparkContext.version,
+          "Spark " + sqlContext.sparkContext.version)
+      val expectedResult = col0Values.map { v =>
+        Row.fromSeq(Seq(v) ++ constantValues)
+      }
+      checkAnswer(
+        sqlContext.read.format("json").schema(schema).load(path.getCanonicalPath),
+        expectedResult
+      )
+    }
+  }
 }

From c986e933a900602af47966bd41edb2116c421a39 Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Mon, 21 Sep 2015 21:09:59 -0700
Subject: [PATCH 370/802] [SPARK-10711] [SPARKR] Do not assume
 spark.submit.deployMode is always set

In ```RUtils.sparkRPackagePath()``` we
1. Call ``` sys.props("spark.submit.deployMode")``` which returns null if ```spark.submit.deployMode``` is not suet
2. Call ``` sparkConf.get("spark.submit.deployMode")``` which throws ```NoSuchElementException``` if ```spark.submit.deployMode``` is not set. This patch simply passes a default value ("cluster") for ```spark.submit.deployMode```.

cc rxin

Author: Hossein <hossein@databricks.com>

Closes #8832 from falaki/SPARK-10711.
---
 core/src/main/scala/org/apache/spark/api/r/RUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
index 9e807cc52f18c..fd5646b5b6372 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -44,7 +44,7 @@ private[spark] object RUtils {
         (sys.props("spark.master"), sys.props("spark.submit.deployMode"))
       } else {
         val sparkConf = SparkEnv.get.conf
-        (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode"))
+        (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode", "client"))
       }
 
     val isYarnCluster = master != null && master.contains("yarn") && deployMode == "cluster"

From 1cd67415728e660a90e4dbe136272b5d6b8f1142 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 21 Sep 2015 23:21:24 -0700
Subject: [PATCH 371/802] [SPARK-9821] [PYSPARK]
 pyspark-reduceByKey-should-take-a-custom-partitioner

from the issue:

In Scala, I can supply a custom partitioner to reduceByKey (and other aggregation/repartitioning methods like aggregateByKey and combinedByKey), but as far as I can tell from the Pyspark API, there's no way to do the same in Python.
Here's an example of my code in Scala:
weblogs.map(s => (getFileType(s), 1)).reduceByKey(new FileTypePartitioner(),_+_)
But I can't figure out how to do the same in Python. The closest I can get is to call repartition before reduceByKey like so:
weblogs.map(lambda s: (getFileType(s), 1)).partitionBy(3,hash_filetype).reduceByKey(lambda v1,v2: v1+v2).collect()
But that defeats the purpose, because I'm shuffling twice instead of once, so my performance is worse instead of better.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8569 from holdenk/SPARK-9821-pyspark-reduceByKey-should-take-a-custom-partitioner.
---
 python/pyspark/rdd.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 73d7d9a5692a9..56e892243c79c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -686,7 +686,7 @@ def cartesian(self, other):
                                              other._jrdd_deserializer)
         return RDD(self._jrdd.cartesian(other._jrdd), self.ctx, deserializer)
 
-    def groupBy(self, f, numPartitions=None):
+    def groupBy(self, f, numPartitions=None, partitionFunc=portable_hash):
         """
         Return an RDD of grouped items.
 
@@ -695,7 +695,7 @@ def groupBy(self, f, numPartitions=None):
         >>> sorted([(x, sorted(y)) for (x, y) in result])
         [(0, [2, 8]), (1, [1, 1, 3, 5])]
         """
-        return self.map(lambda x: (f(x), x)).groupByKey(numPartitions)
+        return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc)
 
     @ignore_unicode_prefix
     def pipe(self, command, env=None, checkCode=False):
@@ -1539,22 +1539,23 @@ def values(self):
         """
         return self.map(lambda x: x[1])
 
-    def reduceByKey(self, func, numPartitions=None):
+    def reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash):
         """
         Merge the values for each key using an associative reduce function.
 
         This will also perform the merging locally on each mapper before
         sending results to a reducer, similarly to a "combiner" in MapReduce.
 
-        Output will be hash-partitioned with C{numPartitions} partitions, or
+        Output will be partitioned with C{numPartitions} partitions, or
         the default parallelism level if C{numPartitions} is not specified.
+        Default partitioner is hash-partition.
 
         >>> from operator import add
         >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> sorted(rdd.reduceByKey(add).collect())
         [('a', 2), ('b', 1)]
         """
-        return self.combineByKey(lambda x: x, func, func, numPartitions)
+        return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)
 
     def reduceByKeyLocally(self, func):
         """
@@ -1739,7 +1740,7 @@ def add_shuffle_key(split, iterator):
 
     # TODO: add control over map-side aggregation
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
-                     numPartitions=None):
+                     numPartitions=None, partitionFunc=portable_hash):
         """
         Generic function to combine the elements for each key using a custom
         set of aggregation functions.
@@ -1777,7 +1778,7 @@ def combineLocally(iterator):
             return merger.items()
 
         locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)
-        shuffled = locally_combined.partitionBy(numPartitions)
+        shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
 
         def _mergeCombiners(iterator):
             merger = ExternalMerger(agg, memory, serializer)
@@ -1786,7 +1787,8 @@ def _mergeCombiners(iterator):
 
         return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)
 
-    def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
+    def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None,
+                       partitionFunc=portable_hash):
         """
         Aggregate the values of each key, using given combine functions and a neutral
         "zero value". This function can return a different result type, U, than the type
@@ -1800,9 +1802,9 @@ def createZero():
             return copy.deepcopy(zeroValue)
 
         return self.combineByKey(
-            lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions)
+            lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc)
 
-    def foldByKey(self, zeroValue, func, numPartitions=None):
+    def foldByKey(self, zeroValue, func, numPartitions=None, partitionFunc=portable_hash):
         """
         Merge the values for each key using an associative function "func"
         and a neutral "zeroValue" which may be added to the result an
@@ -1817,13 +1819,14 @@ def foldByKey(self, zeroValue, func, numPartitions=None):
         def createZero():
             return copy.deepcopy(zeroValue)
 
-        return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions)
+        return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions,
+                                 partitionFunc)
 
     def _memory_limit(self):
         return _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))
 
     # TODO: support variant with custom partitioner
-    def groupByKey(self, numPartitions=None):
+    def groupByKey(self, numPartitions=None, partitionFunc=portable_hash):
         """
         Group the values for each key in the RDD into a single sequence.
         Hash-partitions the resulting RDD with numPartitions partitions.
@@ -1859,7 +1862,7 @@ def combine(iterator):
             return merger.items()
 
         locally_combined = self.mapPartitions(combine, preservesPartitioning=True)
-        shuffled = locally_combined.partitionBy(numPartitions)
+        shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
 
         def groupByKey(it):
             merger = ExternalGroupBy(agg, memory, serializer)

From bf20d6c9f9e478a5de24b45bbafd4dd89666c4cf Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 21 Sep 2015 23:29:59 -0700
Subject: [PATCH 372/802] [SPARK-10716] [BUILD] spark-1.5.0-bin-hadoop2.6.tgz
 file doesn't uncompress on OS X due to hidden file

Remove ._SUCCESS.crc hidden file that may cause problems in distribution tar archive, and is not used

Author: Sean Owen <sowen@cloudera.com>

Closes #8846 from srowen/SPARK-10716.
---
 .../test_support/sql/orc_partitioned/._SUCCESS.crc  | Bin 8 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/test_support/sql/orc_partitioned/._SUCCESS.crc

diff --git a/python/test_support/sql/orc_partitioned/._SUCCESS.crc b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
deleted file mode 100644
index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8
PcmYc;N@ieSU}69O2$TUk


From 0180b849dbaf191826231eda7dfaaf146a19602b Mon Sep 17 00:00:00 2001
From: Jian Feng <jzhang.chs@gmail.com>
Date: Mon, 21 Sep 2015 23:36:41 -0700
Subject: [PATCH 373/802] [SPARK-10577] [PYSPARK] DataFrame hint for broadcast
 join

https://issues.apache.org/jira/browse/SPARK-10577

Author: Jian Feng <jzhang.chs@gmail.com>

Closes #8801 from Jianfeng-chs/master.
---
 python/pyspark/sql/functions.py |  9 +++++++++
 python/pyspark/sql/tests.py     | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 26b8662718a60..fa04f4cd83b6f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -29,6 +29,7 @@
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
 from pyspark.sql.types import StringType
 from pyspark.sql.column import Column, _to_java_column, _to_seq
+from pyspark.sql.dataframe import DataFrame
 
 
 def _create_function(name, doc=""):
@@ -189,6 +190,14 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
+@since(1.6)
+def broadcast(df):
+    """Marks a DataFrame as small enough for use in broadcast joins."""
+
+    sc = SparkContext._active_spark_context
+    return DataFrame(sc._jvm.functions.broadcast(df._jdf), df.sql_ctx)
+
+
 @since(1.4)
 def coalesce(*cols):
     """Returns the first column that is not null.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3e680f1030a71..645133b2b2d84 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1075,6 +1075,24 @@ def foo():
 
         self.assertRaises(TypeError, foo)
 
+    # add test for SPARK-10577 (test broadcast join hint)
+    def test_functions_broadcast(self):
+        from pyspark.sql.functions import broadcast
+
+        df1 = self.sqlCtx.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
+        df2 = self.sqlCtx.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
+
+        # equijoin - should be converted into broadcast join
+        plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan()
+        self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))
+
+        # no join key -- should not be a broadcast join
+        plan2 = df1.join(broadcast(df2))._jdf.queryExecution().executedPlan()
+        self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))
+
+        # planner should not crash without a join
+        broadcast(df1)._jdf.queryExecution().executedPlan()
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 

From 781b21ba2a873ed29394c8dbc74fc700e3e0d17e Mon Sep 17 00:00:00 2001
From: Ewan Leith <ewan.leith@realitymine.com>
Date: Mon, 21 Sep 2015 23:43:20 -0700
Subject: [PATCH 374/802] [SPARK-10419] [SQL] Adding SQLServer support for
 datetimeoffset types to JdbcDialects

Reading from Microsoft SQL Server over jdbc fails when the table contains datetimeoffset types.

This patch registers a SQLServer JDBC Dialect that maps datetimeoffset to a String, as Microsoft suggest.

Author: Ewan Leith <ewan.leith@realitymine.com>

Closes #8575 from realitymine-coordinator/sqlserver.
---
 .../apache/spark/sql/jdbc/JdbcDialects.scala   | 18 ++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala  |  1 +
 2 files changed, 19 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 68ebaaca6c53d..c70fea1c3f50e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -137,6 +137,8 @@ object JdbcDialects {
   registerDialect(MySQLDialect)
   registerDialect(PostgresDialect)
   registerDialect(DB2Dialect)
+  registerDialect(MsSqlServerDialect)
+
 
   /**
    * Fetch the JdbcDialect class corresponding to a given database url.
@@ -260,3 +262,19 @@ case object DB2Dialect extends JdbcDialect {
     case _ => None
   }
 }
+
+/**
+ * :: DeveloperApi ::
+ * Default Microsoft SQL Server dialect, mapping the datetimeoffset types to a String on read.
+ */
+@DeveloperApi
+case object MsSqlServerDialect extends JdbcDialect {
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:sqlserver")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (typeName.contains("datetimeoffset")) {
+      // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients
+      Some(StringType)
+    } else None
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 5ab9381de4d66..c4b039a9c5359 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -408,6 +408,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(JdbcDialects.get("jdbc:mysql://127.0.0.1/db") == MySQLDialect)
     assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
     assert(JdbcDialects.get("jdbc:db2://127.0.0.1/db") == DB2Dialect)
+    assert(JdbcDialects.get("jdbc:sqlserver://127.0.0.1/db") == MsSqlServerDialect)
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 

From 1fcefef06950e2f03477282368ca835bbf40ff24 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 21 Sep 2015 23:46:00 -0700
Subject: [PATCH 375/802] [SPARK-10446][SQL] Support to specify join type when
 calling join with usingColumns

JIRA: https://issues.apache.org/jira/browse/SPARK-10446

Currently the method `join(right: DataFrame, usingColumns: Seq[String])` only supports inner join. It is more convenient to have it support other join types.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8600 from viirya/usingcolumns_df.
---
 python/pyspark/sql/dataframe.py               |  6 ++++-
 .../org/apache/spark/sql/DataFrame.scala      | 22 ++++++++++++++++++-
 .../apache/spark/sql/DataFrameJoinSuite.scala | 13 +++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index fb995fa3a76b5..80f8d8a0eb37d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -567,7 +567,11 @@ def join(self, other, on=None, how=None):
         if on is None or len(on) == 0:
             jdf = self._jdf.join(other._jdf)
         elif isinstance(on[0], basestring):
-            jdf = self._jdf.join(other._jdf, self._jseq(on))
+            if how is None:
+                jdf = self._jdf.join(other._jdf, self._jseq(on), "inner")
+            else:
+                assert isinstance(how, basestring), "how should be basestring"
+                jdf = self._jdf.join(other._jdf, self._jseq(on), how)
         else:
             assert isinstance(on[0], Column), "on should be Column or list of Column"
             if len(on) > 1:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 8f737c2023931..ba94d77b2e60e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -484,6 +484,26 @@ class DataFrame private[sql](
    * @since 1.4.0
    */
   def join(right: DataFrame, usingColumns: Seq[String]): DataFrame = {
+    join(right, usingColumns, "inner")
+  }
+
+  /**
+   * Equi-join with another [[DataFrame]] using the given columns.
+   *
+   * Different from other join functions, the join columns will only appear once in the output,
+   * i.e. similar to SQL's `JOIN USING` syntax.
+   *
+   * Note that if you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
+   * @param right Right side of the join operation.
+   * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
+   * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def join(right: DataFrame, usingColumns: Seq[String], joinType: String): DataFrame = {
     // Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
     // by creating a new instance for one of the branch.
     val joined = sqlContext.executePlan(
@@ -502,7 +522,7 @@ class DataFrame private[sql](
       Join(
         joined.left,
         joined.right,
-        joinType = Inner,
+        joinType = JoinType(joinType),
         condition)
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index e2716d7841d85..56ad71ea4f487 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -42,6 +42,19 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
       Row(1, 2, "1", "2") :: Row(2, 3, "2", "3") :: Row(3, 4, "3", "4") :: Nil)
   }
 
+  test("join - join using multiple columns and specifying join type") {
+    val df = Seq(1, 2, 3).map(i => (i, i + 1, i.toString)).toDF("int", "int2", "str")
+    val df2 = Seq(1, 2, 3).map(i => (i, i + 1, (i + 1).toString)).toDF("int", "int2", "str")
+
+    checkAnswer(
+      df.join(df2, Seq("int", "str"), "left"),
+      Row(1, 2, "1", null) :: Row(2, 3, "2", null) :: Row(3, 4, "3", null) :: Nil)
+
+    checkAnswer(
+      df.join(df2, Seq("int", "str"), "right"),
+      Row(null, null, null, 2) :: Row(null, null, null, 3) :: Row(null, null, null, 4) :: Nil)
+  }
+
   test("join - join using self join") {
     val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str")
 

From f24316e6d928c263cbf3872edd97982059c3db22 Mon Sep 17 00:00:00 2001
From: Madhusudanan Kandasamy <madhusudanan@in.ibm.com>
Date: Tue, 22 Sep 2015 00:03:48 -0700
Subject: [PATCH 376/802] [SPARK-10458] [SPARK CORE] Added isStopped() method
 in SparkContext

Added isStopped() method in SparkContext

Author: Madhusudanan Kandasamy <madhusudanan@in.ibm.com>

Closes #8749 from kmadhugit/SPARK-10458.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ebd8e946ee7a2..967fec9f42bcf 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -265,6 +265,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   val tachyonFolderName = externalBlockStoreFolderName
 
   def isLocal: Boolean = (master == "local" || master.startsWith("local["))
+  /**
+   * @return true if context is stopped or in the midst of stopping.
+   */
+  def isStopped: Boolean = stopped.get()
 
   // An asynchronous listener bus for Spark events
   private[spark] val listenerBus = new LiveListenerBus

From fd61b004877ba4d51c95cd0e08f53bffdf106395 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 22 Sep 2015 00:05:30 -0700
Subject: [PATCH 377/802] [Minor] style fix for previous commit f24316e

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 967fec9f42bcf..bf3aeb488d597 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -265,6 +265,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   val tachyonFolderName = externalBlockStoreFolderName
 
   def isLocal: Boolean = (master == "local" || master.startsWith("local["))
+
   /**
    * @return true if context is stopped or in the midst of stopping.
    */

From 4da32bc0e747fefe847bffe493785d4d16069c04 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 22 Sep 2015 00:07:30 -0700
Subject: [PATCH 378/802] [SPARK-8567] [SQL] Increase the timeout of
 o.a.s.sql.hive.HiveSparkSubmitSuite to 5 minutes.

https://issues.apache.org/jira/browse/SPARK-8567

Looks like "SPARK-8368: includes jars passed in through --jars" is pretty flaky now. Based on some history runs, the time spent on a successful run may be from 1.5 minutes to almost 3 minutes. Let's try to increase the timeout and see if we can fix this test.

https://amplab.cs.berkeley.edu/jenkins/job/Spark-1.5-SBT/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.0,label=spark-test/385/testReport/junit/org.apache.spark.sql.hive/HiveSparkSubmitSuite/SPARK_8368__includes_jars_passed_in_through___jars/history/?start=25

Author: Yin Huai <yhuai@databricks.com>

Closes #8850 from yhuai/SPARK-8567-anotherTry.
---
 .../scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 97df249bdb6d6..5f1660b62d418 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -139,7 +139,7 @@ class HiveSparkSubmitSuite
     new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start()
 
     try {
-      val exitCode = failAfter(180.seconds) { process.waitFor() }
+      val exitCode = failAfter(300.seconds) { process.waitFor() }
       if (exitCode != 0) {
         // include logs in output. Note that logging is async and may not have completed
         // at the time this exception is raised

From f3b727c801408b1cd50e5d9463f2fe0fce654a16 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 22 Sep 2015 00:09:29 -0700
Subject: [PATCH 379/802] [SQL] [MINOR] map -> foreach.

DataFrame.explain should use foreach to print the explain content.

Author: Reynold Xin <rxin@databricks.com>

Closes #8862 from rxin/map-foreach.
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index ba94d77b2e60e..a11140b717360 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -320,9 +320,8 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def explain(extended: Boolean): Unit = {
-    ExplainCommand(
-      queryExecution.logical,
-      extended = extended).queryExecution.executedPlan.executeCollect().map {
+    val explain = ExplainCommand(queryExecution.logical, extended = extended)
+    explain.queryExecution.executedPlan.executeCollect().foreach {
       // scalastyle:off println
       r => println(r.getString(0))
       // scalastyle:on println

From 0bd0e5bed2176b119b3ada590993e153757ea09b Mon Sep 17 00:00:00 2001
From: Akash Mishra <akash.mishra20@gmail.com>
Date: Tue, 22 Sep 2015 00:14:27 -0700
Subject: [PATCH 380/802] =?UTF-8?q?[SPARK-10695]=20[DOCUMENTATION]=20[MESO?=
 =?UTF-8?q?S]=20Fixing=20incorrect=20value=20informati=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…on for spark.mesos.constraints parameter.

Author: Akash Mishra <akash.mishra20@gmail.com>

Closes #8816 from SleepyThread/constraint-fix.
---
 docs/running-on-mesos.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 460a66f37dd64..ec5a44d79212b 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -189,10 +189,10 @@ using `conf.set("spark.cores.max", "10")` (for example).
 You may also make use of `spark.mesos.constraints` to set attribute based constraints on mesos resource offers. By default, all resource offers will be accepted.
 
 {% highlight scala %}
-conf.set("spark.mesos.constraints", "tachyon=true;us-east-1=false")
+conf.set("spark.mesos.constraints", "tachyon:true;us-east-1:false")
 {% endhighlight %}
 
-For example, Let's say `spark.mesos.constraints` is set to `tachyon=true;us-east-1=false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.
+For example, Let's say `spark.mesos.constraints` is set to `tachyon:true;us-east-1:false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.
 
 # Mesos Docker Support
 

From 7278f792a73bbcf8d68f38dc2d87cf722693c4cf Mon Sep 17 00:00:00 2001
From: Rekha Joshi <rekhajoshm@gmail.com>
Date: Tue, 22 Sep 2015 11:03:21 +0100
Subject: [PATCH 381/802] [SPARK-10718] [BUILD] Update License on conf files
 and corresponding excludes file update

Update License on conf files and corresponding excludes file update

Author: Rekha Joshi <rekhajoshm@gmail.com>
Author: Joshi <rekhajoshm@gmail.com>

Closes #8842 from rekhajoshm/SPARK-10718.
---
 .rat-excludes                                  | 12 ------------
 conf/docker.properties.template                | 17 +++++++++++++++++
 conf/fairscheduler.xml.template                | 18 ++++++++++++++++++
 conf/log4j.properties.template                 | 17 +++++++++++++++++
 conf/metrics.properties.template               | 17 +++++++++++++++++
 conf/slaves.template                           | 17 +++++++++++++++++
 conf/spark-defaults.conf.template              | 17 +++++++++++++++++
 conf/spark-env.sh.template                     | 17 +++++++++++++++++
 .../spark/log4j-defaults-repl.properties       | 17 +++++++++++++++++
 .../org/apache/spark/log4j-defaults.properties | 17 +++++++++++++++++
 10 files changed, 154 insertions(+), 12 deletions(-)

diff --git a/.rat-excludes b/.rat-excludes
index 9165872b9fb27..08fba6d351d6a 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -15,20 +15,8 @@ TAGS
 RELEASE
 control
 docs
-docker.properties.template
-fairscheduler.xml.template
-spark-defaults.conf.template
-log4j.properties
-log4j.properties.template
-metrics.properties
-metrics.properties.template
 slaves
-slaves.template
-spark-env.sh
 spark-env.cmd
-spark-env.sh.template
-log4j-defaults.properties
-log4j-defaults-repl.properties
 bootstrap-tooltip.js
 jquery-1.11.1.min.js
 d3.min.js
diff --git a/conf/docker.properties.template b/conf/docker.properties.template
index 26e3bfd9c5b9b..55cb094b4af46 100644
--- a/conf/docker.properties.template
+++ b/conf/docker.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 spark.mesos.executor.docker.image: <image built from `../docker/spark-mesos/Dockerfile`>
 spark.mesos.executor.docker.volumes: /usr/local/lib:/host/usr/local/lib:ro
 spark.mesos.executor.home: /opt/spark
diff --git a/conf/fairscheduler.xml.template b/conf/fairscheduler.xml.template
index acf59e2a35986..385b2e772d2c8 100644
--- a/conf/fairscheduler.xml.template
+++ b/conf/fairscheduler.xml.template
@@ -1,4 +1,22 @@
 <?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
 <allocations>
   <pool name="production">
     <schedulingMode>FAIR</schedulingMode>
diff --git a/conf/log4j.properties.template b/conf/log4j.properties.template
index 74c5cea94403a..f3046be54d7c6 100644
--- a/conf/log4j.properties.template
+++ b/conf/log4j.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=INFO, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 7f17bc7eea4f5..d6962e0da2f30 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 #  syntax: [instance].sink|source.[name].[options]=[value]
 
 #  This file configures Spark's internal metrics system. The metrics system is
diff --git a/conf/slaves.template b/conf/slaves.template
index da0a01343d20a..be42a638230b7 100644
--- a/conf/slaves.template
+++ b/conf/slaves.template
@@ -1,2 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # A Spark Worker will be started on each of the machines listed below.
 localhost
\ No newline at end of file
diff --git a/conf/spark-defaults.conf.template b/conf/spark-defaults.conf.template
index a48dcc70e1363..19cba6e71ed19 100644
--- a/conf/spark-defaults.conf.template
+++ b/conf/spark-defaults.conf.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Default system properties included when running spark-submit.
 # This is useful for setting default environmental settings.
 
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index c05fe381a36a7..990ded420be72 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -1,5 +1,22 @@
 #!/usr/bin/env bash
 
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # This file is sourced when running various Spark programs.
 # Copy it as spark-env.sh and edit that to configure Spark for your site.
 
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
index 689afea64f8db..c85abc35b93bf 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=WARN, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
index 27006e45e932b..d44cc85dcbd82 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=INFO, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender

From 870b8a2edd44c9274c43ca0db4ef5b0998e16fd8 Mon Sep 17 00:00:00 2001
From: Meihua Wu <meihuawu@umich.edu>
Date: Tue, 22 Sep 2015 11:05:24 +0100
Subject: [PATCH 382/802] [SPARK-10706] [MLLIB] Add java wrapper for random
 vector rdd

Add java wrapper for random vector rdd

holdenk srowen

Author: Meihua Wu <meihuawu@umich.edu>

Closes #8841 from rotationsymmetry/SPARK-10706.
---
 .../spark/mllib/random/RandomRDDs.scala       | 42 +++++++++++++++++++
 .../mllib/random/JavaRandomRDDsSuite.java     | 17 ++++++++
 2 files changed, 59 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index f8ff26b5795be..41d7c4d355f61 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -855,6 +855,48 @@ object RandomRDDs {
       sc, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), generator, seed)
   }
 
+  /**
+   * Java-friendly version of [[RandomRDDs#randomVectorRDD]].
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions, seed).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaVectorRDD]] with the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaVectorRDD]] with the default number of partitions and the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols).toJavaRDD()
+  }
+
   /**
    * Returns `numPartitions` if it is positive, or `sc.defaultParallelism` otherwise.
    */
diff --git a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
index fce5f6712f462..5728df5aeebdc 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
@@ -246,6 +246,23 @@ public void testArbitrary() {
       Assert.assertEquals(2, rdd.first().length());
     }
   }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testRandomVectorRDD() {
+    UniformGenerator generator = new UniformGenerator();
+    long m = 100L;
+    int n = 10;
+    int p = 2;
+    long seed = 1L;
+    JavaRDD<Vector> rdd1 = randomJavaVectorRDD(sc, generator, m, n);
+    JavaRDD<Vector> rdd2 = randomJavaVectorRDD(sc, generator, m, n, p);
+    JavaRDD<Vector> rdd3 = randomJavaVectorRDD(sc, generator, m, n, p, seed);
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+      Assert.assertEquals(n, rdd.first().size());
+    }
+  }
 }
 
 // This is just a test generator, it always returns a string of 42

From f4a3c4e34ce93bcaf29c0a35573932880a8b792b Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 22 Sep 2015 10:19:08 -0700
Subject: [PATCH 383/802] [SPARK-9962] [ML] Decision Tree training:
 prevNodeIdsForInstances.unpersist() at end of training

NodeIdCache: prevNodeIdsForInstances.unpersist() needs to be called at end of training.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8541 from holdenk/SPARK-9962-decission-tree-training-prevNodeIdsForiNstances-unpersist-at-end-of-training.
---
 .../scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala | 8 ++++----
 .../org/apache/spark/mllib/tree/impl/NodeIdCache.scala    | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
index 488e8e4fb5dcd..c5ad8df73fac9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
@@ -164,10 +164,10 @@ private[spark] class NodeIdCache(
         }
       }
     }
-  }
-  if (prevNodeIdsForInstances != null) {
-    // Unpersist the previous one if one exists.
-    prevNodeIdsForInstances.unpersist()
+    if (prevNodeIdsForInstances != null) {
+      // Unpersist the previous one if one exists.
+      prevNodeIdsForInstances.unpersist()
+    }
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
index 8f9eb24b57b55..0abed5411143d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
@@ -166,6 +166,10 @@ private[spark] class NodeIdCache(
         fs.delete(new Path(old.getCheckpointFile.get), true)
       }
     }
+    if (prevNodeIdsForInstances != null) {
+      // Unpersist the previous one if one exists.
+      prevNodeIdsForInstances.unpersist()
+    }
   }
 }
 

From 7104ee0e5dc1290b8b845a0a8ddcdb1875cfd060 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 22 Sep 2015 11:00:33 -0700
Subject: [PATCH 384/802] [SPARK-10750] [ML] ML Param validate should print
 better error information

Currently when you set illegal value for params of array type (such as IntArrayParam, DoubleArrayParam, StringArrayParam), it will throw IllegalArgumentException but with incomprehensible error information.
Take ```VectorSlicer.setNames``` as an example:
```scala
val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")
// The value of setNames must be contain distinct elements, so the next line will throw exception.
vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4", "f1"))
```
It will throw IllegalArgumentException as:
```
vectorSlicer_b3b4d1a10f43 parameter names given invalid value [Ljava.lang.String;798256c5.
java.lang.IllegalArgumentException: vectorSlicer_b3b4d1a10f43 parameter names given invalid value [Ljava.lang.String;798256c5.
```
We should distinguish the value of array type from primitive type at Param.validate(value: T), and we will get better error information.
```
vectorSlicer_3b744ea277b2 parameter names given invalid value [f1,f4,f1].
java.lang.IllegalArgumentException: vectorSlicer_3b744ea277b2 parameter names given invalid value [f1,f4,f1].
```

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8863 from yanboliang/spark-10750.
---
 .../src/main/scala/org/apache/spark/ml/param/params.scala  | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index de32b7218c277..48f6269e57e98 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -65,7 +65,12 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
    */
   private[param] def validate(value: T): Unit = {
     if (!isValid(value)) {
-      throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value.")
+      val valueToString = value match {
+        case v: Array[_] => v.mkString("[", ",", "]")
+        case _ => value.toString
+      }
+      throw new IllegalArgumentException(
+        s"$parent parameter $name given invalid value $valueToString.")
     }
   }
 

From 2ea0f2e11b82ef4817c7e6a162ea23da7860b893 Mon Sep 17 00:00:00 2001
From: xutingjun <xutingjun@huawei.com>
Date: Tue, 22 Sep 2015 11:01:32 -0700
Subject: [PATCH 385/802] [SPARK-9585] Delete the input format caching because
 some input format are non thread safe

If we cache the  InputFormat, all tasks on the same executor will share it.
Some InputFormat is thread safety, but some are not, such as HiveHBaseTableInputFormat. If tasks share a non thread safe InputFormat, unexpected error may be occurs.
To avoid it, I think we should delete the input format  caching.

Author: xutingjun <xutingjun@huawei.com>
Author: meiyoula <1039320815@qq.com>
Author: Xutingjun <xutingjun@huawei.com>

Closes #7918 from XuTingjun/cached_inputFormat.
---
 core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 8f2655d63b797..77b57132b9f1f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -182,17 +182,11 @@ class HadoopRDD[K, V](
   }
 
   protected def getInputFormat(conf: JobConf): InputFormat[K, V] = {
-    if (HadoopRDD.containsCachedMetadata(inputFormatCacheKey)) {
-      return HadoopRDD.getCachedMetadata(inputFormatCacheKey).asInstanceOf[InputFormat[K, V]]
-    }
-    // Once an InputFormat for this RDD is created, cache it so that only one reflection call is
-    // done in each local process.
     val newInputFormat = ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
       .asInstanceOf[InputFormat[K, V]]
     if (newInputFormat.isInstanceOf[Configurable]) {
       newInputFormat.asInstanceOf[Configurable].setConf(conf)
     }
-    HadoopRDD.putCachedMetadata(inputFormatCacheKey, newInputFormat)
     newInputFormat
   }
 

From 22d40159e60dd27a428e4051ef607292cbffbff3 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 22 Sep 2015 11:07:01 -0700
Subject: [PATCH 386/802] [SPARK-10593] [SQL] fix resolve output of Generate

The output of Generate should not be resolved as Reference.

Author: Davies Liu <davies@databricks.com>

Closes #8755 from davies/view.
---
 .../spark/sql/catalyst/analysis/Analyzer.scala   | 16 ++++++++++++++++
 .../spark/sql/catalyst/plans/QueryPlan.scala     |  1 -
 .../catalyst/plans/logical/basicOperators.scala  |  2 +-
 .../spark/sql/hive/execution/SQLQuerySuite.scala | 14 ++++++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 02f34cbf58ad0..bf72d47ce1ea6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -378,6 +378,22 @@ class Analyzer(
         val newOrdering = resolveSortOrders(ordering, child, throws = false)
         Sort(newOrdering, global, child)
 
+      // A special case for Generate, because the output of Generate should not be resolved by
+      // ResolveReferences. Attributes in the output will be resolved by ResolveGenerate.
+      case g @ Generate(generator, join, outer, qualifier, output, child)
+        if child.resolved && !generator.resolved =>
+        val newG = generator transformUp {
+          case u @ UnresolvedAttribute(nameParts) =>
+            withPosition(u) { child.resolve(nameParts, resolver).getOrElse(u) }
+          case UnresolvedExtractValue(child, fieldExpr) =>
+            ExtractValue(child, fieldExpr, resolver)
+        }
+        if (newG.fastEquals(generator)) {
+          g
+        } else {
+          Generate(newG.asInstanceOf[Generator], join, outer, qualifier, output, child)
+        }
+
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressionsUp  {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 55286f9f2fc5c..0ec9f08571082 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.plans
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, VirtualColumn}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.{DataType, StructType}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 722f69cdca827..ae9482c10f126 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -68,7 +68,7 @@ case class Generate(
     generator.resolved &&
       childrenResolved &&
       generator.elementTypes.length == generatorOutput.length &&
-      !generatorOutput.exists(!_.resolved)
+      generatorOutput.forall(_.resolved)
   }
 
   // we don't want the gOutput to be taken as part of the expressions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8126d02335217..bb02473dd17ca 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1170,4 +1170,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       checkAnswer(sqlContext.table("`db.t`"), df)
     }
   }
+
+  test("SPARK-10593 same column names in lateral view") {
+    val df = sqlContext.sql(
+    """
+      |select
+      |insideLayer2.json as a2
+      |from (select '{"layer1": {"layer2": "text inside layer 2"}}' json) test
+      |lateral view json_tuple(json, 'layer1') insideLayer1 as json
+      |lateral view json_tuple(insideLayer1.json, 'layer2') insideLayer2 as json
+    """.stripMargin
+    )
+
+    checkAnswer(df, Row("text inside layer 2") :: Nil)
+  }
 }

From 1ca5e2e0b8d8d406c02a74c76ae9d7fc5637c8d3 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 22 Sep 2015 11:50:22 -0700
Subject: [PATCH 387/802] [SPARK-10704] Rename HashShuffleReader to
 BlockStoreShuffleReader

The current shuffle code has an interface named ShuffleReader with only one implementation, HashShuffleReader. This naming is confusing, since the same read path code is used for both sort- and hash-based shuffle. This patch addresses this by renaming HashShuffleReader to BlockStoreShuffleReader.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8825 from JoshRosen/shuffle-reader-cleanup.
---
 ...shShuffleReader.scala => BlockStoreShuffleReader.scala} | 5 ++---
 .../org/apache/spark/shuffle/hash/HashShuffleManager.scala | 2 +-
 .../org/apache/spark/shuffle/sort/SortShuffleManager.scala | 3 +--
 ...eaderSuite.scala => BlockStoreShuffleReaderSuite.scala} | 7 +++----
 4 files changed, 7 insertions(+), 10 deletions(-)
 rename core/src/main/scala/org/apache/spark/shuffle/{hash/HashShuffleReader.scala => BlockStoreShuffleReader.scala} (97%)
 rename core/src/test/scala/org/apache/spark/shuffle/{hash/HashShuffleReaderSuite.scala => BlockStoreShuffleReaderSuite.scala} (96%)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
similarity index 97%
rename from core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
rename to core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 0c8f08f0f3b1b..6dc9a16e58531 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -15,16 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.hash
+package org.apache.spark.shuffle
 
 import org.apache.spark._
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader}
 import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator}
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
 
-private[spark] class HashShuffleReader[K, C](
+private[spark] class BlockStoreShuffleReader[K, C](
     handle: BaseShuffleHandle[K, _, C],
     startPartition: Int,
     endPartition: Int,
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
index 0b46634b8b466..d2e2fc4c110a7 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
@@ -51,7 +51,7 @@ private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager
       startPartition: Int,
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
-    new HashShuffleReader(
+    new BlockStoreShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
index 476cc1f303da7..9df4e551669cc 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -21,7 +21,6 @@ import java.util.concurrent.ConcurrentHashMap
 
 import org.apache.spark.{Logging, SparkConf, TaskContext, ShuffleDependency}
 import org.apache.spark.shuffle._
-import org.apache.spark.shuffle.hash.HashShuffleReader
 
 private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
 
@@ -54,7 +53,7 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
     // We currently use the same block store shuffle fetcher as the hash-based shuffle.
-    new HashShuffleReader(
+    new BlockStoreShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
similarity index 96%
rename from core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
rename to core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
index 05b3afef5b839..a5eafb1b5529e 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.hash
+package org.apache.spark.shuffle
 
 import java.io.{ByteArrayOutputStream, InputStream}
 import java.nio.ByteBuffer
@@ -28,7 +28,6 @@ import org.mockito.stubbing.Answer
 import org.apache.spark._
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.shuffle.BaseShuffleHandle
 import org.apache.spark.storage.{BlockManager, BlockManagerId, ShuffleBlockId}
 
 /**
@@ -56,7 +55,7 @@ class RecordingManagedBuffer(underlyingBuffer: NioManagedBuffer) extends Managed
   }
 }
 
-class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
+class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
 
   /**
    * This test makes sure that, when data is read from a HashShuffleReader, the underlying
@@ -134,7 +133,7 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
       new BaseShuffleHandle(shuffleId, numMaps, dependency)
     }
 
-    val shuffleReader = new HashShuffleReader(
+    val shuffleReader = new BlockStoreShuffleReader(
       shuffleHandle,
       reduceId,
       reduceId + 1,

From 5017c685f484ec256101d1d33bad11d9e0c0f641 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Tue, 22 Sep 2015 12:14:15 -0700
Subject: [PATCH 388/802] [SPARK-10740] [SQL] handle nondeterministic
 expressions correctly for set operations

https://issues.apache.org/jira/browse/SPARK-10740

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8858 from cloud-fan/non-deter.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 69 ++++++++++++++-----
 .../optimizer/SetOperationPushDownSuite.scala |  3 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 41 +++++++++++
 3 files changed, 93 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 324f40a051c38..63602eaa8ccd8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -95,14 +95,14 @@ object SamplePushDown extends Rule[LogicalPlan] {
  * Intersect:
  * It is not safe to pushdown Projections through it because we need to get the
  * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
- * because we will not have non-deterministic expressions.
+ * with deterministic condition.
  *
  * Except:
  * It is not safe to pushdown Projections through it because we need to get the
  * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
- * because we will not have non-deterministic expressions.
+ * with deterministic condition.
  */
-object SetOperationPushDown extends Rule[LogicalPlan] {
+object SetOperationPushDown extends Rule[LogicalPlan] with PredicateHelper {
 
   /**
    * Maps Attributes from the left side to the corresponding Attribute on the right side.
@@ -129,34 +129,65 @@ object SetOperationPushDown extends Rule[LogicalPlan] {
     result.asInstanceOf[A]
   }
 
+  /**
+   * Splits the condition expression into small conditions by `And`, and partition them by
+   * deterministic, and finally recombine them by `And`. It returns an expression containing
+   * all deterministic expressions (the first field of the returned Tuple2) and an expression
+   * containing all non-deterministic expressions (the second field of the returned Tuple2).
+   */
+  private def partitionByDeterministic(condition: Expression): (Expression, Expression) = {
+    val andConditions = splitConjunctivePredicates(condition)
+    andConditions.partition(_.deterministic) match {
+      case (deterministic, nondeterministic) =>
+        deterministic.reduceOption(And).getOrElse(Literal(true)) ->
+        nondeterministic.reduceOption(And).getOrElse(Literal(true))
+    }
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Push down filter into union
     case Filter(condition, u @ Union(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(u)
-      Union(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
-
-    // Push down projection through UNION ALL
-    case Project(projectList, u @ Union(left, right)) =>
-      val rewrites = buildRewrites(u)
-      Union(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
+      Filter(nondeterministic,
+        Union(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
+
+    // Push down deterministic projection through UNION ALL
+    case p @ Project(projectList, u @ Union(left, right)) =>
+      if (projectList.forall(_.deterministic)) {
+        val rewrites = buildRewrites(u)
+        Union(
+          Project(projectList, left),
+          Project(projectList.map(pushToRight(_, rewrites)), right))
+      } else {
+        p
+      }
 
     // Push down filter through INTERSECT
     case Filter(condition, i @ Intersect(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(i)
-      Intersect(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
+      Filter(nondeterministic,
+        Intersect(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
 
     // Push down filter through EXCEPT
     case Filter(condition, e @ Except(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(e)
-      Except(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
+      Filter(nondeterministic,
+        Except(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
index 3fca47a023dc6..1595ad9327423 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
@@ -30,7 +30,8 @@ class SetOperationPushDownSuite extends PlanTest {
       Batch("Subqueries", Once,
         EliminateSubQueries) ::
       Batch("Union Pushdown", Once,
-        SetOperationPushDown) :: Nil
+        SetOperationPushDown,
+        SimplifyFilters) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1370713975f2f..d919877746c72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -916,4 +916,45 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(intersect.count() === 30)
     assert(except.count() === 70)
   }
+
+  test("SPARK-10740: handle nondeterministic expressions correctly for set operations") {
+    val df1 = (1 to 20).map(Tuple1.apply).toDF("i")
+    val df2 = (1 to 10).map(Tuple1.apply).toDF("i")
+
+    // When generating expected results at here, we need to follow the implementation of
+    // Rand expression.
+    def expected(df: DataFrame): Seq[Row] = {
+      df.rdd.collectPartitions().zipWithIndex.flatMap {
+        case (data, index) =>
+          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+          data.filter(_.getInt(0) < rng.nextDouble() * 10)
+      }
+    }
+
+    val union = df1.unionAll(df2)
+    checkAnswer(
+      union.filter('i < rand(7) * 10),
+      expected(union)
+    )
+    checkAnswer(
+      union.select(rand(7)),
+      union.rdd.collectPartitions().zipWithIndex.flatMap {
+        case (data, index) =>
+          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+          data.map(_ => rng.nextDouble()).map(i => Row(i))
+      }
+    )
+
+    val intersect = df1.intersect(df2)
+    checkAnswer(
+      intersect.filter('i < rand(7) * 10),
+      expected(intersect)
+    )
+
+    val except = df1.except(df2)
+    checkAnswer(
+      except.filter('i < rand(7) * 10),
+      expected(except)
+    )
+  }
 }

From 2204cdb28483b249616068085d4e88554fe6acef Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 22 Sep 2015 13:29:39 -0700
Subject: [PATCH 389/802] [SPARK-10672] [SQL] Do not fail when we cannot save
 the metadata of a data source table in a hive compatible way

https://issues.apache.org/jira/browse/SPARK-10672

With changes in this PR, we will fallback to same the metadata of a table in Spark SQL specific way if we fail to save it in a hive compatible way (Hive throws an exception because of its internal restrictions, e.g. binary and decimal types cannot be saved to parquet if the metastore is running Hive 0.13). I manually tested the fix with the following test in `DataSourceWithHiveMetastoreCatalogSuite` (`spark.sql.hive.metastore.version=0.13` and `spark.sql.hive.metastore.jars`=`maven`).

```
    test(s"fail to save metadata of a parquet table in hive 0.13") {
      withTempPath { dir =>
        withTable("t") {
          val path = dir.getCanonicalPath

          sql(
            s"""CREATE TABLE t USING $provider
               |OPTIONS (path '$path')
               |AS SELECT 1 AS d1, cast("val_1" as binary) AS d2
             """.stripMargin)

          sql(
            s"""describe formatted t
             """.stripMargin).collect.foreach(println)

          sqlContext.table("t").show
        }
      }
    }
  }
```

Without this fix, we will fail with the following error.
```
org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.UnsupportedOperationException: Unknown field type: binary
	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:619)
	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:576)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$createTable$1.apply$mcV$sp(ClientWrapper.scala:359)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$createTable$1.apply(ClientWrapper.scala:357)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$createTable$1.apply(ClientWrapper.scala:357)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$withHiveState$1.apply(ClientWrapper.scala:256)
	at org.apache.spark.sql.hive.client.ClientWrapper.retryLocked(ClientWrapper.scala:211)
	at org.apache.spark.sql.hive.client.ClientWrapper.withHiveState(ClientWrapper.scala:248)
	at org.apache.spark.sql.hive.client.ClientWrapper.createTable(ClientWrapper.scala:357)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog.createDataSourceTable(HiveMetastoreCatalog.scala:358)
	at org.apache.spark.sql.hive.execution.CreateMetastoreDataSourceAsSelect.run(commands.scala:285)
	at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
	at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
	at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:69)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:58)
	at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:144)
	at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:129)
	at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:51)
	at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:725)
	at org.apache.spark.sql.test.SQLTestUtils$$anonfun$sql$1.apply(SQLTestUtils.scala:56)
	at org.apache.spark.sql.test.SQLTestUtils$$anonfun$sql$1.apply(SQLTestUtils.scala:56)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1$$anonfun$apply$mcV$sp$2$$anonfun$apply$2.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:165)
	at org.apache.spark.sql.test.SQLTestUtils$class.withTable(SQLTestUtils.scala:150)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.withTable(HiveMetastoreCatalogSuite.scala:52)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1$$anonfun$apply$mcV$sp$2.apply(HiveMetastoreCatalogSuite.scala:162)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1$$anonfun$apply$mcV$sp$2.apply(HiveMetastoreCatalogSuite.scala:161)
	at org.apache.spark.sql.test.SQLTestUtils$class.withTempPath(SQLTestUtils.scala:125)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.withTempPath(HiveMetastoreCatalogSuite.scala:52)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:161)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:161)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$4$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:161)
	at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
	at org.scalatest.Transformer.apply(Transformer.scala:20)
	at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
	at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:42)
	at org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:163)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
	at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
	at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:175)
	at org.scalatest.FunSuite.runTest(FunSuite.scala:1555)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
	at scala.collection.immutable.List.foreach(List.scala:318)
	at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
	at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396)
	at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483)
	at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:208)
	at org.scalatest.FunSuite.runTests(FunSuite.scala:1555)
	at org.scalatest.Suite$class.run(Suite.scala:1424)
	at org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1555)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
	at org.scalatest.SuperEngine.runImpl(Engine.scala:545)
	at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:212)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.org$scalatest$BeforeAndAfterAll$$super$run(HiveMetastoreCatalogSuite.scala:52)
	at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:257)
	at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:256)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.run(HiveMetastoreCatalogSuite.scala:52)
	at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:462)
	at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:671)
	at sbt.ForkMain$Run$2.call(ForkMain.java:294)
	at sbt.ForkMain$Run$2.call(ForkMain.java:284)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.UnsupportedOperationException: Unknown field type: binary
	at org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector.getObjectInspector(ArrayWritableObjectInspector.java:108)
	at org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector.<init>(ArrayWritableObjectInspector.java:60)
	at org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe.initialize(ParquetHiveSerDe.java:113)
	at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:339)
	at org.apache.hadoop.hive.ql.metadata.Table.getDeserializerFromMetaStore(Table.java:288)
	at org.apache.hadoop.hive.ql.metadata.Table.checkValidity(Table.java:194)
	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:597)
	... 76 more
```

Author: Yin Huai <yhuai@databricks.com>

Closes #8824 from yhuai/datasourceMetadata.
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 101 +++++++++---------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 0c1b41e3377e3..012634cb5aeb5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -309,69 +309,68 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     }
 
     // TODO: Support persisting partitioned data source relations in Hive compatible format
-    val hiveTable = (maybeSerDe, dataSource.relation) match {
+    val qualifiedTableName = tableIdent.quotedString
+    val (hiveCompitiableTable, logMessage) = (maybeSerDe, dataSource.relation) match {
       case (Some(serde), relation: HadoopFsRelation)
-          if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
-        // Hive ParquetSerDe doesn't support decimal type until 1.2.0.
-        val isParquetSerDe = serde.inputFormat.exists(_.toLowerCase.contains("parquet"))
-        val hasDecimalFields = relation.schema.existsRecursively(_.isInstanceOf[DecimalType])
-
-        val hiveParquetSupportsDecimal = client.version match {
-          case org.apache.spark.sql.hive.client.hive.v1_2 => true
-          case _ => false
-        }
-
-        if (isParquetSerDe && !hiveParquetSupportsDecimal && hasDecimalFields) {
-          // If Hive version is below 1.2.0, we cannot save Hive compatible schema to
-          // metastore when the file format is Parquet and the schema has DecimalType.
-          logWarning {
-            "Persisting Parquet relation with decimal field(s) into Hive metastore in Spark SQL " +
-              "specific format, which is NOT compatible with Hive. Because ParquetHiveSerDe in " +
-              s"Hive ${client.version.fullVersion} doesn't support decimal type. See HIVE-6384."
-          }
-          newSparkSQLSpecificMetastoreTable()
-        } else {
-          logInfo {
-            "Persisting data source relation with a single input path into Hive metastore in " +
-              s"Hive compatible format. Input path: ${relation.paths.head}"
-          }
-          newHiveCompatibleMetastoreTable(relation, serde)
-        }
+        if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
+        val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
+        val message =
+          s"Persisting data source relation $qualifiedTableName with a single input path " +
+            s"into Hive metastore in Hive compatible format. Input path: ${relation.paths.head}."
+        (Some(hiveTable), message)
 
       case (Some(serde), relation: HadoopFsRelation) if relation.partitionColumns.nonEmpty =>
-        logWarning {
-          "Persisting partitioned data source relation into Hive metastore in " +
-            s"Spark SQL specific format, which is NOT compatible with Hive.  Input path(s): " +
-            relation.paths.mkString("\n", "\n", "")
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Persisting partitioned data source relation $qualifiedTableName into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
+            "Input path(s): " + relation.paths.mkString("\n", "\n", "")
+        (None, message)
 
       case (Some(serde), relation: HadoopFsRelation) =>
-        logWarning {
-          "Persisting data source relation with multiple input paths into Hive metastore in " +
-            s"Spark SQL specific format, which is NOT compatible with Hive.  Input paths: " +
-            relation.paths.mkString("\n", "\n", "")
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Persisting data source relation $qualifiedTableName with multiple input paths into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
+            s"Input paths: " + relation.paths.mkString("\n", "\n", "")
+        (None, message)
 
       case (Some(serde), _) =>
-        logWarning {
-          s"Data source relation is not a ${classOf[HadoopFsRelation].getSimpleName}. " +
-            "Persisting it into Hive metastore in Spark SQL specific format, " +
-            "which is NOT compatible with Hive."
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Data source relation $qualifiedTableName is not a " +
+            s"${classOf[HadoopFsRelation].getSimpleName}. Persisting it into Hive metastore " +
+            "in Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
 
       case _ =>
-        logWarning {
+        val message =
           s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
-            "Persisting data source relation into Hive metastore in Spark SQL specific format, " +
-            "which is NOT compatible with Hive."
-        }
-        newSparkSQLSpecificMetastoreTable()
+            s"Persisting data source relation $qualifiedTableName into Hive metastore in " +
+            s"Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
     }
 
-    client.createTable(hiveTable)
+    (hiveCompitiableTable, logMessage) match {
+      case (Some(table), message) =>
+        // We first try to save the metadata of the table in a Hive compatiable way.
+        // If Hive throws an error, we fall back to save its metadata in the Spark SQL
+        // specific way.
+        try {
+          logInfo(message)
+          client.createTable(table)
+        } catch {
+          case throwable: Throwable =>
+            val warningMessage =
+              s"Could not persist $qualifiedTableName in a Hive compatible way. Persisting " +
+                s"it into Hive metastore in Spark SQL specific format."
+            logWarning(warningMessage, throwable)
+            val sparkSqlSpecificTable = newSparkSQLSpecificMetastoreTable()
+            client.createTable(sparkSqlSpecificTable)
+        }
+
+      case (None, message) =>
+        logWarning(message)
+        val hiveTable = newSparkSQLSpecificMetastoreTable()
+        client.createTable(hiveTable)
+    }
   }
 
   def hiveDefaultTableFilePath(tableName: String): String = {

From 5aea987c904b281d7952ad8db40a32561b4ec5cf Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 22 Sep 2015 13:31:35 -0700
Subject: [PATCH 390/802] [SPARK-10737] [SQL] When using UnsafeRows,
 SortMergeJoin may return wrong results

https://issues.apache.org/jira/browse/SPARK-10737

Author: Yin Huai <yhuai@databricks.com>

Closes #8854 from yhuai/SMJBug.
---
 .../codegen/GenerateProjection.scala          |  2 ++
 .../apache/spark/sql/execution/Window.scala   |  9 ++++--
 .../sql/execution/joins/SortMergeJoin.scala   | 25 +++++++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 28 +++++++++++++++++++
 4 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 2164ddf03d1b2..75524b568d685 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -171,6 +171,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       @Override
       public Object apply(Object r) {
+        // GenerateProjection does not work with UnsafeRows.
+        assert(!(r instanceof ${classOf[UnsafeRow].getName}));
         return new SpecificRow((InternalRow) r);
       }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 0269d6d4b7a1c..f8929530c5036 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -253,7 +253,11 @@ case class Window(
 
         // Get all relevant projections.
         val result = createResultProjection(unboundExpressions)
-        val grouping = newProjection(partitionSpec, child.output)
+        val grouping = if (child.outputsUnsafeRows) {
+          UnsafeProjection.create(partitionSpec, child.output)
+        } else {
+          newProjection(partitionSpec, child.output)
+        }
 
         // Manage the stream and the grouping.
         var nextRow: InternalRow = EmptyRow
@@ -277,7 +281,8 @@ case class Window(
         val numFrames = frames.length
         private[this] def fetchNextPartition() {
           // Collect all the rows in the current partition.
-          val currentGroup = nextGroup
+          // Before we start to fetch new input rows, make a copy of nextGroup.
+          val currentGroup = nextGroup.copy()
           rows = new CompactBuffer
           while (nextRowAvailable && nextGroup == currentGroup) {
             rows += nextRow.copy()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 906f20d2a7289..70a1af6a7063a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -56,9 +56,6 @@ case class SortMergeJoin(
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     requiredOrders(leftKeys) :: requiredOrders(rightKeys) :: Nil
 
-  @transient protected lazy val leftKeyGenerator = newProjection(leftKeys, left.output)
-  @transient protected lazy val rightKeyGenerator = newProjection(rightKeys, right.output)
-
   protected[this] def isUnsafeMode: Boolean = {
     (codegenEnabled && unsafeEnabled
       && UnsafeProjection.canSupport(leftKeys)
@@ -82,6 +79,28 @@ case class SortMergeJoin(
 
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       new RowIterator {
+        // The projection used to extract keys from input rows of the left child.
+        private[this] val leftKeyGenerator = {
+          if (isUnsafeMode) {
+            // It is very important to use UnsafeProjection if input rows are UnsafeRows.
+            // Otherwise, GenerateProjection will cause wrong results.
+            UnsafeProjection.create(leftKeys, left.output)
+          } else {
+            newProjection(leftKeys, left.output)
+          }
+        }
+
+        // The projection used to extract keys from input rows of the right child.
+        private[this] val rightKeyGenerator = {
+          if (isUnsafeMode) {
+            // It is very important to use UnsafeProjection if input rows are UnsafeRows.
+            // Otherwise, GenerateProjection will cause wrong results.
+            UnsafeProjection.create(rightKeys, right.output)
+          } else {
+            newProjection(rightKeys, right.output)
+          }
+        }
+
         // An ordering that can be used to compare keys from both sides.
         private[this] val keyOrdering = newNaturalAscendingOrdering(leftKeys.map(_.dataType))
         private[this] var currentLeftRow: InternalRow = _
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 05b4127cbcaff..eca6f1073889a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1781,4 +1781,32 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         Seq(Row(1), Row(1)))
     }
   }
+
+  test("SortMergeJoin returns wrong results when using UnsafeRows") {
+    // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737.
+    // This bug will be triggered when Tungsten is enabled and there are multiple
+    // SortMergeJoin operators executed in the same task.
+    val confs =
+      SQLConf.SORTMERGE_JOIN.key -> "true" ::
+        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1" ::
+        SQLConf.TUNGSTEN_ENABLED.key -> "true" :: Nil
+    withSQLConf(confs: _*) {
+      val df1 = (1 to 50).map(i => (s"str_$i", i)).toDF("i", "j")
+      val df2 =
+        df1
+          .join(df1.select(df1("i")), "i")
+          .select(df1("i"), df1("j"))
+
+      val df3 = df2.withColumnRenamed("i", "i1").withColumnRenamed("j", "j1")
+      val df4 =
+        df2
+          .join(df3, df2("i") === df3("i1"))
+          .withColumn("diff", $"j" - $"j1")
+          .select(df2("i"), df2("j"), $"diff")
+
+      checkAnswer(
+        df4,
+        df1.withColumn("diff", lit(0)))
+    }
+  }
 }

From a96ba40f7ee1352288ea676d8844e1c8174202eb Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 22 Sep 2015 14:11:46 -0700
Subject: [PATCH 391/802] [SPARK-10714] [SPARK-8632] [SPARK-10685] [SQL]
 Refactor Python UDF handling

This patch refactors Python UDF handling:

1. Extract the per-partition Python UDF calling logic from PythonRDD into a PythonRunner. PythonRunner itself expects iterator as input/output, and thus has no dependency on RDD. This way, we can use PythonRunner directly in a mapPartitions call, or in the future in an environment without RDDs.
2. Use PythonRunner in Spark SQL's BatchPythonEvaluation.
3. Updated BatchPythonEvaluation to only use its input once, rather than twice. This should fix Python UDF performance regression in Spark 1.5.

There are a number of small cleanups I wanted to do when I looked at the code, but I kept most of those out so the diff looks small.

This basically implements the approach in https://github.com/apache/spark/pull/8833, but with some code moving around so the correctness doesn't depend on the inner workings of Spark serialization and task execution.

Author: Reynold Xin <rxin@databricks.com>

Closes #8835 from rxin/python-iter-refactor.
---
 .../apache/spark/api/python/PythonRDD.scala   | 54 ++++++++++---
 .../spark/sql/execution/pythonUDFs.scala      | 80 +++++++++++--------
 2 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 69da180593bb5..3788d1829758a 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -24,6 +24,7 @@ import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JM
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.existentials
+import scala.util.control.NonFatal
 
 import com.google.common.base.Charsets.UTF_8
 import org.apache.hadoop.conf.Configuration
@@ -38,7 +39,6 @@ import org.apache.spark.input.PortableDataStream
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
-import scala.util.control.NonFatal
 
 private[spark] class PythonRDD(
     parent: RDD[_],
@@ -61,11 +61,39 @@ private[spark] class PythonRDD(
     if (preservePartitoning) firstParent.partitioner else None
   }
 
+  val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
+
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
+    val runner = new PythonRunner(
+      command, envVars, pythonIncludes, pythonExec, pythonVer, broadcastVars, accumulator,
+      bufferSize, reuse_worker)
+    runner.compute(firstParent.iterator(split, context), split.index, context)
+  }
+}
+
+
+/**
+ * A helper class to run Python UDFs in Spark.
+ */
+private[spark] class PythonRunner(
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    pythonExec: String,
+    pythonVer: String,
+    broadcastVars: JList[Broadcast[PythonBroadcast]],
+    accumulator: Accumulator[JList[Array[Byte]]],
+    bufferSize: Int,
+    reuse_worker: Boolean)
+  extends Logging {
+
+  def compute(
+      inputIterator: Iterator[_],
+      partitionIndex: Int,
+      context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
     val env = SparkEnv.get
-    val localdir = env.blockManager.diskBlockManager.localDirs.map(
-      f => f.getPath()).mkString(",")
+    val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",")
     envVars.put("SPARK_LOCAL_DIRS", localdir) // it's also used in monitor thread
     if (reuse_worker) {
       envVars.put("SPARK_REUSE_WORKER", "1")
@@ -75,7 +103,7 @@ private[spark] class PythonRDD(
     @volatile var released = false
 
     // Start a thread to feed the process input from our parent's iterator
-    val writerThread = new WriterThread(env, worker, split, context)
+    val writerThread = new WriterThread(env, worker, inputIterator, partitionIndex, context)
 
     context.addTaskCompletionListener { context =>
       writerThread.shutdownOnTaskCompletion()
@@ -183,13 +211,16 @@ private[spark] class PythonRDD(
     new InterruptibleIterator(context, stdoutIterator)
   }
 
-  val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
-
   /**
    * The thread responsible for writing the data from the PythonRDD's parent iterator to the
    * Python process.
    */
-  class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
+  class WriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[_],
+      partitionIndex: Int,
+      context: TaskContext)
     extends Thread(s"stdout writer for $pythonExec") {
 
     @volatile private var _exception: Exception = null
@@ -211,11 +242,11 @@ private[spark] class PythonRDD(
         val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
         val dataOut = new DataOutputStream(stream)
         // Partition index
-        dataOut.writeInt(split.index)
+        dataOut.writeInt(partitionIndex)
         // Python version of driver
         PythonRDD.writeUTF(pythonVer, dataOut)
         // sparkFilesDir
-        PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
+        PythonRDD.writeUTF(SparkFiles.getRootDirectory(), dataOut)
         // Python includes (*.zip and *.egg files)
         dataOut.writeInt(pythonIncludes.size())
         for (include <- pythonIncludes.asScala) {
@@ -246,7 +277,7 @@ private[spark] class PythonRDD(
         dataOut.writeInt(command.length)
         dataOut.write(command)
         // Data values
-        PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
+        PythonRDD.writeIteratorToStream(inputIterator, dataOut)
         dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
         dataOut.writeInt(SpecialLengths.END_OF_STREAM)
         dataOut.flush()
@@ -327,7 +358,8 @@ private[spark] object PythonRDD extends Logging {
 
   // remember the broadcasts sent to each worker
   private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]()
-  private def getWorkerBroadcasts(worker: Socket) = {
+
+  def getWorkerBroadcasts(worker: Socket): mutable.Set[Long] = {
     synchronized {
       workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]())
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index d0411da6fdf5a..c35c726bfc503 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import net.razorvine.pickle._
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.python.{PythonBroadcast, PythonRDD, SerDeUtil}
+import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.{Accumulator, Logging => SparkLogging}
+import org.apache.spark.{Logging => SparkLogging, TaskContext, Accumulator}
 
 /**
  * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
@@ -329,7 +329,13 @@ case class EvaluatePython(
 /**
  * :: DeveloperApi ::
  * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.
- * The input data is zipped with the result of the udf evaluation.
+ *
+ * Python evaluation works by sending the necessary (projected) input data via a socket to an
+ * external Python process, and combine the result from the Python process with the original row.
+ *
+ * For each row we send to Python, we also put it in a queue. For each output row from Python,
+ * we drain the queue to find the original input row. Note that if the Python process is way too
+ * slow, this could lead to the queue growing unbounded and eventually run out of memory.
  */
 @DeveloperApi
 case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
@@ -342,51 +348,57 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
   override def canProcessSafeRows: Boolean = true
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val childResults = child.execute().map(_.copy())
+    val inputRDD = child.execute().map(_.copy())
+    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
+    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
 
-    val parent = childResults.mapPartitions { iter =>
+    inputRDD.mapPartitions { iter =>
       EvaluatePython.registerPicklers()  // register pickler for Row
+
+      // The queue used to buffer input rows so we can drain it to
+      // combine input with output from Python.
+      val queue = new java.util.concurrent.ConcurrentLinkedQueue[InternalRow]()
+
       val pickle = new Pickler
       val currentRow = newMutableProjection(udf.children, child.output)()
       val fields = udf.children.map(_.dataType)
       val schema = new StructType(fields.map(t => new StructField("", t, true)).toArray)
-      iter.grouped(100).map { inputRows =>
+
+      // Input iterator to Python: input rows are grouped so we send them in batches to Python.
+      // For each row, add it to the queue.
+      val inputIterator = iter.grouped(100).map { inputRows =>
         val toBePickled = inputRows.map { row =>
+          queue.add(row)
           EvaluatePython.toJava(currentRow(row), schema)
         }.toArray
         pickle.dumps(toBePickled)
       }
-    }
 
-    val pyRDD = new PythonRDD(
-      parent,
-      udf.command,
-      udf.envVars,
-      udf.pythonIncludes,
-      false,
-      udf.pythonExec,
-      udf.pythonVer,
-      udf.broadcastVars,
-      udf.accumulator
-    ).mapPartitions { iter =>
-      val pickle = new Unpickler
-      iter.flatMap { pickedResult =>
-        val unpickledBatch = pickle.loads(pickedResult)
-        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
-      }
-    }.mapPartitions { iter =>
+      val context = TaskContext.get()
+
+      // Output iterator for results from Python.
+      val outputIterator = new PythonRunner(
+        udf.command,
+        udf.envVars,
+        udf.pythonIncludes,
+        udf.pythonExec,
+        udf.pythonVer,
+        udf.broadcastVars,
+        udf.accumulator,
+        bufferSize,
+        reuseWorker
+      ).compute(inputIterator, context.partitionId(), context)
+
+      val unpickle = new Unpickler
       val row = new GenericMutableRow(1)
-      iter.map { result =>
-        row(0) = EvaluatePython.fromJava(result, udf.dataType)
-        row: InternalRow
-      }
-    }
+      val joined = new JoinedRow
 
-    childResults.zip(pyRDD).mapPartitions { iter =>
-      val joinedRow = new JoinedRow()
-      iter.map {
-        case (row, udfResult) =>
-          joinedRow(row, udfResult)
+      outputIterator.flatMap { pickedResult =>
+        val unpickledBatch = unpickle.loads(pickedResult)
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
+      }.map { result =>
+        row(0) = EvaluatePython.fromJava(result, udf.dataType)
+        joined(queue.poll(), row)
       }
     }
   }

From 61d4c07f4becb42f054e588be56ed13239644410 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 22 Sep 2015 16:35:43 -0700
Subject: [PATCH 392/802] [SPARK-10640] History server fails to parse
 TaskCommitDenied

... simply because the code is missing!

Author: Andrew Or <andrew@databricks.com>

Closes #8828 from andrewor14/task-end-reason-json.
---
 .../scala/org/apache/spark/TaskEndReason.scala  |  6 +++++-
 .../org/apache/spark/util/JsonProtocol.scala    | 13 +++++++++++++
 .../apache/spark/util/JsonProtocolSuite.scala   | 17 +++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 7137246bc34f2..9335c5f4160bf 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -17,13 +17,17 @@
 
 package org.apache.spark
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.Utils
 
+// ==============================================================================================
+// NOTE: new task end reasons MUST be accompanied with serialization logic in util.JsonProtocol!
+// ==============================================================================================
+
 /**
  * :: DeveloperApi ::
  * Various possible reasons why a task ended. The low-level TaskScheduler is supposed to retry
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 99614a786bd93..40729fa5a4ffe 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -362,6 +362,10 @@ private[spark] object JsonProtocol {
         ("Stack Trace" -> stackTrace) ~
         ("Full Stack Trace" -> exceptionFailure.fullStackTrace) ~
         ("Metrics" -> metrics)
+      case taskCommitDenied: TaskCommitDenied =>
+        ("Job ID" -> taskCommitDenied.jobID) ~
+        ("Partition ID" -> taskCommitDenied.partitionID) ~
+        ("Attempt Number" -> taskCommitDenied.attemptNumber)
       case ExecutorLostFailure(executorId, isNormalExit) =>
         ("Executor ID" -> executorId) ~
         ("Normal Exit" -> isNormalExit)
@@ -770,6 +774,7 @@ private[spark] object JsonProtocol {
     val exceptionFailure = Utils.getFormattedClassName(ExceptionFailure)
     val taskResultLost = Utils.getFormattedClassName(TaskResultLost)
     val taskKilled = Utils.getFormattedClassName(TaskKilled)
+    val taskCommitDenied = Utils.getFormattedClassName(TaskCommitDenied)
     val executorLostFailure = Utils.getFormattedClassName(ExecutorLostFailure)
     val unknownReason = Utils.getFormattedClassName(UnknownReason)
 
@@ -794,6 +799,14 @@ private[spark] object JsonProtocol {
         ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics, None)
       case `taskResultLost` => TaskResultLost
       case `taskKilled` => TaskKilled
+      case `taskCommitDenied` =>
+        // Unfortunately, the `TaskCommitDenied` message was introduced in 1.3.0 but the JSON
+        // de/serialization logic was not added until 1.5.1. To provide backward compatibility
+        // for reading those logs, we need to provide default values for all the fields.
+        val jobId = Utils.jsonOption(json \ "Job ID").map(_.extract[Int]).getOrElse(-1)
+        val partitionId = Utils.jsonOption(json \ "Partition ID").map(_.extract[Int]).getOrElse(-1)
+        val attemptNo = Utils.jsonOption(json \ "Attempt Number").map(_.extract[Int]).getOrElse(-1)
+        TaskCommitDenied(jobId, partitionId, attemptNo)
       case `executorLostFailure` =>
         val isNormalExit = Utils.jsonOption(json \ "Normal Exit").
           map(_.extract[Boolean])
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 143c1b901df11..a24bf2931cca0 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -151,6 +151,7 @@ class JsonProtocolSuite extends SparkFunSuite {
     testTaskEndReason(exceptionFailure)
     testTaskEndReason(TaskResultLost)
     testTaskEndReason(TaskKilled)
+    testTaskEndReason(TaskCommitDenied(2, 3, 4))
     testTaskEndReason(ExecutorLostFailure("100", true))
     testTaskEndReason(UnknownReason)
 
@@ -352,6 +353,17 @@ class JsonProtocolSuite extends SparkFunSuite {
     assertEquals(expectedStageInfo, JsonProtocol.stageInfoFromJson(oldStageInfo))
   }
 
+  // `TaskCommitDenied` was added in 1.3.0 but JSON de/serialization logic was added in 1.5.1
+  test("TaskCommitDenied backward compatibility") {
+    val denied = TaskCommitDenied(1, 2, 3)
+    val oldDenied = JsonProtocol.taskEndReasonToJson(denied)
+      .removeField({ _._1 == "Job ID" })
+      .removeField({ _._1 == "Partition ID" })
+      .removeField({ _._1 == "Attempt Number" })
+    val expectedDenied = TaskCommitDenied(-1, -1, -1)
+    assertEquals(expectedDenied, JsonProtocol.taskEndReasonFromJson(oldDenied))
+  }
+
   /** -------------------------- *
    | Helper test running methods |
    * --------------------------- */
@@ -577,6 +589,11 @@ class JsonProtocolSuite extends SparkFunSuite {
         assertOptionEquals(r1.metrics, r2.metrics, assertTaskMetricsEquals)
       case (TaskResultLost, TaskResultLost) =>
       case (TaskKilled, TaskKilled) =>
+      case (TaskCommitDenied(jobId1, partitionId1, attemptNumber1),
+          TaskCommitDenied(jobId2, partitionId2, attemptNumber2)) =>
+        assert(jobId1 === jobId2)
+        assert(partitionId1 === partitionId2)
+        assert(attemptNumber1 === attemptNumber2)
       case (ExecutorLostFailure(execId1, isNormalExit1),
           ExecutorLostFailure(execId2, isNormalExit2)) =>
         assert(execId1 === execId2)

From 84f81e035e1dab1b42c36563041df6ba16e7b287 Mon Sep 17 00:00:00 2001
From: Zhichao Li <zhichao.li@intel.com>
Date: Tue, 22 Sep 2015 19:41:57 -0700
Subject: [PATCH 393/802] [SPARK-10310] [SQL] Fixes script transformation
 field/line delimiters

**Please attribute this PR to `Zhichao Li <zhichao.liintel.com>`.**

This PR is based on PR #8476 authored by zhichao-li. It fixes SPARK-10310 by adding field delimiter SerDe property to the default `LazySimpleSerDe`, and enabling default record reader/writer classes.

Currently, we only support `LazySimpleSerDe`, used together with `TextRecordReader` and `TextRecordWriter`, and don't support customizing record reader/writer using `RECORDREADER`/`RECORDWRITER` clauses. This should be addressed in separate PR(s).

Author: Cheng Lian <lian@databricks.com>

Closes #8860 from liancheng/spark-10310/fix-script-trans-delimiters.
---
 .../org/apache/spark/sql/hive/HiveQl.scala    | 52 ++++++++++---
 .../hive/execution/ScriptTransformation.scala | 75 +++++++++++++++----
 .../resources/data/scripts/test_transform.py  |  6 ++
 .../sql/hive/execution/SQLQuerySuite.scala    | 39 ++++++++++
 .../execution/ScriptTransformationSuite.scala |  2 +
 5 files changed, 152 insertions(+), 22 deletions(-)
 create mode 100755 sql/hive/src/test/resources/data/scripts/test_transform.py

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index d5cd7e98b5267..256440a9a2e97 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
@@ -884,16 +885,22 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                   AttributeReference("value", StringType)()), true)
             }
 
-            def matchSerDe(clause: Seq[ASTNode])
-              : (Seq[(String, String)], Option[String], Seq[(String, String)]) = clause match {
+            type SerDeInfo = (
+              Seq[(String, String)],  // Input row format information
+              Option[String],         // Optional input SerDe class
+              Seq[(String, String)],  // Input SerDe properties
+              Boolean                 // Whether to use default record reader/writer
+            )
+
+            def matchSerDe(clause: Seq[ASTNode]): SerDeInfo = clause match {
               case Token("TOK_SERDEPROPS", propsClause) :: Nil =>
                 val rowFormat = propsClause.map {
                   case Token(name, Token(value, Nil) :: Nil) => (name, value)
                 }
-                (rowFormat, None, Nil)
+                (rowFormat, None, Nil, false)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) :: Nil) :: Nil =>
-                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), Nil)
+                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), Nil, false)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) ::
                 Token("TOK_TABLEPROPERTIES",
@@ -903,20 +910,47 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                     (BaseSemanticAnalyzer.unescapeSQLString(name),
                       BaseSemanticAnalyzer.unescapeSQLString(value))
                 }
-                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), serdeProps)
 
-              case Nil => (Nil, Option(hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)), Nil)
+                // SPARK-10310: Special cases LazySimpleSerDe
+                // TODO Fully supports user-defined record reader/writer classes
+                val unescapedSerDeClass = BaseSemanticAnalyzer.unescapeSQLString(serdeClass)
+                val useDefaultRecordReaderWriter =
+                  unescapedSerDeClass == classOf[LazySimpleSerDe].getCanonicalName
+                (Nil, Some(unescapedSerDeClass), serdeProps, useDefaultRecordReaderWriter)
+
+              case Nil =>
+                // Uses default TextRecordReader/TextRecordWriter, sets field delimiter here
+                val serdeProps = Seq(serdeConstants.FIELD_DELIM -> "\t")
+                (Nil, Option(hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)), serdeProps, true)
             }
 
-            val (inRowFormat, inSerdeClass, inSerdeProps) = matchSerDe(inputSerdeClause)
-            val (outRowFormat, outSerdeClass, outSerdeProps) = matchSerDe(outputSerdeClause)
+            val (inRowFormat, inSerdeClass, inSerdeProps, useDefaultRecordReader) =
+              matchSerDe(inputSerdeClause)
+
+            val (outRowFormat, outSerdeClass, outSerdeProps, useDefaultRecordWriter) =
+              matchSerDe(outputSerdeClause)
 
             val unescapedScript = BaseSemanticAnalyzer.unescapeSQLString(script)
 
+            // TODO Adds support for user-defined record reader/writer classes
+            val recordReaderClass = if (useDefaultRecordReader) {
+              Option(hiveConf.getVar(ConfVars.HIVESCRIPTRECORDREADER))
+            } else {
+              None
+            }
+
+            val recordWriterClass = if (useDefaultRecordWriter) {
+              Option(hiveConf.getVar(ConfVars.HIVESCRIPTRECORDWRITER))
+            } else {
+              None
+            }
+
             val schema = HiveScriptIOSchema(
               inRowFormat, outRowFormat,
               inSerdeClass, outSerdeClass,
-              inSerdeProps, outSerdeProps, schemaLess)
+              inSerdeProps, outSerdeProps,
+              recordReaderClass, recordWriterClass,
+              schemaLess)
 
             Some(
               logical.ScriptTransformation(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 32bddbaeaeaf9..b30117f0de997 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -24,20 +24,22 @@ import javax.annotation.Nullable
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.ql.exec.{RecordReader, RecordWriter}
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.io.Writable
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.{HiveContext, HiveInspectors}
 import org.apache.spark.sql.types.DataType
-import org.apache.spark.util.{CircularBuffer, RedirectThread, Utils}
+import org.apache.spark.util.{CircularBuffer, RedirectThread, SerializableConfiguration, Utils}
 import org.apache.spark.{Logging, TaskContext}
 
 /**
@@ -58,6 +60,8 @@ case class ScriptTransformation(
 
   override def otherCopyArgs: Seq[HiveContext] = sc :: Nil
 
+  private val serializedHiveConf = new SerializableConfiguration(sc.hiveconf)
+
   protected override def doExecute(): RDD[InternalRow] = {
     def processIterator(inputIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val cmd = List("/bin/bash", "-c", script)
@@ -67,6 +71,7 @@ case class ScriptTransformation(
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
       val errorStream = proc.getErrorStream
+      val localHiveConf = serializedHiveConf.value
 
       // In order to avoid deadlocks, we need to consume the error output of the child process.
       // To avoid issues caused by large error output, we use a circular buffer to limit the amount
@@ -96,7 +101,8 @@ case class ScriptTransformation(
         outputStream,
         proc,
         stderrBuffer,
-        TaskContext.get()
+        TaskContext.get(),
+        localHiveConf
       )
 
       // This nullability is a performance optimization in order to avoid an Option.foreach() call
@@ -109,6 +115,10 @@ case class ScriptTransformation(
       val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
         var curLine: String = null
         val scriptOutputStream = new DataInputStream(inputStream)
+
+        @Nullable val scriptOutputReader =
+          ioschema.recordReader(scriptOutputStream, localHiveConf).orNull
+
         var scriptOutputWritable: Writable = null
         val reusedWritableObject: Writable = if (null != outputSerde) {
           outputSerde.getSerializedClass().newInstance
@@ -134,15 +144,25 @@ case class ScriptTransformation(
             }
           } else if (scriptOutputWritable == null) {
             scriptOutputWritable = reusedWritableObject
-            try {
-              scriptOutputWritable.readFields(scriptOutputStream)
-              true
-            } catch {
-              case _: EOFException =>
-                if (writerThread.exception.isDefined) {
-                  throw writerThread.exception.get
-                }
+
+            if (scriptOutputReader != null) {
+              if (scriptOutputReader.next(scriptOutputWritable) <= 0) {
+                writerThread.exception.foreach(throw _)
                 false
+              } else {
+                true
+              }
+            } else {
+              try {
+                scriptOutputWritable.readFields(scriptOutputStream)
+                true
+              } catch {
+                case _: EOFException =>
+                  if (writerThread.exception.isDefined) {
+                    throw writerThread.exception.get
+                  }
+                  false
+              }
             }
           } else {
             true
@@ -210,7 +230,8 @@ private class ScriptTransformationWriterThread(
     outputStream: OutputStream,
     proc: Process,
     stderrBuffer: CircularBuffer,
-    taskContext: TaskContext
+    taskContext: TaskContext,
+    conf: Configuration
   ) extends Thread("Thread-ScriptTransformation-Feed") with Logging {
 
   setDaemon(true)
@@ -224,6 +245,7 @@ private class ScriptTransformationWriterThread(
     TaskContext.setTaskContext(taskContext)
 
     val dataOutputStream = new DataOutputStream(outputStream)
+    @Nullable val scriptInputWriter = ioschema.recordWriter(dataOutputStream, conf).orNull
 
     // We can't use Utils.tryWithSafeFinally here because we also need a `catch` block, so
     // let's use a variable to record whether the `finally` block was hit due to an exception
@@ -250,7 +272,12 @@ private class ScriptTransformationWriterThread(
         } else {
           val writable = inputSerde.serialize(
             row.asInstanceOf[GenericInternalRow].values, inputSoi)
-          prepareWritable(writable, ioschema.outputSerdeProps).write(dataOutputStream)
+
+          if (scriptInputWriter != null) {
+            scriptInputWriter.write(writable)
+          } else {
+            prepareWritable(writable, ioschema.outputSerdeProps).write(dataOutputStream)
+          }
         }
       }
       outputStream.close()
@@ -290,6 +317,8 @@ case class HiveScriptIOSchema (
     outputSerdeClass: Option[String],
     inputSerdeProps: Seq[(String, String)],
     outputSerdeProps: Seq[(String, String)],
+    recordReaderClass: Option[String],
+    recordWriterClass: Option[String],
     schemaLess: Boolean) extends ScriptInputOutputSchema with HiveInspectors {
 
   private val defaultFormat = Map(
@@ -347,4 +376,24 @@ case class HiveScriptIOSchema (
 
     serde
   }
+
+  def recordReader(
+      inputStream: InputStream,
+      conf: Configuration): Option[RecordReader] = {
+    recordReaderClass.map { klass =>
+      val instance = Utils.classForName(klass).newInstance().asInstanceOf[RecordReader]
+      val props = new Properties()
+      props.putAll(outputSerdeProps.toMap.asJava)
+      instance.initialize(inputStream, conf, props)
+      instance
+    }
+  }
+
+  def recordWriter(outputStream: OutputStream, conf: Configuration): Option[RecordWriter] = {
+    recordWriterClass.map { klass =>
+      val instance = Utils.classForName(klass).newInstance().asInstanceOf[RecordWriter]
+      instance.initialize(outputStream, conf)
+      instance
+    }
+  }
 }
diff --git a/sql/hive/src/test/resources/data/scripts/test_transform.py b/sql/hive/src/test/resources/data/scripts/test_transform.py
new file mode 100755
index 0000000000000..ac6d11d8b919c
--- /dev/null
+++ b/sql/hive/src/test/resources/data/scripts/test_transform.py
@@ -0,0 +1,6 @@
+import sys
+
+delim = sys.argv[1]
+
+for row in sys.stdin:
+    print(delim.join([w + '#' for w in row[:-1].split(delim)]))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index bb02473dd17ca..71823e32ad389 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1184,4 +1184,43 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
     checkAnswer(df, Row("text inside layer 2") :: Nil)
   }
+
+  test("SPARK-10310: " +
+    "script transformation using default input/output SerDe and record reader/writer") {
+    sqlContext
+      .range(5)
+      .selectExpr("id AS a", "id AS b")
+      .registerTempTable("test")
+
+    checkAnswer(
+      sql(
+        """FROM(
+          |  FROM test SELECT TRANSFORM(a, b)
+          |  USING 'python src/test/resources/data/scripts/test_transform.py "\t"'
+          |  AS (c STRING, d STRING)
+          |) t
+          |SELECT c
+        """.stripMargin),
+      (0 until 5).map(i => Row(i + "#")))
+  }
+
+  test("SPARK-10310: script transformation using LazySimpleSerDe") {
+    sqlContext
+      .range(5)
+      .selectExpr("id AS a", "id AS b")
+      .registerTempTable("test")
+
+    val df = sql(
+      """FROM test
+        |SELECT TRANSFORM(a, b)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |WITH SERDEPROPERTIES('field.delim' = '|')
+        |USING 'python src/test/resources/data/scripts/test_transform.py "|"'
+        |AS (c STRING, d STRING)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |WITH SERDEPROPERTIES('field.delim' = '|')
+      """.stripMargin)
+
+    checkAnswer(df, (0 until 5).map(i => Row(i + "#", i + "#")))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index cb8d0fca8e693..7cfdb886b585d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -38,6 +38,8 @@ class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton {
     outputSerdeClass = None,
     inputSerdeProps = Seq.empty,
     outputSerdeProps = Seq.empty,
+    recordReaderClass = None,
+    recordWriterClass = None,
     schemaLess = false
   )
 

From 558e9c7e60a7c0d85ba26634e97562ad2163e91d Mon Sep 17 00:00:00 2001
From: Matt Hagen <anonz3000@gmail.com>
Date: Tue, 22 Sep 2015 21:14:25 -0700
Subject: [PATCH 394/802] [SPARK-10663] Removed unnecessary invocation of
 DataFrame.toDF method.

The Scala example under the "Example: Pipeline" heading in this
document initializes the "test" variable to a DataFrame. Because test
is already a DF, there is not need to call test.toDF as the example
does in a subsequent line: model.transform(test.toDF). So, I removed
the extraneous toDF invocation.

Author: Matt Hagen <anonz3000@gmail.com>

Closes #8875 from hagenhaus/SPARK-10663.
---
 docs/ml-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 0427ac6695aa1..fd3a6167bc65e 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -475,7 +475,7 @@ val test = sqlContext.createDataFrame(Seq(
 )).toDF("id", "text")
 
 // Make predictions on test documents.
-model.transform(test.toDF)
+model.transform(test)
   .select("id", "text", "probability", "prediction")
   .collect()
   .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>

From 5548a254755bb84edae2768b94ab1816e1b49b91 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 22 Sep 2015 22:44:09 -0700
Subject: [PATCH 395/802] [SPARK-10652] [SPARK-10742] [STREAMING] Set
 meaningful job descriptions for all streaming jobs

Here is the screenshot after adding the job descriptions to threads that run receivers and the scheduler thread running the batch jobs.

## All jobs page
* Added job descriptions with links to relevant batch details page
![image](https://cloud.githubusercontent.com/assets/663212/9924165/cda4a372-5cb1-11e5-91ca-d43a32c699e9.png)

## All stages page
* Added stage descriptions with links to relevant batch details page
![image](https://cloud.githubusercontent.com/assets/663212/9923814/2cce266a-5cae-11e5-8a3f-dad84d06c50e.png)

## Streaming batch details page
* Added the +details link
![image](https://cloud.githubusercontent.com/assets/663212/9921977/24014a32-5c98-11e5-958e-457b6c38065b.png)

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8791 from tdas/SPARK-10652.
---
 .../scala/org/apache/spark/ui/UIUtils.scala   | 62 ++++++++++++++++-
 .../apache/spark/ui/jobs/AllJobsPage.scala    | 14 ++--
 .../org/apache/spark/ui/jobs/StageTable.scala |  7 +-
 .../org/apache/spark/ui/UIUtilsSuite.scala    | 66 +++++++++++++++++++
 .../spark/streaming/StreamingContext.scala    |  4 +-
 .../streaming/scheduler/JobScheduler.scala    | 15 ++++-
 .../streaming/scheduler/ReceiverTracker.scala |  5 +-
 .../apache/spark/streaming/ui/BatchPage.scala | 33 ++++++----
 .../streaming/StreamingContextSuite.scala     |  2 +-
 9 files changed, 179 insertions(+), 29 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index f2da417724104..21dc8f0b65485 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.ui
 
 import java.text.SimpleDateFormat
-import java.util.{Locale, Date}
+import java.util.{Date, Locale}
 
-import scala.xml.{Node, Text, Unparsed}
+import scala.util.control.NonFatal
+import scala.xml._
+import scala.xml.transform.{RewriteRule, RuleTransformer}
 
 import org.apache.spark.Logging
 import org.apache.spark.ui.scope.RDDOperationGraph
@@ -395,4 +397,60 @@ private[spark] object UIUtils extends Logging {
     </script>
   }
 
+  /**
+   * Returns HTML rendering of a job or stage description. It will try to parse the string as HTML
+   * and make sure that it only contains anchors with root-relative links. Otherwise,
+   * the whole string will rendered as a simple escaped text.
+   *
+   * Note: In terms of security, only anchor tags with root relative links are supported. So any
+   * attempts to embed links outside Spark UI, or other tags like <script> will cause in the whole
+   * description to be treated as plain text.
+   */
+  def makeDescription(desc: String, basePathUri: String): NodeSeq = {
+    import scala.language.postfixOps
+
+    // If the description can be parsed as HTML and has only relative links, then render
+    // as HTML, otherwise render as escaped string
+    try {
+      // Try to load the description as unescaped HTML
+      val xml = XML.loadString(s"""<span class="description-input">$desc</span>""")
+
+      // Verify that this has only anchors and span (we are wrapping in span)
+      val allowedNodeLabels = Set("a", "span")
+      val illegalNodes = xml \\ "_"  filterNot { case node: Node =>
+        allowedNodeLabels.contains(node.label)
+      }
+      if (illegalNodes.nonEmpty) {
+        throw new IllegalArgumentException(
+          "Only HTML anchors allowed in job descriptions\n" +
+            illegalNodes.map { n => s"${n.label} in $n"}.mkString("\n\t"))
+      }
+
+      // Verify that all links are relative links starting with "/"
+      val allLinks =
+        xml \\ "a" flatMap { _.attributes } filter { _.key == "href" } map { _.value.toString }
+      if (allLinks.exists { ! _.startsWith ("/") }) {
+        throw new IllegalArgumentException(
+          "Links in job descriptions must be root-relative:\n" + allLinks.mkString("\n\t"))
+      }
+
+      // Prepend the relative links with basePathUri
+      val rule = new RewriteRule() {
+        override def transform(n: Node): Seq[Node] = {
+          n match {
+            case e: Elem if e \ "@href" nonEmpty =>
+              val relativePath = e.attribute("href").get.toString
+              val fullUri = s"${basePathUri.stripSuffix("/")}/${relativePath.stripPrefix("/")}"
+              e % Attribute(null, "href", fullUri, Null)
+            case _ => n
+          }
+        }
+      }
+      new RuleTransformer(rule).transform(xml)
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Invalid job description: $desc ", e)
+        <span class="description-input">{desc}</span>
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index e72547df7254b..041cd55ea483b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ui.jobs
 
-import scala.collection.mutable.{HashMap, ListBuffer}
-import scala.xml.{Node, NodeSeq, Unparsed, Utility}
-
 import java.util.Date
 import javax.servlet.http.HttpServletRequest
 
-import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
-import org.apache.spark.ui.jobs.UIData.{ExecutorUIData, JobUIData}
+import scala.collection.mutable.{HashMap, ListBuffer}
+import scala.xml._
+
 import org.apache.spark.JobExecutionStatus
+import org.apache.spark.ui.jobs.UIData.{ExecutorUIData, JobUIData}
+import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
 
 /** Page showing list of all ongoing and recently finished jobs */
 private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
@@ -224,6 +224,8 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       }
       val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
       val formattedSubmissionTime = job.submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
+      val jobDescription = UIUtils.makeDescription(lastStageDescription, parent.basePath)
+
       val detailUrl =
         "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), job.jobId)
       <tr id={"job-" + job.jobId}>
@@ -231,7 +233,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
           {job.jobId} {job.jobGroup.map(id => s"($id)").getOrElse("")}
         </td>
         <td>
-          <span class="description-input" title={lastStageDescription}>{lastStageDescription}</span>
+          {jobDescription}
           <a href={detailUrl} class="name-link">{lastStageName}</a>
         </td>
         <td sorttable_customkey={job.submissionTime.getOrElse(-1).toString}>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 99812db4912a3..ea806d09b6009 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.ui.jobs
 
-import scala.xml.Node
-import scala.xml.Text
-
 import java.util.Date
 
+import scala.xml.{Node, Text}
+
 import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.scheduler.StageInfo
@@ -116,7 +115,7 @@ private[ui] class StageTableBase(
       stageData <- listener.stageIdToData.get((s.stageId, s.attemptId))
       desc <- stageData.description
     } yield {
-      <span class="description-input" title={desc}>{desc}</span>
+      UIUtils.makeDescription(desc, basePathUri)
     }
     <div>{stageDesc.getOrElse("")} {killLink} {nameLink} {details}</div>
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
new file mode 100644
index 0000000000000..2b693c165180f
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui
+
+import scala.xml.Elem
+
+import org.apache.spark.SparkFunSuite
+
+class UIUtilsSuite extends SparkFunSuite {
+  import UIUtils._
+
+  test("makeDescription") {
+    verify(
+      """test <a href="/link"> text </a>""",
+      <span class="description-input">test <a href="/link"> text </a></span>,
+      "Correctly formatted text with only anchors and relative links should generate HTML"
+    )
+
+    verify(
+      """test <a href="/link" text </a>""",
+      <span class="description-input">{"""test <a href="/link" text </a>"""}</span>,
+      "Badly formatted text should make the description be treated as a streaming instead of HTML"
+    )
+
+    verify(
+      """test <a href="link"> text </a>""",
+      <span class="description-input">{"""test <a href="link"> text </a>"""}</span>,
+      "Non-relative links should make the description be treated as a string instead of HTML"
+    )
+
+    verify(
+      """test<a><img></img></a>""",
+      <span class="description-input">{"""test<a><img></img></a>"""}</span>,
+      "Non-anchor elements should make the description be treated as a string instead of HTML"
+    )
+
+    verify(
+      """test <a href="/link"> text </a>""",
+      <span class="description-input">test <a href="base/link"> text </a></span>,
+      baseUrl = "base",
+      errorMsg = "Base URL should be prepended to html links"
+    )
+  }
+
+  private def verify(
+      desc: String, expected: Elem, errorMsg: String = "", baseUrl: String = ""): Unit = {
+    val generated = makeDescription(desc, baseUrl)
+    assert(generated.sameElements(expected),
+      s"\n$errorMsg\n\nExpected:\n$expected\nGenerated:\n$generated")
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 6720ba4f72cf3..94fea63f55b25 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -200,6 +200,8 @@ class StreamingContext private[streaming] (
 
   private val startSite = new AtomicReference[CallSite](null)
 
+  private[streaming] def getStartSite(): CallSite = startSite.get()
+
   private var shutdownHookRef: AnyRef = _
 
   conf.getOption("spark.streaming.checkpoint.directory").foreach(checkpoint)
@@ -744,7 +746,7 @@ object StreamingContext extends Logging {
         throw new IllegalStateException(
           "Only one StreamingContext may be started in this JVM. " +
             "Currently running StreamingContext was started at" +
-            activeContext.get.startSite.get.longForm)
+            activeContext.get.getStartSite().longForm)
       }
     }
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 0cd39594ee923..32d995dc42f27 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -25,6 +25,7 @@ import scala.util.{Failure, Success}
 import org.apache.spark.Logging
 import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming._
+import org.apache.spark.streaming.ui.UIUtils
 import org.apache.spark.util.{EventLoop, ThreadUtils}
 
 
@@ -190,10 +191,20 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   }
 
   private class JobHandler(job: Job) extends Runnable with Logging {
+    import JobScheduler._
+
     def run() {
-      ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
-      ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
       try {
+        val formattedTime = UIUtils.formatBatchTime(
+          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
+        val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
+        val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"
+
+        ssc.sc.setJobDescription(
+          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
+        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
+        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
+
         // We need to assign `eventLoop` to a temp variable. Otherwise, because
         // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
         // it's possible that when `post` is called, `eventLoop` happens to null.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index f86fd44b48719..204e6142fd6cf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc._
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.receiver._
-import org.apache.spark.util.{ThreadUtils, SerializableConfiguration}
+import org.apache.spark.util.{Utils, ThreadUtils, SerializableConfiguration}
 
 
 /** Enumeration to identify current state of a Receiver */
@@ -554,6 +554,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
           ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors))
         }
       receiverRDD.setName(s"Receiver $receiverId")
+      ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
+      ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite()))
+
       val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
         receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())
       // We will keep restarting the receiver job until ReceiverTracker is stopped
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 90d1b0fadecfc..9129c1f26abd4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -19,14 +19,14 @@ package org.apache.spark.streaming.ui
 
 import javax.servlet.http.HttpServletRequest
 
-import scala.xml.{NodeSeq, Node, Text, Unparsed}
+import scala.xml._
 
 import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.streaming.Time
-import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
-import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId}
+import org.apache.spark.streaming.ui.StreamingJobProgressListener.{OutputOpId, SparkJobId}
 import org.apache.spark.ui.jobs.UIData.JobUIData
+import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
 
 private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData])
 
@@ -207,16 +207,25 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
             sparkListener.stageIdToInfo.get(sparkJob.stageIds.max)
           }
         }
-    val lastStageData = lastStageInfo.flatMap { s =>
-      sparkListener.stageIdToData.get((s.stageId, s.attemptId))
-    }
-
-    val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)")
-    val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("")
+    lastStageInfo match {
+      case Some(stageInfo) =>
+        val details = if (stageInfo.details.nonEmpty) {
+          <span
+            onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
+            class="expand-details">
+              +details
+          </span> ++
+          <div class="stage-details collapsed">
+            <pre>{stageInfo.details}</pre>
+          </div>
+        } else {
+          NodeSeq.Empty
+        }
 
-    <span class="description-input" title={lastStageDescription}>
-      {lastStageDescription}
-    </span> ++ Text(lastStageName)
+        <div> {stageInfo.name} {details} </div>
+      case None =>
+        Text("(Unknown)")
+    }
   }
 
   private def failureReasonCell(failureReason: String): Seq[Node] = {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 3b9d0d15ea04c..c7a877142b374 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -204,7 +204,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
 
     // Verify streaming jobs have expected thread-local properties
     assert(jobGroupFound === null)
-    assert(jobDescFound === null)
+    assert(jobDescFound.contains("Streaming job from"))
     assert(jobInterruptFound === "false")
 
     // Verify current thread's thread-local properties have not changed

From 44c28abf120754c0175c65ffd3d4587a350b3798 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 23 Sep 2015 01:28:02 -0700
Subject: [PATCH 396/802] [SPARK-10224] [STREAMING] Fix the issue that
 blockIntervalTimer won't call updateCurrentBuffer when stopping

`blockIntervalTimer.stop(interruptTimer = false)` doesn't guarantee calling `updateCurrentBuffer`. So it's possible that `blockIntervalTimer` will exit when `updateCurrentBuffer` is not empty. Then the data in `currentBuffer` will be lost.

To reproduce it, you can add `Thread.sleep(200)` in this line (https://github.com/apache/spark/blob/69c9c177160e32a2fbc9b36ecc52156077fca6fc/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala#L100) and run `StreamingContexSuite`.
I cannot write a unit test to reproduce it because I cannot find an approach to force `RecurringTimer` suspend at this line for a few milliseconds.

There was a failure in Jenkins here: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/41455/console

This PR updates RecurringTimer to make sure `stop(interruptTimer = false)` will call `callback` at least once after the `stop` method is called.

Author: zsxwing <zsxwing@gmail.com>

Closes #8417 from zsxwing/SPARK-10224.
---
 .../spark/streaming/util/RecurringTimer.scala | 19 +++--
 .../receiver/BlockGeneratorSuite.scala        |  7 +-
 .../streaming/util/RecurringTimerSuite.scala  | 83 +++++++++++++++++++
 3 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
index dd32ad5ad811d..0148cb51c6f09 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
@@ -72,8 +72,10 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
 
   /**
    * Stop the timer, and return the last time the callback was made.
-   * interruptTimer = true will interrupt the callback
+   * - interruptTimer = true will interrupt the callback
    * if it is in progress (not guaranteed to give correct time in this case).
+   * - interruptTimer = false guarantees that there will be at least one callback after `stop` has
+   * been called.
    */
   def stop(interruptTimer: Boolean): Long = synchronized {
     if (!stopped) {
@@ -87,18 +89,23 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
     prevTime
   }
 
+  private def triggerActionForNextInterval(): Unit = {
+    clock.waitTillTime(nextTime)
+    callback(nextTime)
+    prevTime = nextTime
+    nextTime += period
+    logDebug("Callback for " + name + " called at time " + prevTime)
+  }
+
   /**
    * Repeatedly call the callback every interval.
    */
   private def loop() {
     try {
       while (!stopped) {
-        clock.waitTillTime(nextTime)
-        callback(nextTime)
-        prevTime = nextTime
-        nextTime += period
-        logDebug("Callback for " + name + " called at time " + prevTime)
+        triggerActionForNextInterval()
       }
+      triggerActionForNextInterval()
     } catch {
       case e: InterruptedException =>
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
index a38cc603f2190..2f11b255f1104 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
@@ -184,9 +184,10 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter {
     // Verify that the final data is present in the final generated block and
     // pushed before complete stop
     assert(blockGenerator.isStopped() === false) // generator has not stopped yet
-    clock.advance(blockIntervalMs)   // force block generation
-    failAfter(1 second) {
-      thread.join()
+    eventually(timeout(10 seconds), interval(10 milliseconds)) {
+      // Keep calling `advance` to avoid blocking forever in `clock.waitTillTime`
+      clock.advance(blockIntervalMs)
+      assert(thread.isAlive === false)
     }
     assert(blockGenerator.isStopped() === true) // generator has finally been completely stopped
     assert(listener.pushedData === data, "All data not pushed by stop()")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala
new file mode 100644
index 0000000000000..0544972d95c03
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.util
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+
+import org.scalatest.PrivateMethodTester
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.ManualClock
+
+class RecurringTimerSuite extends SparkFunSuite with PrivateMethodTester {
+
+  test("basic") {
+    val clock = new ManualClock()
+    val results = new mutable.ArrayBuffer[Long]() with mutable.SynchronizedBuffer[Long]
+    val timer = new RecurringTimer(clock, 100, time => {
+      results += time
+    }, "RecurringTimerSuite-basic")
+    timer.start(0)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L))
+    }
+    clock.advance(100)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L, 100L))
+    }
+    clock.advance(200)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L, 100L, 200L, 300L))
+    }
+    assert(timer.stop(interruptTimer = true) === 300L)
+  }
+
+  test("SPARK-10224: call 'callback' after stopping") {
+    val clock = new ManualClock()
+    val results = new mutable.ArrayBuffer[Long]() with mutable.SynchronizedBuffer[Long]
+    val timer = new RecurringTimer(clock, 100, time => {
+      results += time
+    }, "RecurringTimerSuite-SPARK-10224")
+    timer.start(0)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L))
+    }
+    @volatile var lastTime = -1L
+    // Now RecurringTimer is waiting for the next interval
+    val thread = new Thread {
+      override def run(): Unit = {
+        lastTime = timer.stop(interruptTimer = false)
+      }
+    }
+    thread.start()
+    val stopped = PrivateMethod[RecurringTimer]('stopped)
+    // Make sure the `stopped` field has been changed
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(timer.invokePrivate(stopped()) === true)
+    }
+    clock.advance(200)
+    // When RecurringTimer is awake from clock.waitTillTime, it will call `callback` once.
+    // Then it will find `stopped` is true and exit the loop, but it should call `callback` again
+    // before exiting its internal thread.
+    thread.join()
+    assert(results === Seq(0L, 100L, 200L))
+    assert(lastTime === 200L)
+  }
+}

From 50e4634236668a0195390f0080d0ac230d428d05 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 23 Sep 2015 01:29:30 -0700
Subject: [PATCH 397/802] [SPARK-10769] [STREAMING] [TESTS] Fix
 o.a.s.streaming.CheckpointSuite.maintains rate controller

Fixed the following failure in https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1787/testReport/junit/org.apache.spark.streaming/CheckpointSuite/recovery_maintains_rate_controller/
```
sbt.ForkMain$ForkError: The code passed to eventually never returned normally. Attempted 660 times over 10.000044392000001 seconds. Last failure message: 9223372036854775807 did not equal 200.
	at org.scalatest.concurrent.Eventually$class.tryTryAgain$1(Eventually.scala:420)
	at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:438)
	at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478)
	at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:336)
	at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478)
	at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply$mcV$sp(CheckpointSuite.scala:413)
	at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396)
	at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396)
	at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
```

In this test, it calls `advanceTimeWithRealDelay(ssc, 2)` to run two batch jobs. However, one race condition is these two jobs can finish before the receiver is registered. Then `UpdateRateLimit` won't be sent to the receiver and `getDefaultBlockGeneratorRateLimit` cannot be updated.

Here are the logs related to this issue:
```
15/09/22 19:28:26.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock before advancing = 2500

15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Finished job streaming job 3000 ms.0 from job set of time 3000 ms
15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Total delay: 1442975303.869 s for time 3000 ms (execution: 0.711 s)

15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Finished job streaming job 3500 ms.0 from job set of time 3500 ms
15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Total delay: 1442975303.373 s for time 3500 ms (execution: 0.004 s)

15/09/22 19:28:26.879 sparkDriver-akka.actor.default-dispatcher-3 INFO ReceiverTracker: Registered receiver for stream 0 from localhost:57749

15/09/22 19:28:27.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock after advancing = 3500
```
`advanceTimeWithRealDelay(ssc, 2)` triggered job 3000ms and 3500ms but the receiver was registered after job 3000ms and 3500ms finished.

So we should make sure the receiver online before running `advanceTimeWithRealDelay(ssc, 2)`.

Author: zsxwing <zsxwing@gmail.com>

Closes #8877 from zsxwing/SPARK-10769.
---
 .../scala/org/apache/spark/streaming/CheckpointSuite.scala  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 1bba7a143edf2..a6956533c07a5 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -408,10 +408,14 @@ class CheckpointSuite extends TestSuiteBase {
 
     ssc = new StreamingContext(checkpointDir)
     ssc.start()
-    val outputNew = advanceTimeWithRealDelay(ssc, 2)
 
     eventually(timeout(10.seconds)) {
       assert(RateTestReceiver.getActive().nonEmpty)
+    }
+
+    advanceTimeWithRealDelay(ssc, 2)
+
+    eventually(timeout(10.seconds)) {
       assert(RateTestReceiver.getActive().get.getDefaultBlockGeneratorRateLimit() === 200)
     }
     ssc.stop()

From 27bfa9ab3a610e072c011fd88ee4684cea6ceb76 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Wed, 23 Sep 2015 10:01:28 +0100
Subject: [PATCH 398/802] [SPARK-10721] Log warning when file deletion fails

Author: tedyu <yuzhihong@gmail.com>

Closes #8843 from tedyu/master.
---
 .../unsafe/sort/UnsafeSorterSpillReader.java        |  7 ++++++-
 .../org/apache/spark/api/python/PythonRDD.scala     |  7 +++++--
 .../org/apache/spark/deploy/RPackageUtils.scala     | 10 +++++++---
 .../spark/deploy/history/FsHistoryProvider.scala    |  8 ++++++--
 .../deploy/master/FileSystemPersistenceEngine.scala |  5 ++++-
 .../apache/spark/rdd/ReliableCheckpointRDD.scala    |  4 +++-
 .../spark/rdd/ReliableRDDCheckpointData.scala       |  6 ++++--
 .../spark/scheduler/EventLoggingListener.scala      |  8 ++++++--
 .../scheduler/cluster/SimrSchedulerBackend.scala    |  4 +++-
 .../spark/shuffle/FileShuffleBlockResolver.scala    |  5 ++++-
 .../spark/shuffle/IndexShuffleBlockResolver.scala   | 13 +++++++++----
 .../scala/org/apache/spark/storage/DiskStore.scala  | 10 ++++++++--
 .../util/collection/ExternalAppendOnlyMap.scala     |  8 ++++++--
 .../spark/util/collection/ExternalSorter.scala      |  4 +++-
 .../shuffle/ExternalShuffleBlockResolver.java       |  8 ++++++--
 15 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 4989b05d63e23..501dfe77d13cb 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -24,12 +24,15 @@
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.Platform;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description
  * of the file format).
  */
 final class UnsafeSorterSpillReader extends UnsafeSorterIterator {
+  private static final Logger logger = LoggerFactory.getLogger(UnsafeSorterSpillReader.class);
 
   private final File file;
   private InputStream in;
@@ -73,7 +76,9 @@ public void loadNext() throws IOException {
     numRecordsRemaining--;
     if (numRecordsRemaining == 0) {
       in.close();
-      file.delete();
+      if (!file.delete() && file.exists()) {
+        logger.warn("Unable to delete spill file {}", file.getPath());
+      }
       in = null;
       din = null;
     }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 3788d1829758a..19be0939038b8 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -871,7 +871,8 @@ private class PythonAccumulatorParam(@transient private val serverHost: String,
  * write the data into disk after deserialization, then Python can read it from disks.
  */
 // scalastyle:off no.finalize
-private[spark] class PythonBroadcast(@transient var path: String) extends Serializable {
+private[spark] class PythonBroadcast(@transient var path: String) extends Serializable
+  with Logging {
 
   /**
    * Read data from disks, then copy it to `out`
@@ -907,7 +908,9 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial
     if (!path.isEmpty) {
       val file = new File(path)
       if (file.exists()) {
-        file.delete()
+        if (!file.delete()) {
+          logWarning(s"Error deleting ${file.getPath}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
index 4b28866dcaa7c..7d160b6790eaa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
@@ -175,8 +175,10 @@ private[deploy] object RPackageUtils extends Logging {
               print(s"ERROR: Failed to build R package in $file.", printStream)
               print(RJarDoc, printStream)
             }
-          } finally {
-            rSource.delete() // clean up
+          } finally { // clean up
+            if (!rSource.delete()) {
+              logWarning(s"Error deleting ${rSource.getPath()}")
+            }
           }
         } else {
           if (verbose) {
@@ -211,7 +213,9 @@ private[deploy] object RPackageUtils extends Logging {
     val filesToBundle = listFilesRecursively(dir, Seq(".zip"))
     // create a zip file from scratch, do not append to existing file.
     val zipFile = new File(dir, name)
-    zipFile.delete()
+    if (!zipFile.delete()) {
+      logWarning(s"Error deleting ${zipFile.getPath()}")
+    }
     val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false))
     try {
       filesToBundle.foreach { file =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 8eb2ba1e8683b..5eb8adf97d90b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -242,7 +242,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         logError("Exception encountered when attempting to update last scan time", e)
         lastScanTime
     } finally {
-      fs.delete(path)
+      if (!fs.delete(path)) {
+        logWarning(s"Error deleting ${path}")
+      }
     }
   }
 
@@ -405,7 +407,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         try {
           val path = new Path(logDir, attempt.logPath)
           if (fs.exists(path)) {
-            fs.delete(path, true)
+            if (!fs.delete(path, true)) {
+              logWarning(s"Error deleting ${path}")
+            }
           }
         } catch {
           case e: AccessControlException =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
index aa379d4cd61e7..1aa8cd5013b49 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
@@ -45,7 +45,10 @@ private[master] class FileSystemPersistenceEngine(
   }
 
   override def unpersist(name: String): Unit = {
-    new File(dir + File.separator + name).delete()
+    val f = new File(dir + File.separator + name)
+    if (!f.delete()) {
+      logWarning(s"Error deleting ${f.getPath()}")
+    }
   }
 
   override def read[T: ClassTag](prefix: String): Seq[T] = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index 1c3b5da19ceba..a69be6a068bbf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -144,7 +144,9 @@ private[spark] object ReliableCheckpointRDD extends Logging {
       } else {
         // Some other copy of this task must've finished before us and renamed it
         logInfo(s"Final output path $finalOutputPath already exists; not overwriting it")
-        fs.delete(tempOutputPath, false)
+        if (!fs.delete(tempOutputPath, false)) {
+          logWarning(s"Error deleting ${tempOutputPath}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
index e9f6060301ba3..91cad6662e4d2 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
@@ -89,7 +89,7 @@ private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private v
 
 }
 
-private[spark] object ReliableRDDCheckpointData {
+private[spark] object ReliableRDDCheckpointData extends Logging {
 
   /** Return the path of the directory to which this RDD's checkpoint data is written. */
   def checkpointPath(sc: SparkContext, rddId: Int): Option[Path] = {
@@ -101,7 +101,9 @@ private[spark] object ReliableRDDCheckpointData {
     checkpointPath(sc, rddId).foreach { path =>
       val fs = path.getFileSystem(sc.hadoopConfiguration)
       if (fs.exists(path)) {
-        fs.delete(path, true)
+        if (!fs.delete(path, true)) {
+          logWarning(s"Error deleting ${path.toString()}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 5a06ef02f5c57..000a021a528cf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -109,7 +109,9 @@ private[spark] class EventLoggingListener(
 
     if (shouldOverwrite && fileSystem.exists(path)) {
       logWarning(s"Event log $path already exists. Overwriting...")
-      fileSystem.delete(path, true)
+      if (!fileSystem.delete(path, true)) {
+        logWarning(s"Error deleting $path")
+      }
     }
 
     /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
@@ -216,7 +218,9 @@ private[spark] class EventLoggingListener(
     if (fileSystem.exists(target)) {
       if (shouldOverwrite) {
         logWarning(s"Event log $target already exists. Overwriting...")
-        fileSystem.delete(target, true)
+        if (!fileSystem.delete(target, true)) {
+          logWarning(s"Error deleting $target")
+        }
       } else {
         throw new IOException("Target log file already exists (%s)".format(logPath))
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
index 0324c9dab910b..641638a77d5f5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
@@ -65,7 +65,9 @@ private[spark] class SimrSchedulerBackend(
   override def stop() {
     val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
     val fs = FileSystem.get(conf)
-    fs.delete(new Path(driverFilePath), false)
+    if (!fs.delete(new Path(driverFilePath), false)) {
+      logWarning(s"error deleting ${driverFilePath}")
+    }
     super.stop()
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
index d9902f96dfd4e..cd253a78c2b19 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -127,7 +127,10 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
       case Some(state) =>
         for (mapId <- state.completedMapTasks.asScala; reduceId <- 0 until state.numReducers) {
           val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
-          blockManager.diskBlockManager.getFile(blockId).delete()
+          val file = blockManager.diskBlockManager.getFile(blockId)
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file.getPath()}")
+          }
         }
         logInfo("Deleted all files for shuffle " + shuffleId)
         true
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index d0163d326dba7..65887d119d65a 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -21,7 +21,7 @@ import java.io._
 
 import com.google.common.io.ByteStreams
 
-import org.apache.spark.{SparkConf, SparkEnv}
+import org.apache.spark.{SparkConf, SparkEnv, Logging}
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.storage._
@@ -40,7 +40,8 @@ import IndexShuffleBlockResolver.NOOP_REDUCE_ID
  */
 // Note: Changes to the format in this file should be kept in sync with
 // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getSortBasedShuffleBlockData().
-private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleBlockResolver {
+private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleBlockResolver
+  with Logging {
 
   private lazy val blockManager = SparkEnv.get.blockManager
 
@@ -60,12 +61,16 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
   def removeDataByMap(shuffleId: Int, mapId: Int): Unit = {
     var file = getDataFile(shuffleId, mapId)
     if (file.exists()) {
-      file.delete()
+      if (!file.delete()) {
+        logWarning(s"Error deleting data ${file.getPath()}")
+      }
     }
 
     file = getIndexFile(shuffleId, mapId)
     if (file.exists()) {
-      file.delete()
+      if (!file.delete()) {
+        logWarning(s"Error deleting index ${file.getPath()}")
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index feb9533604ffb..c008b9dc16327 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -86,7 +86,9 @@ private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBloc
     } catch {
       case e: Throwable =>
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
         throw e
     }
@@ -155,7 +157,11 @@ private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBloc
   override def remove(blockId: BlockId): Boolean = {
     val file = diskManager.getFile(blockId.name)
     if (file.exists()) {
-      file.delete()
+      val ret = file.delete()
+      if (!ret) {
+        logWarning(s"Error deleting ${file.getPath()}")
+      }
+      ret
     } else {
       false
     }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index f929b12606f0a..29c5732f5a8c1 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -208,7 +208,9 @@ class ExternalAppendOnlyMap[K, V, C](
           writer.revertPartialWritesAndClose()
         }
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
       }
     }
@@ -489,7 +491,9 @@ class ExternalAppendOnlyMap[K, V, C](
         fileStream = null
       }
       if (file.exists()) {
-        file.delete()
+        if (!file.delete()) {
+          logWarning(s"Error deleting ${file}")
+        }
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 2a30f751ff03d..749be34d8e8fd 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -318,7 +318,9 @@ private[spark] class ExternalSorter[K, V, C](
           writer.revertPartialWritesAndClose()
         }
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
       }
     }
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
index c5f93bb47f55c..0d4dd6afac769 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -114,10 +114,14 @@ public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorF
             "recover state for existing applications", registeredExecutorFile, e);
           if (registeredExecutorFile.isDirectory()) {
             for (File f : registeredExecutorFile.listFiles()) {
-              f.delete();
+              if (!f.delete()) {
+                logger.warn("error deleting {}", f.getPath());
+              }
             }
           }
-          registeredExecutorFile.delete();
+          if (!registeredExecutorFile.delete()) {
+            logger.warn("error deleting {}", registeredExecutorFile.getPath());
+          }
           options.createIfMissing(true);
           try {
             tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);

From a18208047f06a4244703c17023bb20cbe1f59d73 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 23 Sep 2015 11:31:01 -0700
Subject: [PATCH 399/802] [SPARK-10403] Allow UnsafeRowSerializer to work with
 tungsten-sort ShuffleManager

This patch attempts to fix an issue where Spark SQL's UnsafeRowSerializer was incompatible with the `tungsten-sort` ShuffleManager.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8873 from JoshRosen/SPARK-10403.
---
 .../sql/execution/UnsafeRowSerializer.scala   | 22 +++++++++---------
 .../execution/UnsafeRowSerializerSuite.scala  | 23 +++++++++++++------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
index e060c06d9e2a2..7e981268de392 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -45,16 +45,9 @@ private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with S
 }
 
 private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInstance {
-
-  /**
-   * Marks the end of a stream written with [[serializeStream()]].
-   */
-  private[this] val EOF: Int = -1
-
   /**
    * Serializes a stream of UnsafeRows. Within the stream, each record consists of a record
    * length (stored as a 4-byte integer, written high byte first), followed by the record's bytes.
-   * The end of the stream is denoted by a record with the special length `EOF` (-1).
    */
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096)
@@ -92,7 +85,6 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
 
     override def close(): Unit = {
       writeBuffer = null
-      dOut.writeInt(EOF)
       dOut.close()
     }
   }
@@ -104,12 +96,20 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
       private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024)
       private[this] var row: UnsafeRow = new UnsafeRow()
       private[this] var rowTuple: (Int, UnsafeRow) = (0, row)
+      private[this] val EOF: Int = -1
 
       override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = {
         new Iterator[(Int, UnsafeRow)] {
-          private[this] var rowSize: Int = dIn.readInt()
-          if (rowSize == EOF) dIn.close()
 
+          private[this] def readSize(): Int = try {
+            dIn.readInt()
+          } catch {
+            case e: EOFException =>
+              dIn.close()
+              EOF
+          }
+
+          private[this] var rowSize: Int = readSize()
           override def hasNext: Boolean = rowSize != EOF
 
           override def next(): (Int, UnsafeRow) = {
@@ -118,7 +118,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
             }
             ByteStreams.readFully(dIn, rowBuffer, 0, rowSize)
             row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize)
-            rowSize = dIn.readInt() // read the next row's size
+            rowSize = readSize()
             if (rowSize == EOF) { // We are returning the last row in this stream
               dIn.close()
               val _rowTuple = rowTuple
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index 0113d052e338d..f7d48bc53ebbc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.execution
 
-import java.io.{File, DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
+import java.io.{File, ByteArrayInputStream, ByteArrayOutputStream}
 
 import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.ShuffleBlockId
 import org.apache.spark.util.collection.ExternalSorter
 import org.apache.spark.util.Utils
@@ -41,7 +42,7 @@ class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStrea
   }
 }
 
-class UnsafeRowSerializerSuite extends SparkFunSuite {
+class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
 
   private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
     val converter = unsafeRowConverter(schema)
@@ -87,11 +88,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite {
   }
 
   test("close empty input stream") {
-    val baos = new ByteArrayOutputStream()
-    val dout = new DataOutputStream(baos)
-    dout.writeInt(-1)  // EOF
-    dout.flush()
-    val input = new ClosableByteArrayInputStream(baos.toByteArray)
+    val input = new ClosableByteArrayInputStream(Array.empty)
     val serializer = new UnsafeRowSerializer(numFields = 2).newInstance()
     val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator
     assert(!deserializerIter.hasNext)
@@ -143,4 +140,16 @@ class UnsafeRowSerializerSuite extends SparkFunSuite {
       }
     }
   }
+
+  test("SPARK-10403: unsafe row serializer with UnsafeShuffleManager") {
+    val conf = new SparkConf()
+      .set("spark.shuffle.manager", "tungsten-sort")
+    sc = new SparkContext("local", "test", conf)
+    val row = Row("Hello", 123)
+    val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
+    val rowsRDD = sc.parallelize(Seq((0, unsafeRow), (1, unsafeRow), (0, unsafeRow)))
+      .asInstanceOf[RDD[Product2[Int, InternalRow]]]
+    val shuffled = new ShuffledRowRDD(rowsRDD, new UnsafeRowSerializer(2), 2)
+    shuffled.count()
+  }
 }

From 098be27ad53c485ee2fc7f5871c47f899020e87b Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Wed, 23 Sep 2015 15:00:52 -0700
Subject: [PATCH 400/802] [SPARK-9715] [ML] Store numFeatures in all ML
 PredictionModel types

All prediction models should store `numFeatures` indicating the number of features the model was trained on. Default value of -1 added for backwards compatibility.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #8675 from sethah/SPARK-9715.
---
 .../examples/ml/JavaDeveloperApiExample.java  |  5 ++++
 .../examples/ml/DeveloperApiExample.scala     |  3 +++
 .../scala/org/apache/spark/ml/Predictor.scala |  6 ++++-
 .../DecisionTreeClassifier.scala              | 13 ++++++----
 .../ml/classification/GBTClassifier.scala     | 26 ++++++++++++++-----
 .../classification/LogisticRegression.scala   |  2 ++
 .../MultilayerPerceptronClassifier.scala      |  2 ++
 .../spark/ml/classification/NaiveBayes.scala  |  2 ++
 .../RandomForestClassifier.scala              |  8 +++---
 .../ml/regression/DecisionTreeRegressor.scala | 13 ++++++----
 .../spark/ml/regression/GBTRegressor.scala    | 24 ++++++++++++-----
 .../ml/regression/LinearRegression.scala      |  2 ++
 .../ml/regression/RandomForestRegressor.scala |  7 ++---
 .../spark/ml/tree/impl/RandomForest.scala     | 14 +++++++---
 .../DecisionTreeClassifierSuite.scala         |  4 ++-
 .../classification/GBTClassifierSuite.scala   | 11 +++++---
 .../LogisticRegressionSuite.scala             |  2 ++
 .../MultilayerPerceptronClassifierSuite.scala |  4 ++-
 .../ProbabilisticClassifierSuite.scala        |  6 +++--
 .../RandomForestClassifierSuite.scala         |  8 +++---
 .../DecisionTreeRegressorSuite.scala          |  2 ++
 .../ml/regression/GBTRegressorSuite.scala     |  7 +++--
 .../ml/regression/LinearRegressionSuite.scala |  4 ++-
 .../RandomForestRegressorSuite.scala          |  2 ++
 .../ml/tree/impl/RandomForestSuite.scala      |  3 ++-
 25 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index a377694507d29..0b4c0d9ba9f8b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -219,6 +219,11 @@ public Vector predictRaw(Vector features) {
    */
   public int numClasses() { return 2; }
 
+  /**
+   * Number of features the model was trained on.
+   */
+  public int numFeatures() { return weights_.size(); }
+
   /**
    * Create a copy of the model.
    * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 340c3559b15ef..3758edc56198a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -172,6 +172,9 @@ private class MyLogisticRegressionModel(
   /** Number of classes the label can take.  2 indicates binary classification. */
   override val numClasses: Int = 2
 
+  /** Number of features the model was trained on. */
+  override val numFeatures: Int = weights.size
+
   /**
    * Create a copy of the model.
    * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index 19fe039b8fd03..e0dcd427fae24 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.SchemaUtils
@@ -145,6 +145,10 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
   /** @group setParam */
   def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M]
 
+  /** Returns the number of features the model was trained on. If unknown, returns -1 */
+  @Since("1.6.0")
+  def numFeatures: Int = -1
+
   /**
    * Returns the SQL DataType corresponding to the FeaturesType type parameter.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index b8eb49f9bdb48..a6f6d463bffb2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -107,6 +107,7 @@ object DecisionTreeClassifier {
 final class DecisionTreeClassificationModel private[ml] (
     override val uid: String,
     override val rootNode: Node,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel]
   with DecisionTreeModel with Serializable {
@@ -118,8 +119,8 @@ final class DecisionTreeClassificationModel private[ml] (
    * Construct a decision tree classification model.
    * @param rootNode  Root node of tree, with other nodes attached.
    */
-  private[ml] def this(rootNode: Node, numClasses: Int) =
-    this(Identifiable.randomUID("dtc"), rootNode, numClasses)
+  private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) =
+    this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses)
 
   override protected def predict(features: Vector): Double = {
     rootNode.predictImpl(features).prediction
@@ -141,7 +142,7 @@ final class DecisionTreeClassificationModel private[ml] (
   }
 
   override def copy(extra: ParamMap): DecisionTreeClassificationModel = {
-    copyValues(new DecisionTreeClassificationModel(uid, rootNode, numClasses), extra)
+    copyValues(new DecisionTreeClassificationModel(uid, rootNode, numFeatures, numClasses), extra)
       .setParent(parent)
   }
 
@@ -161,12 +162,14 @@ private[ml] object DecisionTreeClassificationModel {
   def fromOld(
       oldModel: OldDecisionTreeModel,
       parent: DecisionTreeClassifier,
-      categoricalFeatures: Map[Int, Int]): DecisionTreeClassificationModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): DecisionTreeClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification,
       s"Cannot convert non-classification DecisionTreeModel (old API) to" +
         s" DecisionTreeClassificationModel (new API).  Algo is: ${oldModel.algo}")
     val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc")
-    new DecisionTreeClassificationModel(uid, rootNode, -1)
+    // Can't infer number of features from old model, so default to -1
+    new DecisionTreeClassificationModel(uid, rootNode, numFeatures, -1)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ad8683648b975..74aef94bf7675 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.loss.{LogLoss => OldLogLoss, Loss => OldLoss}
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{Row, DataFrame}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.DoubleType
 
@@ -138,10 +138,11 @@ final class GBTClassifier(override val uid: String)
     require(numClasses == 2,
       s"GBTClassifier only supports binary classification but was given numClasses = $numClasses")
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
     val oldGBT = new OldGBT(boostingStrategy)
     val oldModel = oldGBT.run(oldDataset)
-    GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures)
+    GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures, numFeatures)
   }
 
   override def copy(extra: ParamMap): GBTClassifier = defaultCopy(extra)
@@ -164,10 +165,11 @@ object GBTClassifier {
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
 @Experimental
-final class GBTClassificationModel(
+final class GBTClassificationModel private[ml](
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    private val _treeWeights: Array[Double])
+    private val _treeWeights: Array[Double],
+    override val numFeatures: Int)
   extends PredictionModel[Vector, GBTClassificationModel]
   with TreeEnsembleModel with Serializable {
 
@@ -175,6 +177,14 @@ final class GBTClassificationModel(
   require(_trees.length == _treeWeights.length, "GBTClassificationModel given trees, treeWeights" +
     s" of non-matching lengths (${_trees.length}, ${_treeWeights.length}, respectively).")
 
+  /**
+   * Construct a GBTClassificationModel
+   * @param _trees  Decision trees in the ensemble.
+   * @param _treeWeights  Weights for the decision trees in the ensemble.
+   */
+  def this(uid: String, _trees: Array[DecisionTreeRegressionModel], _treeWeights: Array[Double]) =
+    this(uid, _trees, _treeWeights, -1)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   override def treeWeights: Array[Double] = _treeWeights
@@ -196,7 +206,8 @@ final class GBTClassificationModel(
   }
 
   override def copy(extra: ParamMap): GBTClassificationModel = {
-    copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra).setParent(parent)
+    copyValues(new GBTClassificationModel(uid, _trees, _treeWeights, numFeatures),
+      extra).setParent(parent)
   }
 
   override def toString: String = {
@@ -215,7 +226,8 @@ private[ml] object GBTClassificationModel {
   def fromOld(
       oldModel: OldGBTModel,
       parent: GBTClassifier,
-      categoricalFeatures: Map[Int, Int]): GBTClassificationModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): GBTClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -223,6 +235,6 @@ private[ml] object GBTClassificationModel {
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc")
-    new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights)
+    new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index bd96e8d000ff2..c17a7b0c361ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -426,6 +426,8 @@ class LogisticRegressionModel private[ml] (
     1.0 / (1.0 + math.exp(-m))
   }
 
+  override val numFeatures: Int = weights.size
+
   override val numClasses: Int = 2
 
   private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 5f60dea91fcfa..cd7462596dd9e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -181,6 +181,8 @@ class MultilayerPerceptronClassificationModel private[ml] (
   extends PredictionModel[Vector, MultilayerPerceptronClassificationModel]
   with Serializable {
 
+  override val numFeatures: Int = layers.head
+
   private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 082ea1ffad58f..a14dcecbaf5b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -137,6 +137,8 @@ class NaiveBayesModel private[ml] (
       throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
   }
 
+  override val numFeatures: Int = theta.numCols
+
   override val numClasses: Int = pi.size
 
   private def multinomialCalculation(features: Vector) = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index a6ebee1bb10af..bae329692a68d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -119,13 +119,12 @@ object RandomForestClassifier {
  * features.
  * @param _trees  Decision trees in the ensemble.
  *               Warning: These have null parents.
- * @param numFeatures  Number of features used by this model
  */
 @Experimental
 final class RandomForestClassificationModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeClassificationModel],
-    val numFeatures: Int,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
@@ -226,7 +225,8 @@ private[ml] object RandomForestClassificationModel {
       oldModel: OldRandomForestModel,
       parent: RandomForestClassifier,
       categoricalFeatures: Map[Int, Int],
-      numClasses: Int): RandomForestClassificationModel = {
+      numClasses: Int,
+      numFeatures: Int = -1): RandomForestClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -234,6 +234,6 @@ private[ml] object RandomForestClassificationModel {
       DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc")
-    new RandomForestClassificationModel(uid, newTrees, -1, numClasses)
+    new RandomForestClassificationModel(uid, newTrees, numFeatures, numClasses)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index d9a244bea28d2..88b79a4eb82be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -96,7 +96,8 @@ object DecisionTreeRegressor {
 @Experimental
 final class DecisionTreeRegressionModel private[ml] (
     override val uid: String,
-    override val rootNode: Node)
+    override val rootNode: Node,
+    override val numFeatures: Int)
   extends PredictionModel[Vector, DecisionTreeRegressionModel]
   with DecisionTreeModel with Serializable {
 
@@ -107,14 +108,15 @@ final class DecisionTreeRegressionModel private[ml] (
    * Construct a decision tree regression model.
    * @param rootNode  Root node of tree, with other nodes attached.
    */
-  private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
+  private[ml] def this(rootNode: Node, numFeatures: Int) =
+    this(Identifiable.randomUID("dtr"), rootNode, numFeatures)
 
   override protected def predict(features: Vector): Double = {
     rootNode.predictImpl(features).prediction
   }
 
   override def copy(extra: ParamMap): DecisionTreeRegressionModel = {
-    copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra).setParent(parent)
+    copyValues(new DecisionTreeRegressionModel(uid, rootNode, numFeatures), extra).setParent(parent)
   }
 
   override def toString: String = {
@@ -133,12 +135,13 @@ private[ml] object DecisionTreeRegressionModel {
   def fromOld(
       oldModel: OldDecisionTreeModel,
       parent: DecisionTreeRegressor,
-      categoricalFeatures: Map[Int, Int]): DecisionTreeRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): DecisionTreeRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression,
       s"Cannot convert non-regression DecisionTreeModel (old API) to" +
         s" DecisionTreeRegressionModel (new API).  Algo is: ${oldModel.algo}")
     val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr")
-    new DecisionTreeRegressionModel(uid, rootNode)
+    new DecisionTreeRegressionModel(uid, rootNode, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index d841ecb9e58d6..65b5b3e0727df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -128,10 +128,11 @@ final class GBTRegressor(override val uid: String)
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
     val oldGBT = new OldGBT(boostingStrategy)
     val oldModel = oldGBT.run(oldDataset)
-    GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures)
+    GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures, numFeatures)
   }
 
   override def copy(extra: ParamMap): GBTRegressor = defaultCopy(extra)
@@ -154,10 +155,11 @@ object GBTRegressor {
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
 @Experimental
-final class GBTRegressionModel(
+final class GBTRegressionModel private[ml](
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    private val _treeWeights: Array[Double])
+    private val _treeWeights: Array[Double],
+    override val numFeatures: Int)
   extends PredictionModel[Vector, GBTRegressionModel]
   with TreeEnsembleModel with Serializable {
 
@@ -165,6 +167,14 @@ final class GBTRegressionModel(
   require(_trees.length == _treeWeights.length, "GBTRegressionModel given trees, treeWeights of" +
     s" non-matching lengths (${_trees.length}, ${_treeWeights.length}, respectively).")
 
+  /**
+   * Construct a GBTRegressionModel
+   * @param _trees  Decision trees in the ensemble.
+   * @param _treeWeights  Weights for the decision trees in the ensemble.
+   */
+  def this(uid: String, _trees: Array[DecisionTreeRegressionModel], _treeWeights: Array[Double]) =
+    this(uid, _trees, _treeWeights, -1)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   override def treeWeights: Array[Double] = _treeWeights
@@ -185,7 +195,8 @@ final class GBTRegressionModel(
   }
 
   override def copy(extra: ParamMap): GBTRegressionModel = {
-    copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra).setParent(parent)
+    copyValues(new GBTRegressionModel(uid, _trees, _treeWeights, numFeatures),
+      extra).setParent(parent)
   }
 
   override def toString: String = {
@@ -204,7 +215,8 @@ private[ml] object GBTRegressionModel {
   def fromOld(
       oldModel: OldGBTModel,
       parent: GBTRegressor,
-      categoricalFeatures: Map[Int, Int]): GBTRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): GBTRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -212,6 +224,6 @@ private[ml] object GBTRegressionModel {
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr")
-    new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights)
+    new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 78a67c5fdab20..a77e702141b3d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -293,6 +293,8 @@ class LinearRegressionModel private[ml] (
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
+  override val numFeatures: Int = weights.size
+
   /**
    * Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is
    * thrown if `trainingSummary == None`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index ddb7214416a69..64fc17247cce6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -115,7 +115,7 @@ object RandomForestRegressor {
 final class RandomForestRegressionModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    val numFeatures: Int)
+    override val numFeatures: Int)
   extends PredictionModel[Vector, RandomForestRegressionModel]
   with TreeEnsembleModel with Serializable {
 
@@ -187,13 +187,14 @@ private[ml] object RandomForestRegressionModel {
   def fromOld(
       oldModel: OldRandomForestModel,
       parent: RandomForestRegressor,
-      categoricalFeatures: Map[Int, Int]): RandomForestRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): RandomForestRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
       // parent for each tree is null since there is no good way to set this.
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
-    new RandomForestRegressionModel(parent.uid, newTrees, -1)
+    new RandomForestRegressionModel(parent.uid, newTrees, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 4ac51a475474a..c494556085e95 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -179,22 +179,28 @@ private[ml] object RandomForest extends Logging {
       }
     }
 
+    val numFeatures = metadata.numFeatures
+
     parentUID match {
       case Some(uid) =>
         if (strategy.algo == OldAlgo.Classification) {
           topNodes.map { rootNode =>
-            new DecisionTreeClassificationModel(uid, rootNode.toNode, strategy.getNumClasses)
+            new DecisionTreeClassificationModel(uid, rootNode.toNode, numFeatures,
+              strategy.getNumClasses)
           }
         } else {
-          topNodes.map(rootNode => new DecisionTreeRegressionModel(uid, rootNode.toNode))
+          topNodes.map { rootNode =>
+            new DecisionTreeRegressionModel(uid, rootNode.toNode, numFeatures)
+          }
         }
       case None =>
         if (strategy.algo == OldAlgo.Classification) {
           topNodes.map { rootNode =>
-            new DecisionTreeClassificationModel(rootNode.toNode, strategy.getNumClasses)
+            new DecisionTreeClassificationModel(rootNode.toNode, numFeatures,
+              strategy.getNumClasses)
           }
         } else {
-          topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode))
+          topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode, numFeatures))
         }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index f680d8d3c4cc2..815f6fd997584 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -59,7 +59,7 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
 
   test("params") {
     ParamsSuite.checkParams(new DecisionTreeClassifier)
-    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)
+    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -310,6 +310,7 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
       dt: DecisionTreeClassifier,
       categoricalFeatures: Map[Int, Int],
       numClasses: Int): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy = dt.getOldStrategy(categoricalFeatures, numClasses)
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses)
@@ -318,5 +319,6 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
     val oldTreeAsNew = DecisionTreeClassificationModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
+    assert(newTree.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index e3909bccaa5ca..039141aeb6f67 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -59,8 +59,8 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("params") {
     ParamsSuite.checkParams(new GBTClassifier)
     val model = new GBTClassificationModel("gbtc",
-      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null))),
-      Array(1.0))
+      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null), 1)),
+      Array(1.0), 1)
     ParamsSuite.checkParams(model)
   }
 
@@ -145,7 +145,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
   */
 }
 
-private object GBTClassifierSuite {
+private object GBTClassifierSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -156,6 +156,7 @@ private object GBTClassifierSuite {
       validationData: Option[RDD[LabeledPoint]],
       gbt: GBTClassifier,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldBoostingStrategy =
       gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
     val oldGBT = new OldGBT(oldBoostingStrategy)
@@ -164,7 +165,9 @@ private object GBTClassifierSuite {
     val newModel = gbt.fit(newData)
     // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTClassificationModel.fromOld(
-      oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures)
+      oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures, numFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
+    assert(oldModelAsNew.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index f5219f9f574be..ec01998601fbc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -194,6 +194,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val model = lr.fit(dataset)
     assert(model.numClasses === 2)
+    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
 
     val threshold = model.getThreshold
     val results = model.transform(dataset)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index ddc948f65df45..2d1df9b2b82e8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
@@ -73,6 +73,8 @@ class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSp
       .setSeed(11L)
       .setMaxIter(numIterations)
     val model = trainer.fit(dataFrame)
+    val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
     val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label")
       .map { case Row(p: Double, l: Double) => (p, l) }
     // train multinomial logistic regression
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
index 8f50cb924e64d..fb5f00e0646c6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 final class TestProbabilisticClassificationModel(
     override val uid: String,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {
 
@@ -45,13 +46,14 @@ class ProbabilisticClassifierSuite extends SparkFunSuite {
 
   test("test thresholding") {
     val thresholds = Array(0.5, 0.2)
-    val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds)
+    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
+      .setThresholds(thresholds)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0)
   }
 
   test("test thresholding not required") {
-    val testModel = new TestProbabilisticClassificationModel("myuid", 2)
+    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index b4403ec30049a..deb8ec771cb27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -68,7 +68,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   test("params") {
     ParamsSuite.checkParams(new RandomForestClassifier)
     val model = new RandomForestClassificationModel("rfc",
-      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2, 2)
+      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)), 2, 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -209,7 +209,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   */
 }
 
-private object RandomForestClassifierSuite {
+private object RandomForestClassifierSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -220,6 +220,7 @@ private object RandomForestClassifierSuite {
       rf: RandomForestClassifier,
       categoricalFeatures: Map[Int, Int],
       numClasses: Int): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy =
       rf.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, rf.getOldImpurity)
     val oldModel = OldRandomForest.trainClassifier(
@@ -233,6 +234,7 @@ private object RandomForestClassifierSuite {
     TreeTests.checkEqual(oldModelAsNew, newModel)
     assert(newModel.hasParent)
     assert(!newModel.trees.head.asInstanceOf[DecisionTreeClassificationModel].hasParent)
-    assert(newModel.numClasses == numClasses)
+    assert(newModel.numClasses === numClasses)
+    assert(newModel.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index b092bcd6a7e86..868fb8eecb8bb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -89,6 +89,7 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
       data: RDD[LabeledPoint],
       dt: DecisionTreeRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy = dt.getOldStrategy(categoricalFeatures)
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
@@ -97,5 +98,6 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
     val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
+    assert(newTree.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index a68197b59193d..09326600e620f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -156,7 +156,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
   */
 }
 
-private object GBTRegressorSuite {
+private object GBTRegressorSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -167,6 +167,7 @@ private object GBTRegressorSuite {
       validationData: Option[RDD[LabeledPoint]],
       gbt: GBTRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldBoostingStrategy = gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
     val oldGBT = new OldGBT(oldBoostingStrategy)
     val oldModel = oldGBT.run(data)
@@ -174,7 +175,9 @@ private object GBTRegressorSuite {
     val newModel = gbt.fit(newData)
     // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTRegressionModel.fromOld(
-      oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures)
+      oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures, numFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
+    assert(oldModelAsNew.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 8428f4f00b370..7cb9471e69860 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -22,8 +22,8 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
-import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.{Vector, DenseVector, Vectors}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -87,6 +87,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
+    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
   }
 
   test("linear regression with intercept without regularization") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 7b1b3f11481de..7e751e4b553b6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -137,6 +137,7 @@ private object RandomForestRegressorSuite extends SparkFunSuite {
       data: RDD[LabeledPoint],
       rf: RandomForestRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy =
       rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity)
     val oldModel = OldRandomForest.trainRegressor(
@@ -147,5 +148,6 @@ private object RandomForestRegressorSuite extends SparkFunSuite {
     val oldModelAsNew = RandomForestRegressionModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
index dc852795c7f62..d5c238e9ae164 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
@@ -77,7 +77,8 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // Forest consisting of (full tree) + (internal node with 2 leafs)
     val trees = Array(parent, grandParent).map { root =>
-      new DecisionTreeClassificationModel(root, numClasses = 3).asInstanceOf[DecisionTreeModel]
+      new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3)
+        .asInstanceOf[DecisionTreeModel]
     }
     val importances: Vector = RandomForest.featureImportances(trees, 2)
     val tree2norm = feature0importance + feature1importance

From ce2b056d35c0c75d5c162b93680ee2d84152e911 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Sep 2015 15:26:02 -0700
Subject: [PATCH 401/802] [SPARK-10686] [ML] Add quantilesCol to
 AFTSurvivalRegression

By default ```quantilesCol``` should be empty. If ```quantileProbabilities``` is set, we should append quantiles as a new column (of type Vector).

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8836 from yanboliang/spark-10686.
---
 .../ml/regression/AFTSurvivalRegression.scala | 51 ++++++++++---
 .../AFTSurvivalRegressionSuite.scala          | 74 ++++++++++++-------
 2 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 5b25db651f56c..717caacad30eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -41,7 +41,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[regression] trait AFTSurvivalRegressionParams extends Params
   with HasFeaturesCol with HasLabelCol with HasPredictionCol with HasMaxIter
-  with HasTol with HasFitIntercept {
+  with HasTol with HasFitIntercept with Logging {
 
   /**
    * Param for censor column name.
@@ -59,21 +59,35 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
 
   /**
    * Param for quantile probabilities array.
-   * Values of the quantile probabilities array should be in the range [0, 1].
+   * Values of the quantile probabilities array should be in the range [0, 1]
+   * and the array should be non-empty.
    * @group param
    */
   @Since("1.6.0")
   final val quantileProbabilities: DoubleArrayParam = new DoubleArrayParam(this,
     "quantileProbabilities", "quantile probabilities array",
-    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)))
+    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)) && t.length > 0)
 
   /** @group getParam */
   @Since("1.6.0")
   def getQuantileProbabilities: Array[Double] = $(quantileProbabilities)
+  setDefault(quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99))
 
-  /** Checks whether the input has quantile probabilities array. */
-  protected[regression] def hasQuantileProbabilities: Boolean = {
-    isDefined(quantileProbabilities) && $(quantileProbabilities).size != 0
+  /**
+   * Param for quantiles column name.
+   * This column will output quantiles of corresponding quantileProbabilities if it is set.
+   * @group param
+   */
+  @Since("1.6.0")
+  final val quantilesCol: Param[String] = new Param(this, "quantilesCol", "quantiles column name")
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getQuantilesCol: String = $(quantilesCol)
+
+  /** Checks whether the input has quantiles column name. */
+  protected[regression] def hasQuantilesCol: Boolean = {
+    isDefined(quantilesCol) && $(quantilesCol) != ""
   }
 
   /**
@@ -90,6 +104,9 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
       SchemaUtils.checkColumnType(schema, $(censorCol), DoubleType)
       SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
     }
+    if (hasQuantilesCol) {
+      SchemaUtils.appendColumn(schema, $(quantilesCol), new VectorUDT)
+    }
     SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
   }
 }
@@ -124,6 +141,14 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
   @Since("1.6.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantilesCol(value: String): this.type = set(quantilesCol, value)
+
   /**
    * Set if we should fit the intercept
    * Default is true.
@@ -243,10 +268,12 @@ class AFTSurvivalRegressionModel private[ml] (
   @Since("1.6.0")
   def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value)
 
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantilesCol(value: String): this.type = set(quantilesCol, value)
+
   @Since("1.6.0")
   def predictQuantiles(features: Vector): Vector = {
-    require(hasQuantileProbabilities,
-      "AFTSurvivalRegressionModel predictQuantiles must set quantile probabilities array")
     // scale parameter for the Weibull distribution of lifetime
     val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
     // shape parameter for the Weibull distribution of lifetime
@@ -266,7 +293,13 @@ class AFTSurvivalRegressionModel private[ml] (
   override def transform(dataset: DataFrame): DataFrame = {
     transformSchema(dataset.schema)
     val predictUDF = udf { features: Vector => predict(features) }
-    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+    val predictQuantilesUDF = udf { features: Vector => predictQuantiles(features)}
+    if (hasQuantilesCol) {
+      dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+        .withColumn($(quantilesCol), predictQuantilesUDF(col($(featuresCol))))
+    } else {
+      dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+    }
   }
 
   @Since("1.6.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index ca7140a45ea65..359f31027172b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -22,8 +22,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
-import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
-import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.random.{ExponentialGenerator, WeibullGenerator}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -59,16 +58,20 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
     assert(aftr.getFitIntercept)
     assert(aftr.getMaxIter === 100)
     assert(aftr.getTol === 1E-6)
-    val model = aftr.fit(datasetUnivariate)
+    val model = aftr.setQuantileProbabilities(Array(0.1, 0.8))
+      .setQuantilesCol("quantiles")
+      .fit(datasetUnivariate)
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
 
     model.transform(datasetUnivariate)
-      .select("label", "prediction")
+      .select("label", "prediction", "quantiles")
       .collect()
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
+    assert(model.getQuantileProbabilities === Array(0.1, 0.8))
+    assert(model.getQuantilesCol === "quantiles")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
   }
@@ -108,7 +111,10 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
   }
 
   test("aft survival regression with univariate") {
-    val trainer = new AFTSurvivalRegression
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
     val model = trainer.fit(datasetUnivariate)
 
     /*
@@ -159,23 +165,25 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        [1]  0.1879174  2.6801195 14.5779394
      */
     val features = Vectors.dense(6.559282795753792)
-    val quantileProbabilities = Array(0.1, 0.5, 0.9)
     val responsePredictR = 4.494763
     val quantilePredictR = Vectors.dense(0.1879174, 2.6801195, 14.5779394)
 
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
-    model.setQuantileProbabilities(quantileProbabilities)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetUnivariate).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    model.transform(datasetUnivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
     }
   }
 
   test("aft survival regression with multivariate") {
-    val trainer = new AFTSurvivalRegression
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
     val model = trainer.fit(datasetMultivariate)
 
     /*
@@ -227,23 +235,26 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        [1]  0.5287044  3.3285858 10.7517072
      */
     val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
-    val quantileProbabilities = Array(0.1, 0.5, 0.9)
     val responsePredictR = 4.761219
     val quantilePredictR = Vectors.dense(0.5287044, 3.3285858, 10.7517072)
 
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
-    model.setQuantileProbabilities(quantileProbabilities)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetMultivariate).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
     }
   }
 
   test("aft survival regression w/o intercept") {
-    val trainer = new AFTSurvivalRegression().setFitIntercept(false)
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
+      .setFitIntercept(false)
     val model = trainer.fit(datasetMultivariate)
 
     /*
@@ -294,18 +305,31 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        [1]   1.452103  25.506077 158.428600
      */
     val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
-    val quantileProbabilities = Array(0.1, 0.5, 0.9)
     val responsePredictR = 44.54465
     val quantilePredictR = Vectors.dense(1.452103, 25.506077, 158.428600)
 
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
-    model.setQuantileProbabilities(quantileProbabilities)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetMultivariate).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 = math.exp(BLAS.dot(model.coefficients, features) + model.intercept)
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression w/o quantiles column") {
+    val trainer = new AFTSurvivalRegression
+    val model = trainer.fit(datasetUnivariate)
+    val outputDf = model.transform(datasetUnivariate)
+
+    assert(outputDf.schema.fieldNames.contains("quantiles") === false)
+
+    outputDf.select("features", "prediction")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
     }
   }
 }

From 067afb4e9bb227f159bcbc2aafafce9693303ea9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Sep 2015 16:41:42 -0700
Subject: [PATCH 402/802] [SPARK-10699] [ML] Support checkpointInterval can be
 disabled

Currently use can set ```checkpointInterval``` to specify how often should the cache be check-pointed. But we also need the function that users can disable it. This PR supports that users can disable checkpoint if user setting ```checkpointInterval = -1```.
We also add documents for GBT ```cacheNodeIds``` to make users can understand more clearly about checkpoint.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8820 from yanboliang/spark-10699.
---
 .../spark/ml/classification/DecisionTreeClassifier.scala    | 1 -
 .../apache/spark/ml/param/shared/SharedParamsCodeGen.scala  | 6 +++---
 .../org/apache/spark/ml/param/shared/sharedParams.scala     | 4 ++--
 .../main/scala/org/apache/spark/ml/recommendation/ALS.scala | 2 +-
 .../scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala   | 2 +-
 .../main/scala/org/apache/spark/ml/tree/treeParams.scala    | 4 ++--
 6 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index a6f6d463bffb2..b0157f7ce24ec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -19,7 +19,6 @@ package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.HasCheckpointInterval
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8049d51fee5ea..8cb6b5493c61c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -56,9 +56,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
       ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")),
-      ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that " +
-        "the cache will get checkpointed every 10 iterations.",
-        isValid = "ParamValidators.gtEq(1)"),
+      ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
+        "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
+        "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an errror). More " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index aff47fc326c4a..e3625212e5251 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -223,10 +223,10 @@ private[ml] trait HasOutputCol extends Params {
 private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations..
+   * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", ParamValidators.gtEq(1))
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 7db8ad8d27918..9a56a75b69d0b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -561,7 +561,7 @@ object ALS extends Logging {
     var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong())
     var previousCheckpointFile: Option[String] = None
     val shouldCheckpoint: Int => Boolean = (iter) =>
-      sc.checkpointDir.isDefined && (iter % checkpointInterval == 0)
+      sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0)
     val deletePreviousCheckpointFile: () => Unit = () =>
       previousCheckpointFile.foreach { file =>
         try {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
index c5ad8df73fac9..1ee01131d6334 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
@@ -122,7 +122,7 @@ private[spark] class NodeIdCache(
     rddUpdateCount += 1
 
     // Handle checkpointing if the directory is not None.
-    if (canCheckpoint && (rddUpdateCount % checkpointInterval) == 0) {
+    if (canCheckpoint && checkpointInterval != -1 && (rddUpdateCount % checkpointInterval) == 0) {
       // Let's see if we can delete previous checkpoints.
       var canDelete = true
       while (checkpointQueue.size > 1 && canDelete) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 42e74ce6d2c69..281ba6eeffa92 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.tree
 
-import org.apache.spark.ml.classification.ClassifierParams
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -87,7 +86,8 @@ private[ml] trait DecisionTreeParams extends PredictorParams with HasCheckpointI
   /**
    * If false, the algorithm will pass trees to executors to match instances with nodes.
    * If true, the algorithm will cache node IDs for each instance.
-   * Caching can speed up training of deeper trees.
+   * Caching can speed up training of deeper trees. Users can set how often should the
+   * cache be checkpointed or disable it by setting checkpointInterval.
    * (default = false)
    * @group expertParam
    */

From 9952217749118ae78fe794ca11e1c4a87a4ae8ba Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 23 Sep 2015 16:43:21 -0700
Subject: [PATCH 403/802] [SPARK-10731] [SQL] Delegate to Scala's
 DataFrame.take implementation in Python DataFrame.

Python DataFrame.head/take now requires scanning all the partitions. This pull request changes them to delegate the actual implementation to Scala DataFrame (by calling DataFrame.take).

This is more of a hack for fixing this issue in 1.5.1. A more proper fix is to change executeCollect and executeTake to return InternalRow rather than Row, and thus eliminate the extra round-trip conversion.

Author: Reynold Xin <rxin@databricks.com>

Closes #8876 from rxin/SPARK-10731.
---
 .../org/apache/spark/api/python/PythonRDD.scala  |  2 +-
 python/pyspark/sql/dataframe.py                  |  5 ++++-
 .../execution/{pythonUDFs.scala => python.scala} | 14 +++++++++++++-
 .../apache/spark/sql/test/ExamplePointUDT.scala  | 16 +++++++---------
 4 files changed, 25 insertions(+), 12 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/{pythonUDFs.scala => python.scala} (95%)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 19be0939038b8..8464b578ed09e 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -633,7 +633,7 @@ private[spark] object PythonRDD extends Logging {
    *
    * The thread will terminate after all the data are sent or any exceptions happen.
    */
-  private def serveIterator[T](items: Iterator[T], threadName: String): Int = {
+  def serveIterator[T](items: Iterator[T], threadName: String): Int = {
     val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
     // Close the socket if no connection in 3 seconds
     serverSocket.setSoTimeout(3000)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 80f8d8a0eb37d..b09422aadef8e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -300,7 +300,10 @@ def take(self, num):
         >>> df.take(2)
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         """
-        return self.limit(num).collect()
+        with SCCallSiteSync(self._sc) as css:
+            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
+                self._jdf, num)
+        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
 
     @ignore_unicode_prefix
     @since(1.3)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
similarity index 95%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index c35c726bfc503..d6aaf424a8935 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -28,7 +28,8 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -118,6 +119,17 @@ object EvaluatePython {
   def apply(udf: PythonUDF, child: LogicalPlan): EvaluatePython =
     new EvaluatePython(udf, child, AttributeReference("pythonUDF", udf.dataType)())
 
+  def takeAndServe(df: DataFrame, n: Int): Int = {
+    registerPicklers()
+    // This is an annoying hack - we should refactor the code so executeCollect and executeTake
+    // returns InternalRow rather than Row.
+    val converter = CatalystTypeConverters.createToCatalystConverter(df.schema)
+    val iter = new SerDeUtil.AutoBatchedPickler(df.take(n).iterator.map { row =>
+      EvaluatePython.toJava(converter(row).asInstanceOf[InternalRow], df.schema)
+    })
+    PythonRDD.serveIterator(iter, s"serve-DataFrame")
+  }
+
   /**
    * Helper for converting from Catalyst type to java type suitable for Pyrolite.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index 2fdd798b44bb6..963e6030c14c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -39,22 +39,20 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] {
 
   override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT"
 
-  override def serialize(obj: Any): Seq[Double] = {
+  override def serialize(obj: Any): GenericArrayData = {
     obj match {
       case p: ExamplePoint =>
-        Seq(p.x, p.y)
+        val output = new Array[Any](2)
+        output(0) = p.x
+        output(1) = p.y
+        new GenericArrayData(output)
     }
   }
 
   override def deserialize(datum: Any): ExamplePoint = {
     datum match {
-      case values: Seq[_] =>
-        val xy = values.asInstanceOf[Seq[Double]]
-        assert(xy.length == 2)
-        new ExamplePoint(xy(0), xy(1))
-      case values: util.ArrayList[_] =>
-        val xy = values.asInstanceOf[util.ArrayList[Double]].asScala
-        new ExamplePoint(xy(0), xy(1))
+      case values: ArrayData =>
+        new ExamplePoint(values.getDouble(0), values.getDouble(1))
     }
   }
 

From 084e4e126211d74a79e8dbd2d0e604dd3c650822 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 23 Sep 2015 18:59:49 -0700
Subject: [PATCH 404/802] [SPARK-6028][Core]A new RPC implemetation based on
 the network module

Design doc: https://docs.google.com/document/d/1CF5G6rGVQMKSyV_QKo4D2M-x6rxz5x1Ew7aK3Uq6u8c/edit?usp=sharing

Author: zsxwing <zsxwing@gmail.com>

Closes #6457 from zsxwing/new-rpc.
---
 .../org/apache/spark/MapOutputTracker.scala   |   2 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  20 +-
 .../apache/spark/deploy/worker/Worker.scala   |   2 +-
 .../spark/deploy/worker/WorkerWatcher.scala   |  13 +-
 .../org/apache/spark/rpc/RpcCallContext.scala |   2 +-
 .../org/apache/spark/rpc/RpcEndpoint.scala    |  51 +-
 .../rpc/RpcEndpointNotFoundException.scala    |  22 +
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   7 +-
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |   6 +-
 .../apache/spark/rpc/netty/Dispatcher.scala   | 218 ++++++++
 .../apache/spark/rpc/netty/IDVerifier.scala   |  39 ++
 .../org/apache/spark/rpc/netty/Inbox.scala    | 220 ++++++++
 .../spark/rpc/netty/NettyRpcAddress.scala     |  56 ++
 .../spark/rpc/netty/NettyRpcCallContext.scala |  87 +++
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 504 ++++++++++++++++++
 .../storage/BlockManagerSlaveEndpoint.scala   |   6 +-
 .../org/apache/spark/util/ThreadUtils.scala   |   6 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  10 +-
 .../org/apache/spark/SSLSampleConfigs.scala   |   2 +
 .../deploy/worker/WorkerWatcherSuite.scala    |   6 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  78 ++-
 .../apache/spark/rpc/TestRpcEndpoint.scala    | 123 +++++
 .../apache/spark/rpc/netty/InboxSuite.scala   | 148 +++++
 .../rpc/netty/NettyRpcAddressSuite.scala      |  29 +
 .../spark/rpc/netty/NettyRpcEnvSuite.scala    |  38 ++
 .../rpc/netty/NettyRpcHandlerSuite.scala      |  67 +++
 .../spark/network/client/TransportClient.java |   4 +
 .../spark/network/server/RpcHandler.java      |   2 +
 .../server/TransportRequestHandler.java       |   1 +
 .../streaming/scheduler/ReceiverTracker.scala |   2 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |   5 +-
 31 files changed, 1708 insertions(+), 68 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 94eb8daa85c53..e380c5b8afcf7 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -45,7 +45,7 @@ private[spark] class MapOutputTrackerMasterEndpoint(
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case GetMapOutputStatuses(shuffleId: Int) =>
-      val hostPort = context.sender.address.hostPort
+      val hostPort = context.senderAddress.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
       val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
       val serializedSize = mapOutputStatuses.size
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index c6fef7f91f00c..cfde27fb2e7d3 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -20,11 +20,10 @@ package org.apache.spark
 import java.io.File
 import java.net.Socket
 
-import akka.actor.ActorSystem
-
 import scala.collection.mutable
 import scala.util.Properties
 
+import akka.actor.ActorSystem
 import com.google.common.collect.MapMaker
 
 import org.apache.spark.annotation.DeveloperApi
@@ -41,7 +40,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
 import org.apache.spark.storage._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator}
-import org.apache.spark.util.{RpcUtils, Utils}
+import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils}
 
 /**
  * :: DeveloperApi ::
@@ -57,6 +56,7 @@ import org.apache.spark.util.{RpcUtils, Utils}
 class SparkEnv (
     val executorId: String,
     private[spark] val rpcEnv: RpcEnv,
+    _actorSystem: ActorSystem, // TODO Remove actorSystem
     val serializer: Serializer,
     val closureSerializer: Serializer,
     val cacheManager: CacheManager,
@@ -76,7 +76,7 @@ class SparkEnv (
 
   // TODO Remove actorSystem
   @deprecated("Actor system is no longer supported as of 1.4.0", "1.4.0")
-  val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+  val actorSystem: ActorSystem = _actorSystem
 
   private[spark] var isStopped = false
   private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()
@@ -100,6 +100,9 @@ class SparkEnv (
       blockManager.master.stop()
       metricsSystem.stop()
       outputCommitCoordinator.stop()
+      if (!rpcEnv.isInstanceOf[AkkaRpcEnv]) {
+        actorSystem.shutdown()
+      }
       rpcEnv.shutdown()
 
       // Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut
@@ -249,7 +252,13 @@ object SparkEnv extends Logging {
     // Create the ActorSystem for Akka and get the port it binds to.
     val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
     val rpcEnv = RpcEnv.create(actorSystemName, hostname, port, conf, securityManager)
-    val actorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+    val actorSystem: ActorSystem =
+      if (rpcEnv.isInstanceOf[AkkaRpcEnv]) {
+        rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+      } else {
+        // Create a ActorSystem for legacy codes
+        AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)._1
+      }
 
     // Figure out which port Akka actually bound to in case the original port is 0 or occupied.
     if (isDriver) {
@@ -395,6 +404,7 @@ object SparkEnv extends Logging {
     val envInstance = new SparkEnv(
       executorId,
       rpcEnv,
+      actorSystem,
       serializer,
       closureSerializer,
       cacheManager,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 770927c80f7a4..93a1b3f310422 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -329,7 +329,7 @@ private[deploy] class Worker(
         registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate(
           new Runnable {
             override def run(): Unit = Utils.tryLogNonFatalError {
-              self.send(ReregisterWithMaster)
+              Option(self).foreach(_.send(ReregisterWithMaster))
             }
           },
           INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index 735c4f0927150..ab56fde938bae 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -24,14 +24,13 @@ import org.apache.spark.rpc._
  * Actor which connects to a worker process and terminates the JVM if the connection is severed.
  * Provides fate sharing between a worker and its associated child processes.
  */
-private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: String)
+private[spark] class WorkerWatcher(
+    override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false)
   extends RpcEndpoint with Logging {
 
-  override def onStart() {
-    logInfo(s"Connecting to worker $workerUrl")
-    if (!isTesting) {
-      rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
-    }
+  logInfo(s"Connecting to worker $workerUrl")
+  if (!isTesting) {
+    rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
   }
 
   // Used to avoid shutting down JVM during tests
@@ -40,8 +39,6 @@ private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: Strin
   // true rather than calling `System.exit`. The user can check `isShutDown` to know if
   // `exitNonZero` is called.
   private[deploy] var isShutDown = false
-  private[deploy] def setTesting(testing: Boolean) = isTesting = testing
-  private var isTesting = false
 
   // Lets filter events only from the worker's rpc system
   private val expectedAddress = RpcAddress.fromURIString(workerUrl)
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
index 3e5b64265e919..f527ec86ab7b2 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
@@ -37,5 +37,5 @@ private[spark] trait RpcCallContext {
   /**
    * The sender of this message.
    */
-  def sender: RpcEndpointRef
+  def senderAddress: RpcAddress
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
index dfcbc51cdf616..f1ddc6d2cd438 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
@@ -28,20 +28,6 @@ private[spark] trait RpcEnvFactory {
   def create(config: RpcEnvConfig): RpcEnv
 }
 
-/**
- * A trait that requires RpcEnv thread-safely sending messages to it.
- *
- * Thread-safety means processing of one message happens before processing of the next message by
- * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
- * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
- * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
- *
- * However, there is no guarantee that the same thread will be executing the same
- * [[ThreadSafeRpcEndpoint]] for different messages.
- */
-private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint
-
-
 /**
  * An end point for the RPC that defines what functions to trigger given a message.
  *
@@ -101,38 +87,39 @@ private[spark] trait RpcEndpoint {
   }
 
   /**
-   * Invoked before [[RpcEndpoint]] starts to handle any message.
+   * Invoked when `remoteAddress` is connected to the current node.
    */
-  def onStart(): Unit = {
+  def onConnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when [[RpcEndpoint]] is stopping.
+   * Invoked when `remoteAddress` is lost.
    */
-  def onStop(): Unit = {
+  def onDisconnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when `remoteAddress` is connected to the current node.
+   * Invoked when some network error happens in the connection between the current node and
+   * `remoteAddress`.
    */
-  def onConnected(remoteAddress: RpcAddress): Unit = {
+  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when `remoteAddress` is lost.
+   * Invoked before [[RpcEndpoint]] starts to handle any message.
    */
-  def onDisconnected(remoteAddress: RpcAddress): Unit = {
+  def onStart(): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when some network error happens in the connection between the current node and
-   * `remoteAddress`.
+   * Invoked when [[RpcEndpoint]] is stopping. `self` will be `null` in this method and you cannot
+   * use it to send or ask messages.
    */
-  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+  def onStop(): Unit = {
     // By default, do nothing.
   }
 
@@ -146,3 +133,17 @@ private[spark] trait RpcEndpoint {
     }
   }
 }
+
+/**
+ * A trait that requires RpcEnv thread-safely sending messages to it.
+ *
+ * Thread-safety means processing of one message happens before processing of the next message by
+ * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
+ * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
+ * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
+ *
+ * However, there is no guarantee that the same thread will be executing the same
+ * [[ThreadSafeRpcEndpoint]] for different messages.
+ */
+private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint {
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
new file mode 100644
index 0000000000000..d177881fb3053
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc
+
+import org.apache.spark.SparkException
+
+private[rpc] class RpcEndpointNotFoundException(uri: String)
+  extends SparkException(s"Cannot find endpoint: $uri")
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 29debe8081308..afe1ee8a0b4d3 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -36,8 +36,11 @@ private[spark] object RpcEnv {
 
   private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = {
     // Add more RpcEnv implementations here
-    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory")
-    val rpcEnvName = conf.get("spark.rpc", "akka")
+    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
+      "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory")
+    // Use "netty" by default so that Jenkins can run all tests using NettyRpcEnv.
+    // Will change it back to "akka" before merging the new implementation.
+    val rpcEnvName = conf.get("spark.rpc", "netty")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
     Utils.classForName(rpcEnvFactoryClassName).newInstance().asInstanceOf[RpcEnvFactory]
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index ad67e1c5ad4d5..95132a4e4a0bf 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -166,9 +166,9 @@ private[spark] class AkkaRpcEnv private[akka] (
             _sender ! AkkaMessage(response, false)
           }
 
-          // Some RpcEndpoints need to know the sender's address
-          override val sender: RpcEndpointRef =
-            new AkkaRpcEndpointRef(defaultAddress, _sender, conf)
+          // Use "lazy" because most of RpcEndpoints don't need "senderAddress"
+          override lazy val senderAddress: RpcAddress =
+            new AkkaRpcEndpointRef(defaultAddress, _sender, conf).address
         })
       } else {
         endpoint.receive
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
new file mode 100644
index 0000000000000..d71e6f01dbb29
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.JavaConverters._
+import scala.concurrent.Promise
+import scala.util.control.NonFatal
+
+import org.apache.spark.{SparkException, Logging}
+import org.apache.spark.network.client.RpcResponseCallback
+import org.apache.spark.rpc._
+import org.apache.spark.util.ThreadUtils
+
+private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
+
+  private class EndpointData(
+      val name: String,
+      val endpoint: RpcEndpoint,
+      val ref: NettyRpcEndpointRef) {
+    val inbox = new Inbox(ref, endpoint)
+  }
+
+  private val endpoints = new ConcurrentHashMap[String, EndpointData]()
+  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]()
+
+  // Track the receivers whose inboxes may contain messages.
+  private val receivers = new LinkedBlockingQueue[EndpointData]()
+
+  @GuardedBy("this")
+  private var stopped = false
+
+  def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
+    val addr = new NettyRpcAddress(nettyEnv.address.host, nettyEnv.address.port, name)
+    val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
+    synchronized {
+      if (stopped) {
+        throw new IllegalStateException("RpcEnv has been stopped")
+      }
+      if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) {
+        throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name")
+      }
+      val data = endpoints.get(name)
+      endpointRefs.put(data.endpoint, data.ref)
+      receivers.put(data)
+    }
+    endpointRef
+  }
+
+  def getRpcEndpointRef(endpoint: RpcEndpoint): RpcEndpointRef = endpointRefs.get(endpoint)
+
+  def removeRpcEndpointRef(endpoint: RpcEndpoint): Unit = endpointRefs.remove(endpoint)
+
+  // Should be idempotent
+  private def unregisterRpcEndpoint(name: String): Unit = {
+    val data = endpoints.remove(name)
+    if (data != null) {
+      data.inbox.stop()
+      receivers.put(data)
+    }
+    // Don't clean `endpointRefs` here because it's possible that some messages are being processed
+    // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
+    // `removeRpcEndpointRef`.
+  }
+
+  def stop(rpcEndpointRef: RpcEndpointRef): Unit = {
+    synchronized {
+      if (stopped) {
+        // This endpoint will be stopped by Dispatcher.stop() method.
+        return
+      }
+      unregisterRpcEndpoint(rpcEndpointRef.name)
+    }
+  }
+
+  /**
+   * Send a message to all registered [[RpcEndpoint]]s.
+   * @param message
+   */
+  def broadcastMessage(message: InboxMessage): Unit = {
+    val iter = endpoints.keySet().iterator()
+    while (iter.hasNext) {
+      val name = iter.next
+      postMessageToInbox(name, (_) => message,
+        () => { logWarning(s"Drop ${message} because ${name} has been stopped") })
+    }
+  }
+
+  def postMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
+    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
+      val rpcCallContext =
+        new RemoteNettyRpcCallContext(
+          nettyEnv, sender, callback, message.senderAddress, message.needReply)
+      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
+    }
+
+    def onEndpointStopped(): Unit = {
+      callback.onFailure(
+        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
+    }
+
+    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+  }
+
+  def postMessage(message: RequestMessage, p: Promise[Any]): Unit = {
+    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
+      val rpcCallContext =
+        new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p)
+      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
+    }
+
+    def onEndpointStopped(): Unit = {
+      p.tryFailure(
+        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
+    }
+
+    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+  }
+
+  private def postMessageToInbox(
+      endpointName: String,
+      createMessageFn: NettyRpcEndpointRef => InboxMessage,
+      onStopped: () => Unit): Unit = {
+    val shouldCallOnStop =
+      synchronized {
+        val data = endpoints.get(endpointName)
+        if (stopped || data == null) {
+          true
+        } else {
+          data.inbox.post(createMessageFn(data.ref))
+          receivers.put(data)
+          false
+        }
+      }
+    if (shouldCallOnStop) {
+      // We don't need to call `onStop` in the `synchronized` block
+      onStopped()
+    }
+  }
+
+  private val parallelism = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.parallelism",
+    Runtime.getRuntime.availableProcessors())
+
+  private val executor = ThreadUtils.newDaemonFixedThreadPool(parallelism, "dispatcher-event-loop")
+
+  (0 until parallelism) foreach { _ =>
+    executor.execute(new MessageLoop)
+  }
+
+  def stop(): Unit = {
+    synchronized {
+      if (stopped) {
+        return
+      }
+      stopped = true
+    }
+    // Stop all endpoints. This will queue all endpoints for processing by the message loops.
+    endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
+    // Enqueue a message that tells the message loops to stop.
+    receivers.put(PoisonEndpoint)
+    executor.shutdown()
+  }
+
+  def awaitTermination(): Unit = {
+    executor.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
+  }
+
+  /**
+   * Return if the endpoint exists
+   */
+  def verify(name: String): Boolean = {
+    endpoints.containsKey(name)
+  }
+
+  private class MessageLoop extends Runnable {
+    override def run(): Unit = {
+      try {
+        while (true) {
+          try {
+            val data = receivers.take()
+            if (data == PoisonEndpoint) {
+              // Put PoisonEndpoint back so that other MessageLoops can see it.
+              receivers.put(PoisonEndpoint)
+              return
+            }
+            data.inbox.process(Dispatcher.this)
+          } catch {
+            case NonFatal(e) => logError(e.getMessage, e)
+          }
+        }
+      } catch {
+        case ie: InterruptedException => // exit
+      }
+    }
+  }
+
+  /**
+   * A poison endpoint that indicates MessageLoop should exit its loop.
+   */
+  private val PoisonEndpoint = new EndpointData(null, null, null)
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
new file mode 100644
index 0000000000000..6061c9b8de944
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
+
+/**
+ * A message used to ask the remote [[IDVerifier]] if an [[RpcEndpoint]] exists
+ */
+private[netty] case class ID(name: String)
+
+/**
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
+ */
+private[netty] class IDVerifier(
+    override val rpcEnv: RpcEnv, dispatcher: Dispatcher) extends RpcEndpoint {
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case ID(name) => context.reply(dispatcher.verify(name))
+  }
+}
+
+private[netty] object IDVerifier {
+  val NAME = "id-verifier"
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
new file mode 100644
index 0000000000000..4803548365aba
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.LinkedList
+import javax.annotation.concurrent.GuardedBy
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint}
+
+private[netty] sealed trait InboxMessage
+
+private[netty] case class ContentMessage(
+    senderAddress: RpcAddress,
+    content: Any,
+    needReply: Boolean,
+    context: NettyRpcCallContext) extends InboxMessage
+
+private[netty] case object OnStart extends InboxMessage
+
+private[netty] case object OnStop extends InboxMessage
+
+/**
+ * A broadcast message that indicates connecting to a remote node.
+ */
+private[netty] case class Associated(remoteAddress: RpcAddress) extends InboxMessage
+
+/**
+ * A broadcast message that indicates a remote connection is lost.
+ */
+private[netty] case class Disassociated(remoteAddress: RpcAddress) extends InboxMessage
+
+/**
+ * A broadcast message that indicates a network error
+ */
+private[netty] case class AssociationError(cause: Throwable, remoteAddress: RpcAddress)
+  extends InboxMessage
+
+/**
+ * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely.
+ * @param endpointRef
+ * @param endpoint
+ */
+private[netty] class Inbox(
+    val endpointRef: NettyRpcEndpointRef,
+    val endpoint: RpcEndpoint) extends Logging {
+
+  @GuardedBy("this")
+  protected val messages = new LinkedList[InboxMessage]()
+
+  @GuardedBy("this")
+  private var stopped = false
+
+  @GuardedBy("this")
+  private var enableConcurrent = false
+
+  @GuardedBy("this")
+  private var workerCount = 0
+
+  // OnStart should be the first message to process
+  synchronized {
+    messages.add(OnStart)
+  }
+
+  /**
+   * Process stored messages.
+   */
+  def process(dispatcher: Dispatcher): Unit = {
+    var message: InboxMessage = null
+    synchronized {
+      if (!enableConcurrent && workerCount != 0) {
+        return
+      }
+      message = messages.poll()
+      if (message != null) {
+        workerCount += 1
+      } else {
+        return
+      }
+    }
+    while (true) {
+      safelyCall(endpoint) {
+        message match {
+          case ContentMessage(_sender, content, needReply, context) =>
+            val pf: PartialFunction[Any, Unit] =
+              if (needReply) {
+                endpoint.receiveAndReply(context)
+              } else {
+                endpoint.receive
+              }
+            try {
+              pf.applyOrElse[Any, Unit](content, { msg =>
+                throw new SparkException(s"Unmatched message $message from ${_sender}")
+              })
+              if (!needReply) {
+                context.finish()
+              }
+            } catch {
+              case NonFatal(e) =>
+                if (needReply) {
+                  // If the sender asks a reply, we should send the error back to the sender
+                  context.sendFailure(e)
+                } else {
+                  context.finish()
+                  throw e
+                }
+            }
+
+          case OnStart => {
+            endpoint.onStart()
+            if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {
+              synchronized {
+                enableConcurrent = true
+              }
+            }
+          }
+
+          case OnStop =>
+            assert(synchronized { workerCount } == 1)
+            dispatcher.removeRpcEndpointRef(endpoint)
+            endpoint.onStop()
+            assert(isEmpty, "OnStop should be the last message")
+
+          case Associated(remoteAddress) =>
+            endpoint.onConnected(remoteAddress)
+
+          case Disassociated(remoteAddress) =>
+            endpoint.onDisconnected(remoteAddress)
+
+          case AssociationError(cause, remoteAddress) =>
+            endpoint.onNetworkError(cause, remoteAddress)
+        }
+      }
+
+      synchronized {
+        // "enableConcurrent" will be set to false after `onStop` is called, so we should check it
+        // every time.
+        if (!enableConcurrent && workerCount != 1) {
+          // If we are not the only one worker, exit
+          workerCount -= 1
+          return
+        }
+        message = messages.poll()
+        if (message == null) {
+          workerCount -= 1
+          return
+        }
+      }
+    }
+  }
+
+  def post(message: InboxMessage): Unit = {
+    val dropped =
+      synchronized {
+        if (stopped) {
+          // We already put "OnStop" into "messages", so we should drop further messages
+          true
+        } else {
+          messages.add(message)
+          false
+        }
+      }
+    if (dropped) {
+      onDrop(message)
+    }
+  }
+
+  def stop(): Unit = synchronized {
+    // The following codes should be in `synchronized` so that we can make sure "OnStop" is the last
+    // message
+    if (!stopped) {
+      // We should disable concurrent here. Then when RpcEndpoint.onStop is called, it's the only
+      // thread that is processing messages. So `RpcEndpoint.onStop` can release its resources
+      // safely.
+      enableConcurrent = false
+      stopped = true
+      messages.add(OnStop)
+      // Note: The concurrent events in messages will be processed one by one.
+    }
+  }
+
+  // Visible for testing.
+  protected def onDrop(message: InboxMessage): Unit = {
+    logWarning(s"Drop ${message} because $endpointRef is stopped")
+  }
+
+  def isEmpty: Boolean = synchronized { messages.isEmpty }
+
+  private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {
+    try {
+      action
+    } catch {
+      case NonFatal(e) => {
+        try {
+          endpoint.onError(e)
+        } catch {
+          case NonFatal(e) => logWarning(s"Ignore error", e)
+        }
+      }
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
new file mode 100644
index 0000000000000..1876b25592086
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.net.URI
+
+import org.apache.spark.SparkException
+import org.apache.spark.rpc.RpcAddress
+
+private[netty] case class NettyRpcAddress(host: String, port: Int, name: String) {
+
+  def toRpcAddress: RpcAddress = RpcAddress(host, port)
+
+  override val toString = s"spark://$name@$host:$port"
+}
+
+private[netty] object NettyRpcAddress {
+
+  def apply(sparkUrl: String): NettyRpcAddress = {
+    try {
+      val uri = new URI(sparkUrl)
+      val host = uri.getHost
+      val port = uri.getPort
+      val name = uri.getUserInfo
+      if (uri.getScheme != "spark" ||
+        host == null ||
+        port < 0 ||
+        name == null ||
+        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
+        uri.getFragment != null ||
+        uri.getQuery != null) {
+        throw new SparkException("Invalid Spark URL: " + sparkUrl)
+      }
+      NettyRpcAddress(host, port, name)
+    } catch {
+      case e: java.net.URISyntaxException =>
+        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
new file mode 100644
index 0000000000000..75dcc02a0c5a9
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import scala.concurrent.Promise
+
+import org.apache.spark.Logging
+import org.apache.spark.network.client.RpcResponseCallback
+import org.apache.spark.rpc.{RpcAddress, RpcCallContext}
+
+private[netty] abstract class NettyRpcCallContext(
+    endpointRef: NettyRpcEndpointRef,
+    override val senderAddress: RpcAddress,
+    needReply: Boolean) extends RpcCallContext with Logging {
+
+  protected def send(message: Any): Unit
+
+  override def reply(response: Any): Unit = {
+    if (needReply) {
+      send(AskResponse(endpointRef, response))
+    } else {
+      throw new IllegalStateException(
+        s"Cannot send $response to the sender because the sender won't handle it")
+    }
+  }
+
+  override def sendFailure(e: Throwable): Unit = {
+    if (needReply) {
+      send(AskResponse(endpointRef, RpcFailure(e)))
+    } else {
+      logError(e.getMessage, e)
+      throw new IllegalStateException(
+        "Cannot send reply to the sender because the sender won't handle it")
+    }
+  }
+
+  def finish(): Unit = {
+    if (!needReply) {
+      send(Ack(endpointRef))
+    }
+  }
+}
+
+/**
+ * If the sender and the receiver are in the same process, the reply can be sent back via `Promise`.
+ */
+private[netty] class LocalNettyRpcCallContext(
+    endpointRef: NettyRpcEndpointRef,
+    senderAddress: RpcAddress,
+    needReply: Boolean,
+    p: Promise[Any]) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+
+  override protected def send(message: Any): Unit = {
+    p.success(message)
+  }
+}
+
+/**
+ * A [[RpcCallContext]] that will call [[RpcResponseCallback]] to send the reply back.
+ */
+private[netty] class RemoteNettyRpcCallContext(
+    nettyEnv: NettyRpcEnv,
+    endpointRef: NettyRpcEndpointRef,
+    callback: RpcResponseCallback,
+    senderAddress: RpcAddress,
+    needReply: Boolean) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+
+  override protected def send(message: Any): Unit = {
+    val reply = nettyEnv.serialize(message)
+    callback.onSuccess(reply)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
new file mode 100644
index 0000000000000..5522b40782d9e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -0,0 +1,504 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc.netty
+
+import java.io._
+import java.net.{InetSocketAddress, URI}
+import java.nio.ByteBuffer
+import java.util.Arrays
+import java.util.concurrent._
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.{Future, Promise}
+import scala.reflect.ClassTag
+import scala.util.{DynamicVariable, Failure, Success}
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.network.TransportContext
+import org.apache.spark.network.client._
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.sasl.{SaslClientBootstrap, SaslServerBootstrap}
+import org.apache.spark.network.server._
+import org.apache.spark.rpc._
+import org.apache.spark.serializer.{JavaSerializer, JavaSerializerInstance}
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+private[netty] class NettyRpcEnv(
+    val conf: SparkConf,
+    javaSerializerInstance: JavaSerializerInstance,
+    host: String,
+    securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
+
+  private val transportConf =
+    SparkTransportConf.fromSparkConf(conf, conf.getInt("spark.rpc.io.threads", 0))
+
+  private val dispatcher: Dispatcher = new Dispatcher(this)
+
+  private val transportContext =
+    new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
+
+  private val clientFactory = {
+    val bootstraps: Seq[TransportClientBootstrap] =
+      if (securityManager.isAuthenticationEnabled()) {
+        Seq(new SaslClientBootstrap(transportConf, "", securityManager,
+          securityManager.isSaslEncryptionEnabled()))
+      } else {
+        Nil
+      }
+    transportContext.createClientFactory(bootstraps.asJava)
+  }
+
+  val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
+
+  // Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool
+  // to implement non-blocking send/ask.
+  // TODO: a non-blocking TransportClientFactory.createClient in future
+  private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
+    "netty-rpc-connection",
+    conf.getInt("spark.rpc.connect.threads", 256))
+
+  @volatile private var server: TransportServer = _
+
+  def start(port: Int): Unit = {
+    val bootstraps: Seq[TransportServerBootstrap] =
+      if (securityManager.isAuthenticationEnabled()) {
+        Seq(new SaslServerBootstrap(transportConf, securityManager))
+      } else {
+        Nil
+      }
+    server = transportContext.createServer(port, bootstraps.asJava)
+    dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
+  }
+
+  override lazy val address: RpcAddress = {
+    require(server != null, "NettyRpcEnv has not yet started")
+    RpcAddress(host, server.getPort())
+  }
+
+  override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
+    dispatcher.registerRpcEndpoint(name, endpoint)
+  }
+
+  def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = {
+    val addr = NettyRpcAddress(uri)
+    val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
+    val idVerifierRef =
+      new NettyRpcEndpointRef(conf, NettyRpcAddress(addr.host, addr.port, IDVerifier.NAME), this)
+    idVerifierRef.ask[Boolean](ID(endpointRef.name)).flatMap { find =>
+      if (find) {
+        Future.successful(endpointRef)
+      } else {
+        Future.failed(new RpcEndpointNotFoundException(uri))
+      }
+    }(ThreadUtils.sameThread)
+  }
+
+  override def stop(endpointRef: RpcEndpointRef): Unit = {
+    require(endpointRef.isInstanceOf[NettyRpcEndpointRef])
+    dispatcher.stop(endpointRef)
+  }
+
+  private[netty] def send(message: RequestMessage): Unit = {
+    val remoteAddr = message.receiver.address
+    if (remoteAddr == address) {
+      val promise = Promise[Any]()
+      dispatcher.postMessage(message, promise)
+      promise.future.onComplete {
+        case Success(response) =>
+          val ack = response.asInstanceOf[Ack]
+          logDebug(s"Receive ack from ${ack.sender}")
+        case Failure(e) =>
+          logError(s"Exception when sending $message", e)
+      }(ThreadUtils.sameThread)
+    } else {
+      try {
+        // `createClient` will block if it cannot find a known connection, so we should run it in
+        // clientConnectionExecutor
+        clientConnectionExecutor.execute(new Runnable {
+          override def run(): Unit = Utils.tryLogNonFatalError {
+            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
+            client.sendRpc(serialize(message), new RpcResponseCallback {
+
+              override def onFailure(e: Throwable): Unit = {
+                logError(s"Exception when sending $message", e)
+              }
+
+              override def onSuccess(response: Array[Byte]): Unit = {
+                val ack = deserialize[Ack](response)
+                logDebug(s"Receive ack from ${ack.sender}")
+              }
+            })
+          }
+        })
+      } catch {
+        case e: RejectedExecutionException => {
+          // `send` after shutting clientConnectionExecutor down, ignore it
+          logWarning(s"Cannot send ${message} because RpcEnv is stopped")
+        }
+      }
+    }
+  }
+
+  private[netty] def ask(message: RequestMessage): Future[Any] = {
+    val promise = Promise[Any]()
+    val remoteAddr = message.receiver.address
+    if (remoteAddr == address) {
+      val p = Promise[Any]()
+      dispatcher.postMessage(message, p)
+      p.future.onComplete {
+        case Success(response) =>
+          val reply = response.asInstanceOf[AskResponse]
+          if (reply.reply.isInstanceOf[RpcFailure]) {
+            if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
+              logWarning(s"Ignore failure: ${reply.reply}")
+            }
+          } else if (!promise.trySuccess(reply.reply)) {
+            logWarning(s"Ignore message: ${reply}")
+          }
+        case Failure(e) =>
+          if (!promise.tryFailure(e)) {
+            logWarning("Ignore Exception", e)
+          }
+      }(ThreadUtils.sameThread)
+    } else {
+      try {
+        // `createClient` will block if it cannot find a known connection, so we should run it in
+        // clientConnectionExecutor
+        clientConnectionExecutor.execute(new Runnable {
+          override def run(): Unit = {
+            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
+            client.sendRpc(serialize(message), new RpcResponseCallback {
+
+              override def onFailure(e: Throwable): Unit = {
+                if (!promise.tryFailure(e)) {
+                  logWarning("Ignore Exception", e)
+                }
+              }
+
+              override def onSuccess(response: Array[Byte]): Unit = {
+                val reply = deserialize[AskResponse](response)
+                if (reply.reply.isInstanceOf[RpcFailure]) {
+                  if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
+                    logWarning(s"Ignore failure: ${reply.reply}")
+                  }
+                } else if (!promise.trySuccess(reply.reply)) {
+                  logWarning(s"Ignore message: ${reply}")
+                }
+              }
+            })
+          }
+        })
+      } catch {
+        case e: RejectedExecutionException => {
+          if (!promise.tryFailure(e)) {
+            logWarning(s"Ignore failure", e)
+          }
+        }
+      }
+    }
+    promise.future
+  }
+
+  private[netty] def serialize(content: Any): Array[Byte] = {
+    val buffer = javaSerializerInstance.serialize(content)
+    Arrays.copyOfRange(
+      buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
+  }
+
+  private[netty] def deserialize[T: ClassTag](bytes: Array[Byte]): T = {
+    deserialize { () =>
+      javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes))
+    }
+  }
+
+  override def endpointRef(endpoint: RpcEndpoint): RpcEndpointRef = {
+    dispatcher.getRpcEndpointRef(endpoint)
+  }
+
+  override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
+    new NettyRpcAddress(address.host, address.port, endpointName).toString
+
+  override def shutdown(): Unit = {
+    cleanup()
+  }
+
+  override def awaitTermination(): Unit = {
+    dispatcher.awaitTermination()
+  }
+
+  private def cleanup(): Unit = {
+    if (timeoutScheduler != null) {
+      timeoutScheduler.shutdownNow()
+    }
+    if (server != null) {
+      server.close()
+    }
+    if (clientFactory != null) {
+      clientFactory.close()
+    }
+    if (dispatcher != null) {
+      dispatcher.stop()
+    }
+    if (clientConnectionExecutor != null) {
+      clientConnectionExecutor.shutdownNow()
+    }
+  }
+
+  override def deserialize[T](deserializationAction: () => T): T = {
+    NettyRpcEnv.currentEnv.withValue(this) {
+      deserializationAction()
+    }
+  }
+}
+
+private[netty] object NettyRpcEnv extends Logging {
+
+  /**
+   * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
+   * Use `currentEnv` to wrap the deserialization codes. E.g.,
+   *
+   * {{{
+   *   NettyRpcEnv.currentEnv.withValue(this) {
+   *     your deserialization codes
+   *   }
+   * }}}
+   */
+  private[netty] val currentEnv = new DynamicVariable[NettyRpcEnv](null)
+}
+
+private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
+
+  def create(config: RpcEnvConfig): RpcEnv = {
+    val sparkConf = config.conf
+    // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support
+    // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance
+    val javaSerializerInstance =
+      new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]
+    val nettyEnv =
+      new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager)
+    val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
+      nettyEnv.start(actualPort)
+      (nettyEnv, actualPort)
+    }
+    try {
+      Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1
+    } catch {
+      case NonFatal(e) =>
+        nettyEnv.shutdown()
+        throw e
+    }
+  }
+}
+
+private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
+  extends RpcEndpointRef(conf) with Serializable with Logging {
+
+  @transient @volatile private var nettyEnv: NettyRpcEnv = _
+
+  @transient @volatile private var _address: NettyRpcAddress = _
+
+  def this(conf: SparkConf, _address: NettyRpcAddress, nettyEnv: NettyRpcEnv) {
+    this(conf)
+    this._address = _address
+    this.nettyEnv = nettyEnv
+  }
+
+  override def address: RpcAddress = _address.toRpcAddress
+
+  private def readObject(in: ObjectInputStream): Unit = {
+    in.defaultReadObject()
+    _address = in.readObject().asInstanceOf[NettyRpcAddress]
+    nettyEnv = NettyRpcEnv.currentEnv.value
+  }
+
+  private def writeObject(out: ObjectOutputStream): Unit = {
+    out.defaultWriteObject()
+    out.writeObject(_address)
+  }
+
+  override def name: String = _address.name
+
+  override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = {
+    val promise = Promise[Any]()
+    val timeoutCancelable = nettyEnv.timeoutScheduler.schedule(new Runnable {
+      override def run(): Unit = {
+        promise.tryFailure(new TimeoutException("Cannot receive any reply in " + timeout.duration))
+      }
+    }, timeout.duration.toNanos, TimeUnit.NANOSECONDS)
+    val f = nettyEnv.ask(RequestMessage(nettyEnv.address, this, message, true))
+    f.onComplete { v =>
+      timeoutCancelable.cancel(true)
+      if (!promise.tryComplete(v)) {
+        logWarning(s"Ignore message $v")
+      }
+    }(ThreadUtils.sameThread)
+    promise.future.mapTo[T].recover(timeout.addMessageIfTimeout)(ThreadUtils.sameThread)
+  }
+
+  override def send(message: Any): Unit = {
+    require(message != null, "Message is null")
+    nettyEnv.send(RequestMessage(nettyEnv.address, this, message, false))
+  }
+
+  override def toString: String = s"NettyRpcEndpointRef(${_address})"
+
+  def toURI: URI = new URI(s"spark://${_address}")
+
+  final override def equals(that: Any): Boolean = that match {
+    case other: NettyRpcEndpointRef => _address == other._address
+    case _ => false
+  }
+
+  final override def hashCode(): Int = if (_address == null) 0 else _address.hashCode()
+}
+
+/**
+ * The message that is sent from the sender to the receiver.
+ */
+private[netty] case class RequestMessage(
+    senderAddress: RpcAddress, receiver: NettyRpcEndpointRef, content: Any, needReply: Boolean)
+
+/**
+ * The base trait for all messages that are sent back from the receiver to the sender.
+ */
+private[netty] trait ResponseMessage
+
+/**
+ * The reply for `ask` from the receiver side.
+ */
+private[netty] case class AskResponse(sender: NettyRpcEndpointRef, reply: Any)
+  extends ResponseMessage
+
+/**
+ * A message to send back to the receiver side. It's necessary because [[TransportClient]] only
+ * clean the resources when it receives a reply.
+ */
+private[netty] case class Ack(sender: NettyRpcEndpointRef) extends ResponseMessage
+
+/**
+ * A response that indicates some failure happens in the receiver side.
+ */
+private[netty] case class RpcFailure(e: Throwable)
+
+/**
+ * Maintain the mapping relations between client addresses and [[RpcEnv]] addresses, broadcast
+ * network events and forward messages to [[Dispatcher]].
+ */
+private[netty] class NettyRpcHandler(
+    dispatcher: Dispatcher, nettyEnv: NettyRpcEnv) extends RpcHandler with Logging {
+
+  private type ClientAddress = RpcAddress
+  private type RemoteEnvAddress = RpcAddress
+
+  // Store all client addresses and their NettyRpcEnv addresses.
+  @GuardedBy("this")
+  private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
+
+  // Store the connections from other NettyRpcEnv addresses. We need to keep track of the connection
+  // count because `TransportClientFactory.createClient` will create multiple connections
+  // (at most `spark.shuffle.io.numConnectionsPerPeer` connections) and randomly select a connection
+  // to send the message. See `TransportClientFactory.createClient` for more details.
+  @GuardedBy("this")
+  private val remoteConnectionCount = new mutable.HashMap[RemoteEnvAddress, Int]()
+
+  override def receive(
+      client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
+    val requestMessage = nettyEnv.deserialize[RequestMessage](message)
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    assert(addr != null)
+    val remoteEnvAddress = requestMessage.senderAddress
+    val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+    val broadcastMessage =
+      synchronized {
+        // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
+        if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
+          // clientAddr connects at the first time
+          val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
+          // Increase the connection number of remoteEnvAddress
+          remoteConnectionCount.put(remoteEnvAddress, count + 1)
+          if (count == 0) {
+            // This is the first connection, so fire "Associated"
+            Some(Associated(remoteEnvAddress))
+          } else {
+            None
+          }
+        } else {
+          None
+        }
+      }
+    broadcastMessage.foreach(dispatcher.broadcastMessage)
+    dispatcher.postMessage(requestMessage, callback)
+  }
+
+  override def getStreamManager: StreamManager = new OneForOneStreamManager
+
+  override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = {
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    if (addr != null) {
+      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      val broadcastMessage =
+        synchronized {
+          remoteAddresses.get(clientAddr).map(AssociationError(cause, _))
+        }
+      if (broadcastMessage.isEmpty) {
+        logError(cause.getMessage, cause)
+      } else {
+        dispatcher.broadcastMessage(broadcastMessage.get)
+      }
+    } else {
+      // If the channel is closed before connecting, its remoteAddress will be null.
+      // See java.net.Socket.getRemoteSocketAddress
+      // Because we cannot get a RpcAddress, just log it
+      logError("Exception before connecting to the client", cause)
+    }
+  }
+
+  override def connectionTerminated(client: TransportClient): Unit = {
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    if (addr != null) {
+      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      val broadcastMessage =
+        synchronized {
+          // If the last connection to a remote RpcEnv is terminated, we should broadcast
+          // "Disassociated"
+          remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
+            remoteAddresses -= clientAddr
+            val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
+            assert(count != 0, "remoteAddresses and remoteConnectionCount are not consistent")
+            if (count - 1 == 0) {
+              // We lost all clients, so clean up and fire "Disassociated"
+              remoteConnectionCount.remove(remoteEnvAddress)
+              Some(Disassociated(remoteEnvAddress))
+            } else {
+              // Decrease the connection number of remoteEnvAddress
+              remoteConnectionCount.put(remoteEnvAddress, count - 1)
+              None
+            }
+          }
+        }
+      broadcastMessage.foreach(dispatcher.broadcastMessage)
+    } else {
+      // If the channel is closed before connecting, its remoteAddress will be null. In this case,
+      // we can ignore it since we don't fire "Associated".
+      // See java.net.Socket.getRemoteSocketAddress
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
index 7478ab0fc2f7a..e749631bf6f19 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
@@ -19,7 +19,7 @@ package org.apache.spark.storage
 
 import scala.concurrent.{ExecutionContext, Future}
 
-import org.apache.spark.rpc.{RpcEnv, RpcCallContext, RpcEndpoint}
+import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext, RpcEndpoint}
 import org.apache.spark.util.ThreadUtils
 import org.apache.spark.{Logging, MapOutputTracker, SparkEnv}
 import org.apache.spark.storage.BlockManagerMessages._
@@ -33,7 +33,7 @@ class BlockManagerSlaveEndpoint(
     override val rpcEnv: RpcEnv,
     blockManager: BlockManager,
     mapOutputTracker: MapOutputTracker)
-  extends RpcEndpoint with Logging {
+  extends ThreadSafeRpcEndpoint with Logging {
 
   private val asyncThreadPool =
     ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
@@ -80,7 +80,7 @@ class BlockManagerSlaveEndpoint(
     future.onSuccess { case response =>
       logDebug("Done " + actionMessage + ", response is " + response)
       context.reply(response)
-      logDebug("Sent response: " + response + " to " + context.sender)
+      logDebug("Sent response: " + response + " to " + context.senderAddress)
     }
     future.onFailure { case t: Throwable =>
       logError("Error in " + actionMessage, t)
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 22e291a2b48d6..1ed098379e299 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -85,7 +85,11 @@ private[spark] object ThreadUtils {
    */
   def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()
-    Executors.newSingleThreadScheduledExecutor(threadFactory)
+    val executor = new ScheduledThreadPoolExecutor(1, threadFactory)
+    // By default, a cancelled task is not automatically removed from the work queue until its delay
+    // elapses. We have to enable it manually.
+    executor.setRemoveOnCancelPolicy(true)
+    executor
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index af4e68950f75a..7e70308bb360c 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -168,10 +168,9 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     masterTracker.registerShuffle(10, 1)
     masterTracker.registerMapOutput(10, 0, MapStatus(
       BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0)))
-    val sender = mock(classOf[RpcEndpointRef])
-    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
+    val senderAddress = RpcAddress("localhost", 12345)
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.sender).thenReturn(sender)
+    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(10))
     verify(rpcCallContext).reply(any())
     verify(rpcCallContext, never()).sendFailure(any())
@@ -198,10 +197,9 @@ class MapOutputTrackerSuite extends SparkFunSuite {
       masterTracker.registerMapOutput(20, i, new CompressedMapStatus(
         BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0)))
     }
-    val sender = mock(classOf[RpcEndpointRef])
-    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
+    val senderAddress = RpcAddress("localhost", 12345)
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.sender).thenReturn(sender)
+    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(20))
     verify(rpcCallContext, never()).reply(any())
     verify(rpcCallContext).sendFailure(isA(classOf[SparkException]))
diff --git a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
index 33270bec6247c..2d14249855c9d 100644
--- a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
+++ b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
@@ -41,6 +41,7 @@ object SSLSampleConfigs {
 
   def sparkSSLConfig(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
+    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", keyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
@@ -54,6 +55,7 @@ object SSLSampleConfigs {
 
   def sparkSSLConfigUntrusted(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
+    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", untrustedKeyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
index e9034e39a715c..40c24bdecc6ce 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
@@ -26,8 +26,7 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val conf = new SparkConf()
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
-    workerWatcher.setTesting(testing = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(RpcAddress("1.2.3.4", 1234))
     assert(workerWatcher.isShutDown)
@@ -39,8 +38,7 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
     val otherRpcAddress = RpcAddress("4.3.2.1", 1234)
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
-    workerWatcher.setTesting(testing = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(otherRpcAddress)
     assert(!workerWatcher.isShutDown)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 6ceafe4337747..e836946a59431 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rpc
 
+import java.io.NotSerializableException
 import java.util.concurrent.{TimeUnit, CountDownLatch, TimeoutException}
 
 import scala.collection.mutable
@@ -99,7 +100,6 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     }
     val rpcEndpointRef = env.setupEndpoint("send-ref", endpoint)
-
     val newRpcEndpointRef = rpcEndpointRef.askWithRetry[RpcEndpointRef]("Hello")
     val reply = newRpcEndpointRef.askWithRetry[String]("Echo")
     assert("Echo" === reply)
@@ -516,6 +516,9 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(events === List(
         ("onConnected", remoteAddress),
         ("onNetworkError", remoteAddress),
+        ("onDisconnected", remoteAddress)) ||
+        events === List(
+        ("onConnected", remoteAddress),
         ("onDisconnected", remoteAddress)))
     }
   }
@@ -535,15 +538,84 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       "local", env.address, "sendWithReply-unserializable-error")
     try {
       val f = rpcEndpointRef.ask[String]("hello")
-      intercept[TimeoutException] {
+      val e = intercept[Exception] {
         Await.result(f, 1 seconds)
       }
+      assert(e.isInstanceOf[TimeoutException] || // For Akka
+        e.isInstanceOf[NotSerializableException] // For Netty
+      )
     } finally {
       anotherEnv.shutdown()
       anotherEnv.awaitTermination()
     }
   }
 
+  test("port conflict") {
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", env.address.port)
+    assert(anotherEnv.address.port != env.address.port)
+  }
+
+  test("send with authentication") {
+    val conf = new SparkConf
+    conf.set("spark.authenticate", "true")
+    conf.set("spark.authenticate.secret", "good")
+
+    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+
+    try {
+      @volatile var message: String = null
+      localEnv.setupEndpoint("send-authentication", new RpcEndpoint {
+        override val rpcEnv = localEnv
+
+        override def receive: PartialFunction[Any, Unit] = {
+          case msg: String => message = msg
+        }
+      })
+      val rpcEndpointRef =
+        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "send-authentication")
+      rpcEndpointRef.send("hello")
+      eventually(timeout(5 seconds), interval(10 millis)) {
+        assert("hello" === message)
+      }
+    } finally {
+      localEnv.shutdown()
+      localEnv.awaitTermination()
+      remoteEnv.shutdown()
+      remoteEnv.awaitTermination()
+    }
+  }
+
+  test("ask with authentication") {
+    val conf = new SparkConf
+    conf.set("spark.authenticate", "true")
+    conf.set("spark.authenticate.secret", "good")
+
+    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+
+    try {
+      localEnv.setupEndpoint("ask-authentication", new RpcEndpoint {
+        override val rpcEnv = localEnv
+
+        override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+          case msg: String => {
+            context.reply(msg)
+          }
+        }
+      })
+      val rpcEndpointRef =
+        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "ask-authentication")
+      val reply = rpcEndpointRef.askWithRetry[String]("hello")
+      assert("hello" === reply)
+    } finally {
+      localEnv.shutdown()
+      localEnv.awaitTermination()
+      remoteEnv.shutdown()
+      remoteEnv.awaitTermination()
+    }
+  }
+
   test("construct RpcTimeout with conf property") {
     val conf = new SparkConf
 
@@ -612,7 +684,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     // once the future is complete to verify addMessageIfTimeout was invoked
     val reply3 =
       intercept[RpcTimeoutException] {
-        Await.result(fut3, 200 millis)
+        Await.result(fut3, 2000 millis)
       }.getMessage
 
     // When the future timed out, the recover callback should have used
diff --git a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
new file mode 100644
index 0000000000000..5e8da3e205ab0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalactic.TripleEquals
+
+class TestRpcEndpoint extends ThreadSafeRpcEndpoint with TripleEquals {
+
+  override val rpcEnv: RpcEnv = null
+
+  @volatile private var receiveMessages = ArrayBuffer[Any]()
+
+  @volatile private var receiveAndReplyMessages = ArrayBuffer[Any]()
+
+  @volatile private var onConnectedMessages = ArrayBuffer[RpcAddress]()
+
+  @volatile private var onDisconnectedMessages = ArrayBuffer[RpcAddress]()
+
+  @volatile private var onNetworkErrorMessages = ArrayBuffer[(Throwable, RpcAddress)]()
+
+  @volatile private var started = false
+
+  @volatile private var stopped = false
+
+  override def receive: PartialFunction[Any, Unit] = {
+    case message: Any => receiveMessages += message
+  }
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case message: Any => receiveAndReplyMessages += message
+  }
+
+  override def onConnected(remoteAddress: RpcAddress): Unit = {
+    onConnectedMessages += remoteAddress
+  }
+
+  /**
+   * Invoked when some network error happens in the connection between the current node and
+   * `remoteAddress`.
+   */
+  override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+    onNetworkErrorMessages += cause -> remoteAddress
+  }
+
+  override def onDisconnected(remoteAddress: RpcAddress): Unit = {
+    onDisconnectedMessages += remoteAddress
+  }
+
+  def numReceiveMessages: Int = receiveMessages.size
+
+  override def onStart(): Unit = {
+    started = true
+  }
+
+  override def onStop(): Unit = {
+    stopped = true
+  }
+
+  def verifyStarted(): Unit = {
+    assert(started, "RpcEndpoint is not started")
+  }
+
+  def verifyStopped(): Unit = {
+    assert(stopped, "RpcEndpoint is not stopped")
+  }
+
+  def verifyReceiveMessages(expected: Seq[Any]): Unit = {
+    assert(receiveMessages === expected)
+  }
+
+  def verifySingleReceiveMessage(message: Any): Unit = {
+    verifyReceiveMessages(List(message))
+  }
+
+  def verifyReceiveAndReplyMessages(expected: Seq[Any]): Unit = {
+    assert(receiveAndReplyMessages === expected)
+  }
+
+  def verifySingleReceiveAndReplyMessage(message: Any): Unit = {
+    verifyReceiveAndReplyMessages(List(message))
+  }
+
+  def verifySingleOnConnectedMessage(remoteAddress: RpcAddress): Unit = {
+    verifyOnConnectedMessages(List(remoteAddress))
+  }
+
+  def verifyOnConnectedMessages(expected: Seq[RpcAddress]): Unit = {
+    assert(onConnectedMessages === expected)
+  }
+
+  def verifySingleOnDisconnectedMessage(remoteAddress: RpcAddress): Unit = {
+    verifyOnDisconnectedMessages(List(remoteAddress))
+  }
+
+  def verifyOnDisconnectedMessages(expected: Seq[RpcAddress]): Unit = {
+    assert(onDisconnectedMessages === expected)
+  }
+
+  def verifySingleOnNetworkErrorMessage(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+    verifyOnNetworkErrorMessages(List(cause -> remoteAddress))
+  }
+
+  def verifyOnNetworkErrorMessages(expected: Seq[(Throwable, RpcAddress)]): Unit = {
+    assert(onNetworkErrorMessages === expected)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
new file mode 100644
index 0000000000000..ff83ab9b32cb9
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.concurrent.{CountDownLatch, TimeUnit}
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.mockito.Mockito._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.rpc.{RpcEnv, RpcEndpoint, RpcAddress, TestRpcEndpoint}
+
+class InboxSuite extends SparkFunSuite {
+
+  test("post") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    when(endpointRef.name).thenReturn("hello")
+
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    val message = ContentMessage(null, "hi", false, null)
+    inbox.post(message)
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    endpoint.verifySingleReceiveMessage("hi")
+
+    inbox.stop()
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+    endpoint.verifyStarted()
+    endpoint.verifyStopped()
+  }
+
+  test("post: with reply") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    val message = ContentMessage(null, "hi", true, null)
+    inbox.post(message)
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    endpoint.verifySingleReceiveAndReplyMessage("hi")
+  }
+
+  test("post: multiple threads") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    when(endpointRef.name).thenReturn("hello")
+
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val numDroppedMessages = new AtomicInteger(0)
+    val inbox = new Inbox(endpointRef, endpoint) {
+      override def onDrop(message: InboxMessage): Unit = {
+        numDroppedMessages.incrementAndGet()
+      }
+    }
+
+    val exitLatch = new CountDownLatch(10)
+
+    for (_ <- 0 until 10) {
+      new Thread {
+        override def run(): Unit = {
+          for (_ <- 0 until 100) {
+            val message = ContentMessage(null, "hi", false, null)
+            inbox.post(message)
+          }
+          exitLatch.countDown()
+        }
+      }.start()
+    }
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+    inbox.stop()
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    exitLatch.await(30, TimeUnit.SECONDS)
+
+    assert(1000 === endpoint.numReceiveMessages + numDroppedMessages.get)
+    endpoint.verifyStarted()
+    endpoint.verifyStopped()
+  }
+
+  test("post: Associated") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(Associated(remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnConnectedMessage(remoteAddress)
+  }
+
+  test("post: Disassociated") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(Disassociated(remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnDisconnectedMessage(remoteAddress)
+  }
+
+  test("post: AssociationError") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+    val cause = new RuntimeException("Oops")
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(AssociationError(cause, remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
new file mode 100644
index 0000000000000..a5d43d3704e37
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.SparkFunSuite
+
+class NettyRpcAddressSuite extends SparkFunSuite {
+
+  test("toString") {
+    val addr = NettyRpcAddress("localhost", 12345, "test")
+    assert(addr.toString === "spark://test@localhost:12345")
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
new file mode 100644
index 0000000000000..be19668e17c04
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.rpc._
+
+class NettyRpcEnvSuite extends RpcEnvSuite {
+
+  override def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv = {
+    val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf))
+    new NettyRpcEnvFactory().create(config)
+  }
+
+  test("non-existent endpoint") {
+    val uri = env.uriOf("test", env.address, "nonexist-endpoint")
+    val e = intercept[RpcEndpointNotFoundException] {
+      env.setupEndpointRef("test", env.address, "nonexist-endpoint")
+    }
+    assert(e.getMessage.contains(uri))
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
new file mode 100644
index 0000000000000..06ca035d199e8
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.net.InetSocketAddress
+
+import io.netty.channel.Channel
+import org.mockito.Mockito._
+import org.mockito.Matchers._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.network.client.{TransportResponseHandler, TransportClient}
+import org.apache.spark.rpc._
+
+class NettyRpcHandlerSuite extends SparkFunSuite {
+
+  val env = mock(classOf[NettyRpcEnv])
+  when(env.deserialize(any(classOf[Array[Byte]]))(any())).
+    thenReturn(RequestMessage(RpcAddress("localhost", 12345), null, null, false))
+
+  test("receive") {
+    val dispatcher = mock(classOf[Dispatcher])
+    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
+
+    val channel = mock(classOf[Channel])
+    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.receive(client, null, null)
+
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
+    nettyRpcHandler.receive(client, null, null)
+
+    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+  }
+
+  test("connectionTerminated") {
+    val dispatcher = mock(classOf[Dispatcher])
+    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
+
+    val channel = mock(classOf[Channel])
+    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.receive(client, null, null)
+
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.connectionTerminated(client)
+
+    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).broadcastMessage(Disassociated(RpcAddress("localhost", 12345)))
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index df841288a02e3..fbb8bb6b2f6c3 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -78,6 +78,10 @@ public TransportClient(Channel channel, TransportResponseHandler handler) {
     this.handler = Preconditions.checkNotNull(handler);
   }
 
+  public Channel getChannel() {
+    return channel;
+  }
+
   public boolean isActive() {
     return channel.isOpen() || channel.isActive();
   }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
index 2ba92a40f8b0a..dbb7f95f55bc0 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
@@ -52,4 +52,6 @@ public abstract void receive(
    * No further requests will come from this client.
    */
   public void connectionTerminated(TransportClient client) { }
+
+  public void exceptionCaught(Throwable cause, TransportClient client) { }
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index df6027805838d..96941d26be19d 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -71,6 +71,7 @@ public TransportRequestHandler(
 
   @Override
   public void exceptionCaught(Throwable cause) {
+    rpcHandler.exceptionCaught(cause, reverseClient);
   }
 
   @Override
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 204e6142fd6cf..d053e9e84910f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -474,7 +474,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       // Remote messages
       case RegisterReceiver(streamId, typ, hostPort, receiverEndpoint) =>
         val successful =
-          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.sender.address)
+          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.senderAddress)
         context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 93621b44c9183..32d218169a6e6 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -556,10 +556,7 @@ private[spark] class ApplicationMaster(
       override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
     extends RpcEndpoint with Logging {
 
-    override def onStart(): Unit = {
-      driver.send(RegisterClusterManager(self))
-
-    }
+    driver.send(RegisterClusterManager(self))
 
     override def receive: PartialFunction[Any, Unit] = {
       case x: AddWebUIFilter =>

From 83f6f54d12a418f5158ee7ee985b54eef8cc1cf0 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 23 Sep 2015 19:34:31 -0700
Subject: [PATCH 405/802] [SPARK-10474] [SQL] Aggregation fails to allocate
 memory for pointer array (round 2)

This patch reverts most of the changes in a previous fix #8827.

The real cause of the issue is that in `TungstenAggregate`'s prepare method we only reserve 1 page, but later when we switch to sort-based aggregation we try to acquire 1 page AND a pointer array. The longer-term fix should be to reserve also the pointer array, but for now ***we will simply not track the pointer array***. (Note that elsewhere we already don't track the pointer array, e.g. [here](https://github.com/apache/spark/blob/a18208047f06a4244703c17023bb20cbe1f59d73/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java#L88))

Note: This patch reuses the unit test added in #8827 so it doesn't show up in the diff.

Author: Andrew Or <andrew@databricks.com>

Closes #8888 from andrewor14/dont-track-pointer-array.
---
 .../unsafe/sort/UnsafeExternalSorter.java     | 51 +++++--------------
 .../sql/execution/UnsafeKVExternalSorter.java |  9 +---
 .../UnsafeFixedWidthAggregationMapSuite.scala |  8 +--
 3 files changed, 16 insertions(+), 52 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 14b6aafdea7df..0a311d2d935ac 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -159,16 +159,15 @@ public BoxedUnit apply() {
   /**
    * Allocates new sort data structures. Called when creating the sorter and after each spill.
    */
-  public void initializeForWriting() throws IOException {
+  private void initializeForWriting() throws IOException {
+    // Note: Do not track memory for the pointer array for now because of SPARK-10474.
+    // In more detail, in TungstenAggregate we only reserve a page, but when we fall back to
+    // sort-based aggregation we try to acquire a page AND a pointer array, which inevitably
+    // fails if all other memory is already occupied. It should be safe to not track the array
+    // because its memory footprint is frequently much smaller than that of a page. This is a
+    // temporary hack that we should address in 1.6.0.
+    // TODO: track the pointer array memory!
     this.writeMetrics = new ShuffleWriteMetrics();
-    final long pointerArrayMemory =
-      UnsafeInMemorySorter.getMemoryRequirementsForPointerArray(initialSize);
-    final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pointerArrayMemory);
-    if (memoryAcquired != pointerArrayMemory) {
-      shuffleMemoryManager.release(memoryAcquired);
-      throw new IOException("Could not acquire " + pointerArrayMemory + " bytes of memory");
-    }
-
     this.inMemSorter =
       new UnsafeInMemorySorter(taskMemoryManager, recordComparator, prefixComparator, initialSize);
     this.isInMemSorterExternal = false;
@@ -187,14 +186,6 @@ public void closeCurrentPage() {
    * Sort and spill the current records in response to memory pressure.
    */
   public void spill() throws IOException {
-    spill(true);
-  }
-
-  /**
-   * Sort and spill the current records in response to memory pressure.
-   * @param shouldInitializeForWriting whether to allocate memory for writing after the spill
-   */
-  public void spill(boolean shouldInitializeForWriting) throws IOException {
     assert(inMemSorter != null);
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
       Thread.currentThread().getId(),
@@ -225,9 +216,7 @@ public void spill(boolean shouldInitializeForWriting) throws IOException {
     // written to disk. This also counts the space needed to store the sorter's pointer array.
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
 
-    if (shouldInitializeForWriting) {
-      initializeForWriting();
-    }
+    initializeForWriting();
   }
 
   /**
@@ -275,14 +264,7 @@ private long freeMemory() {
       shuffleMemoryManager.release(block.size());
       memoryFreed += block.size();
     }
-    if (inMemSorter != null) {
-      if (!isInMemSorterExternal) {
-        long sorterMemoryUsage = inMemSorter.getMemoryUsage();
-        memoryFreed += sorterMemoryUsage;
-        shuffleMemoryManager.release(sorterMemoryUsage);
-      }
-      inMemSorter = null;
-    }
+    // TODO: track in-memory sorter memory usage (SPARK-10474)
     allocatedPages.clear();
     currentPage = null;
     currentPagePosition = -1;
@@ -320,17 +302,8 @@ public void cleanupResources() {
   private void growPointerArrayIfNecessary() throws IOException {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
-      logger.debug("Attempting to expand sort pointer array");
-      final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage();
-      final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2;
-      final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryToGrowPointerArray);
-      if (memoryAcquired < memoryToGrowPointerArray) {
-        shuffleMemoryManager.release(memoryAcquired);
-        spill();
-      } else {
-        inMemSorter.expandPointerArray();
-        shuffleMemoryManager.release(oldPointerArrayMemoryUsage);
-      }
+      // TODO: track the pointer array memory! (SPARK-10474)
+      inMemSorter.expandPointerArray();
     }
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index b81f67a16b815..9df5780e4fd84 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -85,7 +85,7 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
       // We will use the number of elements in the map as the initialSize of the
       // UnsafeInMemorySorter. Because UnsafeInMemorySorter does not accept 0 as the initialSize,
       // we will use 1 as its initial size if the map is empty.
-      // TODO: track pointer array memory used by this in-memory sorter!
+      // TODO: track pointer array memory used by this in-memory sorter! (SPARK-10474)
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
         taskMemoryManager, recordComparator, prefixComparator, Math.max(1, map.numElements()));
 
@@ -124,13 +124,8 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
         pageSizeBytes,
         inMemSorter);
 
-      // Note: This spill doesn't actually release any memory, so if we try to allocate a new
-      // pointer array immediately after the spill then we may fail to acquire sufficient space
-      // for it (SPARK-10474). For this reason, we must initialize for writing explicitly *after*
-      // we have actually freed memory from our map.
-      sorter.spill(false /* initialize for writing */);
+      sorter.spill();
       map.free();
-      sorter.initializeForWriting();
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index ada4d42f991ce..1739798a24e0a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -200,9 +200,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      // 4096 * 16 is the initial size allocated for the pointer/prefix array in the in-mem sorter.
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() ===
-        initialMemoryConsumption + 4096 * 16)
+      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.
@@ -305,9 +303,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      // 4096 * 16 is the initial size allocated for the pointer/prefix array in the in-mem sorter.
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() ===
-        initialMemoryConsumption + 4096 * 16)
+      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.

From 758c9d25e92417f8c06328c3af7ea2ef0212c79f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 23 Sep 2015 19:52:02 -0700
Subject: [PATCH 406/802] [SPARK-10692] [STREAMING] Expose failureReasons in
 BatchInfo for streaming UI to clear failed batches

Slightly modified version of #8818, all credit goes to zsxwing

Author: zsxwing <zsxwing@gmail.com>
Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #8892 from tdas/SPARK-10692.
---
 .../spark/streaming/scheduler/BatchInfo.scala | 10 +++
 .../streaming/scheduler/JobScheduler.scala    | 26 +++----
 .../spark/streaming/scheduler/JobSet.scala    | 19 ++++-
 .../streaming/StreamingListenerSuite.scala    | 76 +++++++++++++++++++
 4 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 9922b6bc1201b..3c869561ef8b9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -39,6 +39,8 @@ case class BatchInfo(
     processingEndTime: Option[Long]
   ) {
 
+  private var _failureReasons: Map[Int, String] = Map.empty
+
   @deprecated("Use streamIdToInputInfo instead", "1.5.0")
   def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
 
@@ -67,4 +69,12 @@ case class BatchInfo(
    * The number of recorders received by the receivers in this batch.
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
+
+  /** Set the failure reasons corresponding to every output ops in the batch */
+  private[streaming] def setFailureReason(reasons: Map[Int, String]): Unit = {
+    _failureReasons = reasons
+  }
+
+  /** Failure reasons corresponding to every output ops in the batch */
+  private[streaming] def failureReasons = _failureReasons
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 32d995dc42f27..66afbf1b11764 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -166,22 +166,22 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   }
 
   private def handleJobCompletion(job: Job) {
+    val jobSet = jobSets.get(job.time)
+    jobSet.handleJobCompletion(job)
+    logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
+    if (jobSet.hasCompleted) {
+      jobSets.remove(jobSet.time)
+      jobGenerator.onBatchCompletion(jobSet.time)
+      logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
+        jobSet.totalDelay / 1000.0, jobSet.time.toString,
+        jobSet.processingDelay / 1000.0
+      ))
+      listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
+    }
     job.result match {
-      case Success(_) =>
-        val jobSet = jobSets.get(job.time)
-        jobSet.handleJobCompletion(job)
-        logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
-        if (jobSet.hasCompleted) {
-          jobSets.remove(jobSet.time)
-          jobGenerator.onBatchCompletion(jobSet.time)
-          logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
-            jobSet.totalDelay / 1000.0, jobSet.time.toString,
-            jobSet.processingDelay / 1000.0
-          ))
-          listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
-        }
       case Failure(e) =>
         reportError("Error running job " + job, e)
+      case _ =>
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 95833efc9417f..255ccf0536688 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -18,8 +18,10 @@
 package org.apache.spark.streaming.scheduler
 
 import scala.collection.mutable.HashSet
+import scala.util.Failure
 
 import org.apache.spark.streaming.Time
+import org.apache.spark.util.Utils
 
 /** Class representing a set of Jobs
   * belong to the same batch.
@@ -62,12 +64,23 @@ case class JobSet(
   }
 
   def toBatchInfo: BatchInfo = {
-    new BatchInfo(
+    val failureReasons: Map[Int, String] = {
+      if (hasCompleted) {
+        jobs.filter(_.result.isFailure).map { job =>
+          (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception))
+        }.toMap
+      } else {
+        Map.empty
+      }
+    }
+    val binfo = new BatchInfo(
       time,
       streamIdToInputInfo,
       submissionTime,
-      if (processingStartTime >= 0 ) Some(processingStartTime) else None,
-      if (processingEndTime >= 0 ) Some(processingEndTime) else None
+      if (processingStartTime >= 0) Some(processingStartTime) else None,
+      if (processingEndTime >= 0) Some(processingEndTime) else None
     )
+    binfo.setFailureReason(failureReasons)
+    binfo
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index d840c349bbbc4..d8fd2ced3bf3b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -140,6 +140,69 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
     }
   }
 
+  test("onBatchCompleted with successful batch") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD(_.count)
+
+    val failureReasons = startStreamingContextAndCollectFailureReasons(ssc)
+    assert(failureReasons != null && failureReasons.isEmpty,
+      "A successful batch should not set errorMessage")
+  }
+
+  test("onBatchCompleted with failed batch and one failed job") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is a failed job")
+    }
+
+    // Check if failureReasons contains the correct error message
+    val failureReasons = startStreamingContextAndCollectFailureReasons(ssc, isFailed = true)
+    assert(failureReasons != null)
+    assert(failureReasons.size === 1)
+    assert(failureReasons.contains(0))
+    assert(failureReasons(0).contains("This is a failed job"))
+  }
+
+  test("onBatchCompleted with failed batch and multiple failed jobs") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is a failed job")
+    }
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is another failed job")
+    }
+
+    // Check if failureReasons contains the correct error messages
+    val failureReasons =
+      startStreamingContextAndCollectFailureReasons(ssc, isFailed = true)
+    assert(failureReasons != null)
+    assert(failureReasons.size === 2)
+    assert(failureReasons.contains(0))
+    assert(failureReasons.contains(1))
+    assert(failureReasons(0).contains("This is a failed job"))
+    assert(failureReasons(1).contains("This is another failed job"))
+  }
+
+  private def startStreamingContextAndCollectFailureReasons(
+      _ssc: StreamingContext, isFailed: Boolean = false): Map[Int, String] = {
+    val failureReasonsCollector = new FailureReasonsCollector()
+    _ssc.addStreamingListener(failureReasonsCollector)
+    val batchCounter = new BatchCounter(_ssc)
+    _ssc.start()
+    // Make sure running at least one batch
+    batchCounter.waitUntilBatchesCompleted(expectedNumCompletedBatches = 1, timeout = 10000)
+    if (isFailed) {
+      intercept[RuntimeException] {
+        _ssc.awaitTerminationOrTimeout(10000)
+      }
+    }
+    _ssc.stop()
+    failureReasonsCollector.failureReasons
+  }
+
   /** Check if a sequence of numbers is in increasing order */
   def isInIncreasingOrder(seq: Seq[Long]): Boolean = {
     for (i <- 1 until seq.size) {
@@ -205,3 +268,16 @@ class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_O
   }
   def onStop() { }
 }
+
+/**
+ * A StreamingListener that saves the latest `failureReasons` in `BatchInfo` to the `failureReasons`
+ * field.
+ */
+class FailureReasonsCollector extends StreamingListener {
+
+  @volatile var failureReasons: Map[Int, String] = null
+
+  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
+    failureReasons = batchCompleted.batchInfo.failureReasons
+  }
+}

From d91967e159f416924bbd7f0db25156588d4bd7b1 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 23 Sep 2015 22:49:08 -0700
Subject: [PATCH 407/802] [SPARK-10763] [ML] [JAVA] [TEST] Update Java MLLIB/ML
 tests to use simplified dataframe construction

As introduced in https://issues.apache.org/jira/browse/SPARK-10630 we now have an easier way to create dataframes from local Java lists. Lets update the tests to use those.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8886 from holdenk/SPARK-10763-update-java-mllib-ml-tests-to-use-simplified-dataframe-construction.
---
 .../ml/classification/JavaNaiveBayesSuite.java     |  8 ++++----
 .../spark/ml/feature/JavaBucketizerSuite.java      | 14 +++++++-------
 .../org/apache/spark/ml/feature/JavaDCTSuite.java  | 11 +++++------
 .../spark/ml/feature/JavaHashingTFSuite.java       |  7 ++++---
 .../ml/feature/JavaPolynomialExpansionSuite.java   |  5 +++--
 .../ml/feature/JavaStopWordsRemoverSuite.java      |  7 ++++---
 .../spark/ml/feature/JavaStringIndexerSuite.java   |  7 ++++---
 .../spark/ml/feature/JavaVectorAssemblerSuite.java |  3 +--
 .../spark/ml/feature/JavaVectorSlicerSuite.java    |  7 ++++---
 .../apache/spark/ml/feature/JavaWord2VecSuite.java | 12 ++++++------
 10 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 075a62c493f17..f5f690eabd12c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
@@ -75,21 +76,20 @@ public void naiveBayesDefaultParams() {
 
   @Test
   public void testNaiveBayes() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)),
       RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)),
-      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))
-    ));
+      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0)));
 
     StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("features", new VectorUDT(), false, Metadata.empty())
     });
 
-    DataFrame dataset = jsql.createDataFrame(jrdd, schema);
+    DataFrame dataset = jsql.createDataFrame(data, schema);
     NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial");
     NaiveBayesModel model = nb.fit(dataset);
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
index 47d68de599da2..8a1e5ef015659 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
@@ -55,16 +55,16 @@ public void tearDown() {
   public void bucketizerTest() {
     double[] splits = {-0.5, 0.0, 0.5};
 
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-      RowFactory.create(-0.5),
-      RowFactory.create(-0.3),
-      RowFactory.create(0.0),
-      RowFactory.create(0.2)
-    ));
     StructType schema = new StructType(new StructField[] {
       new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
     });
-    DataFrame dataset = jsql.createDataFrame(data, schema);
+    DataFrame dataset = jsql.createDataFrame(
+      Arrays.asList(
+        RowFactory.create(-0.5),
+        RowFactory.create(-0.3),
+        RowFactory.create(0.0),
+        RowFactory.create(0.2)),
+      schema);
 
     Bucketizer bucketizer = new Bucketizer()
       .setInputCol("feature")
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
index 0f6ec64d97d36..39da47381b129 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
@@ -57,12 +57,11 @@ public void tearDown() {
   @Test
   public void javaCompatibilityTest() {
     double[] input = new double[] {1D, 2D, 3D, 4D};
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-      RowFactory.create(Vectors.dense(input))
-    ));
-    DataFrame dataset = jsql.createDataFrame(data, new StructType(new StructField[]{
-      new StructField("vec", (new VectorUDT()), false, Metadata.empty())
-    }));
+    DataFrame dataset = jsql.createDataFrame(
+      Arrays.asList(RowFactory.create(Vectors.dense(input))),
+      new StructType(new StructField[]{
+        new StructField("vec", (new VectorUDT()), false, Metadata.empty())
+      }));
 
     double[] expectedResult = input.clone();
     (new DoubleDCT_1D(input.length)).forward(expectedResult, true);
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
index 03dd5369bddf7..d12332c2a02a3 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -55,17 +56,17 @@ public void tearDown() {
 
   @Test
   public void hashingTF() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(0.0, "Hi I heard about Spark"),
       RowFactory.create(0.0, "I wish Java could use case classes"),
       RowFactory.create(1.0, "Logistic regression models are neat")
-    ));
+    );
     StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
     });
 
-    DataFrame sentenceData = jsql.createDataFrame(jrdd, schema);
+    DataFrame sentenceData = jsql.createDataFrame(data, schema);
     Tokenizer tokenizer = new Tokenizer()
       .setInputCol("sentence")
       .setOutputCol("words");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
index 834fedbb59e1b..bf8eefd71905c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -60,7 +61,7 @@ public void polynomialExpansionTest() {
       .setOutputCol("polyFeatures")
       .setDegree(3);
 
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(
         Vectors.dense(-2.0, 2.3),
         Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)
@@ -70,7 +71,7 @@ public void polynomialExpansionTest() {
         Vectors.dense(0.6, -1.1),
         Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331)
       )
-    ));
+    );
 
     StructType schema = new StructType(new StructField[] {
       new StructField("features", new VectorUDT(), false, Metadata.empty()),
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
index 76cdd0fae84ab..848d9f8aa9288 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
@@ -58,14 +59,14 @@ public void javaCompatibilityTest() {
       .setInputCol("raw")
       .setOutputCol("filtered");
 
-    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
       RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
-    ));
+    );
     StructType schema = new StructType(new StructField[] {
       new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
     });
-    DataFrame dataset = jsql.createDataFrame(rdd, schema);
+    DataFrame dataset = jsql.createDataFrame(data, schema);
 
     remover.transform(dataset).collect();
   }
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
index 35b18c5308f61..6b2c48ef1c342 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -56,9 +57,9 @@ public void testStringIndexer() {
       createStructField("id", IntegerType, false),
       createStructField("label", StringType, false)
     });
-    JavaRDD<Row> rdd = jsc.parallelize(
-      Arrays.asList(c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")));
-    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    List<Row> data = Arrays.asList(
+      c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c"));
+    DataFrame dataset = sqlContext.createDataFrame(data, schema);
 
     StringIndexer indexer = new StringIndexer()
       .setInputCol("label")
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
index b7c564caad3bd..e283777570930 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
@@ -65,8 +65,7 @@ public void testVectorAssembler() {
     Row row = RowFactory.create(
       0, 0.0, Vectors.dense(1.0, 2.0), "a",
       Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L);
-    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
-    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    DataFrame dataset = sqlContext.createDataFrame(Arrays.asList(row), schema);
     VectorAssembler assembler = new VectorAssembler()
       .setInputCols(new String[] {"x", "y", "z", "n"})
       .setOutputCol("features");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
index f953361427586..00174e6a683d6 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -63,12 +64,12 @@ public void vectorSlice() {
     };
     AttributeGroup group = new AttributeGroup("userFeatures", attrs);
 
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
       RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
-    ));
+    );
 
-    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+    DataFrame dataset = jsql.createDataFrame(data, (new StructType()).add(group.toStructField()));
 
     VectorSlicer vectorSlicer = new VectorSlicer()
       .setInputCol("userFeatures").setOutputCol("features");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
index 70f5ad9432212..0c0c1c4d12d0f 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
@@ -51,15 +51,15 @@ public void tearDown() {
 
   @Test
   public void testJavaWord2Vec() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-      RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
-      RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
-      RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
-    ));
     StructType schema = new StructType(new StructField[]{
       new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
     });
-    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+    DataFrame documentDF = sqlContext.createDataFrame(
+      Arrays.asList(
+        RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+        RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+        RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))),
+      schema);
 
     Word2Vec word2Vec = new Word2Vec()
       .setInputCol("text")

From 02144d6745ec0a6d8877d969feb82139bd22437f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 24 Sep 2015 08:25:44 -0700
Subject: [PATCH 408/802] Revert "[SPARK-6028][Core]A new RPC implemetation
 based on the network module"

This reverts commit 084e4e126211d74a79e8dbd2d0e604dd3c650822.
---
 .../org/apache/spark/MapOutputTracker.scala   |   2 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  20 +-
 .../apache/spark/deploy/worker/Worker.scala   |   2 +-
 .../spark/deploy/worker/WorkerWatcher.scala   |  13 +-
 .../org/apache/spark/rpc/RpcCallContext.scala |   2 +-
 .../org/apache/spark/rpc/RpcEndpoint.scala    |  51 +-
 .../rpc/RpcEndpointNotFoundException.scala    |  22 -
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   7 +-
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |   6 +-
 .../apache/spark/rpc/netty/Dispatcher.scala   | 218 --------
 .../apache/spark/rpc/netty/IDVerifier.scala   |  39 --
 .../org/apache/spark/rpc/netty/Inbox.scala    | 220 --------
 .../spark/rpc/netty/NettyRpcAddress.scala     |  56 --
 .../spark/rpc/netty/NettyRpcCallContext.scala |  87 ---
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 504 ------------------
 .../storage/BlockManagerSlaveEndpoint.scala   |   6 +-
 .../org/apache/spark/util/ThreadUtils.scala   |   6 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  10 +-
 .../org/apache/spark/SSLSampleConfigs.scala   |   2 -
 .../deploy/worker/WorkerWatcherSuite.scala    |   6 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  78 +--
 .../apache/spark/rpc/TestRpcEndpoint.scala    | 123 -----
 .../apache/spark/rpc/netty/InboxSuite.scala   | 148 -----
 .../rpc/netty/NettyRpcAddressSuite.scala      |  29 -
 .../spark/rpc/netty/NettyRpcEnvSuite.scala    |  38 --
 .../rpc/netty/NettyRpcHandlerSuite.scala      |  67 ---
 .../spark/network/client/TransportClient.java |   4 -
 .../spark/network/server/RpcHandler.java      |   2 -
 .../server/TransportRequestHandler.java       |   1 -
 .../streaming/scheduler/ReceiverTracker.scala |   2 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |   5 +-
 31 files changed, 68 insertions(+), 1708 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index e380c5b8afcf7..94eb8daa85c53 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -45,7 +45,7 @@ private[spark] class MapOutputTrackerMasterEndpoint(
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case GetMapOutputStatuses(shuffleId: Int) =>
-      val hostPort = context.senderAddress.hostPort
+      val hostPort = context.sender.address.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
       val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
       val serializedSize = mapOutputStatuses.size
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index cfde27fb2e7d3..c6fef7f91f00c 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -20,10 +20,11 @@ package org.apache.spark
 import java.io.File
 import java.net.Socket
 
+import akka.actor.ActorSystem
+
 import scala.collection.mutable
 import scala.util.Properties
 
-import akka.actor.ActorSystem
 import com.google.common.collect.MapMaker
 
 import org.apache.spark.annotation.DeveloperApi
@@ -40,7 +41,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
 import org.apache.spark.storage._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator}
-import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils}
+import org.apache.spark.util.{RpcUtils, Utils}
 
 /**
  * :: DeveloperApi ::
@@ -56,7 +57,6 @@ import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils}
 class SparkEnv (
     val executorId: String,
     private[spark] val rpcEnv: RpcEnv,
-    _actorSystem: ActorSystem, // TODO Remove actorSystem
     val serializer: Serializer,
     val closureSerializer: Serializer,
     val cacheManager: CacheManager,
@@ -76,7 +76,7 @@ class SparkEnv (
 
   // TODO Remove actorSystem
   @deprecated("Actor system is no longer supported as of 1.4.0", "1.4.0")
-  val actorSystem: ActorSystem = _actorSystem
+  val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
 
   private[spark] var isStopped = false
   private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()
@@ -100,9 +100,6 @@ class SparkEnv (
       blockManager.master.stop()
       metricsSystem.stop()
       outputCommitCoordinator.stop()
-      if (!rpcEnv.isInstanceOf[AkkaRpcEnv]) {
-        actorSystem.shutdown()
-      }
       rpcEnv.shutdown()
 
       // Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut
@@ -252,13 +249,7 @@ object SparkEnv extends Logging {
     // Create the ActorSystem for Akka and get the port it binds to.
     val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
     val rpcEnv = RpcEnv.create(actorSystemName, hostname, port, conf, securityManager)
-    val actorSystem: ActorSystem =
-      if (rpcEnv.isInstanceOf[AkkaRpcEnv]) {
-        rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
-      } else {
-        // Create a ActorSystem for legacy codes
-        AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)._1
-      }
+    val actorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
 
     // Figure out which port Akka actually bound to in case the original port is 0 or occupied.
     if (isDriver) {
@@ -404,7 +395,6 @@ object SparkEnv extends Logging {
     val envInstance = new SparkEnv(
       executorId,
       rpcEnv,
-      actorSystem,
       serializer,
       closureSerializer,
       cacheManager,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 93a1b3f310422..770927c80f7a4 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -329,7 +329,7 @@ private[deploy] class Worker(
         registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate(
           new Runnable {
             override def run(): Unit = Utils.tryLogNonFatalError {
-              Option(self).foreach(_.send(ReregisterWithMaster))
+              self.send(ReregisterWithMaster)
             }
           },
           INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index ab56fde938bae..735c4f0927150 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -24,13 +24,14 @@ import org.apache.spark.rpc._
  * Actor which connects to a worker process and terminates the JVM if the connection is severed.
  * Provides fate sharing between a worker and its associated child processes.
  */
-private[spark] class WorkerWatcher(
-    override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false)
+private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: String)
   extends RpcEndpoint with Logging {
 
-  logInfo(s"Connecting to worker $workerUrl")
-  if (!isTesting) {
-    rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
+  override def onStart() {
+    logInfo(s"Connecting to worker $workerUrl")
+    if (!isTesting) {
+      rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
+    }
   }
 
   // Used to avoid shutting down JVM during tests
@@ -39,6 +40,8 @@ private[spark] class WorkerWatcher(
   // true rather than calling `System.exit`. The user can check `isShutDown` to know if
   // `exitNonZero` is called.
   private[deploy] var isShutDown = false
+  private[deploy] def setTesting(testing: Boolean) = isTesting = testing
+  private var isTesting = false
 
   // Lets filter events only from the worker's rpc system
   private val expectedAddress = RpcAddress.fromURIString(workerUrl)
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
index f527ec86ab7b2..3e5b64265e919 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
@@ -37,5 +37,5 @@ private[spark] trait RpcCallContext {
   /**
    * The sender of this message.
    */
-  def senderAddress: RpcAddress
+  def sender: RpcEndpointRef
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
index f1ddc6d2cd438..dfcbc51cdf616 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
@@ -28,6 +28,20 @@ private[spark] trait RpcEnvFactory {
   def create(config: RpcEnvConfig): RpcEnv
 }
 
+/**
+ * A trait that requires RpcEnv thread-safely sending messages to it.
+ *
+ * Thread-safety means processing of one message happens before processing of the next message by
+ * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
+ * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
+ * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
+ *
+ * However, there is no guarantee that the same thread will be executing the same
+ * [[ThreadSafeRpcEndpoint]] for different messages.
+ */
+private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint
+
+
 /**
  * An end point for the RPC that defines what functions to trigger given a message.
  *
@@ -87,39 +101,38 @@ private[spark] trait RpcEndpoint {
   }
 
   /**
-   * Invoked when `remoteAddress` is connected to the current node.
+   * Invoked before [[RpcEndpoint]] starts to handle any message.
    */
-  def onConnected(remoteAddress: RpcAddress): Unit = {
+  def onStart(): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when `remoteAddress` is lost.
+   * Invoked when [[RpcEndpoint]] is stopping.
    */
-  def onDisconnected(remoteAddress: RpcAddress): Unit = {
+  def onStop(): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when some network error happens in the connection between the current node and
-   * `remoteAddress`.
+   * Invoked when `remoteAddress` is connected to the current node.
    */
-  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+  def onConnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked before [[RpcEndpoint]] starts to handle any message.
+   * Invoked when `remoteAddress` is lost.
    */
-  def onStart(): Unit = {
+  def onDisconnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when [[RpcEndpoint]] is stopping. `self` will be `null` in this method and you cannot
-   * use it to send or ask messages.
+   * Invoked when some network error happens in the connection between the current node and
+   * `remoteAddress`.
    */
-  def onStop(): Unit = {
+  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
@@ -133,17 +146,3 @@ private[spark] trait RpcEndpoint {
     }
   }
 }
-
-/**
- * A trait that requires RpcEnv thread-safely sending messages to it.
- *
- * Thread-safety means processing of one message happens before processing of the next message by
- * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
- * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
- * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
- *
- * However, there is no guarantee that the same thread will be executing the same
- * [[ThreadSafeRpcEndpoint]] for different messages.
- */
-private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint {
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
deleted file mode 100644
index d177881fb3053..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.rpc
-
-import org.apache.spark.SparkException
-
-private[rpc] class RpcEndpointNotFoundException(uri: String)
-  extends SparkException(s"Cannot find endpoint: $uri")
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index afe1ee8a0b4d3..29debe8081308 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -36,11 +36,8 @@ private[spark] object RpcEnv {
 
   private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = {
     // Add more RpcEnv implementations here
-    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
-      "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory")
-    // Use "netty" by default so that Jenkins can run all tests using NettyRpcEnv.
-    // Will change it back to "akka" before merging the new implementation.
-    val rpcEnvName = conf.get("spark.rpc", "netty")
+    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory")
+    val rpcEnvName = conf.get("spark.rpc", "akka")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
     Utils.classForName(rpcEnvFactoryClassName).newInstance().asInstanceOf[RpcEnvFactory]
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index 95132a4e4a0bf..ad67e1c5ad4d5 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -166,9 +166,9 @@ private[spark] class AkkaRpcEnv private[akka] (
             _sender ! AkkaMessage(response, false)
           }
 
-          // Use "lazy" because most of RpcEndpoints don't need "senderAddress"
-          override lazy val senderAddress: RpcAddress =
-            new AkkaRpcEndpointRef(defaultAddress, _sender, conf).address
+          // Some RpcEndpoints need to know the sender's address
+          override val sender: RpcEndpointRef =
+            new AkkaRpcEndpointRef(defaultAddress, _sender, conf)
         })
       } else {
         endpoint.receive
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
deleted file mode 100644
index d71e6f01dbb29..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
-import javax.annotation.concurrent.GuardedBy
-
-import scala.collection.JavaConverters._
-import scala.concurrent.Promise
-import scala.util.control.NonFatal
-
-import org.apache.spark.{SparkException, Logging}
-import org.apache.spark.network.client.RpcResponseCallback
-import org.apache.spark.rpc._
-import org.apache.spark.util.ThreadUtils
-
-private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
-
-  private class EndpointData(
-      val name: String,
-      val endpoint: RpcEndpoint,
-      val ref: NettyRpcEndpointRef) {
-    val inbox = new Inbox(ref, endpoint)
-  }
-
-  private val endpoints = new ConcurrentHashMap[String, EndpointData]()
-  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]()
-
-  // Track the receivers whose inboxes may contain messages.
-  private val receivers = new LinkedBlockingQueue[EndpointData]()
-
-  @GuardedBy("this")
-  private var stopped = false
-
-  def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
-    val addr = new NettyRpcAddress(nettyEnv.address.host, nettyEnv.address.port, name)
-    val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
-    synchronized {
-      if (stopped) {
-        throw new IllegalStateException("RpcEnv has been stopped")
-      }
-      if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) {
-        throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name")
-      }
-      val data = endpoints.get(name)
-      endpointRefs.put(data.endpoint, data.ref)
-      receivers.put(data)
-    }
-    endpointRef
-  }
-
-  def getRpcEndpointRef(endpoint: RpcEndpoint): RpcEndpointRef = endpointRefs.get(endpoint)
-
-  def removeRpcEndpointRef(endpoint: RpcEndpoint): Unit = endpointRefs.remove(endpoint)
-
-  // Should be idempotent
-  private def unregisterRpcEndpoint(name: String): Unit = {
-    val data = endpoints.remove(name)
-    if (data != null) {
-      data.inbox.stop()
-      receivers.put(data)
-    }
-    // Don't clean `endpointRefs` here because it's possible that some messages are being processed
-    // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
-    // `removeRpcEndpointRef`.
-  }
-
-  def stop(rpcEndpointRef: RpcEndpointRef): Unit = {
-    synchronized {
-      if (stopped) {
-        // This endpoint will be stopped by Dispatcher.stop() method.
-        return
-      }
-      unregisterRpcEndpoint(rpcEndpointRef.name)
-    }
-  }
-
-  /**
-   * Send a message to all registered [[RpcEndpoint]]s.
-   * @param message
-   */
-  def broadcastMessage(message: InboxMessage): Unit = {
-    val iter = endpoints.keySet().iterator()
-    while (iter.hasNext) {
-      val name = iter.next
-      postMessageToInbox(name, (_) => message,
-        () => { logWarning(s"Drop ${message} because ${name} has been stopped") })
-    }
-  }
-
-  def postMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
-    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
-      val rpcCallContext =
-        new RemoteNettyRpcCallContext(
-          nettyEnv, sender, callback, message.senderAddress, message.needReply)
-      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
-    }
-
-    def onEndpointStopped(): Unit = {
-      callback.onFailure(
-        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
-    }
-
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
-  }
-
-  def postMessage(message: RequestMessage, p: Promise[Any]): Unit = {
-    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
-      val rpcCallContext =
-        new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p)
-      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
-    }
-
-    def onEndpointStopped(): Unit = {
-      p.tryFailure(
-        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
-    }
-
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
-  }
-
-  private def postMessageToInbox(
-      endpointName: String,
-      createMessageFn: NettyRpcEndpointRef => InboxMessage,
-      onStopped: () => Unit): Unit = {
-    val shouldCallOnStop =
-      synchronized {
-        val data = endpoints.get(endpointName)
-        if (stopped || data == null) {
-          true
-        } else {
-          data.inbox.post(createMessageFn(data.ref))
-          receivers.put(data)
-          false
-        }
-      }
-    if (shouldCallOnStop) {
-      // We don't need to call `onStop` in the `synchronized` block
-      onStopped()
-    }
-  }
-
-  private val parallelism = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.parallelism",
-    Runtime.getRuntime.availableProcessors())
-
-  private val executor = ThreadUtils.newDaemonFixedThreadPool(parallelism, "dispatcher-event-loop")
-
-  (0 until parallelism) foreach { _ =>
-    executor.execute(new MessageLoop)
-  }
-
-  def stop(): Unit = {
-    synchronized {
-      if (stopped) {
-        return
-      }
-      stopped = true
-    }
-    // Stop all endpoints. This will queue all endpoints for processing by the message loops.
-    endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
-    // Enqueue a message that tells the message loops to stop.
-    receivers.put(PoisonEndpoint)
-    executor.shutdown()
-  }
-
-  def awaitTermination(): Unit = {
-    executor.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
-  }
-
-  /**
-   * Return if the endpoint exists
-   */
-  def verify(name: String): Boolean = {
-    endpoints.containsKey(name)
-  }
-
-  private class MessageLoop extends Runnable {
-    override def run(): Unit = {
-      try {
-        while (true) {
-          try {
-            val data = receivers.take()
-            if (data == PoisonEndpoint) {
-              // Put PoisonEndpoint back so that other MessageLoops can see it.
-              receivers.put(PoisonEndpoint)
-              return
-            }
-            data.inbox.process(Dispatcher.this)
-          } catch {
-            case NonFatal(e) => logError(e.getMessage, e)
-          }
-        }
-      } catch {
-        case ie: InterruptedException => // exit
-      }
-    }
-  }
-
-  /**
-   * A poison endpoint that indicates MessageLoop should exit its loop.
-   */
-  private val PoisonEndpoint = new EndpointData(null, null, null)
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
deleted file mode 100644
index 6061c9b8de944..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.rpc.netty
-
-import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
-
-/**
- * A message used to ask the remote [[IDVerifier]] if an [[RpcEndpoint]] exists
- */
-private[netty] case class ID(name: String)
-
-/**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
- */
-private[netty] class IDVerifier(
-    override val rpcEnv: RpcEnv, dispatcher: Dispatcher) extends RpcEndpoint {
-
-  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case ID(name) => context.reply(dispatcher.verify(name))
-  }
-}
-
-private[netty] object IDVerifier {
-  val NAME = "id-verifier"
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
deleted file mode 100644
index 4803548365aba..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import java.util.LinkedList
-import javax.annotation.concurrent.GuardedBy
-
-import scala.util.control.NonFatal
-
-import org.apache.spark.{Logging, SparkException}
-import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint}
-
-private[netty] sealed trait InboxMessage
-
-private[netty] case class ContentMessage(
-    senderAddress: RpcAddress,
-    content: Any,
-    needReply: Boolean,
-    context: NettyRpcCallContext) extends InboxMessage
-
-private[netty] case object OnStart extends InboxMessage
-
-private[netty] case object OnStop extends InboxMessage
-
-/**
- * A broadcast message that indicates connecting to a remote node.
- */
-private[netty] case class Associated(remoteAddress: RpcAddress) extends InboxMessage
-
-/**
- * A broadcast message that indicates a remote connection is lost.
- */
-private[netty] case class Disassociated(remoteAddress: RpcAddress) extends InboxMessage
-
-/**
- * A broadcast message that indicates a network error
- */
-private[netty] case class AssociationError(cause: Throwable, remoteAddress: RpcAddress)
-  extends InboxMessage
-
-/**
- * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely.
- * @param endpointRef
- * @param endpoint
- */
-private[netty] class Inbox(
-    val endpointRef: NettyRpcEndpointRef,
-    val endpoint: RpcEndpoint) extends Logging {
-
-  @GuardedBy("this")
-  protected val messages = new LinkedList[InboxMessage]()
-
-  @GuardedBy("this")
-  private var stopped = false
-
-  @GuardedBy("this")
-  private var enableConcurrent = false
-
-  @GuardedBy("this")
-  private var workerCount = 0
-
-  // OnStart should be the first message to process
-  synchronized {
-    messages.add(OnStart)
-  }
-
-  /**
-   * Process stored messages.
-   */
-  def process(dispatcher: Dispatcher): Unit = {
-    var message: InboxMessage = null
-    synchronized {
-      if (!enableConcurrent && workerCount != 0) {
-        return
-      }
-      message = messages.poll()
-      if (message != null) {
-        workerCount += 1
-      } else {
-        return
-      }
-    }
-    while (true) {
-      safelyCall(endpoint) {
-        message match {
-          case ContentMessage(_sender, content, needReply, context) =>
-            val pf: PartialFunction[Any, Unit] =
-              if (needReply) {
-                endpoint.receiveAndReply(context)
-              } else {
-                endpoint.receive
-              }
-            try {
-              pf.applyOrElse[Any, Unit](content, { msg =>
-                throw new SparkException(s"Unmatched message $message from ${_sender}")
-              })
-              if (!needReply) {
-                context.finish()
-              }
-            } catch {
-              case NonFatal(e) =>
-                if (needReply) {
-                  // If the sender asks a reply, we should send the error back to the sender
-                  context.sendFailure(e)
-                } else {
-                  context.finish()
-                  throw e
-                }
-            }
-
-          case OnStart => {
-            endpoint.onStart()
-            if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {
-              synchronized {
-                enableConcurrent = true
-              }
-            }
-          }
-
-          case OnStop =>
-            assert(synchronized { workerCount } == 1)
-            dispatcher.removeRpcEndpointRef(endpoint)
-            endpoint.onStop()
-            assert(isEmpty, "OnStop should be the last message")
-
-          case Associated(remoteAddress) =>
-            endpoint.onConnected(remoteAddress)
-
-          case Disassociated(remoteAddress) =>
-            endpoint.onDisconnected(remoteAddress)
-
-          case AssociationError(cause, remoteAddress) =>
-            endpoint.onNetworkError(cause, remoteAddress)
-        }
-      }
-
-      synchronized {
-        // "enableConcurrent" will be set to false after `onStop` is called, so we should check it
-        // every time.
-        if (!enableConcurrent && workerCount != 1) {
-          // If we are not the only one worker, exit
-          workerCount -= 1
-          return
-        }
-        message = messages.poll()
-        if (message == null) {
-          workerCount -= 1
-          return
-        }
-      }
-    }
-  }
-
-  def post(message: InboxMessage): Unit = {
-    val dropped =
-      synchronized {
-        if (stopped) {
-          // We already put "OnStop" into "messages", so we should drop further messages
-          true
-        } else {
-          messages.add(message)
-          false
-        }
-      }
-    if (dropped) {
-      onDrop(message)
-    }
-  }
-
-  def stop(): Unit = synchronized {
-    // The following codes should be in `synchronized` so that we can make sure "OnStop" is the last
-    // message
-    if (!stopped) {
-      // We should disable concurrent here. Then when RpcEndpoint.onStop is called, it's the only
-      // thread that is processing messages. So `RpcEndpoint.onStop` can release its resources
-      // safely.
-      enableConcurrent = false
-      stopped = true
-      messages.add(OnStop)
-      // Note: The concurrent events in messages will be processed one by one.
-    }
-  }
-
-  // Visible for testing.
-  protected def onDrop(message: InboxMessage): Unit = {
-    logWarning(s"Drop ${message} because $endpointRef is stopped")
-  }
-
-  def isEmpty: Boolean = synchronized { messages.isEmpty }
-
-  private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {
-    try {
-      action
-    } catch {
-      case NonFatal(e) => {
-        try {
-          endpoint.onError(e)
-        } catch {
-          case NonFatal(e) => logWarning(s"Ignore error", e)
-        }
-      }
-    }
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
deleted file mode 100644
index 1876b25592086..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import java.net.URI
-
-import org.apache.spark.SparkException
-import org.apache.spark.rpc.RpcAddress
-
-private[netty] case class NettyRpcAddress(host: String, port: Int, name: String) {
-
-  def toRpcAddress: RpcAddress = RpcAddress(host, port)
-
-  override val toString = s"spark://$name@$host:$port"
-}
-
-private[netty] object NettyRpcAddress {
-
-  def apply(sparkUrl: String): NettyRpcAddress = {
-    try {
-      val uri = new URI(sparkUrl)
-      val host = uri.getHost
-      val port = uri.getPort
-      val name = uri.getUserInfo
-      if (uri.getScheme != "spark" ||
-        host == null ||
-        port < 0 ||
-        name == null ||
-        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
-        uri.getFragment != null ||
-        uri.getQuery != null) {
-        throw new SparkException("Invalid Spark URL: " + sparkUrl)
-      }
-      NettyRpcAddress(host, port, name)
-    } catch {
-      case e: java.net.URISyntaxException =>
-        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
-    }
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
deleted file mode 100644
index 75dcc02a0c5a9..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import scala.concurrent.Promise
-
-import org.apache.spark.Logging
-import org.apache.spark.network.client.RpcResponseCallback
-import org.apache.spark.rpc.{RpcAddress, RpcCallContext}
-
-private[netty] abstract class NettyRpcCallContext(
-    endpointRef: NettyRpcEndpointRef,
-    override val senderAddress: RpcAddress,
-    needReply: Boolean) extends RpcCallContext with Logging {
-
-  protected def send(message: Any): Unit
-
-  override def reply(response: Any): Unit = {
-    if (needReply) {
-      send(AskResponse(endpointRef, response))
-    } else {
-      throw new IllegalStateException(
-        s"Cannot send $response to the sender because the sender won't handle it")
-    }
-  }
-
-  override def sendFailure(e: Throwable): Unit = {
-    if (needReply) {
-      send(AskResponse(endpointRef, RpcFailure(e)))
-    } else {
-      logError(e.getMessage, e)
-      throw new IllegalStateException(
-        "Cannot send reply to the sender because the sender won't handle it")
-    }
-  }
-
-  def finish(): Unit = {
-    if (!needReply) {
-      send(Ack(endpointRef))
-    }
-  }
-}
-
-/**
- * If the sender and the receiver are in the same process, the reply can be sent back via `Promise`.
- */
-private[netty] class LocalNettyRpcCallContext(
-    endpointRef: NettyRpcEndpointRef,
-    senderAddress: RpcAddress,
-    needReply: Boolean,
-    p: Promise[Any]) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
-
-  override protected def send(message: Any): Unit = {
-    p.success(message)
-  }
-}
-
-/**
- * A [[RpcCallContext]] that will call [[RpcResponseCallback]] to send the reply back.
- */
-private[netty] class RemoteNettyRpcCallContext(
-    nettyEnv: NettyRpcEnv,
-    endpointRef: NettyRpcEndpointRef,
-    callback: RpcResponseCallback,
-    senderAddress: RpcAddress,
-    needReply: Boolean) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
-
-  override protected def send(message: Any): Unit = {
-    val reply = nettyEnv.serialize(message)
-    callback.onSuccess(reply)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
deleted file mode 100644
index 5522b40782d9e..0000000000000
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.rpc.netty
-
-import java.io._
-import java.net.{InetSocketAddress, URI}
-import java.nio.ByteBuffer
-import java.util.Arrays
-import java.util.concurrent._
-import javax.annotation.concurrent.GuardedBy
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.concurrent.{Future, Promise}
-import scala.reflect.ClassTag
-import scala.util.{DynamicVariable, Failure, Success}
-import scala.util.control.NonFatal
-
-import org.apache.spark.{Logging, SecurityManager, SparkConf}
-import org.apache.spark.network.TransportContext
-import org.apache.spark.network.client._
-import org.apache.spark.network.netty.SparkTransportConf
-import org.apache.spark.network.sasl.{SaslClientBootstrap, SaslServerBootstrap}
-import org.apache.spark.network.server._
-import org.apache.spark.rpc._
-import org.apache.spark.serializer.{JavaSerializer, JavaSerializerInstance}
-import org.apache.spark.util.{ThreadUtils, Utils}
-
-private[netty] class NettyRpcEnv(
-    val conf: SparkConf,
-    javaSerializerInstance: JavaSerializerInstance,
-    host: String,
-    securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
-
-  private val transportConf =
-    SparkTransportConf.fromSparkConf(conf, conf.getInt("spark.rpc.io.threads", 0))
-
-  private val dispatcher: Dispatcher = new Dispatcher(this)
-
-  private val transportContext =
-    new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
-
-  private val clientFactory = {
-    val bootstraps: Seq[TransportClientBootstrap] =
-      if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslClientBootstrap(transportConf, "", securityManager,
-          securityManager.isSaslEncryptionEnabled()))
-      } else {
-        Nil
-      }
-    transportContext.createClientFactory(bootstraps.asJava)
-  }
-
-  val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
-
-  // Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool
-  // to implement non-blocking send/ask.
-  // TODO: a non-blocking TransportClientFactory.createClient in future
-  private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
-    "netty-rpc-connection",
-    conf.getInt("spark.rpc.connect.threads", 256))
-
-  @volatile private var server: TransportServer = _
-
-  def start(port: Int): Unit = {
-    val bootstraps: Seq[TransportServerBootstrap] =
-      if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslServerBootstrap(transportConf, securityManager))
-      } else {
-        Nil
-      }
-    server = transportContext.createServer(port, bootstraps.asJava)
-    dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
-  }
-
-  override lazy val address: RpcAddress = {
-    require(server != null, "NettyRpcEnv has not yet started")
-    RpcAddress(host, server.getPort())
-  }
-
-  override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
-    dispatcher.registerRpcEndpoint(name, endpoint)
-  }
-
-  def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = {
-    val addr = NettyRpcAddress(uri)
-    val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
-    val idVerifierRef =
-      new NettyRpcEndpointRef(conf, NettyRpcAddress(addr.host, addr.port, IDVerifier.NAME), this)
-    idVerifierRef.ask[Boolean](ID(endpointRef.name)).flatMap { find =>
-      if (find) {
-        Future.successful(endpointRef)
-      } else {
-        Future.failed(new RpcEndpointNotFoundException(uri))
-      }
-    }(ThreadUtils.sameThread)
-  }
-
-  override def stop(endpointRef: RpcEndpointRef): Unit = {
-    require(endpointRef.isInstanceOf[NettyRpcEndpointRef])
-    dispatcher.stop(endpointRef)
-  }
-
-  private[netty] def send(message: RequestMessage): Unit = {
-    val remoteAddr = message.receiver.address
-    if (remoteAddr == address) {
-      val promise = Promise[Any]()
-      dispatcher.postMessage(message, promise)
-      promise.future.onComplete {
-        case Success(response) =>
-          val ack = response.asInstanceOf[Ack]
-          logDebug(s"Receive ack from ${ack.sender}")
-        case Failure(e) =>
-          logError(s"Exception when sending $message", e)
-      }(ThreadUtils.sameThread)
-    } else {
-      try {
-        // `createClient` will block if it cannot find a known connection, so we should run it in
-        // clientConnectionExecutor
-        clientConnectionExecutor.execute(new Runnable {
-          override def run(): Unit = Utils.tryLogNonFatalError {
-            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
-            client.sendRpc(serialize(message), new RpcResponseCallback {
-
-              override def onFailure(e: Throwable): Unit = {
-                logError(s"Exception when sending $message", e)
-              }
-
-              override def onSuccess(response: Array[Byte]): Unit = {
-                val ack = deserialize[Ack](response)
-                logDebug(s"Receive ack from ${ack.sender}")
-              }
-            })
-          }
-        })
-      } catch {
-        case e: RejectedExecutionException => {
-          // `send` after shutting clientConnectionExecutor down, ignore it
-          logWarning(s"Cannot send ${message} because RpcEnv is stopped")
-        }
-      }
-    }
-  }
-
-  private[netty] def ask(message: RequestMessage): Future[Any] = {
-    val promise = Promise[Any]()
-    val remoteAddr = message.receiver.address
-    if (remoteAddr == address) {
-      val p = Promise[Any]()
-      dispatcher.postMessage(message, p)
-      p.future.onComplete {
-        case Success(response) =>
-          val reply = response.asInstanceOf[AskResponse]
-          if (reply.reply.isInstanceOf[RpcFailure]) {
-            if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
-              logWarning(s"Ignore failure: ${reply.reply}")
-            }
-          } else if (!promise.trySuccess(reply.reply)) {
-            logWarning(s"Ignore message: ${reply}")
-          }
-        case Failure(e) =>
-          if (!promise.tryFailure(e)) {
-            logWarning("Ignore Exception", e)
-          }
-      }(ThreadUtils.sameThread)
-    } else {
-      try {
-        // `createClient` will block if it cannot find a known connection, so we should run it in
-        // clientConnectionExecutor
-        clientConnectionExecutor.execute(new Runnable {
-          override def run(): Unit = {
-            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
-            client.sendRpc(serialize(message), new RpcResponseCallback {
-
-              override def onFailure(e: Throwable): Unit = {
-                if (!promise.tryFailure(e)) {
-                  logWarning("Ignore Exception", e)
-                }
-              }
-
-              override def onSuccess(response: Array[Byte]): Unit = {
-                val reply = deserialize[AskResponse](response)
-                if (reply.reply.isInstanceOf[RpcFailure]) {
-                  if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
-                    logWarning(s"Ignore failure: ${reply.reply}")
-                  }
-                } else if (!promise.trySuccess(reply.reply)) {
-                  logWarning(s"Ignore message: ${reply}")
-                }
-              }
-            })
-          }
-        })
-      } catch {
-        case e: RejectedExecutionException => {
-          if (!promise.tryFailure(e)) {
-            logWarning(s"Ignore failure", e)
-          }
-        }
-      }
-    }
-    promise.future
-  }
-
-  private[netty] def serialize(content: Any): Array[Byte] = {
-    val buffer = javaSerializerInstance.serialize(content)
-    Arrays.copyOfRange(
-      buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
-  }
-
-  private[netty] def deserialize[T: ClassTag](bytes: Array[Byte]): T = {
-    deserialize { () =>
-      javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes))
-    }
-  }
-
-  override def endpointRef(endpoint: RpcEndpoint): RpcEndpointRef = {
-    dispatcher.getRpcEndpointRef(endpoint)
-  }
-
-  override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
-    new NettyRpcAddress(address.host, address.port, endpointName).toString
-
-  override def shutdown(): Unit = {
-    cleanup()
-  }
-
-  override def awaitTermination(): Unit = {
-    dispatcher.awaitTermination()
-  }
-
-  private def cleanup(): Unit = {
-    if (timeoutScheduler != null) {
-      timeoutScheduler.shutdownNow()
-    }
-    if (server != null) {
-      server.close()
-    }
-    if (clientFactory != null) {
-      clientFactory.close()
-    }
-    if (dispatcher != null) {
-      dispatcher.stop()
-    }
-    if (clientConnectionExecutor != null) {
-      clientConnectionExecutor.shutdownNow()
-    }
-  }
-
-  override def deserialize[T](deserializationAction: () => T): T = {
-    NettyRpcEnv.currentEnv.withValue(this) {
-      deserializationAction()
-    }
-  }
-}
-
-private[netty] object NettyRpcEnv extends Logging {
-
-  /**
-   * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
-   * Use `currentEnv` to wrap the deserialization codes. E.g.,
-   *
-   * {{{
-   *   NettyRpcEnv.currentEnv.withValue(this) {
-   *     your deserialization codes
-   *   }
-   * }}}
-   */
-  private[netty] val currentEnv = new DynamicVariable[NettyRpcEnv](null)
-}
-
-private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
-
-  def create(config: RpcEnvConfig): RpcEnv = {
-    val sparkConf = config.conf
-    // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support
-    // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance
-    val javaSerializerInstance =
-      new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]
-    val nettyEnv =
-      new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager)
-    val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
-      nettyEnv.start(actualPort)
-      (nettyEnv, actualPort)
-    }
-    try {
-      Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1
-    } catch {
-      case NonFatal(e) =>
-        nettyEnv.shutdown()
-        throw e
-    }
-  }
-}
-
-private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
-  extends RpcEndpointRef(conf) with Serializable with Logging {
-
-  @transient @volatile private var nettyEnv: NettyRpcEnv = _
-
-  @transient @volatile private var _address: NettyRpcAddress = _
-
-  def this(conf: SparkConf, _address: NettyRpcAddress, nettyEnv: NettyRpcEnv) {
-    this(conf)
-    this._address = _address
-    this.nettyEnv = nettyEnv
-  }
-
-  override def address: RpcAddress = _address.toRpcAddress
-
-  private def readObject(in: ObjectInputStream): Unit = {
-    in.defaultReadObject()
-    _address = in.readObject().asInstanceOf[NettyRpcAddress]
-    nettyEnv = NettyRpcEnv.currentEnv.value
-  }
-
-  private def writeObject(out: ObjectOutputStream): Unit = {
-    out.defaultWriteObject()
-    out.writeObject(_address)
-  }
-
-  override def name: String = _address.name
-
-  override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = {
-    val promise = Promise[Any]()
-    val timeoutCancelable = nettyEnv.timeoutScheduler.schedule(new Runnable {
-      override def run(): Unit = {
-        promise.tryFailure(new TimeoutException("Cannot receive any reply in " + timeout.duration))
-      }
-    }, timeout.duration.toNanos, TimeUnit.NANOSECONDS)
-    val f = nettyEnv.ask(RequestMessage(nettyEnv.address, this, message, true))
-    f.onComplete { v =>
-      timeoutCancelable.cancel(true)
-      if (!promise.tryComplete(v)) {
-        logWarning(s"Ignore message $v")
-      }
-    }(ThreadUtils.sameThread)
-    promise.future.mapTo[T].recover(timeout.addMessageIfTimeout)(ThreadUtils.sameThread)
-  }
-
-  override def send(message: Any): Unit = {
-    require(message != null, "Message is null")
-    nettyEnv.send(RequestMessage(nettyEnv.address, this, message, false))
-  }
-
-  override def toString: String = s"NettyRpcEndpointRef(${_address})"
-
-  def toURI: URI = new URI(s"spark://${_address}")
-
-  final override def equals(that: Any): Boolean = that match {
-    case other: NettyRpcEndpointRef => _address == other._address
-    case _ => false
-  }
-
-  final override def hashCode(): Int = if (_address == null) 0 else _address.hashCode()
-}
-
-/**
- * The message that is sent from the sender to the receiver.
- */
-private[netty] case class RequestMessage(
-    senderAddress: RpcAddress, receiver: NettyRpcEndpointRef, content: Any, needReply: Boolean)
-
-/**
- * The base trait for all messages that are sent back from the receiver to the sender.
- */
-private[netty] trait ResponseMessage
-
-/**
- * The reply for `ask` from the receiver side.
- */
-private[netty] case class AskResponse(sender: NettyRpcEndpointRef, reply: Any)
-  extends ResponseMessage
-
-/**
- * A message to send back to the receiver side. It's necessary because [[TransportClient]] only
- * clean the resources when it receives a reply.
- */
-private[netty] case class Ack(sender: NettyRpcEndpointRef) extends ResponseMessage
-
-/**
- * A response that indicates some failure happens in the receiver side.
- */
-private[netty] case class RpcFailure(e: Throwable)
-
-/**
- * Maintain the mapping relations between client addresses and [[RpcEnv]] addresses, broadcast
- * network events and forward messages to [[Dispatcher]].
- */
-private[netty] class NettyRpcHandler(
-    dispatcher: Dispatcher, nettyEnv: NettyRpcEnv) extends RpcHandler with Logging {
-
-  private type ClientAddress = RpcAddress
-  private type RemoteEnvAddress = RpcAddress
-
-  // Store all client addresses and their NettyRpcEnv addresses.
-  @GuardedBy("this")
-  private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
-
-  // Store the connections from other NettyRpcEnv addresses. We need to keep track of the connection
-  // count because `TransportClientFactory.createClient` will create multiple connections
-  // (at most `spark.shuffle.io.numConnectionsPerPeer` connections) and randomly select a connection
-  // to send the message. See `TransportClientFactory.createClient` for more details.
-  @GuardedBy("this")
-  private val remoteConnectionCount = new mutable.HashMap[RemoteEnvAddress, Int]()
-
-  override def receive(
-      client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
-    val requestMessage = nettyEnv.deserialize[RequestMessage](message)
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
-    assert(addr != null)
-    val remoteEnvAddress = requestMessage.senderAddress
-    val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-    val broadcastMessage =
-      synchronized {
-        // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
-        if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
-          // clientAddr connects at the first time
-          val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-          // Increase the connection number of remoteEnvAddress
-          remoteConnectionCount.put(remoteEnvAddress, count + 1)
-          if (count == 0) {
-            // This is the first connection, so fire "Associated"
-            Some(Associated(remoteEnvAddress))
-          } else {
-            None
-          }
-        } else {
-          None
-        }
-      }
-    broadcastMessage.foreach(dispatcher.broadcastMessage)
-    dispatcher.postMessage(requestMessage, callback)
-  }
-
-  override def getStreamManager: StreamManager = new OneForOneStreamManager
-
-  override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
-    if (addr != null) {
-      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-      val broadcastMessage =
-        synchronized {
-          remoteAddresses.get(clientAddr).map(AssociationError(cause, _))
-        }
-      if (broadcastMessage.isEmpty) {
-        logError(cause.getMessage, cause)
-      } else {
-        dispatcher.broadcastMessage(broadcastMessage.get)
-      }
-    } else {
-      // If the channel is closed before connecting, its remoteAddress will be null.
-      // See java.net.Socket.getRemoteSocketAddress
-      // Because we cannot get a RpcAddress, just log it
-      logError("Exception before connecting to the client", cause)
-    }
-  }
-
-  override def connectionTerminated(client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
-    if (addr != null) {
-      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-      val broadcastMessage =
-        synchronized {
-          // If the last connection to a remote RpcEnv is terminated, we should broadcast
-          // "Disassociated"
-          remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
-            remoteAddresses -= clientAddr
-            val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-            assert(count != 0, "remoteAddresses and remoteConnectionCount are not consistent")
-            if (count - 1 == 0) {
-              // We lost all clients, so clean up and fire "Disassociated"
-              remoteConnectionCount.remove(remoteEnvAddress)
-              Some(Disassociated(remoteEnvAddress))
-            } else {
-              // Decrease the connection number of remoteEnvAddress
-              remoteConnectionCount.put(remoteEnvAddress, count - 1)
-              None
-            }
-          }
-        }
-      broadcastMessage.foreach(dispatcher.broadcastMessage)
-    } else {
-      // If the channel is closed before connecting, its remoteAddress will be null. In this case,
-      // we can ignore it since we don't fire "Associated".
-      // See java.net.Socket.getRemoteSocketAddress
-    }
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
index e749631bf6f19..7478ab0fc2f7a 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
@@ -19,7 +19,7 @@ package org.apache.spark.storage
 
 import scala.concurrent.{ExecutionContext, Future}
 
-import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext, RpcEndpoint}
+import org.apache.spark.rpc.{RpcEnv, RpcCallContext, RpcEndpoint}
 import org.apache.spark.util.ThreadUtils
 import org.apache.spark.{Logging, MapOutputTracker, SparkEnv}
 import org.apache.spark.storage.BlockManagerMessages._
@@ -33,7 +33,7 @@ class BlockManagerSlaveEndpoint(
     override val rpcEnv: RpcEnv,
     blockManager: BlockManager,
     mapOutputTracker: MapOutputTracker)
-  extends ThreadSafeRpcEndpoint with Logging {
+  extends RpcEndpoint with Logging {
 
   private val asyncThreadPool =
     ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
@@ -80,7 +80,7 @@ class BlockManagerSlaveEndpoint(
     future.onSuccess { case response =>
       logDebug("Done " + actionMessage + ", response is " + response)
       context.reply(response)
-      logDebug("Sent response: " + response + " to " + context.senderAddress)
+      logDebug("Sent response: " + response + " to " + context.sender)
     }
     future.onFailure { case t: Throwable =>
       logError("Error in " + actionMessage, t)
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 1ed098379e299..22e291a2b48d6 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -85,11 +85,7 @@ private[spark] object ThreadUtils {
    */
   def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()
-    val executor = new ScheduledThreadPoolExecutor(1, threadFactory)
-    // By default, a cancelled task is not automatically removed from the work queue until its delay
-    // elapses. We have to enable it manually.
-    executor.setRemoveOnCancelPolicy(true)
-    executor
+    Executors.newSingleThreadScheduledExecutor(threadFactory)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index 7e70308bb360c..af4e68950f75a 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -168,9 +168,10 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     masterTracker.registerShuffle(10, 1)
     masterTracker.registerMapOutput(10, 0, MapStatus(
       BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0)))
-    val senderAddress = RpcAddress("localhost", 12345)
+    val sender = mock(classOf[RpcEndpointRef])
+    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
+    when(rpcCallContext.sender).thenReturn(sender)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(10))
     verify(rpcCallContext).reply(any())
     verify(rpcCallContext, never()).sendFailure(any())
@@ -197,9 +198,10 @@ class MapOutputTrackerSuite extends SparkFunSuite {
       masterTracker.registerMapOutput(20, i, new CompressedMapStatus(
         BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0)))
     }
-    val senderAddress = RpcAddress("localhost", 12345)
+    val sender = mock(classOf[RpcEndpointRef])
+    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
+    when(rpcCallContext.sender).thenReturn(sender)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(20))
     verify(rpcCallContext, never()).reply(any())
     verify(rpcCallContext).sendFailure(isA(classOf[SparkException]))
diff --git a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
index 2d14249855c9d..33270bec6247c 100644
--- a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
+++ b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
@@ -41,7 +41,6 @@ object SSLSampleConfigs {
 
   def sparkSSLConfig(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
-    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", keyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
@@ -55,7 +54,6 @@ object SSLSampleConfigs {
 
   def sparkSSLConfigUntrusted(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
-    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", untrustedKeyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
index 40c24bdecc6ce..e9034e39a715c 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
@@ -26,7 +26,8 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val conf = new SparkConf()
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
+    workerWatcher.setTesting(testing = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(RpcAddress("1.2.3.4", 1234))
     assert(workerWatcher.isShutDown)
@@ -38,7 +39,8 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
     val otherRpcAddress = RpcAddress("4.3.2.1", 1234)
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
+    workerWatcher.setTesting(testing = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(otherRpcAddress)
     assert(!workerWatcher.isShutDown)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index e836946a59431..6ceafe4337747 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.rpc
 
-import java.io.NotSerializableException
 import java.util.concurrent.{TimeUnit, CountDownLatch, TimeoutException}
 
 import scala.collection.mutable
@@ -100,6 +99,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     }
     val rpcEndpointRef = env.setupEndpoint("send-ref", endpoint)
+
     val newRpcEndpointRef = rpcEndpointRef.askWithRetry[RpcEndpointRef]("Hello")
     val reply = newRpcEndpointRef.askWithRetry[String]("Echo")
     assert("Echo" === reply)
@@ -516,9 +516,6 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(events === List(
         ("onConnected", remoteAddress),
         ("onNetworkError", remoteAddress),
-        ("onDisconnected", remoteAddress)) ||
-        events === List(
-        ("onConnected", remoteAddress),
         ("onDisconnected", remoteAddress)))
     }
   }
@@ -538,84 +535,15 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       "local", env.address, "sendWithReply-unserializable-error")
     try {
       val f = rpcEndpointRef.ask[String]("hello")
-      val e = intercept[Exception] {
+      intercept[TimeoutException] {
         Await.result(f, 1 seconds)
       }
-      assert(e.isInstanceOf[TimeoutException] || // For Akka
-        e.isInstanceOf[NotSerializableException] // For Netty
-      )
     } finally {
       anotherEnv.shutdown()
       anotherEnv.awaitTermination()
     }
   }
 
-  test("port conflict") {
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", env.address.port)
-    assert(anotherEnv.address.port != env.address.port)
-  }
-
-  test("send with authentication") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.authenticate.secret", "good")
-
-    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
-    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
-
-    try {
-      @volatile var message: String = null
-      localEnv.setupEndpoint("send-authentication", new RpcEndpoint {
-        override val rpcEnv = localEnv
-
-        override def receive: PartialFunction[Any, Unit] = {
-          case msg: String => message = msg
-        }
-      })
-      val rpcEndpointRef =
-        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "send-authentication")
-      rpcEndpointRef.send("hello")
-      eventually(timeout(5 seconds), interval(10 millis)) {
-        assert("hello" === message)
-      }
-    } finally {
-      localEnv.shutdown()
-      localEnv.awaitTermination()
-      remoteEnv.shutdown()
-      remoteEnv.awaitTermination()
-    }
-  }
-
-  test("ask with authentication") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.authenticate.secret", "good")
-
-    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
-    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
-
-    try {
-      localEnv.setupEndpoint("ask-authentication", new RpcEndpoint {
-        override val rpcEnv = localEnv
-
-        override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-          case msg: String => {
-            context.reply(msg)
-          }
-        }
-      })
-      val rpcEndpointRef =
-        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "ask-authentication")
-      val reply = rpcEndpointRef.askWithRetry[String]("hello")
-      assert("hello" === reply)
-    } finally {
-      localEnv.shutdown()
-      localEnv.awaitTermination()
-      remoteEnv.shutdown()
-      remoteEnv.awaitTermination()
-    }
-  }
-
   test("construct RpcTimeout with conf property") {
     val conf = new SparkConf
 
@@ -684,7 +612,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     // once the future is complete to verify addMessageIfTimeout was invoked
     val reply3 =
       intercept[RpcTimeoutException] {
-        Await.result(fut3, 2000 millis)
+        Await.result(fut3, 200 millis)
       }.getMessage
 
     // When the future timed out, the recover callback should have used
diff --git a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
deleted file mode 100644
index 5e8da3e205ab0..0000000000000
--- a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.scalactic.TripleEquals
-
-class TestRpcEndpoint extends ThreadSafeRpcEndpoint with TripleEquals {
-
-  override val rpcEnv: RpcEnv = null
-
-  @volatile private var receiveMessages = ArrayBuffer[Any]()
-
-  @volatile private var receiveAndReplyMessages = ArrayBuffer[Any]()
-
-  @volatile private var onConnectedMessages = ArrayBuffer[RpcAddress]()
-
-  @volatile private var onDisconnectedMessages = ArrayBuffer[RpcAddress]()
-
-  @volatile private var onNetworkErrorMessages = ArrayBuffer[(Throwable, RpcAddress)]()
-
-  @volatile private var started = false
-
-  @volatile private var stopped = false
-
-  override def receive: PartialFunction[Any, Unit] = {
-    case message: Any => receiveMessages += message
-  }
-
-  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case message: Any => receiveAndReplyMessages += message
-  }
-
-  override def onConnected(remoteAddress: RpcAddress): Unit = {
-    onConnectedMessages += remoteAddress
-  }
-
-  /**
-   * Invoked when some network error happens in the connection between the current node and
-   * `remoteAddress`.
-   */
-  override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
-    onNetworkErrorMessages += cause -> remoteAddress
-  }
-
-  override def onDisconnected(remoteAddress: RpcAddress): Unit = {
-    onDisconnectedMessages += remoteAddress
-  }
-
-  def numReceiveMessages: Int = receiveMessages.size
-
-  override def onStart(): Unit = {
-    started = true
-  }
-
-  override def onStop(): Unit = {
-    stopped = true
-  }
-
-  def verifyStarted(): Unit = {
-    assert(started, "RpcEndpoint is not started")
-  }
-
-  def verifyStopped(): Unit = {
-    assert(stopped, "RpcEndpoint is not stopped")
-  }
-
-  def verifyReceiveMessages(expected: Seq[Any]): Unit = {
-    assert(receiveMessages === expected)
-  }
-
-  def verifySingleReceiveMessage(message: Any): Unit = {
-    verifyReceiveMessages(List(message))
-  }
-
-  def verifyReceiveAndReplyMessages(expected: Seq[Any]): Unit = {
-    assert(receiveAndReplyMessages === expected)
-  }
-
-  def verifySingleReceiveAndReplyMessage(message: Any): Unit = {
-    verifyReceiveAndReplyMessages(List(message))
-  }
-
-  def verifySingleOnConnectedMessage(remoteAddress: RpcAddress): Unit = {
-    verifyOnConnectedMessages(List(remoteAddress))
-  }
-
-  def verifyOnConnectedMessages(expected: Seq[RpcAddress]): Unit = {
-    assert(onConnectedMessages === expected)
-  }
-
-  def verifySingleOnDisconnectedMessage(remoteAddress: RpcAddress): Unit = {
-    verifyOnDisconnectedMessages(List(remoteAddress))
-  }
-
-  def verifyOnDisconnectedMessages(expected: Seq[RpcAddress]): Unit = {
-    assert(onDisconnectedMessages === expected)
-  }
-
-  def verifySingleOnNetworkErrorMessage(cause: Throwable, remoteAddress: RpcAddress): Unit = {
-    verifyOnNetworkErrorMessages(List(cause -> remoteAddress))
-  }
-
-  def verifyOnNetworkErrorMessages(expected: Seq[(Throwable, RpcAddress)]): Unit = {
-    assert(onNetworkErrorMessages === expected)
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
deleted file mode 100644
index ff83ab9b32cb9..0000000000000
--- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import java.util.concurrent.{CountDownLatch, TimeUnit}
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.mockito.Mockito._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.rpc.{RpcEnv, RpcEndpoint, RpcAddress, TestRpcEndpoint}
-
-class InboxSuite extends SparkFunSuite {
-
-  test("post") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    when(endpointRef.name).thenReturn("hello")
-
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val inbox = new Inbox(endpointRef, endpoint)
-    val message = ContentMessage(null, "hi", false, null)
-    inbox.post(message)
-    inbox.process(dispatcher)
-    assert(inbox.isEmpty)
-
-    endpoint.verifySingleReceiveMessage("hi")
-
-    inbox.stop()
-    inbox.process(dispatcher)
-    assert(inbox.isEmpty)
-    endpoint.verifyStarted()
-    endpoint.verifyStopped()
-  }
-
-  test("post: with reply") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val inbox = new Inbox(endpointRef, endpoint)
-    val message = ContentMessage(null, "hi", true, null)
-    inbox.post(message)
-    inbox.process(dispatcher)
-    assert(inbox.isEmpty)
-
-    endpoint.verifySingleReceiveAndReplyMessage("hi")
-  }
-
-  test("post: multiple threads") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    when(endpointRef.name).thenReturn("hello")
-
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val numDroppedMessages = new AtomicInteger(0)
-    val inbox = new Inbox(endpointRef, endpoint) {
-      override def onDrop(message: InboxMessage): Unit = {
-        numDroppedMessages.incrementAndGet()
-      }
-    }
-
-    val exitLatch = new CountDownLatch(10)
-
-    for (_ <- 0 until 10) {
-      new Thread {
-        override def run(): Unit = {
-          for (_ <- 0 until 100) {
-            val message = ContentMessage(null, "hi", false, null)
-            inbox.post(message)
-          }
-          exitLatch.countDown()
-        }
-      }.start()
-    }
-    inbox.process(dispatcher)
-    assert(inbox.isEmpty)
-    inbox.stop()
-    inbox.process(dispatcher)
-    assert(inbox.isEmpty)
-
-    exitLatch.await(30, TimeUnit.SECONDS)
-
-    assert(1000 === endpoint.numReceiveMessages + numDroppedMessages.get)
-    endpoint.verifyStarted()
-    endpoint.verifyStopped()
-  }
-
-  test("post: Associated") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val remoteAddress = RpcAddress("localhost", 11111)
-
-    val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Associated(remoteAddress))
-    inbox.process(dispatcher)
-
-    endpoint.verifySingleOnConnectedMessage(remoteAddress)
-  }
-
-  test("post: Disassociated") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val remoteAddress = RpcAddress("localhost", 11111)
-
-    val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Disassociated(remoteAddress))
-    inbox.process(dispatcher)
-
-    endpoint.verifySingleOnDisconnectedMessage(remoteAddress)
-  }
-
-  test("post: AssociationError") {
-    val endpoint = new TestRpcEndpoint
-    val endpointRef = mock(classOf[NettyRpcEndpointRef])
-    val dispatcher = mock(classOf[Dispatcher])
-
-    val remoteAddress = RpcAddress("localhost", 11111)
-    val cause = new RuntimeException("Oops")
-
-    val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(AssociationError(cause, remoteAddress))
-    inbox.process(dispatcher)
-
-    endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress)
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
deleted file mode 100644
index a5d43d3704e37..0000000000000
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import org.apache.spark.SparkFunSuite
-
-class NettyRpcAddressSuite extends SparkFunSuite {
-
-  test("toString") {
-    val addr = NettyRpcAddress("localhost", 12345, "test")
-    assert(addr.toString === "spark://test@localhost:12345")
-  }
-
-}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
deleted file mode 100644
index be19668e17c04..0000000000000
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.rpc._
-
-class NettyRpcEnvSuite extends RpcEnvSuite {
-
-  override def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv = {
-    val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf))
-    new NettyRpcEnvFactory().create(config)
-  }
-
-  test("non-existent endpoint") {
-    val uri = env.uriOf("test", env.address, "nonexist-endpoint")
-    val e = intercept[RpcEndpointNotFoundException] {
-      env.setupEndpointRef("test", env.address, "nonexist-endpoint")
-    }
-    assert(e.getMessage.contains(uri))
-  }
-
-}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
deleted file mode 100644
index 06ca035d199e8..0000000000000
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rpc.netty
-
-import java.net.InetSocketAddress
-
-import io.netty.channel.Channel
-import org.mockito.Mockito._
-import org.mockito.Matchers._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.network.client.{TransportResponseHandler, TransportClient}
-import org.apache.spark.rpc._
-
-class NettyRpcHandlerSuite extends SparkFunSuite {
-
-  val env = mock(classOf[NettyRpcEnv])
-  when(env.deserialize(any(classOf[Array[Byte]]))(any())).
-    thenReturn(RequestMessage(RpcAddress("localhost", 12345), null, null, false))
-
-  test("receive") {
-    val dispatcher = mock(classOf[Dispatcher])
-    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
-
-    val channel = mock(classOf[Channel])
-    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
-    nettyRpcHandler.receive(client, null, null)
-
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
-    nettyRpcHandler.receive(client, null, null)
-
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
-  }
-
-  test("connectionTerminated") {
-    val dispatcher = mock(classOf[Dispatcher])
-    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
-
-    val channel = mock(classOf[Channel])
-    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
-    nettyRpcHandler.receive(client, null, null)
-
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
-    nettyRpcHandler.connectionTerminated(client)
-
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
-    verify(dispatcher, times(1)).broadcastMessage(Disassociated(RpcAddress("localhost", 12345)))
-  }
-
-}
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index fbb8bb6b2f6c3..df841288a02e3 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -78,10 +78,6 @@ public TransportClient(Channel channel, TransportResponseHandler handler) {
     this.handler = Preconditions.checkNotNull(handler);
   }
 
-  public Channel getChannel() {
-    return channel;
-  }
-
   public boolean isActive() {
     return channel.isOpen() || channel.isActive();
   }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
index dbb7f95f55bc0..2ba92a40f8b0a 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
@@ -52,6 +52,4 @@ public abstract void receive(
    * No further requests will come from this client.
    */
   public void connectionTerminated(TransportClient client) { }
-
-  public void exceptionCaught(Throwable cause, TransportClient client) { }
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index 96941d26be19d..df6027805838d 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -71,7 +71,6 @@ public TransportRequestHandler(
 
   @Override
   public void exceptionCaught(Throwable cause) {
-    rpcHandler.exceptionCaught(cause, reverseClient);
   }
 
   @Override
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index d053e9e84910f..204e6142fd6cf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -474,7 +474,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       // Remote messages
       case RegisterReceiver(streamId, typ, hostPort, receiverEndpoint) =>
         val successful =
-          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.senderAddress)
+          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.sender.address)
         context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 32d218169a6e6..93621b44c9183 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -556,7 +556,10 @@ private[spark] class ApplicationMaster(
       override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
     extends RpcEndpoint with Logging {
 
-    driver.send(RegisterClusterManager(self))
+    override def onStart(): Unit = {
+      driver.send(RegisterClusterManager(self))
+
+    }
 
     override def receive: PartialFunction[Any, Unit] = {
       case x: AddWebUIFilter =>

From 341b13f8f5eb118f1fb4d4f84418715ac4750a4d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Thu, 24 Sep 2015 09:54:07 -0700
Subject: [PATCH 409/802] [SPARK-10765] [SQL] use new aggregate interface for
 hive UDAF

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8874 from cloud-fan/hive-agg.
---
 .../expressions/aggregate/interfaces.scala    |   7 +-
 .../spark/sql/execution/SparkStrategies.scala |  14 +-
 .../aggregate/AggregationIterator.scala       |   2 +-
 .../spark/sql/execution/aggregate/utils.scala |  51 +++++++
 .../org/apache/spark/sql/hive/hiveUDFs.scala  | 139 +++++++-----------
 .../sql/hive/execution/HiveUDFSuite.scala     |   2 +-
 6 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 576d8c7a3a68a..d8699533cd156 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -169,6 +168,12 @@ abstract class AggregateFunction2
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+
+  /**
+   * Indicates if this function supports partial aggregation.
+   * Currently Hive UDAF is the only one that doesn't support partial aggregation.
+   */
+  def supportsPartial: Boolean = true
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 41b215c79296a..b078c8b6b05ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -221,7 +221,19 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
             }
 
             val aggregateOperator =
-              if (functionsWithDistinct.isEmpty) {
+              if (aggregateExpressions.map(_.aggregateFunction).exists(!_.supportsPartial)) {
+                if (functionsWithDistinct.nonEmpty) {
+                  sys.error("Distinct columns cannot exist in Aggregate operator containing " +
+                    "aggregate functions which don't support partial aggregation.")
+                } else {
+                  aggregate.Utils.planAggregateWithoutPartial(
+                    groupingExpressions,
+                    aggregateExpressions,
+                    aggregateFunctionMap,
+                    resultExpressions,
+                    planLater(child))
+                }
+              } else if (functionsWithDistinct.isEmpty) {
                 aggregate.Utils.planAggregateWithoutDistinct(
                   groupingExpressions,
                   aggregateExpressions,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index abca373b0c4f9..62dbc07e883f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -26,7 +26,7 @@ import org.apache.spark.unsafe.KVIterator
 import scala.collection.mutable.ArrayBuffer
 
 /**
- * The base class of [[SortBasedAggregationIterator]] and [[UnsafeHybridAggregationIterator]].
+ * The base class of [[SortBasedAggregationIterator]].
  * It mainly contains two parts:
  * 1. It initializes aggregate functions.
  * 2. It creates two functions, `processRow` and `generateOutput` based on [[AggregateMode]] of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 80816a095ea8c..4f5e86cceb8ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -37,6 +37,57 @@ object Utils {
       UnsafeProjection.canSupport(groupingExpressions)
   }
 
+  def planAggregateWithoutPartial(
+      groupingExpressions: Seq[Expression],
+      aggregateExpressions: Seq[AggregateExpression2],
+      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      resultExpressions: Seq[NamedExpression],
+      child: SparkPlan): Seq[SparkPlan] = {
+
+    val namedGroupingExpressions = groupingExpressions.map {
+      case ne: NamedExpression => ne -> ne
+      // If the expression is not a NamedExpressions, we add an alias.
+      // So, when we generate the result of the operator, the Aggregate Operator
+      // can directly get the Seq of attributes representing the grouping expressions.
+      case other =>
+        val withAlias = Alias(other, other.toString)()
+        other -> withAlias
+    }
+    val groupExpressionMap = namedGroupingExpressions.toMap
+    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+
+    val completeAggregateExpressions = aggregateExpressions.map(_.copy(mode = Complete))
+    val completeAggregateAttributes =
+      completeAggregateExpressions.map {
+        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
+      }
+
+    val rewrittenResultExpressions = resultExpressions.map { expr =>
+      expr.transformDown {
+        case agg: AggregateExpression2 =>
+          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
+        case expression =>
+          // We do not rely on the equality check at here since attributes may
+          // different cosmetically. Instead, we use semanticEquals.
+          groupExpressionMap.collectFirst {
+            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+          }.getOrElse(expression)
+      }.asInstanceOf[NamedExpression]
+    }
+
+    SortBasedAggregate(
+      requiredChildDistributionExpressions = Some(namedGroupingAttributes),
+      groupingExpressions = namedGroupingExpressions.map(_._2),
+      nonCompleteAggregateExpressions = Nil,
+      nonCompleteAggregateAttributes = Nil,
+      completeAggregateExpressions = completeAggregateExpressions,
+      completeAggregateAttributes = completeAggregateAttributes,
+      initialInputBufferOffset = 0,
+      resultExpressions = rewrittenResultExpressions,
+      child = child
+    ) :: Nil
+  }
+
   def planAggregateWithoutDistinct(
       groupingExpressions: Seq[Expression],
       aggregateExpressions: Seq[AggregateExpression2],
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index cad02373e5ba1..fa9012b96edde 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -65,9 +66,10 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
         HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
       } else if (
         classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDAF(new HiveFunctionWrapper(functionClassName), children)
+        HiveUDAFFunction(new HiveFunctionWrapper(functionClassName), children)
       } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveUDAF(new HiveFunctionWrapper(functionClassName), children)
+        HiveUDAFFunction(
+          new HiveFunctionWrapper(functionClassName), children, isUDAFBridgeRequired = true)
       } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
         HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
       } else {
@@ -441,70 +443,6 @@ private[hive] case class HiveWindowFunction(
     new HiveWindowFunction(funcWrapper, pivotResult, isUDAFBridgeRequired, children)
 }
 
-private[hive] case class HiveGenericUDAF(
-    funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression1
-  with HiveInspectors {
-
-  type UDFType = AbstractGenericUDAFResolver
-
-  @transient
-  protected lazy val resolver: AbstractGenericUDAFResolver = funcWrapper.createFunction()
-
-  @transient
-  protected lazy val objectInspector = {
-    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
-    resolver.getEvaluator(parameterInfo)
-      .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
-  }
-
-  @transient
-  protected lazy val inspectors = children.map(toInspector)
-
-  def dataType: DataType = inspectorToDataType(objectInspector)
-
-  def nullable: Boolean = true
-
-  override def toString: String = {
-    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
-  }
-
-  def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this)
-}
-
-/** It is used as a wrapper for the hive functions which uses UDAF interface */
-private[hive] case class HiveUDAF(
-    funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression1
-  with HiveInspectors {
-
-  type UDFType = UDAF
-
-  @transient
-  protected lazy val resolver: AbstractGenericUDAFResolver =
-    new GenericUDAFBridge(funcWrapper.createFunction())
-
-  @transient
-  protected lazy val objectInspector = {
-    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
-    resolver.getEvaluator(parameterInfo)
-      .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
-  }
-
-  @transient
-  protected lazy val inspectors = children.map(toInspector)
-
-  def dataType: DataType = inspectorToDataType(objectInspector)
-
-  def nullable: Boolean = true
-
-  override def toString: String = {
-    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
-  }
-
-  def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this, true)
-}
-
 /**
  * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
  * [[Generator]].  Note that the semantics of Generators do not allow
@@ -584,49 +522,86 @@ private[hive] case class HiveGenericUDTF(
   }
 }
 
+/**
+ * Currently we don't support partial aggregation for queries using Hive UDAF, which may hurt
+ * performance a lot.
+ */
 private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
-    exprs: Seq[Expression],
-    base: AggregateExpression1,
+    children: Seq[Expression],
     isUDAFBridgeRequired: Boolean = false)
-  extends AggregateFunction1
-  with HiveInspectors {
+  extends AggregateFunction2 with HiveInspectors {
 
-  def this() = this(null, null, null)
+  def this() = this(null, null)
 
-  private val resolver =
+  @transient
+  private lazy val resolver =
     if (isUDAFBridgeRequired) {
       new GenericUDAFBridge(funcWrapper.createFunction[UDAF]())
     } else {
       funcWrapper.createFunction[AbstractGenericUDAFResolver]()
     }
 
-  private val inspectors = exprs.map(toInspector).toArray
+  @transient
+  private lazy val inspectors = children.map(toInspector).toArray
 
-  private val function = {
+  @transient
+  private lazy val functionAndInspector = {
     val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false)
-    resolver.getEvaluator(parameterInfo)
+    val f = resolver.getEvaluator(parameterInfo)
+    f -> f.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
   }
 
-  private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
+  @transient
+  private lazy val function = functionAndInspector._1
+
+  @transient
+  private lazy val returnInspector = functionAndInspector._2
 
-  private val buffer =
-    function.getNewAggregationBuffer
+  @transient
+  private[this] var buffer: GenericUDAFEvaluator.AggregationBuffer = _
 
   override def eval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector)
 
   @transient
-  val inputProjection = new InterpretedProjection(exprs)
+  private lazy val inputProjection = new InterpretedProjection(children)
 
   @transient
-  protected lazy val cached = new Array[AnyRef](exprs.length)
+  private lazy val cached = new Array[AnyRef](children.length)
 
   @transient
-  private lazy val inputDataTypes: Array[DataType] = exprs.map(_.dataType).toArray
+  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
+
+  // Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation
+  // buffer for it.
+  override def bufferSchema: StructType = StructType(Nil)
 
-  def update(input: InternalRow): Unit = {
+  override def update(_buffer: MutableRow, input: InternalRow): Unit = {
     val inputs = inputProjection(input)
     function.iterate(buffer, wrap(inputs, inspectors, cached, inputDataTypes))
   }
+
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    throw new UnsupportedOperationException(
+      "Hive UDAF doesn't support partial aggregate")
+  }
+
+  override def cloneBufferAttributes: Seq[Attribute] = Nil
+
+  override def initialize(_buffer: MutableRow): Unit = {
+    buffer = function.getNewAggregationBuffer
+  }
+
+  override def bufferAttributes: Seq[AttributeReference] = Nil
+
+  // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
+  // catalyst type checking framework.
+  override def inputTypes: Seq[AbstractDataType] = children.map(_ => AnyDataType)
+
+  override def nullable: Boolean = true
+
+  override def supportsPartial: Boolean = false
+
+  override lazy val dataType: DataType = inspectorToDataType(returnInspector)
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index d9ba895e1eceb..3c8a0091c83ec 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -131,7 +131,7 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton {
     hiveContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
   }
 
-  test("SPARK-6409 UDAFAverage test") {
+  test("SPARK-6409 UDAF Average test") {
     sql(s"CREATE TEMPORARY FUNCTION test_avg AS '${classOf[GenericUDAFAverage].getName}'")
     checkAnswer(
       sql("SELECT test_avg(1), test_avg(substr(value,5)) FROM src"),

From b3862d3c59746ffb5f089aea4ff9e6f033a2c658 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 24 Sep 2015 12:52:11 -0700
Subject: [PATCH 410/802] [SPARK-10705] [SQL] Avoid using external rows in
 DataFrame.toJSON

JIRA: https://issues.apache.org/jira/browse/SPARK-10705

As described in the JIRA ticket, `DataFrame.toJSON` uses `DataFrame.mapPartitions`, which converts internal rows to external rows. We should use `queryExecution.toRdd.mapPartitions` that directly uses internal rows for better performance.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8865 from viirya/df-tojson-internalrow.
---
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../datasources/json/JSONRelation.scala       |  2 +-
 .../datasources/json/JacksonGenerator.scala   | 84 +------------------
 3 files changed, 3 insertions(+), 85 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index a11140b717360..f9995da3a85ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1564,7 +1564,7 @@ class DataFrame private[sql](
    */
   def toJSON: RDD[String] = {
     val rowSchema = this.schema
-    this.mapPartitions { iter =>
+    queryExecution.toRdd.mapPartitions { iter =>
       val writer = new CharArrayWriter()
       // create the Generator without separator inserted between 2 records
       val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 8ee0127c3bde8..d05e6efa83c84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -182,7 +182,7 @@ private[json] class JsonOutputWriter(
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
   override protected[sql] def writeInternal(row: InternalRow): Unit = {
-    JacksonGenerator(dataSchema, gen, row)
+    JacksonGenerator(dataSchema, gen)(row)
     gen.flush()
 
     result.set(writer.toString)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index 23bada1ddd92f..d7d6edeb6c6d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -28,88 +28,6 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 
 private[sql] object JacksonGenerator {
-  /** Transforms a single Row to JSON using Jackson
-    *
-    * @param rowSchema the schema object used for conversion
-    * @param gen a JsonGenerator object
-    * @param row The row to convert
-    */
-  def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = {
-    def valWriter: (DataType, Any) => Unit = {
-      case (_, null) | (NullType, _) => gen.writeNull()
-      case (StringType, v: String) => gen.writeString(v)
-      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
-      case (IntegerType, v: Int) => gen.writeNumber(v)
-      case (ShortType, v: Short) => gen.writeNumber(v)
-      case (FloatType, v: Float) => gen.writeNumber(v)
-      case (DoubleType, v: Double) => gen.writeNumber(v)
-      case (LongType, v: Long) => gen.writeNumber(v)
-      case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
-      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
-      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
-      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
-      case (DateType, v) => gen.writeString(v.toString)
-      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))
-
-      case (ArrayType(ty, _), v: Seq[_]) =>
-        gen.writeStartArray()
-        v.foreach(valWriter(ty, _))
-        gen.writeEndArray()
-
-      case (MapType(kv, vv, _), v: Map[_, _]) =>
-        gen.writeStartObject()
-        v.foreach { p =>
-          gen.writeFieldName(p._1.toString)
-          valWriter(vv, p._2)
-        }
-        gen.writeEndObject()
-
-      case (StructType(ty), v: Row) =>
-        gen.writeStartObject()
-        ty.zip(v.toSeq).foreach {
-          case (_, null) =>
-          case (field, v) =>
-            gen.writeFieldName(field.name)
-            valWriter(field.dataType, v)
-        }
-        gen.writeEndObject()
-
-      // For UDT, udt.serialize will produce SQL types. So, we need the following three cases.
-      case (ArrayType(ty, _), v: ArrayData) =>
-        gen.writeStartArray()
-        v.foreach(ty, (_, value) => valWriter(ty, value))
-        gen.writeEndArray()
-
-      case (MapType(kt, vt, _), v: MapData) =>
-        gen.writeStartObject()
-        v.foreach(kt, vt, { (k, v) =>
-          gen.writeFieldName(k.toString)
-          valWriter(vt, v)
-        })
-        gen.writeEndObject()
-
-      case (StructType(ty), v: InternalRow) =>
-        gen.writeStartObject()
-        var i = 0
-        while (i < ty.length) {
-          val field = ty(i)
-          val value = v.get(i, field.dataType)
-          if (value != null) {
-            gen.writeFieldName(field.name)
-            valWriter(field.dataType, value)
-          }
-          i += 1
-        }
-        gen.writeEndObject()
-
-      case (dt, v) =>
-        sys.error(
-          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
-    }
-
-    valWriter(rowSchema, row)
-  }
-
   /** Transforms a single InternalRow to JSON using Jackson
    *
    * TODO: make the code shared with the other apply method.
@@ -118,7 +36,7 @@ private[sql] object JacksonGenerator {
    * @param gen a JsonGenerator object
    * @param row The row to convert
    */
-  def apply(rowSchema: StructType, gen: JsonGenerator, row: InternalRow): Unit = {
+  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
     def valWriter: (DataType, Any) => Unit = {
       case (_, null) | (NullType, _) => gen.writeNull()
       case (StringType, v) => gen.writeString(v.toString)

From 8023242e77e4a799de6edc490078285684549b6d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 24 Sep 2015 14:18:33 -0700
Subject: [PATCH 411/802] [SPARK-10761] Refactor DiskBlockObjectWriter to not
 require BlockId

The DiskBlockObjectWriter constructor took a BlockId parameter but never used it. As part of some general cleanup in these interfaces, this patch refactors its constructor to eliminate this parameter.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8871 from JoshRosen/disk-block-object-writer-blockid-cleanup.
---
 .../sort/BypassMergeSortShuffleWriter.java    |  9 +++--
 .../shuffle/IndexShuffleBlockResolver.scala   |  3 +-
 .../apache/spark/storage/BlockManager.scala   |  2 +-
 .../spark/storage/DiskBlockObjectWriter.scala |  7 ++--
 .../unsafe/UnsafeShuffleWriterSuite.java      |  1 -
 .../sort/UnsafeExternalSorterSuite.java       |  1 -
 .../BypassMergeSortShuffleWriterSuite.scala   |  1 -
 .../storage/DiskBlockObjectWriterSuite.scala  | 33 +++++++++----------
 8 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index 0b8b604e18494..f5d80bbcf3557 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -151,7 +151,7 @@ public long[] writePartitionedFile(
         } finally {
           Closeables.close(in, copyThrewException);
         }
-        if (!blockManager.diskBlockManager().getFile(partitionWriters[i].blockId()).delete()) {
+        if (!partitionWriters[i].fileSegment().file().delete()) {
           logger.error("Unable to delete file for partition {}", i);
         }
       }
@@ -168,12 +168,11 @@ public long[] writePartitionedFile(
   public void stop() throws IOException {
     if (partitionWriters != null) {
       try {
-        final DiskBlockManager diskBlockManager = blockManager.diskBlockManager();
         for (DiskBlockObjectWriter writer : partitionWriters) {
           // This method explicitly does _not_ throw exceptions:
-          writer.revertPartialWritesAndClose();
-          if (!diskBlockManager.getFile(writer.blockId()).delete()) {
-            logger.error("Error while deleting file for block {}", writer.blockId());
+          File file = writer.revertPartialWritesAndClose();
+          if (!file.delete()) {
+            logger.error("Error while deleting file {}", file.getAbsolutePath());
           }
         }
       } finally {
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 65887d119d65a..5e4c2b5d0a5c4 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -119,9 +119,8 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
 }
 
 private[spark] object IndexShuffleBlockResolver {
-  // No-op reduce ID used in interactions with disk store and DiskBlockObjectWriter.
+  // No-op reduce ID used in interactions with disk store.
   // The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort
   // shuffle outputs for several reduces are glommed into a single file.
-  // TODO: Avoid this entirely by having the DiskBlockObjectWriter not require a BlockId.
   val NOOP_REDUCE_ID = 0
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index bca3942f8c555..47bd2ef8b2941 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -669,7 +669,7 @@ private[spark] class BlockManager(
       writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
     val compressStream: OutputStream => OutputStream = wrapForCompression(blockId, _)
     val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
-    new DiskBlockObjectWriter(blockId, file, serializerInstance, bufferSize, compressStream,
+    new DiskBlockObjectWriter(file, serializerInstance, bufferSize, compressStream,
       syncWrites, writeMetrics)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index 49d9154f95a5b..80d426fadc65e 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -34,7 +34,6 @@ import org.apache.spark.util.Utils
  * reopened again.
  */
 private[spark] class DiskBlockObjectWriter(
-    val blockId: BlockId,
     file: File,
     serializerInstance: SerializerInstance,
     bufferSize: Int,
@@ -144,8 +143,10 @@ private[spark] class DiskBlockObjectWriter(
    * Reverts writes that haven't been flushed yet. Callers should invoke this function
    * when there are runtime exceptions. This method will not throw, though it may be
    * unsuccessful in truncating written data.
+   *
+   * @return the file that this DiskBlockObjectWriter wrote to.
    */
-  def revertPartialWritesAndClose() {
+  def revertPartialWritesAndClose(): File = {
     // Discard current writes. We do this by flushing the outstanding writes and then
     // truncating the file to its initial position.
     try {
@@ -160,12 +161,14 @@ private[spark] class DiskBlockObjectWriter(
       val truncateStream = new FileOutputStream(file, true)
       try {
         truncateStream.getChannel.truncate(initialPosition)
+        file
       } finally {
         truncateStream.close()
       }
     } catch {
       case e: Exception =>
         logError("Uncaught exception while reverting partial writes to file " + file, e)
+        file
     }
   }
 
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index a266b0c36e0fa..d218344cd4520 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -129,7 +129,6 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
         Object[] args = invocationOnMock.getArguments();
 
         return new DiskBlockObjectWriter(
-          (BlockId) args[0],
           (File) args[1],
           (SerializerInstance) args[2],
           (Integer) args[3],
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 445a37b83e98a..a5bbaa95fa456 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -127,7 +127,6 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
         Object[] args = invocationOnMock.getArguments();
 
         return new DiskBlockObjectWriter(
-          (BlockId) args[0],
           (File) args[1],
           (SerializerInstance) args[2],
           (Integer) args[3],
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index cc7342f1ecd78..341f56df2dafc 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -72,7 +72,6 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
       override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = {
         val args = invocation.getArguments
         new DiskBlockObjectWriter(
-          args(0).asInstanceOf[BlockId],
           args(1).asInstanceOf[File],
           args(2).asInstanceOf[SerializerInstance],
           args(3).asInstanceOf[Int],
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 66af6e1a79740..7c19531c18802 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -20,7 +20,6 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterEach
 
-import org.apache.spark.SparkConf
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.serializer.JavaSerializer
@@ -41,8 +40,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("verify write metrics") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -63,8 +62,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("verify write metrics on revert") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -86,8 +85,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("Reopening a closed block writer") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.open()
     writer.close()
@@ -99,8 +98,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -115,8 +114,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("commitAndClose() should be idempotent") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -133,8 +132,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("revertPartialWritesAndClose() should be idempotent") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -151,8 +150,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("fileSegment() can only be called after commitAndClose() has been called") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -165,8 +164,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("commitAndClose() without ever opening or writing") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     writer.commitAndClose()
     assert(writer.fileSegment().length === 0)
   }

From 21fd12cb17b9e08a0cc49b4fda801af947a4183b Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Thu, 24 Sep 2015 23:39:04 -0400
Subject: [PATCH 412/802] [SPARK-9852] Let reduce tasks fetch multiple map
 output partitions

This makes two changes:

- Allow reduce tasks to fetch multiple map output partitions -- this is a pretty small change to HashShuffleFetcher
- Move shuffle locality computation out of DAGScheduler and into ShuffledRDD / MapOutputTracker; this was needed because the code in DAGScheduler wouldn't work for RDDs that fetch multiple map output partitions from each reduce task

I also added an AdaptiveSchedulingSuite that creates RDDs depending on multiple map output partitions.

Author: Matei Zaharia <matei@databricks.com>

Closes #8844 from mateiz/spark-9852.
---
 .../org/apache/spark/MapOutputTracker.scala   |  79 ++++++++--
 .../org/apache/spark/rdd/ShuffledRDD.scala    |   6 +
 .../apache/spark/scheduler/DAGScheduler.scala |  33 +----
 .../shuffle/BlockStoreShuffleReader.scala     |   9 +-
 .../scheduler/AdaptiveSchedulingSuite.scala   |  47 +++---
 .../spark/scheduler/CustomShuffledRDD.scala   | 111 ++++++++++++++
 .../spark/scheduler/DAGSchedulerSuite.scala   | 137 ++++++++++--------
 .../BlockStoreShuffleReaderSuite.scala        |   2 +-
 .../spark/sql/execution/ShuffledRowRDD.scala  |   6 +
 9 files changed, 306 insertions(+), 124 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 94eb8daa85c53..e4cb72e39d7ec 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -134,11 +134,25 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    */
   def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int)
       : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
-    logDebug(s"Fetching outputs for shuffle $shuffleId, reduce $reduceId")
+    getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)
+  }
+
+  /**
+   * Called from executors to get the server URIs and output sizes for each shuffle block that
+   * needs to be read from a given range of map output partitions (startPartition is included but
+   * endPartition is excluded from the range).
+   *
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         describing the shuffle blocks that are stored at that block manager.
+   */
+  def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int)
+      : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
+    logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
     val statuses = getStatuses(shuffleId)
     // Synchronize on the returned array because, on the driver, it gets mutated in place
     statuses.synchronized {
-      return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses)
+      return MapOutputTracker.convertMapStatuses(shuffleId, startPartition, endPartition, statuses)
     }
   }
 
@@ -262,6 +276,21 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
   /** Cache a serialized version of the output statuses for each shuffle to send them out faster */
   private var cacheEpoch = epoch
 
+  /** Whether to compute locality preferences for reduce tasks */
+  private val shuffleLocalityEnabled = conf.getBoolean("spark.shuffle.reduceLocality.enabled", true)
+
+  // Number of map and reduce tasks above which we do not assign preferred locations based on map
+  // output sizes. We limit the size of jobs for which assign preferred locations as computing the
+  // top locations by size becomes expensive.
+  private val SHUFFLE_PREF_MAP_THRESHOLD = 1000
+  // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
+  private val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
+
+  // Fraction of total map output that must be at a location for it to considered as a preferred
+  // location for a reduce task. Making this larger will focus on fewer locations where most data
+  // can be read locally, but may lead to more delay in scheduling if those locations are busy.
+  private val REDUCER_PREF_LOCS_FRACTION = 0.2
+
   /**
    * Timestamp based HashMap for storing mapStatuses and cached serialized statuses in the driver,
    * so that statuses are dropped only by explicit de-registering or by TTL-based cleaning (if set).
@@ -322,6 +351,30 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
     cachedSerializedStatuses.contains(shuffleId) || mapStatuses.contains(shuffleId)
   }
 
+  /**
+   * Return the preferred hosts on which to run the given map output partition in a given shuffle,
+   * i.e. the nodes that the most outputs for that partition are on.
+   *
+   * @param dep shuffle dependency object
+   * @param partitionId map output partition that we want to read
+   * @return a sequence of host names
+   */
+  def getPreferredLocationsForShuffle(dep: ShuffleDependency[_, _, _], partitionId: Int)
+      : Seq[String] = {
+    if (shuffleLocalityEnabled && dep.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD &&
+        dep.partitioner.numPartitions < SHUFFLE_PREF_REDUCE_THRESHOLD) {
+      val blockManagerIds = getLocationsWithLargestOutputs(dep.shuffleId, partitionId,
+        dep.partitioner.numPartitions, REDUCER_PREF_LOCS_FRACTION)
+      if (blockManagerIds.nonEmpty) {
+        blockManagerIds.get.map(_.host)
+      } else {
+        Nil
+      }
+    } else {
+      Nil
+    }
+  }
+
   /**
    * Return a list of locations that each have fraction of map output greater than the specified
    * threshold.
@@ -460,23 +513,25 @@ private[spark] object MapOutputTracker extends Logging {
   }
 
   /**
-   * Converts an array of MapStatuses for a given reduce ID to a sequence that, for each block
-   * manager ID, lists the shuffle block ids and corresponding shuffle block sizes stored at that
-   * block manager.
+   * Given an array of map statuses and a range of map output partitions, returns a sequence that,
+   * for each block manager ID, lists the shuffle block IDs and corresponding shuffle block sizes
+   * stored at that block manager.
    *
    * If any of the statuses is null (indicating a missing location due to a failed mapper),
    * throws a FetchFailedException.
    *
    * @param shuffleId Identifier for the shuffle
-   * @param reduceId Identifier for the reduce task
+   * @param startPartition Start of map output partition ID range (included in range)
+   * @param endPartition End of map output partition ID range (excluded from range)
    * @param statuses List of map statuses, indexed by map ID.
    * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
-   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         and the second item is a sequence of (shuffle block ID, shuffle block size) tuples
    *         describing the shuffle blocks that are stored at that block manager.
    */
   private def convertMapStatuses(
       shuffleId: Int,
-      reduceId: Int,
+      startPartition: Int,
+      endPartition: Int,
       statuses: Array[MapStatus]): Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
     assert (statuses != null)
     val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(BlockId, Long)]]
@@ -484,10 +539,12 @@ private[spark] object MapOutputTracker extends Logging {
       if (status == null) {
         val errorMessage = s"Missing an output location for shuffle $shuffleId"
         logError(errorMessage)
-        throw new MetadataFetchFailedException(shuffleId, reduceId, errorMessage)
+        throw new MetadataFetchFailedException(shuffleId, startPartition, errorMessage)
       } else {
-        splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
-          ((ShuffleBlockId(shuffleId, mapId, reduceId), status.getSizeForBlock(reduceId)))
+        for (part <- startPartition until endPartition) {
+          splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
+            ((ShuffleBlockId(shuffleId, mapId, part), status.getSizeForBlock(part)))
+        }
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
index cb15d912bbfb5..a013c3f66a3a8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
@@ -86,6 +86,12 @@ class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
   }
 
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
+    tracker.getPreferredLocationsForShuffle(dep, partition.index)
+  }
+
   override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 394228b2728d1..ade372be092ae 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -184,22 +184,6 @@ class DAGScheduler(
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
 
-  // Flag to control if reduce tasks are assigned preferred locations
-  private val shuffleLocalityEnabled =
-    sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", true)
-  // Number of map, reduce tasks above which we do not assign preferred locations
-  // based on map output sizes. We limit the size of jobs for which assign preferred locations
-  // as computing the top locations by size becomes expensive.
-  private[this] val SHUFFLE_PREF_MAP_THRESHOLD = 1000
-  // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
-  private[this] val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
-
-  // Fraction of total map output that must be at a location for it to considered as a preferred
-  // location for a reduce task.
-  // Making this larger will focus on fewer locations where most data can be read locally, but
-  // may lead to more delay in scheduling if those locations are busy.
-  private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
-
   /**
    * Called by the TaskSetManager to report task's starting.
    */
@@ -1570,25 +1554,10 @@ class DAGScheduler(
             return locs
           }
         }
+
       case _ =>
     }
 
-    // If the RDD has shuffle dependencies and shuffle locality is enabled, pick locations that
-    // have at least REDUCER_PREF_LOCS_FRACTION of data as preferred locations
-    if (shuffleLocalityEnabled && rdd.partitions.length < SHUFFLE_PREF_REDUCE_THRESHOLD) {
-      rdd.dependencies.foreach {
-        case s: ShuffleDependency[_, _, _] =>
-          if (s.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD) {
-            // Get the preferred map output locations for this reducer
-            val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId,
-              partition, rdd.partitions.length, REDUCER_PREF_LOCS_FRACTION)
-            if (topLocsForReducer.nonEmpty) {
-              return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId))
-            }
-          }
-        case _ =>
-      }
-    }
     Nil
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 6dc9a16e58531..7c3e2b5a3703b 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -23,6 +23,10 @@ import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator}
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
 
+/**
+ * Fetches and reads the partitions in range [startPartition, endPartition) from a shuffle by
+ * requesting them from other nodes' block stores.
+ */
 private[spark] class BlockStoreShuffleReader[K, C](
     handle: BaseShuffleHandle[K, _, C],
     startPartition: Int,
@@ -32,9 +36,6 @@ private[spark] class BlockStoreShuffleReader[K, C](
     mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker)
   extends ShuffleReader[K, C] with Logging {
 
-  require(endPartition == startPartition + 1,
-    "Hash shuffle currently only supports fetching one partition")
-
   private val dep = handle.dependency
 
   /** Read the combined key-values for this reduce task */
@@ -43,7 +44,7 @@ private[spark] class BlockStoreShuffleReader[K, C](
       context,
       blockManager.shuffleClient,
       blockManager,
-      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition),
+      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition),
       // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
       SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
index 3fe28027c3c21..e0f474aa505c1 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.scheduler
 
-import org.apache.spark.rdd.{ShuffledRDDPartition, RDD, ShuffledRDD}
 import org.apache.spark._
 
 object AdaptiveSchedulingSuiteState {
@@ -28,26 +27,10 @@ object AdaptiveSchedulingSuiteState {
   }
 }
 
-/** A special ShuffledRDD where we can pass a ShuffleDependency object to use */
-class CustomShuffledRDD[K, V, C](@transient dep: ShuffleDependency[K, V, C])
-  extends RDD[(K, C)](dep.rdd.context, Seq(dep)) {
-
-  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
-    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
-    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
-      .read()
-      .asInstanceOf[Iterator[(K, C)]]
-  }
-
-  override def getPartitions: Array[Partition] = {
-    Array.tabulate[Partition](dep.partitioner.numPartitions)(i => new ShuffledRDDPartition(i))
-  }
-}
-
 class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext {
   test("simple use of submitMapStage") {
     try {
-      sc = new SparkContext("local[1,2]", "test")
+      sc = new SparkContext("local", "test")
       val rdd = sc.parallelize(1 to 3, 3).map { x =>
         AdaptiveSchedulingSuiteState.tasksRun += 1
         (x, x)
@@ -62,4 +45,32 @@ class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext {
       AdaptiveSchedulingSuiteState.clear()
     }
   }
+
+  test("fetching multiple map output partitions per reduce") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 2))
+    assert(shuffled.partitions.length === 2)
+    assert(shuffled.glom().map(_.toSet).collect().toSet == Set(Set((0, 0), (1, 1)), Set((2, 2))))
+  }
+
+  test("fetching all map output partitions in one reduce") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    // Also create lots of hash partitions so that some of them are empty
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(5))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0))
+    assert(shuffled.partitions.length === 1)
+    assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2)))
+  }
+
+  test("more reduce tasks than map output partitions") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 0, 0, 1, 1, 1, 2))
+    assert(shuffled.partitions.length === 7)
+    assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2)))
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala
new file mode 100644
index 0000000000000..d8d818ceed45f
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.util.Arrays
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+
+/**
+ * A Partitioner that might group together one or more partitions from the parent.
+ *
+ * @param parent a parent partitioner
+ * @param partitionStartIndices indices of partitions in parent that should create new partitions
+ *   in child (this should be an array of increasing partition IDs). For example, if we have a
+ *   parent with 5 partitions, and partitionStartIndices is [0, 2, 4], we get three output
+ *   partitions, corresponding to partition ranges [0, 1], [2, 3] and [4] of the parent partitioner.
+ */
+class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: Array[Int])
+  extends Partitioner {
+
+  @transient private lazy val parentPartitionMapping: Array[Int] = {
+    val n = parent.numPartitions
+    val result = new Array[Int](n)
+    for (i <- 0 until partitionStartIndices.length) {
+      val start = partitionStartIndices(i)
+      val end = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n
+      for (j <- start until end) {
+        result(j) = i
+      }
+    }
+    result
+  }
+
+  override def numPartitions: Int = partitionStartIndices.size
+
+  override def getPartition(key: Any): Int = {
+    parentPartitionMapping(parent.getPartition(key))
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case c: CoalescedPartitioner =>
+      c.parent == parent && Arrays.equals(c.partitionStartIndices, partitionStartIndices)
+    case _ =>
+      false
+  }
+}
+
+private[spark] class CustomShuffledRDDPartition(
+    val index: Int, val startIndexInParent: Int, val endIndexInParent: Int)
+  extends Partition {
+
+  override def hashCode(): Int = index
+}
+
+/**
+ * A special ShuffledRDD that supports a ShuffleDependency object from outside and launching reduce
+ * tasks that read multiple map output partitions.
+ */
+class CustomShuffledRDD[K, V, C](
+    var dependency: ShuffleDependency[K, V, C],
+    partitionStartIndices: Array[Int])
+  extends RDD[(K, C)](dependency.rdd.context, Seq(dependency)) {
+
+  def this(dep: ShuffleDependency[K, V, C]) = {
+    this(dep, (0 until dep.partitioner.numPartitions).toArray)
+  }
+
+  override def getDependencies: Seq[Dependency[_]] = List(dependency)
+
+  override val partitioner = {
+    Some(new CoalescedPartitioner(dependency.partitioner, partitionStartIndices))
+  }
+
+  override def getPartitions: Array[Partition] = {
+    val n = dependency.partitioner.numPartitions
+    Array.tabulate[Partition](partitionStartIndices.length) { i =>
+      val startIndex = partitionStartIndices(i)
+      val endIndex = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n
+      new CustomShuffledRDDPartition(i, startIndex, endIndex)
+    }
+  }
+
+  override def compute(p: Partition, context: TaskContext): Iterator[(K, C)] = {
+    val part = p.asInstanceOf[CustomShuffledRDDPartition]
+    SparkEnv.get.shuffleManager.getReader(
+      dependency.shuffleHandle, part.startIndexInParent, part.endIndexInParent, context)
+      .read()
+      .asInstanceOf[Iterator[(K, C)]]
+  }
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    dependency = null
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 6b5bcf0574de6..697c195e4ad1f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -49,19 +49,39 @@ class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
  * An RDD for passing to DAGScheduler. These RDDs will use the dependencies and
  * preferredLocations (if any) that are passed to them. They are deliberately not executable
  * so we can test that DAGScheduler does not try to execute RDDs locally.
+ *
+ * Optionally, one can pass in a list of locations to use as preferred locations for each task,
+ * and a MapOutputTrackerMaster to enable reduce task locality. We pass the tracker separately
+ * because, in this test suite, it won't be the same as sc.env.mapOutputTracker.
  */
 class MyRDD(
     sc: SparkContext,
     numPartitions: Int,
     dependencies: List[Dependency[_]],
-    locations: Seq[Seq[String]] = Nil) extends RDD[(Int, Int)](sc, dependencies) with Serializable {
+    locations: Seq[Seq[String]] = Nil,
+    @transient tracker: MapOutputTrackerMaster = null)
+  extends RDD[(Int, Int)](sc, dependencies) with Serializable {
+
   override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
     throw new RuntimeException("should not be reached")
+
   override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
     override def index: Int = i
   }).toArray
-  override def getPreferredLocations(split: Partition): Seq[String] =
-    if (locations.isDefinedAt(split.index)) locations(split.index) else Nil
+
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    if (locations.isDefinedAt(partition.index)) {
+      locations(partition.index)
+    } else if (tracker != null && dependencies.size == 1 &&
+        dependencies(0).isInstanceOf[ShuffleDependency[_, _, _]]) {
+      // If we have only one shuffle dependency, use the same code path as ShuffledRDD for locality
+      val dep = dependencies(0).asInstanceOf[ShuffleDependency[_, _, _]]
+      tracker.getPreferredLocationsForShuffle(dep, partition.index)
+    } else {
+      Nil
+    }
+  }
+
   override def toString: String = "DAGSchedulerSuiteRDD " + id
 }
 
@@ -351,7 +371,8 @@ class DAGSchedulerSuite
    */
   test("getMissingParentStages should consider all ancestor RDDs' cache statuses") {
     val rddA = new MyRDD(sc, 1, Nil)
-    val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, null)))
+    val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, new HashPartitioner(1))),
+      tracker = mapOutputTracker)
     val rddC = new MyRDD(sc, 1, List(new OneToOneDependency(rddB))).cache()
     val rddD = new MyRDD(sc, 1, List(new OneToOneDependency(rddC)))
     cacheLocations(rddC.id -> 0) =
@@ -458,9 +479,9 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1)),
@@ -474,9 +495,9 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle with fetch failure") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
@@ -590,9 +611,8 @@ class DAGSchedulerSuite
 
     val parts = 8
     val shuffleMapRdd = new MyRDD(sc, parts, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
-    val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, parts, List(shuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(parts))
+    val reduceRdd = new MyRDD(sc, parts, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, (0 until parts).toArray)
 
     completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = parts)
@@ -625,9 +645,8 @@ class DAGSchedulerSuite
     setupStageAbortTest(sc)
 
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
-    val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) {
@@ -668,10 +687,10 @@ class DAGSchedulerSuite
     setupStageAbortTest(sc)
 
     val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
 
     // In the first two iterations, Stage 0 succeeds and stage 1 fails. In the next two iterations,
@@ -717,10 +736,10 @@ class DAGSchedulerSuite
     setupStageAbortTest(sc)
 
     val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
 
     // First, execute stages 0 and 1, failing stage 1 up to MAX-1 times.
@@ -777,9 +796,9 @@ class DAGSchedulerSuite
 
   test("trivial shuffle with multiple fetch failures") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
       (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
@@ -818,9 +837,9 @@ class DAGSchedulerSuite
    */
   test("late fetch failures don't cause multiple concurrent attempts for the same map stage") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     val mapStageId = 0
@@ -886,9 +905,9 @@ class DAGSchedulerSuite
   test("extremely late fetch failures don't cause multiple concurrent attempts for " +
       "the same stage") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     def countSubmittedReduceStageAttempts(): Int = {
@@ -949,9 +968,9 @@ class DAGSchedulerSuite
 
   test("ignore late map task completions") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     // pretend we were told hostA went away
@@ -1018,8 +1037,8 @@ class DAGSchedulerSuite
 
   test("run shuffle with map stage failure") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     // Fail the map stage.  This should cause the entire job to fail.
@@ -1221,12 +1240,12 @@ class DAGSchedulerSuite
    */
   test("failure of stage used by two jobs") {
     val shuffleMapRdd1 = new MyRDD(sc, 2, Nil)
-    val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, null)
+    val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2))
     val shuffleMapRdd2 = new MyRDD(sc, 2, Nil)
-    val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, null)
+    val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, new HashPartitioner(2))
 
-    val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1))
-    val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2))
+    val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1), tracker = mapOutputTracker)
+    val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2), tracker = mapOutputTracker)
 
     // We need to make our own listeners for this test, since by default submit uses the same
     // listener for all jobs, and here we want to capture the failure for each job separately.
@@ -1258,9 +1277,9 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle with out-of-band failure and retry") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     // blockManagerMaster.removeExecutor("exec-hostA")
     // pretend we were told hostA went away
@@ -1281,10 +1300,10 @@ class DAGSchedulerSuite
 
   test("recursive shuffle failures") {
     val shuffleOneRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne))
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker)
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
     // have the first stage complete normally
     complete(taskSets(0), Seq(
@@ -1310,10 +1329,10 @@ class DAGSchedulerSuite
 
   test("cached post-shuffle") {
     val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
     cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
     cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
@@ -1419,9 +1438,9 @@ class DAGSchedulerSuite
   test("reduce tasks should be placed locally with map output") {
     // Create an shuffleMapRdd with 1 partition
     val shuffleMapRdd = new MyRDD(sc, 1, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1))))
@@ -1440,9 +1459,9 @@ class DAGSchedulerSuite
     val numMapTasks = 4
     // Create an shuffleMapRdd with more partitions
     val shuffleMapRdd = new MyRDD(sc, numMapTasks, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
 
     val statuses = (1 to numMapTasks).map { i =>
@@ -1464,10 +1483,10 @@ class DAGSchedulerSuite
     // Create an RDD that has both a shuffle dependency and a narrow dependency (e.g. for a join)
     val rdd1 = new MyRDD(sc, 1, Nil)
     val rdd2 = new MyRDD(sc, 1, Nil, locations = Seq(Seq("hostB")))
-    val shuffleDep = new ShuffleDependency(rdd1, null)
+    val shuffleDep = new ShuffleDependency(rdd1, new HashPartitioner(1))
     val narrowDep = new OneToOneDependency(rdd2)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
       (Success, makeMapStatus("hostA", 1))))
@@ -1500,7 +1519,8 @@ class DAGSchedulerSuite
   test("simple map stage submission") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
 
     // Submit a map stage by itself
     submitMapStage(shuffleDep)
@@ -1526,7 +1546,8 @@ class DAGSchedulerSuite
   test("map stage submission with reduce stage also depending on the data") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
 
     // Submit the map stage by itself
     submitMapStage(shuffleDep)
@@ -1555,7 +1576,7 @@ class DAGSchedulerSuite
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
 
     // Submit a map stage by itself
     submitMapStage(shuffleDep)
@@ -1604,9 +1625,9 @@ class DAGSchedulerSuite
   test("map stage submission with multiple shared stages and failures") {
     val rdd1 = new MyRDD(sc, 2, Nil)
     val dep1 = new ShuffleDependency(rdd1, new HashPartitioner(2))
-    val rdd2 = new MyRDD(sc, 2, List(dep1))
+    val rdd2 = new MyRDD(sc, 2, List(dep1), tracker = mapOutputTracker)
     val dep2 = new ShuffleDependency(rdd2, new HashPartitioner(2))
-    val rdd3 = new MyRDD(sc, 2, List(dep2))
+    val rdd3 = new MyRDD(sc, 2, List(dep2), tracker = mapOutputTracker)
 
     val listener1 = new SimpleListener
     val listener2 = new SimpleListener
@@ -1712,7 +1733,7 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
 
     // Also test that a reduce stage using this shuffled data can immediately run
-    val reduceRDD = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRDD = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     results.clear()
     submit(reduceRDD, Array(0, 1))
     complete(taskSets(2), Seq((Success, 42), (Success, 43)))
diff --git a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
index a5eafb1b5529e..26a372d6a905d 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
@@ -114,7 +114,7 @@ class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext
     // Make a mocked MapOutputTracker for the shuffle reader to use to determine what
     // shuffle data to read.
     val mapOutputTracker = mock(classOf[MapOutputTracker])
-    when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId)).thenReturn {
+    when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)).thenReturn {
       // Test a scenario where all data is local, to avoid creating a bunch of additional mocks
       // for the code to read data over the network.
       val shuffleBlockIdsAndSizes = (0 until numMaps).map { mapId =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index 88f5b13c8f248..743c99a899c61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -65,6 +65,12 @@ class ShuffledRowRDD(
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i))
   }
 
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    tracker.getPreferredLocationsForShuffle(dep, partition.index)
+  }
+
   override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]]
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)

From 922338812c03eba43f2f1a6c414d1b6b049811cf Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 25 Sep 2015 00:43:22 -0700
Subject: [PATCH 413/802] [SPARK-9681] [ML] Support R feature interactions in
 RFormula

This integrates the Interaction feature transformer with SparkR R formula support (i.e. support `:`).

To generate reasonable ML attribute names for feature interactions, it was necessary to add the ability to read attribute the original attribute names back from `StructField`, and also to specify custom group prefixes in `VectorAssembler`. This also has the side-benefit of cleaning up the double-underscores in the attributes generated for non-interaction terms.

mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #8830 from ericl/interaction-2.
---
 R/pkg/R/mllib.R                               |   2 +-
 R/pkg/inst/tests/test_mllib.R                 |  10 +-
 .../spark/ml/attribute/attributes.scala       |  16 ++-
 .../apache/spark/ml/feature/Interaction.scala |  12 +-
 .../apache/spark/ml/feature/RFormula.scala    | 113 ++++++++++++++----
 .../spark/ml/feature/RFormulaParser.scala     |  97 +++++++++++----
 .../spark/ml/feature/StringIndexer.scala      |   5 +-
 .../ml/feature/RFormulaParserSuite.scala      |  89 +++++++++++++-
 .../spark/ml/feature/RFormulaSuite.scala      |  76 +++++++++++-
 python/pyspark/ml/feature.py                  |   2 +-
 10 files changed, 362 insertions(+), 60 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index cea3d760d05fe..474ada5956685 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '+', '-', and '.'.
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index f272de78ad4a6..032f8ec68b9d0 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -49,6 +49,14 @@ test_that("dot minus and intercept vs native glm", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
+test_that("feature interaction vs native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
@@ -57,5 +65,5 @@ test_that("summary coefficients match with native glm", {
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
   expect_true(all(
     as.character(stats$features) ==
-    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index e479f169021d8..a7c10333c0d53 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -124,18 +124,28 @@ private[attribute] trait AttributeFactory {
   private[attribute] def fromMetadata(metadata: Metadata): Attribute
 
   /**
-   * Creates an [[Attribute]] from a [[StructField]] instance.
+   * Creates an [[Attribute]] from a [[StructField]] instance, optionally preserving name.
    */
-  def fromStructField(field: StructField): Attribute = {
+  private[ml] def decodeStructField(field: StructField, preserveName: Boolean): Attribute = {
     require(field.dataType.isInstanceOf[NumericType])
     val metadata = field.metadata
     val mlAttr = AttributeKeys.ML_ATTR
     if (metadata.contains(mlAttr)) {
-      fromMetadata(metadata.getMetadata(mlAttr)).withName(field.name)
+      val attr = fromMetadata(metadata.getMetadata(mlAttr))
+      if (preserveName) {
+        attr
+      } else {
+        attr.withName(field.name)
+      }
     } else {
       UnresolvedAttribute
     }
   }
+
+  /**
+   * Creates an [[Attribute]] from a [[StructField]] instance.
+   */
+  def fromStructField(field: StructField): Attribute = decodeStructField(field, false)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
index 9194763fb32f5..37f7862476cfe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -149,8 +149,14 @@ class Interaction(override val uid: String) extends Transformer
     features.reverse.foreach { f =>
       val encodedAttrs = f.dataType match {
         case _: NumericType | BooleanType =>
-          val attr = Attribute.fromStructField(f)
-          encodedFeatureAttrs(Seq(attr), None)
+          val attr = Attribute.decodeStructField(f, preserveName = true)
+          if (attr == UnresolvedAttribute) {
+            encodedFeatureAttrs(Seq(NumericAttribute.defaultAttr.withName(f.name)), None)
+          } else if (!attr.name.isDefined) {
+            encodedFeatureAttrs(Seq(attr.withName(f.name)), None)
+          } else {
+            encodedFeatureAttrs(Seq(attr), None)
+          }
         case _: VectorUDT =>
           val group = AttributeGroup.fromStructField(f)
           encodedFeatureAttrs(group.attributes.get, Some(group.name))
@@ -221,7 +227,7 @@ class Interaction(override val uid: String) extends Transformer
  *                    count is equal to the number of categories. For numeric features the count
  *                    should be set to 1.
  */
-private[ml] class FeatureEncoder(numFeatures: Array[Int]) {
+private[ml] class FeatureEncoder(numFeatures: Array[Int]) extends Serializable {
   assert(numFeatures.forall(_ > 0), "Features counts must all be positive.")
 
   /** The size of the output vector. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index dcd6fe3c406a4..f9b840097f3ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -21,6 +21,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
@@ -42,8 +43,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the
- * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '~', '.', ':', '+', and '-'. Also see
+ * the R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {
@@ -82,36 +83,54 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
     require(isDefined(formula), "Formula must be defined first.")
     val parsedFormula = RFormulaParser.parse($(formula))
     val resolvedFormula = parsedFormula.resolve(dataset.schema)
-    // StringType terms and terms representing interactions need to be encoded before assembly.
-    // TODO(ekl) add support for feature interactions
     val encoderStages = ArrayBuffer[PipelineStage]()
+
+    val prefixesToRewrite = mutable.Map[String, String]()
     val tempColumns = ArrayBuffer[String]()
-    val takenNames = mutable.Set(dataset.columns: _*)
-    val encodedTerms = resolvedFormula.terms.map { term =>
+    def tmpColumn(category: String): String = {
+      val col = Identifiable.randomUID(category)
+      tempColumns += col
+      col
+    }
+
+    // First we index each string column referenced by the input terms.
+    val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term =>
       dataset.schema(term) match {
         case column if column.dataType == StringType =>
-          val indexCol = term + "_idx_" + uid
-          val encodedCol = {
-            var tmp = term
-            while (takenNames.contains(tmp)) {
-              tmp += "_"
-            }
-            tmp
-          }
-          takenNames.add(indexCol)
-          takenNames.add(encodedCol)
-          encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol)
-          encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol)
-          tempColumns += indexCol
-          tempColumns += encodedCol
-          encodedCol
+          val indexCol = tmpColumn("stridx")
+          encoderStages += new StringIndexer()
+            .setInputCol(term)
+            .setOutputCol(indexCol)
+          (term, indexCol)
         case _ =>
-          term
+          (term, term)
       }
+    }.toMap
+
+    // Then we handle one-hot encoding and interactions between terms.
+    val encodedTerms = resolvedFormula.terms.map {
+      case Seq(term) if dataset.schema(term).dataType == StringType =>
+        val encodedCol = tmpColumn("onehot")
+        encoderStages += new OneHotEncoder()
+          .setInputCol(indexed(term))
+          .setOutputCol(encodedCol)
+        prefixesToRewrite(encodedCol + "_") = term + "_"
+        encodedCol
+      case Seq(term) =>
+        term
+      case terms =>
+        val interactionCol = tmpColumn("interaction")
+        encoderStages += new Interaction()
+          .setInputCols(terms.map(indexed).toArray)
+          .setOutputCol(interactionCol)
+        prefixesToRewrite(interactionCol + "_") = ""
+        interactionCol
     }
+
     encoderStages += new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
+    encoderStages += new VectorAttributeRewriter($(featuresCol), prefixesToRewrite.toMap)
     encoderStages += new ColumnPruner(tempColumns.toSet)
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
     copyValues(new RFormulaModel(uid, resolvedFormula, pipelineModel).setParent(this))
@@ -218,3 +237,53 @@ private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
 
   override def copy(extra: ParamMap): ColumnPruner = defaultCopy(extra)
 }
+
+/**
+ * Utility transformer that rewrites Vector attribute names via prefix replacement. For example,
+ * it can rewrite attribute names starting with 'foo_' to start with 'bar_' instead.
+ *
+ * @param vectorCol name of the vector column to rewrite.
+ * @param prefixesToRewrite the map of string prefixes to their replacement values. Each attribute
+ *                          name defined in vectorCol will be checked against the keys of this
+ *                          map. When a key prefixes a name, the matching prefix will be replaced
+ *                          by the value in the map.
+ */
+private class VectorAttributeRewriter(
+    vectorCol: String,
+    prefixesToRewrite: Map[String, String])
+  extends Transformer {
+
+  override val uid = Identifiable.randomUID("vectorAttrRewriter")
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val metadata = {
+      val group = AttributeGroup.fromStructField(dataset.schema(vectorCol))
+      val attrs = group.attributes.get.map { attr =>
+        if (attr.name.isDefined) {
+          val name = attr.name.get
+          val replacement = prefixesToRewrite.filter { case (k, _) => name.startsWith(k) }
+          if (replacement.nonEmpty) {
+            val (k, v) = replacement.headOption.get
+            attr.withName(v + name.stripPrefix(k))
+          } else {
+            attr
+          }
+        } else {
+          attr
+        }
+      }
+      new AttributeGroup(vectorCol, attrs).toMetadata()
+    }
+    val otherCols = dataset.columns.filter(_ != vectorCol).map(dataset.col)
+    val rewrittenCol = dataset.col(vectorCol).as(vectorCol, metadata)
+    dataset.select((otherCols :+ rewrittenCol): _*)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    StructType(
+      schema.fields.filter(_.name != vectorCol) ++
+      schema.fields.filter(_.name == vectorCol))
+  }
+
+  override def copy(extra: ParamMap): VectorAttributeRewriter = defaultCopy(extra)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
index 1ca3b92a7d92a..4079b387e1834 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import scala.collection.mutable
 import scala.util.parsing.combinator.RegexParsers
 
 import org.apache.spark.mllib.linalg.VectorUDT
@@ -31,27 +32,35 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
    * of the special '.' term. Duplicate terms will be removed during resolution.
    */
   def resolve(schema: StructType): ResolvedRFormula = {
-    var includedTerms = Seq[String]()
+    val dotTerms = expandDot(schema)
+    var includedTerms = Seq[Seq[String]]()
     terms.foreach {
+      case col: ColumnRef =>
+        includedTerms :+= Seq(col.value)
+      case ColumnInteraction(cols) =>
+        includedTerms ++= expandInteraction(schema, cols)
       case Dot =>
-        includedTerms ++= simpleTypes(schema).filter(_ != label.value)
-      case ColumnRef(value) =>
-        includedTerms :+= value
+        includedTerms ++= dotTerms.map(Seq(_))
       case Deletion(term: Term) =>
         term match {
-          case ColumnRef(value) =>
-            includedTerms = includedTerms.filter(_ != value)
+          case inner: ColumnRef =>
+            includedTerms = includedTerms.filter(_ != Seq(inner.value))
+          case ColumnInteraction(cols) =>
+            val fromInteraction = expandInteraction(schema, cols).map(_.toSet)
+            includedTerms = includedTerms.filter(t => !fromInteraction.contains(t.toSet))
           case Dot =>
             // e.g. "- .", which removes all first-order terms
-            val fromSchema = simpleTypes(schema)
-            includedTerms = includedTerms.filter(fromSchema.contains(_))
+            includedTerms = includedTerms.filter {
+              case Seq(t) => !dotTerms.contains(t)
+              case _ => true
+            }
           case _: Deletion =>
-            assert(false, "Deletion terms cannot be nested")
+            throw new RuntimeException("Deletion terms cannot be nested")
           case _: Intercept =>
         }
       case _: Intercept =>
     }
-    ResolvedRFormula(label.value, includedTerms.distinct)
+    ResolvedRFormula(label.value, includedTerms.distinct, hasIntercept)
   }
 
   /** Whether this formula specifies fitting with an intercept term. */
@@ -67,19 +76,54 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
     intercept
   }
 
+  // expands the Dot operators in interaction terms
+  private def expandInteraction(
+      schema: StructType, terms: Seq[InteractableTerm]): Seq[Seq[String]] = {
+    if (terms.isEmpty) {
+      return Seq(Nil)
+    }
+
+    val rest = expandInteraction(schema, terms.tail)
+    val validInteractions = (terms.head match {
+      case Dot =>
+        expandDot(schema).flatMap { t =>
+          rest.map { r =>
+            Seq(t) ++ r
+          }
+        }
+      case ColumnRef(value) =>
+        rest.map(Seq(value) ++ _)
+    }).map(_.distinct)
+
+    // Deduplicates feature interactions, for example, a:b is the same as b:a.
+    var seen = mutable.Set[Set[String]]()
+    validInteractions.flatMap {
+      case t if seen.contains(t.toSet) =>
+        None
+      case t =>
+        seen += t.toSet
+        Some(t)
+    }.sortBy(_.length)
+  }
+
   // the dot operator excludes complex column types
-  private def simpleTypes(schema: StructType): Seq[String] = {
+  private def expandDot(schema: StructType): Seq[String] = {
     schema.fields.filter(_.dataType match {
       case _: NumericType | StringType | BooleanType | _: VectorUDT => true
       case _ => false
-    }).map(_.name)
+    }).map(_.name).filter(_ != label.value)
   }
 }
 
 /**
  * Represents a fully evaluated and simplified R formula.
+ * @param label the column name of the R formula label (response variable).
+ * @param terms the simplified terms of the R formula. Interactions terms are represented as Seqs
+ *              of column names; non-interaction terms as length 1 Seqs.
+ * @param hasIntercept whether the formula specifies fitting with an intercept.
  */
-private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
+private[ml] case class ResolvedRFormula(
+  label: String, terms: Seq[Seq[String]], hasIntercept: Boolean)
 
 /**
  * R formula terms. See the R formula docs here for more information:
@@ -87,11 +131,17 @@ private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
  */
 private[ml] sealed trait Term
 
+/** A term that may be part of an interaction, e.g. 'x' in 'x:y' */
+private[ml] sealed trait InteractableTerm extends Term
+
 /* R formula reference to all available columns, e.g. "." in a formula */
-private[ml] case object Dot extends Term
+private[ml] case object Dot extends InteractableTerm
 
 /* R formula reference to a column, e.g. "+ Species" in a formula */
-private[ml] case class ColumnRef(value: String) extends Term
+private[ml] case class ColumnRef(value: String) extends InteractableTerm
+
+/* R formula interaction of several columns, e.g. "Sepal_Length:Species" in a formula */
+private[ml] case class ColumnInteraction(terms: Seq[InteractableTerm]) extends Term
 
 /* R formula intercept toggle, e.g. "+ 0" in a formula */
 private[ml] case class Intercept(enabled: Boolean) extends Term
@@ -100,25 +150,30 @@ private[ml] case class Intercept(enabled: Boolean) extends Term
 private[ml] case class Deletion(term: Term) extends Term
 
 /**
- * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.'.
+ * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.', ':'.
  */
 private[ml] object RFormulaParser extends RegexParsers {
-  def intercept: Parser[Intercept] =
+  private val intercept: Parser[Intercept] =
     "([01])".r ^^ { case a => Intercept(a == "1") }
 
-  def columnRef: Parser[ColumnRef] =
+  private val columnRef: Parser[ColumnRef] =
     "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) }
 
-  def term: Parser[Term] = intercept | columnRef | "\\.".r ^^ { case _ => Dot }
+  private val dot: Parser[InteractableTerm] = "\\.".r ^^ { case _ => Dot }
+
+  private val interaction: Parser[List[InteractableTerm]] = rep1sep(columnRef | dot, ":")
+
+  private val term: Parser[Term] = intercept |
+    interaction ^^ { case terms => ColumnInteraction(terms) } | dot | columnRef
 
-  def terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
+  private val terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
     case op ~ list => list.foldLeft(List(op)) {
       case (left, "+" ~ right) => left ++ Seq(right)
       case (left, "-" ~ right) => left ++ Seq(Deletion(right))
     }
   }
 
-  def formula: Parser[ParsedRFormula] =
+  private val formula: Parser[ParsedRFormula] =
     (columnRef ~ "~" ~ terms) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
 
   def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 2b1592930e77b..486274cd75a14 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -147,9 +147,8 @@ class StringIndexerModel (
       }
     }
 
-    val outputColName = $(outputCol)
     val metadata = NominalAttribute.defaultAttr
-      .withName(outputColName).withValues(labels).toMetadata()
+      .withName($(inputCol)).withValues(labels).toMetadata()
     // If we are skipping invalid records, filter them out.
     val filteredDataset = (getHandleInvalid) match {
       case "skip" => {
@@ -161,7 +160,7 @@ class StringIndexerModel (
       case _ => dataset
     }
     filteredDataset.select(col("*"),
-      indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
+      indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol), metadata))
   }
 
   override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
index 436e66bab09b0..53798c659d4f3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -25,16 +25,24 @@ class RFormulaParserSuite extends SparkFunSuite {
       formula: String,
       label: String,
       terms: Seq[String],
-      schema: StructType = null) {
+      schema: StructType = new StructType) {
     val resolved = RFormulaParser.parse(formula).resolve(schema)
     assert(resolved.label == label)
-    assert(resolved.terms == terms)
+    val simpleTerms = terms.map { t =>
+      if (t.contains(":")) {
+        t.split(":").toSeq
+      } else {
+        Seq(t)
+      }
+    }
+    assert(resolved.terms == simpleTerms)
   }
 
   test("parse simple formulas") {
     checkParse("y ~ x", "y", Seq("x"))
     checkParse("y ~ x + x", "y", Seq("x"))
-    checkParse("y ~   ._foo  ", "y", Seq("._foo"))
+    checkParse("y~x+z", "y", Seq("x", "z"))
+    checkParse("y ~   ._fo..o  ", "y", Seq("._fo..o"))
     checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
   }
 
@@ -79,4 +87,79 @@ class RFormulaParserSuite extends SparkFunSuite {
     assert(!RFormulaParser.parse("a ~ b - 1").hasIntercept)
     assert(!RFormulaParser.parse("a ~ b + 1 - 1").hasIntercept)
   }
+
+  test("parse interactions") {
+    checkParse("y ~ a:b", "y", Seq("a:b"))
+    checkParse("y ~ ._a:._x", "y", Seq("._a:._x"))
+    checkParse("y ~ foo:bar", "y", Seq("foo:bar"))
+    checkParse("y ~ a : b : c", "y", Seq("a:b:c"))
+    checkParse("y ~ q + a:b:c + b:c + c:d + z", "y", Seq("q", "a:b:c", "b:c", "c:d", "z"))
+  }
+
+  test("parse basic interactions with dot") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+      .add("d", "string", true)
+    checkParse("a ~ .:b", "a", Seq("b", "c:b", "d:b"), schema)
+    checkParse("a ~ b:.", "a", Seq("b", "b:c", "b:d"), schema)
+    checkParse("a ~ .:b:.:.:c:d:.", "a", Seq("b:c:d"), schema)
+  }
+
+  // Test data generated in R with terms.formula(y ~ .:., data = iris)
+  test("parse all to all iris interactions") {
+    val schema = (new StructType)
+      .add("Sepal.Length", "double", true)
+      .add("Sepal.Width", "double", true)
+      .add("Petal.Length", "double", true)
+      .add("Petal.Width", "double", true)
+      .add("Species", "string", true)
+    checkParse(
+      "y ~ .:.",
+      "y",
+      Seq(
+        "Sepal.Length",
+        "Sepal.Width",
+        "Petal.Length",
+        "Petal.Width",
+        "Species",
+        "Sepal.Length:Sepal.Width",
+        "Sepal.Length:Petal.Length",
+        "Sepal.Length:Petal.Width",
+        "Sepal.Length:Species",
+        "Sepal.Width:Petal.Length",
+        "Sepal.Width:Petal.Width",
+        "Sepal.Width:Species",
+        "Petal.Length:Petal.Width",
+        "Petal.Length:Species",
+        "Petal.Width:Species"),
+      schema)
+  }
+
+  // Test data generated in R with terms.formula(y ~ .:. - Species:., data = iris)
+  test("parse interaction negation with iris") {
+    val schema = (new StructType)
+      .add("Sepal.Length", "double", true)
+      .add("Sepal.Width", "double", true)
+      .add("Petal.Length", "double", true)
+      .add("Petal.Width", "double", true)
+      .add("Species", "string", true)
+    checkParse("y ~ .:. - .:.", "y", Nil, schema)
+    checkParse(
+      "y ~ .:. - Species:.",
+      "y",
+      Seq(
+        "Sepal.Length",
+        "Sepal.Width",
+        "Petal.Length",
+        "Petal.Width",
+        "Sepal.Length:Sepal.Width",
+        "Sepal.Length:Petal.Length",
+        "Sepal.Length:Petal.Width",
+        "Sepal.Width:Petal.Length",
+        "Sepal.Width:Petal.Width",
+        "Petal.Length:Petal.Width"),
+      schema)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 6aed3243afce8..b56013008b116 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -118,9 +118,81 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     val expectedAttrs = new AttributeGroup(
       "features",
       Array(
-        new BinaryAttribute(Some("a__bar"), Some(1)),
-        new BinaryAttribute(Some("a__foo"), Some(2)),
+        new BinaryAttribute(Some("a_bar"), Some(1)),
+        new BinaryAttribute(Some("a_foo"), Some(2)),
         new NumericAttribute(Some("b"), Some(3))))
     assert(attrs === expectedAttrs)
   }
+
+  test("numeric interaction") {
+    val formula = new RFormula().setFormula("a ~ b:c:d")
+    val original = sqlContext.createDataFrame(
+      Seq((1, 2, 4, 2), (2, 3, 4, 1))
+    ).toDF("a", "b", "c", "d")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
+        (2, 3, 4, 1, Vectors.dense(12.0), 2.0))
+      ).toDF("a", "b", "c", "d", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](new NumericAttribute(Some("b:c:d"), Some(1))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("factor numeric interaction") {
+    val formula = new RFormula().setFormula("id ~ a:b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5), (4, "baz", 5), (4, "baz", 5))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 4.0, 0.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 5.0, 0.0), 3.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0))
+      ).toDF("id", "a", "b", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_baz:b"), Some(1)),
+        new NumericAttribute(Some("a_bar:b"), Some(2)),
+        new NumericAttribute(Some("a_foo:b"), Some(3))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("factor factor interaction") {
+    val formula = new RFormula().setFormula("id ~ a:b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz"))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", "zq", Vectors.dense(0.0, 0.0, 1.0, 0.0), 1.0),
+        (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0),
+        (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0))
+      ).toDF("id", "a", "b", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_bar:b_zq"), Some(1)),
+        new NumericAttribute(Some("a_bar:b_zz"), Some(2)),
+        new NumericAttribute(Some("a_foo:b_zq"), Some(3)),
+        new NumericAttribute(Some("a_foo:b_zz"), Some(4))))
+    assert(attrs === expectedAttrs)
+  }
 }
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index f41d72f877256..a4e60f916b5c8 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1850,7 +1850,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
 
     Implements the transforms required for fitting a dataset against an
     R model formula. Currently we support a limited subset of the R
-    operators, including '~', '+', '-', and '.'. Also see the R formula
+    operators, including '~', '.', ':', '+', and '-'. Also see the R formula
     docs:
     http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
 

From 6fcee906d2afb5d5c3c49e0a669637a87e82b910 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>
Date: Fri, 25 Sep 2015 11:55:08 -0700
Subject: [PATCH 414/802] [SPARK-10760] [SPARKR] SparkR glm: the documentation
 in examples - family argument is missing

Hi everyone,

Since the family argument is required for the glm function, the execution of:

model <- glm(Sepal_Length ~ Sepal_Width, df)

is failing.

I've fixed the documentation by adding the family argument and also added the summay(model) which will show the coefficients for the model.

Thanks,
Narine

Author: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>

Closes #8870 from NarineK/sparkrml.
---
 R/pkg/R/mllib.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 474ada5956685..cd00bbbeec698 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -41,7 +41,8 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' sqlContext <- sparkRSQL.init(sc)
 #' data(iris)
 #' df <- createDataFrame(sqlContext, iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df)
+#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
+#' summary(model)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
           function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {

From 6f94d56a95e8c3a410a8d0c6a24ccca043227ba9 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 26 Sep 2015 19:08:55 -0700
Subject: [PATCH 415/802] [SPARK-10845] [SQL] Makes spark.sql.hive.version a
 SQLConfEntry

When refactoring SQL options from plain strings to the strongly typed `SQLConfEntry`, `spark.sql.hive.version` wasn't migrated, and doesn't show up in the result of `SET -v`, as `SET -v` only shows public `SQLConfEntry` instances. This affects compatibility with Simba ODBC driver.

This PR migrates this SQL option as a `SQLConfEntry` to fix this issue.

Author: Cheng Lian <lian@databricks.com>

Closes #8925 from liancheng/spark-10845/hive-version-conf.
---
 .../hive/thriftserver/HiveThriftServer2.scala |  1 -
 .../HiveThriftServer2Suites.scala             | 27 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   |  5 ++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index a0643cec0fb7c..a4fd0c3ce9702 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -55,7 +55,6 @@ object HiveThriftServer2 extends Logging {
   @DeveloperApi
   def startWithContext(sqlContext: HiveContext): Unit = {
     val server = new HiveThriftServer2(sqlContext)
-    sqlContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
     server.init(sqlContext.hiveconf)
     server.start()
     listener = new HiveThriftServer2Listener(server, sqlContext.conf)
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index b72249b3bf8c0..19b2f24456ab0 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.net.URL
 import java.sql.{Date, DriverManager, SQLException, Statement}
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
@@ -431,6 +432,32 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       }
     )
   }
+
+  test("Checks Hive version via SET -v") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET -v")
+
+      val conf = mutable.Map.empty[String, String]
+      while (resultSet.next()) {
+        conf += resultSet.getString(1) -> resultSet.getString(2)
+      }
+
+      assert(conf.get("spark.sql.hive.version") === Some("1.2.1"))
+    }
+  }
+
+  test("Checks Hive version via SET") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET")
+
+      val conf = mutable.Map.empty[String, String]
+      while (resultSet.next()) {
+        conf += resultSet.getString(1) -> resultSet.getString(2)
+      }
+
+      assert(conf.get("spark.sql.hive.version") === Some("1.2.1"))
+    }
+  }
 }
 
 class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c12a734863326..c75025e79af4a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -610,6 +610,11 @@ private[hive] object HiveContext {
     doc = "Version of the Hive metastore. Available options are " +
         s"<code>0.12.0</code> through <code>$hiveExecutionVersion</code>.")
 
+  val HIVE_EXECUTION_VERSION = stringConf(
+    key = "spark.sql.hive.version",
+    defaultValue = Some(hiveExecutionVersion),
+    doc = "Version of Hive used internally by Spark SQL.")
+
   val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars",
     defaultValue = Some("builtin"),
     doc = s"""

From 299b439920f980cce4c4f4e4a8436a5145efeaa3 Mon Sep 17 00:00:00 2001
From: y-shimizu <y.shimizu0429@gmail.com>
Date: Sun, 27 Sep 2015 16:36:03 +0100
Subject: [PATCH 416/802] [SPARK-10778] [MLLIB] Implement toString for
 AssociationRules.Rule

I implemented toString for AssociationRules.Rule, format like `[x, y] => {z}: 1.0`

Author: y-shimizu <y.shimizu0429@gmail.com>

Closes #8904 from y-shimizu/master.
---
 .../scala/org/apache/spark/mllib/fpm/AssociationRules.scala  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 95c688c86a7e4..07eb750b06a3b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -142,5 +142,10 @@ object AssociationRules {
     def javaConsequent: java.util.List[Item] = {
       consequent.toList.asJava
     }
+
+    override def toString: String = {
+      s"${antecedent.mkString("{", ",", "}")} => " +
+        s"${consequent.mkString("{", ",", "}")}: ${confidence}"
+    }
   }
 }

From 418e5e4cbdaab87addb91ac0bb2245ff0213ac81 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Sun, 27 Sep 2015 09:08:38 -0700
Subject: [PATCH 417/802] [SPARK-10741] [SQL] Hive Query Having/OrderBy against
 Parquet table is not working

https://issues.apache.org/jira/browse/SPARK-10741
I choose the second approach: do not change output exprIds when convert MetastoreRelation to LogicalRelation

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8889 from cloud-fan/hot-bug.
---
 .../analysis/MultiInstanceRelation.scala      |  1 -
 .../expressions/namedExpressions.scala        |  8 +++
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../datasources/DataSourceStrategy.scala      | 18 +++---
 .../datasources/LogicalRelation.scala         | 33 +++++++---
 .../sql/execution/datasources/rules.scala     | 14 ++--
 .../parquet/ParquetFilterSuite.scala          |  2 +-
 .../ParquetPartitionDiscoverySuite.scala      |  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 64 +++++--------------
 .../spark/sql/hive/execution/commands.scala   |  2 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  4 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  2 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 27 +++++++-
 .../apache/spark/sql/hive/parquetSuites.scala |  8 +--
 .../sql/sources/hadoopFsRelationSuites.scala  |  2 +-
 15 files changed, 103 insertions(+), 86 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
index 35b74024a4cab..394be47a588b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 6f173b52ad9b9..5768c6087db32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -236,6 +236,14 @@ case class AttributeReference(
     }
   }
 
+  def withExprId(newExprId: ExprId): AttributeReference = {
+    if (exprId == newExprId) {
+      this
+    } else {
+      AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifiers)
+    }
+  }
+
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f9995da3a85ed..9c67ad18c3d27 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1595,7 +1595,7 @@ class DataFrame private[sql](
    */
   def inputFiles: Array[String] = {
     val files: Seq[String] = logicalPlan.collect {
-      case LogicalRelation(fsBasedRelation: FileRelation) =>
+      case LogicalRelation(fsBasedRelation: FileRelation, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
         fr.inputFiles
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index c58213155daa8..918db8e7d083e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -38,21 +38,21 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
  */
 private[sql] object DataSourceStrategy extends Strategy with Logging {
   def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match {
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _)) =>
       pruneFilterProjectRaw(
         l,
         projects,
         filters,
         (a, f) => toCatalystRDD(l, a, t.buildScan(a, f))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _)) =>
       pruneFilterProject(
         l,
         projects,
         filters,
         (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _)) =>
       pruneFilterProject(
         l,
         projects,
@@ -60,7 +60,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil
 
     // Scanning partitioned HadoopFsRelation
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation))
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
       val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
 
@@ -88,7 +88,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         selectedPartitions) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) =>
       // See buildPartitionedTableScan for the reason that we need to create a shard
       // broadcast HadoopConf.
       val sharedHadoopConf = SparkHadoopUtil.get.conf
@@ -101,16 +101,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         (a, f) =>
           toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f, t.paths, confBroadcast))) :: Nil
 
-    case l @ LogicalRelation(baseRelation: TableScan) =>
+    case l @ LogicalRelation(baseRelation: TableScan, _) =>
       execution.PhysicalRDD.createFromDataSource(
         l.output, toCatalystRDD(l, baseRelation.buildScan()), baseRelation) :: Nil
 
-    case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: InsertableRelation), part, query, overwrite, false) if part.isEmpty =>
+    case i @ logical.InsertIntoTable(l @ LogicalRelation(t: InsertableRelation, _),
+      part, query, overwrite, false) if part.isEmpty =>
       execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
     case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: HadoopFsRelation), part, query, overwrite, false) =>
+      l @ LogicalRelation(t: HadoopFsRelation, _), part, query, overwrite, false) =>
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
       execution.ExecutedCommand(InsertIntoHadoopFsRelation(t, query, mode)) :: Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index a7123dc845fa2..4069179aa734b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -17,23 +17,40 @@
 package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.sources.BaseRelation
 
 /**
  * Used to link a [[BaseRelation]] in to a logical query plan.
+ *
+ * Note that sometimes we need to use `LogicalRelation` to replace an existing leaf node without
+ * changing the output attributes' IDs.  The `expectedOutputAttributes` parameter is used for
+ * this purpose.  See https://issues.apache.org/jira/browse/SPARK-10741 for more details.
  */
-private[sql] case class LogicalRelation(relation: BaseRelation)
-  extends LeafNode
-  with MultiInstanceRelation {
+private[sql] case class LogicalRelation(
+    relation: BaseRelation,
+    expectedOutputAttributes: Option[Seq[Attribute]] = None)
+  extends LeafNode with MultiInstanceRelation {
 
-  override val output: Seq[AttributeReference] = relation.schema.toAttributes
+  override val output: Seq[AttributeReference] = {
+    val attrs = relation.schema.toAttributes
+    expectedOutputAttributes.map { expectedAttrs =>
+      assert(expectedAttrs.length == attrs.length)
+      attrs.zip(expectedAttrs).map {
+        // We should respect the attribute names provided by base relation and only use the
+        // exprId in `expectedOutputAttributes`.
+        // The reason is that, some relations(like parquet) will reconcile attribute names to
+        // workaround case insensitivity issue.
+        case (attr, expected) => attr.withExprId(expected.exprId)
+      }
+    }.getOrElse(attrs)
+  }
 
   // Logical Relations are distinct if they have different output for the sake of transformations.
   override def equals(other: Any): Boolean = other match {
-    case l @ LogicalRelation(otherRelation) => relation == otherRelation && output == l.output
-    case  _ => false
+    case l @ LogicalRelation(otherRelation, _) => relation == otherRelation && output == l.output
+    case _ => false
   }
 
   override def hashCode: Int = {
@@ -41,7 +58,7 @@ private[sql] case class LogicalRelation(relation: BaseRelation)
   }
 
   override def sameResult(otherPlan: LogicalPlan): Boolean = otherPlan match {
-    case LogicalRelation(otherRelation) => relation == otherRelation
+    case LogicalRelation(otherRelation, _) => relation == otherRelation
     case _ => false
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 16c9138419fa2..8efc8016f94dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -37,7 +37,7 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
 
       // We are inserting into an InsertableRelation or HadoopFsRelation.
       case i @ InsertIntoTable(
-      l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation), _, child, _, _) => {
+      l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _), _, child, _, _) => {
         // First, make sure the data to be inserted have the same number of fields with the
         // schema of the relation.
         if (l.output.size != child.output.size) {
@@ -84,14 +84,14 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case i @ logical.InsertIntoTable(
-        l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite, ifNotExists) =>
+        l @ LogicalRelation(t: InsertableRelation, _), partition, query, overwrite, ifNotExists) =>
         // Right now, we do not support insert into a data source table with partition specs.
         if (partition.nonEmpty) {
           failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.")
         } else {
           // Get all input data source relations of the query.
           val srcRelations = query.collect {
-            case LogicalRelation(src: BaseRelation) => src
+            case LogicalRelation(src: BaseRelation, _) => src
           }
           if (srcRelations.contains(t)) {
             failAnalysis(
@@ -102,7 +102,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
         }
 
       case logical.InsertIntoTable(
-        LogicalRelation(r: HadoopFsRelation), part, query, overwrite, _) =>
+        LogicalRelation(r: HadoopFsRelation, _), part, query, overwrite, _) =>
         // We need to make sure the partition columns specified by users do match partition
         // columns of the relation.
         val existingPartitionColumns = r.partitionColumns.fieldNames.toSet
@@ -120,7 +120,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
 
         // Get all input data source relations of the query.
         val srcRelations = query.collect {
-          case LogicalRelation(src: BaseRelation) => src
+          case LogicalRelation(src: BaseRelation, _) => src
         }
         if (srcRelations.contains(r)) {
           failAnalysis(
@@ -148,10 +148,10 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
-            case l @ LogicalRelation(dest: BaseRelation) =>
+            case l @ LogicalRelation(dest: BaseRelation, _) =>
               // Get all input data source relations of the query.
               val srcRelations = query.collect {
-                case LogicalRelation(src: BaseRelation) => src
+                case LogicalRelation(src: BaseRelation, _) => src
               }
               if (srcRelations.contains(dest)) {
                 failAnalysis(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index f067112cfca95..45ad3fde559c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -55,7 +55,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         .where(Column(predicate))
 
       val analyzedPredicate = query.queryExecution.optimizedPlan.collect {
-        case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters
+        case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation, _)) => filters
       }.flatten
       assert(analyzedPredicate.nonEmpty)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 7bac8609e1b91..3a23b8ed66808 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -465,7 +465,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: ParquetRelation) =>
+        case LogicalRelation(relation: ParquetRelation, _) =>
           assert(relation.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 012634cb5aeb5..ea1521a48c8a7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -448,7 +448,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
       cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => None // Cache miss
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation) =>
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) =>
           // If we have the same paths, same schema, and same partition spec,
           // we will use the cached Parquet Relation.
           val useCached =
@@ -514,7 +514,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       parquetRelation
     }
 
-    result.newInstance()
+    result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
@@ -553,60 +553,28 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         return plan
       }
 
-      // Collects all `MetastoreRelation`s which should be replaced
-      val toBeReplaced = plan.collect {
+      plan transformUp {
         // Write path
-        case InsertIntoTable(relation: MetastoreRelation, _, _, _, _)
-            // Inserting into partitioned table is not supported in Parquet data source (yet).
-            if !relation.hiveQlTable.isPartitioned &&
-              hive.convertMetastoreParquet &&
-              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
+        case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
+          // Inserting into partitioned table is not supported in Parquet data source (yet).
+          if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet &&
+            r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          val parquetRelation = convertToParquetRelation(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
 
         // Write path
-        case InsertIntoHiveTable(relation: MetastoreRelation, _, _, _, _)
+        case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
           // Inserting into partitioned table is not supported in Parquet data source (yet).
-          if !relation.hiveQlTable.isPartitioned &&
-            hive.convertMetastoreParquet &&
-            relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
+          if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet &&
+            r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          val parquetRelation = convertToParquetRelation(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
 
         // Read path
         case relation: MetastoreRelation if hive.convertMetastoreParquet &&
-              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
           val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
-      }
-
-      val relationMap = toBeReplaced.map(r => (r._1, r._2)).toMap
-      val attributedRewrites = AttributeMap(toBeReplaced.map(_._3).fold(Nil)(_ ++: _))
-
-      // Replaces all `MetastoreRelation`s with corresponding `ParquetRelation2`s, and fixes
-      // attribute IDs referenced in other nodes.
-      plan.transformUp {
-        case r: MetastoreRelation if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          val alias = r.alias.getOrElse(r.tableName)
-          Subquery(alias, parquetRelation)
-
-        case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
-          if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
-
-        case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
-          if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
-
-        case other => other.transformExpressions {
-          case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
-        }
+          Subquery(relation.alias.getOrElse(relation.tableName), parquetRelation)
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index d1699dd536817..9f654eed5761c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -235,7 +235,7 @@ case class CreateMetastoreDataSourceAsSelect(
             sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
           EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match {
-            case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation) =>
+            case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _) =>
               if (l.relation != createdRelation.relation) {
                 val errorDescription =
                   s"Cannot append to table $tableName because the resolved relation does not " +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 80a61f82fd24f..81ee9ba71beb6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -81,9 +81,9 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
   test("Double create fails when allowExisting = false") {
     sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
 
-    val message = intercept[QueryExecutionException] {
+    intercept[QueryExecutionException] {
       sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
-    }.getMessage
+    }
   }
 
   test("Double create does not fail when allowExisting = true") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index bf0db084906cd..d3565380005a0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -570,7 +570,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
             Row(3) :: Row(4) :: Nil)
 
           table("test_parquet_ctas").queryExecution.optimizedPlan match {
-            case LogicalRelation(p: ParquetRelation) => // OK
+            case LogicalRelation(p: ParquetRelation, _) => // OK
             case _ =>
               fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation]}")
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 71823e32ad389..8c3f9ac202637 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -263,7 +263,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
       val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
       relation match {
-        case LogicalRelation(r: ParquetRelation) =>
+        case LogicalRelation(r: ParquetRelation, _) =>
           if (!isDataSourceParquet) {
             fail(
               s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
@@ -1223,4 +1223,29 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
     checkAnswer(df, (0 until 5).map(i => Row(i + "#", i + "#")))
   }
+
+  test("SPARK-10741: Sort on Aggregate using parquet") {
+    withTable("test10741") {
+      withTempTable("src") {
+        Seq("a" -> 5, "a" -> 9, "b" -> 6).toDF().registerTempTable("src")
+        sql("CREATE TABLE test10741(c1 STRING, c2 INT) STORED AS PARQUET AS SELECT * FROM src")
+      }
+
+      checkAnswer(sql(
+        """
+          |SELECT c1, AVG(c2) AS c_avg
+          |FROM test10741
+          |GROUP BY c1
+          |HAVING (AVG(c2) > 5) ORDER BY c1
+        """.stripMargin), Row("a", 7.0) :: Row("b", 6.0) :: Nil)
+
+      checkAnswer(sql(
+        """
+          |SELECT c1, AVG(c2) AS c_avg
+          |FROM test10741
+          |GROUP BY c1
+          |ORDER BY AVG(c2)
+        """.stripMargin), Row("b", 6.0) :: Row("a", 7.0) :: Nil)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 6842ec2b5eb37..7d8104f9358d5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -282,7 +282,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       )
 
       table("test_parquet_ctas").queryExecution.optimizedPlan match {
-        case LogicalRelation(_: ParquetRelation) => // OK
+        case LogicalRelation(_: ParquetRelation, _) => // OK
         case _ => fail(
           "test_parquet_ctas should be converted to " +
               s"${classOf[ParquetRelation].getCanonicalName }")
@@ -369,7 +369,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       assertResult(2) {
         analyzed.collect {
-          case r@LogicalRelation(_: ParquetRelation) => r
+          case r @ LogicalRelation(_: ParquetRelation, _) => r
         }.size
       }
     }
@@ -378,7 +378,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   def collectParquetRelation(df: DataFrame): ParquetRelation = {
     val plan = df.queryExecution.analyzed
     plan.collectFirst {
-      case LogicalRelation(r: ParquetRelation) => r
+      case LogicalRelation(r: ParquetRelation, _) => r
     }.getOrElse {
       fail(s"Expecting a ParquetRelation2, but got:\n$plan")
     }
@@ -428,7 +428,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       // Converted test_parquet should be cached.
       catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation) => // OK
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index d7504936d90e5..42b9b3d6340d8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -499,7 +499,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       }
 
       val actualPaths = df.queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: HadoopFsRelation) =>
+        case LogicalRelation(relation: HadoopFsRelation, _) =>
           relation.paths.toSet
       }.getOrElse {
         fail("Expect an FSBasedRelation, but none could be found")

From 8ecba3e86e53834413da8b4299f5791545cae12e Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Sun, 27 Sep 2015 21:16:15 +0100
Subject: [PATCH 418/802] [SPARK-10720] [SQL] [JAVA] Add a java wrapper to
 create a dataframe from a local list of java beans

Similar to SPARK-10630 it would be nice if Java users didn't have to parallelize there data explicitly (as Scala users already can skip). Issue came up in http://stackoverflow.com/questions/32613413/apache-spark-machine-learning-cant-get-estimator-example-to-work

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8879 from holdenk/SPARK-10720-add-a-java-wrapper-to-create-a-dataframe-from-a-local-list-of-java-beans.
---
 .../org/apache/spark/sql/SQLContext.scala     | 51 ++++++++++++++-----
 .../apache/spark/sql/JavaDataFrameSuite.java  | 22 ++++++--
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1bd4e26fb3162..cb0a3e361c97a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.beans.Introspector
+import java.beans.{BeanInfo, Introspector}
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicReference
 
@@ -499,21 +499,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @since 1.3.0
    */
   def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
-    val attributeSeq = getSchema(beanClass)
+    val attributeSeq: Seq[AttributeReference] = getSchema(beanClass)
     val className = beanClass.getName
     val rowRdd = rdd.mapPartitions { iter =>
       // BeanInfo is not serializable so we must rediscover it remotely for each partition.
       val localBeanInfo = Introspector.getBeanInfo(Utils.classForName(className))
-      val extractors =
-        localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
-      val methodsToConverts = extractors.zip(attributeSeq).map { case (e, attr) =>
-        (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
-      }
-      iter.map { row =>
-        new GenericInternalRow(
-          methodsToConverts.map { case (e, convert) => convert(e.invoke(row)) }.toArray[Any]
-        ): InternalRow
-      }
+      SQLContext.beansToRows(iter, localBeanInfo, attributeSeq)
     }
     DataFrame(this, LogicalRDD(attributeSeq, rowRdd)(this))
   }
@@ -530,6 +521,23 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rdd.rdd, beanClass)
   }
 
+  /**
+   * Applies a schema to an List of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   * @group dataframes
+   * @since 1.6.0
+   */
+  def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = {
+    val attrSeq = getSchema(beanClass)
+    val className = beanClass.getName
+    val beanInfo = Introspector.getBeanInfo(beanClass)
+    val rows = SQLContext.beansToRows(data.asScala.iterator, beanInfo, attrSeq)
+    DataFrame(self, LocalRelation(attrSeq, rows.toSeq))
+  }
+
+
   /**
    * :: Experimental ::
    * Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]].
@@ -1229,4 +1237,23 @@ object SQLContext {
       lastInstantiatedContext.set(sqlContext)
     }
   }
+
+  /**
+   * Converts an iterator of Java Beans to InternalRow using the provided
+   * bean info & schema. This is not related to the singleton, but is a static
+   * method for internal use.
+   */
+  private def beansToRows(data: Iterator[_], beanInfo: BeanInfo, attrs: Seq[AttributeReference]):
+      Iterator[InternalRow] = {
+    val extractors =
+      beanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
+    val methodsToConverts = extractors.zip(attrs).map { case (e, attr) =>
+      (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
+    }
+    data.map{ element =>
+      new GenericInternalRow(
+        methodsToConverts.map { case (e, convert) => convert(e.invoke(element)) }.toArray[Any]
+      ): InternalRow
+    }
+  }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 250ac2e1092d4..a1a3fdbb486ea 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -142,11 +142,7 @@ public List<String> getD() {
     }
   }
 
-  @Test
-  public void testCreateDataFrameFromJavaBeans() {
-    Bean bean = new Bean();
-    JavaRDD<Bean> rdd = jsc.parallelize(Arrays.asList(bean));
-    DataFrame df = context.createDataFrame(rdd, Bean.class);
+  void validateDataFrameWithBeans(Bean bean, DataFrame df) {
     StructType schema = df.schema();
     Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()),
       schema.apply("a"));
@@ -182,6 +178,22 @@ public void testCreateDataFrameFromJavaBeans() {
     }
   }
 
+  @Test
+  public void testCreateDataFrameFromLocalJavaBeans() {
+    Bean bean = new Bean();
+    List<Bean> data = Arrays.asList(bean);
+    DataFrame df = context.createDataFrame(data, Bean.class);
+    validateDataFrameWithBeans(bean, df);
+  }
+
+  @Test
+  public void testCreateDataFrameFromJavaBeans() {
+    Bean bean = new Bean();
+    JavaRDD<Bean> rdd = jsc.parallelize(Arrays.asList(bean));
+    DataFrame df = context.createDataFrame(rdd, Bean.class);
+    validateDataFrameWithBeans(bean, df);
+  }
+
   @Test
   public void testCreateDataFromFromList() {
     StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true)));

From fb4c7be747a35ea733df6010194cd76db449adc9 Mon Sep 17 00:00:00 2001
From: Bin Wang <wbin00@gmail.com>
Date: Sun, 27 Sep 2015 21:26:54 +0100
Subject: [PATCH 419/802] add doc for spark.streaming.stopGracefullyOnShutdown

Author: Bin Wang <wbin00@gmail.com>

Closes #8898 from wb14123/doc.
---
 docs/configuration.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index 284f97ad09ec3..d99092e3c506e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1490,6 +1490,14 @@ Apart from these, the following properties are also available, and may be useful
     higher memory usage in Spark.
   </td>
 </tr>
+<tr>
+  <td><code>spark.streaming.stopGracefullyOnShutdown</code></td>
+  <td>false</td>
+  <td>
+    If <code>true</code>, Spark shuts down the <code>StreamingContext</code> gracefully on JVM 
+    shutdown rather than immediately.
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.kafka.maxRatePerPartition</code></td>
   <td>not set</td>

From b58249930d58e2de238c05aaf5fa9315b4c3cbab Mon Sep 17 00:00:00 2001
From: David Martin <dmartinpro@users.noreply.github.com>
Date: Mon, 28 Sep 2015 10:41:39 +0100
Subject: [PATCH 420/802] Fix two mistakes in programming-guide page

seperate -> separate
sees -> see

Author: David Martin <dmartinpro@users.noreply.github.com>

Closes #8928 from dmartinpro/patch-1.
---
 docs/programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 8ad238315f12c..22656fd7910c0 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -805,9 +805,9 @@ print("Counter value: " + counter)
 
 The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node.
 
-However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on seperate worker nodes each have their own copy of the closure.
+However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on separate worker nodes each have their own copy of the closure.
 
-What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only sees the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure.  
+What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only see the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure.  
 
 To ensure well-defined behavior in these sorts of scenarios one should use an [`Accumulator`](#AccumLink). Accumulators in Spark are used specifically to provide a mechanism for safely updating a variable when execution is split up across worker nodes in a cluster. The Accumulators section of this guide discusses these in more detail.  
 

From d8d50ed388d2e695b69d2b93a620045ef2f0bc18 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 28 Sep 2015 06:33:45 -0700
Subject: [PATCH 421/802] [SPARK-10812] [YARN] Spark hadoop util support
 switching to yarn

While this is likely not a huge issue for real production systems, for test systems which may setup a Spark Context and tear it down and stand up a Spark Context with a different master (e.g. some local mode & some yarn mode) tests this cane be an issue. Discovered during work on spark-testing-base on Spark 1.4.1, but seems like the logic that triggers it is present in master (see SparkHadoopUtil object). A valid work around for users encountering this issue is to fork a different JVM, however this can be heavy weight.

```
[info] SampleMiniClusterTest:
[info] Exception encountered when attempting to run a suite with class name: com.holdenkarau.spark.testing.SampleMiniClusterTest *** ABORTED ***
[info] java.lang.ClassCastException: org.apache.spark.deploy.SparkHadoopUtil cannot be cast to org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
[info] at org.apache.spark.deploy.yarn.YarnSparkHadoopUtil$.get(YarnSparkHadoopUtil.scala:163)
[info] at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:257)
[info] at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:561)
[info] at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:115)
[info] at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:57)
[info] at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:141)
[info] at org.apache.spark.SparkContext.<init>(SparkContext.scala:497)
[info] at com.holdenkarau.spark.testing.SharedMiniCluster$class.setup(SharedMiniCluster.scala:186)
[info] at com.holdenkarau.spark.testing.SampleMiniClusterTest.setup(SampleMiniClusterTest.scala:26)
[info] at com.holdenkarau.spark.testing.SharedMiniCluster$class.beforeAll(SharedMiniCluster.scala:103)
```

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8911 from holdenk/SPARK-10812-spark-hadoop-util-support-switching-to-yarn.
---
 .../scala/org/apache/spark/SparkContext.scala |  2 ++
 .../apache/spark/deploy/SparkHadoopUtil.scala | 30 +++++++++----------
 .../org/apache/spark/deploy/yarn/Client.scala |  6 +++-
 .../yarn/YarnSparkHadoopUtilSuite.scala       | 12 ++++++++
 4 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index bf3aeb488d597..0c72adfb9505b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1756,6 +1756,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       }
       SparkEnv.set(null)
     }
+    // Unset YARN mode system env variable, to allow switching between cluster types.
+    System.clearProperty("SPARK_YARN_MODE")
     SparkContext.clearActiveContext()
     logInfo("Successfully stopped SparkContext")
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index a0b7365df900a..d606b80c03c98 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -385,20 +385,13 @@ class SparkHadoopUtil extends Logging {
 
 object SparkHadoopUtil {
 
-  private val hadoop = {
-    val yarnMode = java.lang.Boolean.valueOf(
-        System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
-    if (yarnMode) {
-      try {
-        Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
-          .newInstance()
-          .asInstanceOf[SparkHadoopUtil]
-      } catch {
-       case e: Exception => throw new SparkException("Unable to load YARN support", e)
-      }
-    } else {
-      new SparkHadoopUtil
-    }
+  private lazy val hadoop = new SparkHadoopUtil
+  private lazy val yarn = try {
+    Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
+      .newInstance()
+      .asInstanceOf[SparkHadoopUtil]
+  } catch {
+    case e: Exception => throw new SparkException("Unable to load YARN support", e)
   }
 
   val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp"
@@ -406,6 +399,13 @@ object SparkHadoopUtil {
   val SPARK_YARN_CREDS_COUNTER_DELIM = "-"
 
   def get: SparkHadoopUtil = {
-    hadoop
+    // Check each time to support changing to/from YARN
+    val yarnMode = java.lang.Boolean.valueOf(
+        System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
+    if (yarnMode) {
+      yarn
+    } else {
+      hadoop
+    }
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index a2c4bc2f5480b..8c53c24a79c48 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -86,7 +86,11 @@ private[spark] class Client(
   private val fireAndForget = isClusterMode &&
     !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
 
-  def stop(): Unit = yarnClient.stop()
+  def stop(): Unit = {
+    yarnClient.stop()
+    // Unset YARN mode system env variable, to allow switching between cluster types.
+    System.clearProperty("SPARK_YARN_MODE")
+  }
 
   /**
    * Submit an application running our ApplicationMaster to the ResourceManager.
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index 49bee0866dd43..e1c67db76571f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -30,6 +30,7 @@ import org.scalatest.Matchers
 import org.apache.hadoop.yarn.api.records.ApplicationAccessType
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.util.Utils
 
 
@@ -233,4 +234,15 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
       }
     assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
   }
+
+  test("check different hadoop utils based on env variable") {
+    try {
+      System.setProperty("SPARK_YARN_MODE", "true")
+      assert(SparkHadoopUtil.get.getClass === classOf[YarnSparkHadoopUtil])
+      System.setProperty("SPARK_YARN_MODE", "false")
+      assert(SparkHadoopUtil.get.getClass === classOf[SparkHadoopUtil])
+    } finally {
+      System.clearProperty("SPARK_YARN_MODE")
+    }
+  }
 }

From 353c30bd7dfbd3b76fc8bc9a6dfab9321439a34b Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 28 Sep 2015 06:38:54 -0700
Subject: [PATCH 422/802] [SPARK-10790] [YARN] Fix initial executor number not
 set issue and consolidate the codes

This bug is introduced in [SPARK-9092](https://issues.apache.org/jira/browse/SPARK-9092), `targetExecutorNumber` should use `minExecutors` if `initialExecutors` is not set. Using 0 instead will meet the problem as mentioned in [SPARK-10790](https://issues.apache.org/jira/browse/SPARK-10790).

Also consolidate and simplify some similar code snippets to keep the consistent semantics.

Author: jerryshao <sshao@hortonworks.com>

Closes #8910 from jerryshao/SPARK-10790.
---
 .../spark/deploy/yarn/ClientArguments.scala   | 20 +---------------
 .../spark/deploy/yarn/YarnAllocator.scala     |  6 +----
 .../deploy/yarn/YarnSparkHadoopUtil.scala     | 23 +++++++++++++++++++
 .../cluster/YarnClusterSchedulerBackend.scala | 18 ++-------------
 4 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 54f62e6b723ac..1165061db21e3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -81,25 +81,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       .orNull
     // If dynamic allocation is enabled, start at the configured initial number of executors.
     // Default to minExecutors if no initialExecutors is set.
-    if (isDynamicAllocationEnabled) {
-      val minExecutorsConf = "spark.dynamicAllocation.minExecutors"
-      val initialExecutorsConf = "spark.dynamicAllocation.initialExecutors"
-      val maxExecutorsConf = "spark.dynamicAllocation.maxExecutors"
-      val minNumExecutors = sparkConf.getInt(minExecutorsConf, 0)
-      val initialNumExecutors = sparkConf.getInt(initialExecutorsConf, minNumExecutors)
-      val maxNumExecutors = sparkConf.getInt(maxExecutorsConf, Integer.MAX_VALUE)
-
-      // If defined, initial executors must be between min and max
-      if (initialNumExecutors < minNumExecutors || initialNumExecutors > maxNumExecutors) {
-        throw new IllegalArgumentException(
-          s"$initialExecutorsConf must be between $minExecutorsConf and $maxNumExecutors!")
-      }
-
-      numExecutors = initialNumExecutors
-    } else {
-      val numExecutorsConf = "spark.executor.instances"
-      numExecutors = sparkConf.getInt(numExecutorsConf, numExecutors)
-    }
+    numExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
     principal = Option(principal)
       .orElse(sparkConf.getOption("spark.yarn.principal"))
       .orNull
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index fd88b8b7fe3b9..9e1ef1b3b4229 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -89,11 +89,7 @@ private[yarn] class YarnAllocator(
   @volatile private var numExecutorsFailed = 0
 
   @volatile private var targetNumExecutors =
-    if (Utils.isDynamicAllocationEnabled(sparkConf)) {
-      sparkConf.getInt("spark.dynamicAllocation.initialExecutors", 0)
-    } else {
-      sparkConf.getInt("spark.executor.instances", YarnSparkHadoopUtil.DEFAULT_NUMBER_EXECUTORS)
-    }
+    YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
 
   // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
   // list of requesters that should be responded to once we find out why the given executor
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 445d3dcd266db..f276e7efde9d7 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -314,5 +314,28 @@ object YarnSparkHadoopUtil {
   def getClassPathSeparator(): String = {
     classPathSeparatorField.get(null).asInstanceOf[String]
   }
+
+  /**
+   * Getting the initial target number of executors depends on whether dynamic allocation is
+   * enabled.
+   */
+  def getInitialTargetExecutorNumber(conf: SparkConf): Int = {
+    if (Utils.isDynamicAllocationEnabled(conf)) {
+      val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
+      val initialNumExecutors =
+        conf.getInt("spark.dynamicAllocation.initialExecutors", minNumExecutors)
+      val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", Int.MaxValue)
+      require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
+        s"initial executor number $initialNumExecutors must between min executor number" +
+          s"$minNumExecutors and max executor number $maxNumExecutors")
+
+      initialNumExecutors
+    } else {
+      val targetNumExecutors =
+        sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(DEFAULT_NUMBER_EXECUTORS)
+      // System property can override environment variable.
+      conf.getInt("spark.executor.instances", targetNumExecutors)
+    }
+  }
 }
 
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index 1aed5a1675075..50b699f11b21c 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -17,21 +17,13 @@
 
 package org.apache.spark.scheduler.cluster
 
-import java.net.NetworkInterface
-
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.yarn.api.records.NodeState
-import org.apache.hadoop.yarn.client.api.YarnClient
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 
 import org.apache.spark.SparkContext
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.util.{IntParam, Utils}
+import org.apache.spark.util.Utils
 
 private[spark] class YarnClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
@@ -40,13 +32,7 @@ private[spark] class YarnClusterSchedulerBackend(
 
   override def start() {
     super.start()
-    totalExpectedExecutors = DEFAULT_NUMBER_EXECUTORS
-    if (System.getenv("SPARK_EXECUTOR_INSTANCES") != null) {
-      totalExpectedExecutors = IntParam.unapply(System.getenv("SPARK_EXECUTOR_INSTANCES"))
-        .getOrElse(totalExpectedExecutors)
-    }
-    // System property can override environment variable.
-    totalExpectedExecutors = sc.getConf.getInt("spark.executor.instances", totalExpectedExecutors)
+    totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf)
   }
 
   override def applicationId(): String =

From 14978b785a43e0c13c8bdfd52d20cc8984984ba3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 28 Sep 2015 13:53:45 -0700
Subject: [PATCH 423/802] [SPARK-10395] [SQL] Simplifies CatalystReadSupport

Please refer to [SPARK-10395] [1] for details.

[1]: https://issues.apache.org/jira/browse/SPARK-10395

Author: Cheng Lian <lian@databricks.com>

Closes #8553 from liancheng/spark-10395/simplify-parquet-read-support.
---
 .../parquet/CatalystReadSupport.scala         | 92 +++++++++----------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 8c819f1a48cd6..9502b835a54d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.{Map => JMap}
 
-import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, mapAsScalaMapConverter}
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
@@ -29,34 +29,62 @@ import org.apache.parquet.schema.Type.Repetition
 import org.apache.parquet.schema._
 
 import org.apache.spark.Logging
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
+/**
+ * A Parquet [[ReadSupport]] implementation for reading Parquet records as Catalyst
+ * [[InternalRow]]s.
+ *
+ * The API interface of [[ReadSupport]] is a little bit over complicated because of historical
+ * reasons.  In older versions of parquet-mr (say 1.6.0rc3 and prior), [[ReadSupport]] need to be
+ * instantiated and initialized twice on both driver side and executor side.  The [[init()]] method
+ * is for driver side initialization, while [[prepareForRead()]] is for executor side.  However,
+ * starting from parquet-mr 1.6.0, it's no longer the case, and [[ReadSupport]] is only instantiated
+ * and initialized on executor side.  So, theoretically, now it's totally fine to combine these two
+ * methods into a single initialization method.  The only reason (I could think of) to still have
+ * them here is for parquet-mr API backwards-compatibility.
+ *
+ * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]]
+ * to [[prepareForRead()]], but use a private `var` for simplicity.
+ */
 private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging {
-  // Called after `init()` when initializing Parquet record reader.
+  private var catalystRequestedSchema: StructType = _
+
+  /**
+   * Called on executor side before [[prepareForRead()]] and instantiating actual Parquet record
+   * readers.  Responsible for figuring out Parquet requested schema used for column pruning.
+   */
+  override def init(context: InitContext): ReadContext = {
+    catalystRequestedSchema = {
+      // scalastyle:off jobcontext
+      val conf = context.getConfiguration
+      // scalastyle:on jobcontext
+      val schemaString = conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+      assert(schemaString != null, "Parquet requested schema not set.")
+      StructType.fromString(schemaString)
+    }
+
+    val parquetRequestedSchema =
+      CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
+
+    new ReadContext(parquetRequestedSchema, Map.empty[String, String].asJava)
+  }
+
+  /**
+   * Called on executor side after [[init()]], before instantiating actual Parquet record readers.
+   * Responsible for instantiating [[RecordMaterializer]], which is used for converting Parquet
+   * records to Catalyst [[InternalRow]]s.
+   */
   override def prepareForRead(
       conf: Configuration,
       keyValueMetaData: JMap[String, String],
       fileSchema: MessageType,
       readContext: ReadContext): RecordMaterializer[InternalRow] = {
     log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
-
-    val toCatalyst = new CatalystSchemaConverter(conf)
     val parquetRequestedSchema = readContext.getRequestedSchema
 
-    val catalystRequestedSchema =
-      Option(readContext.getReadSupportMetadata).map(_.asScala).flatMap { metadata =>
-        metadata
-          // First tries to read requested schema, which may result from projections
-          .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
-          // If not available, tries to read Catalyst schema from file metadata.  It's only
-          // available if the target file is written by Spark SQL.
-          .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY))
-      }.map(StructType.fromString).getOrElse {
-        logInfo("Catalyst schema not available, falling back to Parquet schema")
-        toCatalyst.convert(parquetRequestedSchema)
-      }
-
     logInfo {
       s"""Going to read the following fields from the Parquet file:
          |
@@ -69,36 +97,6 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
     new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
   }
-
-  // Called before `prepareForRead()` when initializing Parquet record reader.
-  override def init(context: InitContext): ReadContext = {
-    val conf = {
-      // scalastyle:off jobcontext
-      context.getConfiguration
-      // scalastyle:on jobcontext
-    }
-
-    // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
-    // schema of this file from its metadata.
-    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
-
-    // Optional schema of requested columns, in the form of a string serialized from a Catalyst
-    // `StructType` containing all requested columns.
-    val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
-
-    val parquetRequestedSchema =
-      maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
-        val catalystRequestedSchema = StructType.fromString(schemaString)
-        CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
-      }
-
-    val metadata =
-      Map.empty[String, String] ++
-        maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
-        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
-
-    new ReadContext(parquetRequestedSchema, metadata.asJava)
-  }
 }
 
 private[parquet] object CatalystReadSupport {

From ea02e5513a8f9853094d5612c962fc8c1a340f50 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 28 Sep 2015 14:40:40 -0700
Subject: [PATCH 424/802] [SPARK-10859] [SQL] fix stats of StringType in
 columnar cache

The UTF8String may come from UnsafeRow, then underline buffer of it is not copied, so we should clone it in order to hold it in Stats.

cc yhuai

Author: Davies Liu <davies@databricks.com>

Closes #8929 from davies/pushdown_string.
---
 .../scala/org/apache/spark/sql/columnar/ColumnStats.scala  | 4 ++--
 .../spark/sql/columnar/InMemoryColumnarQuerySuite.scala    | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 5cbd52bc0590e..fbd51b7c34c8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -213,8 +213,8 @@ private[sql] class StringColumnStats extends ColumnStats {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getUTF8String(ordinal)
-      if (upper == null || value.compareTo(upper) > 0) upper = value
-      if (lower == null || value.compareTo(lower) < 0) lower = value
+      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
+      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
       sizeInBytes += STRING.actualSize(row, ordinal)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index cd3644eb9c099..ea5dd2be33d63 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -212,4 +212,11 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     // Drop the cache.
     cached.unpersist()
   }
+
+  test("SPARK-10859: Predicates pushed to InMemoryColumnarTableScan are not evaluated correctly") {
+    val data = sqlContext.range(10).selectExpr("id", "cast(id as string) as s")
+    data.cache()
+    assert(data.count() === 10)
+    assert(data.filter($"s" === "3").count() === 1)
+  }
 }

From bf4199e261c3c8dd2970e2a154c97b46fb339f02 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 28 Sep 2015 22:56:43 -0400
Subject: [PATCH 425/802] [SPARK-10833] [BUILD] Inline, organize BSD/MIT
 licenses in LICENSE

In the course of https://issues.apache.org/jira/browse/LEGAL-226 it came to light that the guidance at http://www.apache.org/dev/licensing-howto.html#permissive-deps means that permissively-licensed dependencies has a different interpretation than we (er, I) had been operating under. "pointer ... to the license within the source tree" specifically means a copy of the license within Spark's distribution, whereas at the moment, Spark's LICENSE has a pointer to the project's license in the other project's source tree.

The remedy is simply to inline all such license references (i.e. BSD/MIT licenses) or include their text in "licenses" subdirectory and point to that.

Along the way, we can also treat other BSD/MIT licenses, whose text has been inlined into LICENSE, in the same way.

The LICENSE file can continue to provide a helpful list of BSD/MIT licensed projects and a pointer to their sites. This would be over and above including license text in the distro, which is the essential thing.

Author: Sean Owen <sowen@cloudera.com>

Closes #8919 from srowen/SPARK-10833.
---
 LICENSE                                       | 699 +-----------------
 NOTICE                                        |  35 +
 .../apache/spark/util/collection/TimSort.java |  18 +
 licenses/LICENSE-AnchorJS.txt                 |  21 +
 licenses/LICENSE-DPark.txt                    |  30 +
 licenses/LICENSE-Mockito.txt                  |  21 +
 licenses/LICENSE-SnapTree.txt                 |  35 +
 licenses/LICENSE-antlr.txt                    |   8 +
 licenses/LICENSE-boto.txt                     |  20 +
 licenses/LICENSE-cloudpickle.txt              |  28 +
 licenses/LICENSE-d3.min.js.txt                |  26 +
 licenses/LICENSE-dagre-d3.txt                 |  19 +
 licenses/LICENSE-f2j.txt                      |   8 +
 licenses/LICENSE-graphlib-dot.txt             |  19 +
 licenses/LICENSE-heapq.txt                    | 280 +++++++
 licenses/LICENSE-javolution.txt               |  27 +
 licenses/LICENSE-jbcrypt.txt                  |  17 +
 licenses/LICENSE-jblas.txt                    |  31 +
 licenses/LICENSE-jline.txt                    |  32 +
 licenses/LICENSE-jpmml-model.txt              |  10 +
 licenses/LICENSE-jquery.txt                   |   9 +
 licenses/LICENSE-junit-interface.txt          |  24 +
 licenses/LICENSE-kryo.txt                     |  10 +
 licenses/LICENSE-minlog.txt                   |  10 +
 licenses/LICENSE-netlib.txt                   |  49 ++
 licenses/LICENSE-paranamer.txt                |  28 +
 licenses/LICENSE-protobuf.txt                 |  42 ++
 licenses/LICENSE-py4j.txt                     |  27 +
 licenses/LICENSE-pyrolite.txt                 |  28 +
 licenses/LICENSE-reflectasm.txt               |  10 +
 licenses/LICENSE-sbt-launch-lib.txt           |  26 +
 licenses/LICENSE-scala.txt                    |  30 +
 licenses/LICENSE-scalacheck.txt               |  32 +
 licenses/LICENSE-scopt.txt                    |  21 +
 licenses/LICENSE-slf4j.txt                    |  21 +
 licenses/LICENSE-sorttable.js.txt             |  16 +
 licenses/LICENSE-spire.txt                    |  19 +
 licenses/LICENSE-xmlenc.txt                   |  27 +
 make-distribution.sh                          |   1 +
 .../network/util/LimitedInputStream.java      |  18 +
 40 files changed, 1153 insertions(+), 679 deletions(-)
 create mode 100644 licenses/LICENSE-AnchorJS.txt
 create mode 100644 licenses/LICENSE-DPark.txt
 create mode 100644 licenses/LICENSE-Mockito.txt
 create mode 100644 licenses/LICENSE-SnapTree.txt
 create mode 100644 licenses/LICENSE-antlr.txt
 create mode 100644 licenses/LICENSE-boto.txt
 create mode 100644 licenses/LICENSE-cloudpickle.txt
 create mode 100644 licenses/LICENSE-d3.min.js.txt
 create mode 100644 licenses/LICENSE-dagre-d3.txt
 create mode 100644 licenses/LICENSE-f2j.txt
 create mode 100644 licenses/LICENSE-graphlib-dot.txt
 create mode 100644 licenses/LICENSE-heapq.txt
 create mode 100644 licenses/LICENSE-javolution.txt
 create mode 100644 licenses/LICENSE-jbcrypt.txt
 create mode 100644 licenses/LICENSE-jblas.txt
 create mode 100644 licenses/LICENSE-jline.txt
 create mode 100644 licenses/LICENSE-jpmml-model.txt
 create mode 100644 licenses/LICENSE-jquery.txt
 create mode 100644 licenses/LICENSE-junit-interface.txt
 create mode 100644 licenses/LICENSE-kryo.txt
 create mode 100644 licenses/LICENSE-minlog.txt
 create mode 100644 licenses/LICENSE-netlib.txt
 create mode 100644 licenses/LICENSE-paranamer.txt
 create mode 100644 licenses/LICENSE-protobuf.txt
 create mode 100644 licenses/LICENSE-py4j.txt
 create mode 100644 licenses/LICENSE-pyrolite.txt
 create mode 100644 licenses/LICENSE-reflectasm.txt
 create mode 100644 licenses/LICENSE-sbt-launch-lib.txt
 create mode 100644 licenses/LICENSE-scala.txt
 create mode 100644 licenses/LICENSE-scalacheck.txt
 create mode 100644 licenses/LICENSE-scopt.txt
 create mode 100644 licenses/LICENSE-slf4j.txt
 create mode 100644 licenses/LICENSE-sorttable.js.txt
 create mode 100644 licenses/LICENSE-spire.txt
 create mode 100644 licenses/LICENSE-xmlenc.txt

diff --git a/LICENSE b/LICENSE
index f9e412cade345..dca03ab16de29 100644
--- a/LICENSE
+++ b/LICENSE
@@ -211,712 +211,45 @@ subcomponents is subject to the terms and conditions of the following
 licenses.
 
 
-=======================================================================
-For the Boto EC2 library (ec2/third_party/boto*.zip):
-=======================================================================
-
-Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish, dis-
-tribute, sublicense, and/or sell copies of the Software, and to permit
-persons to whom the Software is furnished to do so, subject to the fol-
-lowing conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
-ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
-
-
-========================================================================
-For CloudPickle (pyspark/cloudpickle.py):
-========================================================================
-
-Copyright (c) 2012, Regents of the University of California.
-Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the University of California, Berkeley nor the
-      names of its contributors may be used to endorse or promote
-      products derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For Py4J (python/lib/py4j-0.8.2.1-src.zip)
-========================================================================
-
-Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-- The name of the author may not be used to endorse or promote products
-derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For DPark join code (python/pyspark/join.py):
-========================================================================
-
-Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-    * Neither the name of the Douban Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 ========================================================================
 For heapq (pyspark/heapq3.py):
 ========================================================================
 
-# A. HISTORY OF THE SOFTWARE
-# ==========================
-#
-# Python was created in the early 1990s by Guido van Rossum at Stichting
-# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
-# as a successor of a language called ABC.  Guido remains Python's
-# principal author, although it includes many contributions from others.
-#
-# In 1995, Guido continued his work on Python at the Corporation for
-#     National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
-# in Reston, Virginia where he released several versions of the
-# software.
-#
-# In May 2000, Guido and the Python core development team moved to
-# BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
-# year, the PythonLabs team moved to Digital Creations (now Zope
-# Corporation, see http://www.zope.com).  In 2001, the Python Software
-# Foundation (PSF, see http://www.python.org/psf/) was formed, a
-# non-profit organization created specifically to own Python-related
-# Intellectual Property.  Zope Corporation is a sponsoring member of
-# the PSF.
-#
-# All Python releases are Open Source (see http://www.opensource.org for
-# the Open Source Definition).  Historically, most, but not all, Python
-# releases have also been GPL-compatible; the table below summarizes
-# the various releases.
-#
-# Release         Derived     Year        Owner       GPL-
-# from                                compatible? (1)
-#
-# 0.9.0 thru 1.2              1991-1995   CWI         yes
-# 1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
-# 1.6             1.5.2       2000        CNRI        no
-# 2.0             1.6         2000        BeOpen.com  no
-# 1.6.1           1.6         2001        CNRI        yes (2)
-# 2.1             2.0+1.6.1   2001        PSF         no
-# 2.0.1           2.0+1.6.1   2001        PSF         yes
-# 2.1.1           2.1+2.0.1   2001        PSF         yes
-# 2.2             2.1.1       2001        PSF         yes
-# 2.1.2           2.1.1       2002        PSF         yes
-# 2.1.3           2.1.2       2002        PSF         yes
-# 2.2.1           2.2         2002        PSF         yes
-# 2.2.2           2.2.1       2002        PSF         yes
-# 2.2.3           2.2.2       2003        PSF         yes
-# 2.3             2.2.2       2002-2003   PSF         yes
-# 2.3.1           2.3         2002-2003   PSF         yes
-# 2.3.2           2.3.1       2002-2003   PSF         yes
-# 2.3.3           2.3.2       2002-2003   PSF         yes
-# 2.3.4           2.3.3       2004        PSF         yes
-# 2.3.5           2.3.4       2005        PSF         yes
-# 2.4             2.3         2004        PSF         yes
-# 2.4.1           2.4         2005        PSF         yes
-# 2.4.2           2.4.1       2005        PSF         yes
-# 2.4.3           2.4.2       2006        PSF         yes
-# 2.4.4           2.4.3       2006        PSF         yes
-# 2.5             2.4         2006        PSF         yes
-# 2.5.1           2.5         2007        PSF         yes
-# 2.5.2           2.5.1       2008        PSF         yes
-# 2.5.3           2.5.2       2008        PSF         yes
-# 2.6             2.5         2008        PSF         yes
-# 2.6.1           2.6         2008        PSF         yes
-# 2.6.2           2.6.1       2009        PSF         yes
-# 2.6.3           2.6.2       2009        PSF         yes
-# 2.6.4           2.6.3       2009        PSF         yes
-# 2.6.5           2.6.4       2010        PSF         yes
-# 2.7             2.6         2010        PSF         yes
-#
-# Footnotes:
-#
-# (1) GPL-compatible doesn't mean that we're distributing Python under
-# the GPL.  All Python licenses, unlike the GPL, let you distribute
-# a modified version without making your changes open source.  The
-# GPL-compatible licenses make it possible to combine Python with
-#     other software that is released under the GPL; the others don't.
-#
-# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
-# because its license has a choice of law clause.  According to
-# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
-# is "not incompatible" with the GPL.
-#
-# Thanks to the many outside volunteers who have worked under Guido's
-# direction to make these releases possible.
-#
-#
-# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
-# ===============================================================
-#
-# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
-# --------------------------------------------
-#
-# 1. This LICENSE AGREEMENT is between the Python Software Foundation
-# ("PSF"), and the Individual or Organization ("Licensee") accessing and
-# otherwise using this software ("Python") in source or binary form and
-# its associated documentation.
-#
-# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
-# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
-# analyze, test, perform and/or display publicly, prepare derivative works,
-# distribute, and otherwise use Python alone or in any derivative version,
-# provided, however, that PSF's License Agreement and PSF's notice of copyright,
-# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained
-# in Python alone or in any derivative version prepared by Licensee.
-#
-# 3. In the event Licensee prepares a derivative work that is based on
-# or incorporates Python or any part thereof, and wants to make
-# the derivative work available to others as provided herein, then
-# Licensee hereby agrees to include in any such work a brief summary of
-# the changes made to Python.
-#
-# 4. PSF is making Python available to Licensee on an "AS IS"
-# basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
-# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 6. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 7. Nothing in this License Agreement shall be deemed to create any
-# relationship of agency, partnership, or joint venture between PSF and
-# Licensee.  This License Agreement does not grant permission to use PSF
-# trademarks or trade name in a trademark sense to endorse or promote
-# products or services of Licensee, or any third party.
-#
-# 8. By copying, installing or otherwise using Python, Licensee
-# agrees to be bound by the terms and conditions of this License
-# Agreement.
-#
-#
-# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
-# -------------------------------------------
-#
-# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
-#
-# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
-# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
-# Individual or Organization ("Licensee") accessing and otherwise using
-# this software in source or binary form and its associated
-# documentation ("the Software").
-#
-# 2. Subject to the terms and conditions of this BeOpen Python License
-# Agreement, BeOpen hereby grants Licensee a non-exclusive,
-# royalty-free, world-wide license to reproduce, analyze, test, perform
-# and/or display publicly, prepare derivative works, distribute, and
-# otherwise use the Software alone or in any derivative version,
-# provided, however, that the BeOpen Python License is retained in the
-# Software, alone or in any derivative version prepared by Licensee.
-#
-# 3. BeOpen is making the Software available to Licensee on an "AS IS"
-# basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
-# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
-# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
-# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 5. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 6. This License Agreement shall be governed by and interpreted in all
-# respects by the law of the State of California, excluding conflict of
-# law provisions.  Nothing in this License Agreement shall be deemed to
-# create any relationship of agency, partnership, or joint venture
-# between BeOpen and Licensee.  This License Agreement does not grant
-# permission to use BeOpen trademarks or trade names in a trademark
-# sense to endorse or promote products or services of Licensee, or any
-# third party.  As an exception, the "BeOpen Python" logos available at
-# http://www.pythonlabs.com/logos.html may be used according to the
-# permissions granted on that web page.
-#
-# 7. By copying, installing or otherwise using the software, Licensee
-# agrees to be bound by the terms and conditions of this License
-# Agreement.
-#
-#
-# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
-# ---------------------------------------
-#
-# 1. This LICENSE AGREEMENT is between the Corporation for National
-#     Research Initiatives, having an office at 1895 Preston White Drive,
-# Reston, VA 20191 ("CNRI"), and the Individual or Organization
-# ("Licensee") accessing and otherwise using Python 1.6.1 software in
-# source or binary form and its associated documentation.
-#
-# 2. Subject to the terms and conditions of this License Agreement, CNRI
-# hereby grants Licensee a nonexclusive, royalty-free, world-wide
-# license to reproduce, analyze, test, perform and/or display publicly,
-# prepare derivative works, distribute, and otherwise use Python 1.6.1
-# alone or in any derivative version, provided, however, that CNRI's
-# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
-# 1995-2001 Corporation for National Research Initiatives; All Rights
-# Reserved" are retained in Python 1.6.1 alone or in any derivative
-# version prepared by Licensee.  Alternately, in lieu of CNRI's License
-# Agreement, Licensee may substitute the following text (omitting the
-# quotes): "Python 1.6.1 is made available subject to the terms and
-# conditions in CNRI's License Agreement.  This Agreement together with
-# Python 1.6.1 may be located on the Internet using the following
-# unique, persistent identifier (known as a handle): 1895.22/1013.  This
-# Agreement may also be obtained from a proxy server on the Internet
-# using the following URL: http://hdl.handle.net/1895.22/1013".
-#
-# 3. In the event Licensee prepares a derivative work that is based on
-# or incorporates Python 1.6.1 or any part thereof, and wants to make
-# the derivative work available to others as provided herein, then
-# Licensee hereby agrees to include in any such work a brief summary of
-# the changes made to Python 1.6.1.
-#
-# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
-# basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
-# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 6. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 7. This License Agreement shall be governed by the federal
-# intellectual property law of the United States, including without
-# limitation the federal copyright law, and, to the extent such
-# U.S. federal law does not apply, by the law of the Commonwealth of
-# Virginia, excluding Virginia's conflict of law provisions.
-# Notwithstanding the foregoing, with regard to derivative works based
-# on Python 1.6.1 that incorporate non-separable material that was
-# previously distributed under the GNU General Public License (GPL), the
-# law of the Commonwealth of Virginia shall govern this License
-# Agreement only as to issues arising under or with respect to
-# Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
-# License Agreement shall be deemed to create any relationship of
-# agency, partnership, or joint venture between CNRI and Licensee.  This
-# License Agreement does not grant permission to use CNRI trademarks or
-# trade name in a trademark sense to endorse or promote products or
-# services of Licensee, or any third party.
-#
-# 8. By clicking on the "ACCEPT" button where indicated, or by copying,
-# installing or otherwise using Python 1.6.1, Licensee agrees to be
-# bound by the terms and conditions of this License Agreement.
-#
-# ACCEPT
-#
-#
-# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
-# --------------------------------------------------
-#
-# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
-# The Netherlands.  All rights reserved.
-#
-# Permission to use, copy, modify, and distribute this software and its
-# documentation for any purpose and without fee is hereby granted,
-# provided that the above copyright notice appear in all copies and that
-# both that copyright notice and this permission notice appear in
-# supporting documentation, and that the name of Stichting Mathematisch
-# Centrum or CWI not be used in advertising or publicity pertaining to
-# distribution of the software without specific, written prior
-# permission.
-#
-# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
-# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
-# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
-# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-========================================================================
-For sorttable (core/src/main/resources/org/apache/spark/ui/static/sorttable.js):
-========================================================================
-
-Copyright (c) 1997-2007 Stuart Langridge
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-========================================================================
-For d3 (core/src/main/resources/org/apache/spark/ui/static/d3.min.js):
-========================================================================
-
-Copyright (c) 2010-2015, Michael Bostock
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* The name Michael Bostock may not be used to endorse or promote products
-  derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-========================================================================
-For Scala Interpreter classes (all .scala files in repl/src/main/scala
-except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
-and for SerializableMapWrapper in JavaUtils.scala:
-========================================================================
-
-Copyright (c) 2002-2013 EPFL
-Copyright (c) 2011-2013 Typesafe, Inc.
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-- Neither the name of the EPFL nor the names of its contributors may be
-  used to endorse or promote products derived from this software without
-  specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For sbt and sbt-launch-lib.bash in sbt/:
-========================================================================
-
-// Generated from http://www.opensource.org/licenses/bsd-license.php
-Copyright (c) 2011, Paul Phillips.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of the author nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+See license/LICENSE-heapq.txt
 
 ========================================================================
 For SnapTree:
 ========================================================================
 
-SNAPTREE LICENSE
-
-Copyright (c) 2009-2012 Stanford University, unless otherwise specified.
-All rights reserved.
-
-This software was developed by the Pervasive Parallelism Laboratory of
-Stanford University, California, USA.
-
-Permission to use, copy, modify, and distribute this software in source
-or binary form for any purpose with or without fee is hereby granted,
-provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the name of Stanford University nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
-
-
-========================================================================
-For Timsort (core/src/main/java/org/apache/spark/util/collection/TimSort.java):
-========================================================================
-Copyright (C) 2008 The Android Open Source Project
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java):
-========================================================================
-Copyright (C) 2015 Stijn de Gouw
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For LimitedInputStream
-  (network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java):
-========================================================================
-Copyright (C) 2007 The Guava Authors
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For vis.js (core/src/main/resources/org/apache/spark/ui/static/vis.min.js):
-========================================================================
-Copyright (C) 2010-2015 Almende B.V.
-
-Vis.js is dual licensed under both
-
-  * The Apache 2.0 License
-    http://www.apache.org/licenses/LICENSE-2.0
-
-and
-
-  * The MIT License
-    http://opensource.org/licenses/MIT
-
-Vis.js may be distributed under either license.
+See license/LICENSE-SnapTree.txt
 
 ========================================================================
-For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js):
+For jbcrypt:
 ========================================================================
-Copyright (c) 2013 Chris Pettitt
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
 
-========================================================================
-For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js):
-========================================================================
-Copyright (c) 2012-2013 Chris Pettitt
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+See license/LICENSE-jbcrypt.txt
 
 ========================================================================
 BSD-style licenses
 ========================================================================
 
 The following components are provided under a BSD-style license. See project link for details.
+The text of each license is also included at licenses/LICENSE-[project].txt.
 
-     (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
+     (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
      (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
      (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.4 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
-     (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
      (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
      (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
-     (BSD style) Hamcrest Core (org.hamcrest:hamcrest-core:1.1 - no url defined)
+     (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
      (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer)
-     (BSD-like) (The BSD License) jline (org.scala-lang:jline:2.10.4 - http://www.scala-lang.org/)
+     (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License)
+        (Interpreter classes (all .scala files in repl/src/main/scala
+        except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
+        and for SerializableMapWrapper in JavaUtils.scala)
      (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.10.4 - http://www.scala-lang.org/)
      (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.10.4 - http://www.scala-lang.org/)
      (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.4 - http://www.scala-lang.org/)
@@ -934,13 +267,17 @@ The following components are provided under a BSD-style license. See project lin
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
      (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
-     (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/)
+     (BSD licence) sbt and sbt-launch-lib.bash
+     (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
+     (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE)
+     (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE)
 
 ========================================================================
 MIT licenses
 ========================================================================
 
 The following components are provided under the MIT License. See project link for details.
+The text of each license is also included at licenses/LICENSE-[project].txt.
 
      (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org)
      (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org)
@@ -951,3 +288,7 @@ The following components are provided under the MIT License. See project link fo
      (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
      (MIT License) jquery (https://jquery.org/license/)
      (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)
+     (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot)
+     (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3)
+     (MIT License) sorttable (https://github.com/stuartlangridge/sorttable)
+     (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE)
diff --git a/NOTICE b/NOTICE
index 452aef2871652..7f7769f73047f 100644
--- a/NOTICE
+++ b/NOTICE
@@ -572,3 +572,38 @@ Copyright 2009-2013 The Apache Software Foundation
 
 Apache Avro IPC
 Copyright 2009-2013 The Apache Software Foundation
+
+
+Vis.js
+Copyright 2010-2015 Almende B.V.
+
+Vis.js is dual licensed under both
+
+  * The Apache 2.0 License
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    and
+
+  * The MIT License
+    http://opensource.org/licenses/MIT
+
+Vis.js may be distributed under either license.
+
+
+Vis.js uses and redistributes the following third-party libraries:
+
+- component-emitter
+  https://github.com/component/emitter
+  The MIT License
+
+- hammer.js
+  http://hammerjs.github.io/
+  The MIT License
+
+- moment.js
+  http://momentjs.com/
+  The MIT License
+
+- keycharm
+  https://github.com/AlexDM0/keycharm
+  The MIT License
\ No newline at end of file
diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
index a90cc0e761f62..40b5fb7fe4b49 100644
--- a/core/src/main/java/org/apache/spark/util/collection/TimSort.java
+++ b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
@@ -15,6 +15,24 @@
  * limitations under the License.
  */
 
+/*
+ * Based on TimSort.java from the Android Open Source Project
+ *
+ *  Copyright (C) 2008 The Android Open Source Project
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
 package org.apache.spark.util.collection;
 
 import java.util.Comparator;
diff --git a/licenses/LICENSE-AnchorJS.txt b/licenses/LICENSE-AnchorJS.txt
new file mode 100644
index 0000000000000..2bf24b9b9f848
--- /dev/null
+++ b/licenses/LICENSE-AnchorJS.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-DPark.txt b/licenses/LICENSE-DPark.txt
new file mode 100644
index 0000000000000..1d916090e4ea0
--- /dev/null
+++ b/licenses/LICENSE-DPark.txt
@@ -0,0 +1,30 @@
+Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+    * Neither the name of the Douban Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-Mockito.txt b/licenses/LICENSE-Mockito.txt
new file mode 100644
index 0000000000000..e0840a446caf5
--- /dev/null
+++ b/licenses/LICENSE-Mockito.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2007 Mockito contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-SnapTree.txt b/licenses/LICENSE-SnapTree.txt
new file mode 100644
index 0000000000000..a538825d89ec5
--- /dev/null
+++ b/licenses/LICENSE-SnapTree.txt
@@ -0,0 +1,35 @@
+SNAPTREE LICENSE
+
+Copyright (c) 2009-2012 Stanford University, unless otherwise specified.
+All rights reserved.
+
+This software was developed by the Pervasive Parallelism Laboratory of
+Stanford University, California, USA.
+
+Permission to use, copy, modify, and distribute this software in source
+or binary form for any purpose with or without fee is hereby granted,
+provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of Stanford University nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/licenses/LICENSE-antlr.txt b/licenses/LICENSE-antlr.txt
new file mode 100644
index 0000000000000..3021ea04332ed
--- /dev/null
+++ b/licenses/LICENSE-antlr.txt
@@ -0,0 +1,8 @@
+[The BSD License]
+Copyright (c) 2012 Terence Parr and Sam Harwell
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-boto.txt b/licenses/LICENSE-boto.txt
new file mode 100644
index 0000000000000..7bba0cd9e10a4
--- /dev/null
+++ b/licenses/LICENSE-boto.txt
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, dis-
+tribute, sublicense, and/or sell copies of the Software, and to permit
+persons to whom the Software is furnished to do so, subject to the fol-
+lowing conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-cloudpickle.txt b/licenses/LICENSE-cloudpickle.txt
new file mode 100644
index 0000000000000..b1e20fa1eda88
--- /dev/null
+++ b/licenses/LICENSE-cloudpickle.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2012, Regents of the University of California.
+Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of California, Berkeley nor the
+      names of its contributors may be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-d3.min.js.txt b/licenses/LICENSE-d3.min.js.txt
new file mode 100644
index 0000000000000..c71e3f254c068
--- /dev/null
+++ b/licenses/LICENSE-d3.min.js.txt
@@ -0,0 +1,26 @@
+Copyright (c) 2010-2015, Michael Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* The name Michael Bostock may not be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-dagre-d3.txt b/licenses/LICENSE-dagre-d3.txt
new file mode 100644
index 0000000000000..4864fe05e9803
--- /dev/null
+++ b/licenses/LICENSE-dagre-d3.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-f2j.txt b/licenses/LICENSE-f2j.txt
new file mode 100644
index 0000000000000..e28fd3ccdfa69
--- /dev/null
+++ b/licenses/LICENSE-f2j.txt
@@ -0,0 +1,8 @@
+Copyright © 2015 The University of Tennessee. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution.
+· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
\ No newline at end of file
diff --git a/licenses/LICENSE-graphlib-dot.txt b/licenses/LICENSE-graphlib-dot.txt
new file mode 100644
index 0000000000000..c9e18cd562423
--- /dev/null
+++ b/licenses/LICENSE-graphlib-dot.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2012-2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-heapq.txt b/licenses/LICENSE-heapq.txt
new file mode 100644
index 0000000000000..0c4c4b954bea4
--- /dev/null
+++ b/licenses/LICENSE-heapq.txt
@@ -0,0 +1,280 @@
+
+# A. HISTORY OF THE SOFTWARE
+# ==========================
+#
+# Python was created in the early 1990s by Guido van Rossum at Stichting
+# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+# as a successor of a language called ABC.  Guido remains Python's
+# principal author, although it includes many contributions from others.
+#
+# In 1995, Guido continued his work on Python at the Corporation for
+#     National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+# in Reston, Virginia where he released several versions of the
+# software.
+#
+# In May 2000, Guido and the Python core development team moved to
+# BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+# year, the PythonLabs team moved to Digital Creations (now Zope
+# Corporation, see http://www.zope.com).  In 2001, the Python Software
+# Foundation (PSF, see http://www.python.org/psf/) was formed, a
+# non-profit organization created specifically to own Python-related
+# Intellectual Property.  Zope Corporation is a sponsoring member of
+# the PSF.
+#
+# All Python releases are Open Source (see http://www.opensource.org for
+# the Open Source Definition).  Historically, most, but not all, Python
+# releases have also been GPL-compatible; the table below summarizes
+# the various releases.
+#
+# Release         Derived     Year        Owner       GPL-
+# from                                compatible? (1)
+#
+# 0.9.0 thru 1.2              1991-1995   CWI         yes
+# 1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+# 1.6             1.5.2       2000        CNRI        no
+# 2.0             1.6         2000        BeOpen.com  no
+# 1.6.1           1.6         2001        CNRI        yes (2)
+# 2.1             2.0+1.6.1   2001        PSF         no
+# 2.0.1           2.0+1.6.1   2001        PSF         yes
+# 2.1.1           2.1+2.0.1   2001        PSF         yes
+# 2.2             2.1.1       2001        PSF         yes
+# 2.1.2           2.1.1       2002        PSF         yes
+# 2.1.3           2.1.2       2002        PSF         yes
+# 2.2.1           2.2         2002        PSF         yes
+# 2.2.2           2.2.1       2002        PSF         yes
+# 2.2.3           2.2.2       2003        PSF         yes
+# 2.3             2.2.2       2002-2003   PSF         yes
+# 2.3.1           2.3         2002-2003   PSF         yes
+# 2.3.2           2.3.1       2002-2003   PSF         yes
+# 2.3.3           2.3.2       2002-2003   PSF         yes
+# 2.3.4           2.3.3       2004        PSF         yes
+# 2.3.5           2.3.4       2005        PSF         yes
+# 2.4             2.3         2004        PSF         yes
+# 2.4.1           2.4         2005        PSF         yes
+# 2.4.2           2.4.1       2005        PSF         yes
+# 2.4.3           2.4.2       2006        PSF         yes
+# 2.4.4           2.4.3       2006        PSF         yes
+# 2.5             2.4         2006        PSF         yes
+# 2.5.1           2.5         2007        PSF         yes
+# 2.5.2           2.5.1       2008        PSF         yes
+# 2.5.3           2.5.2       2008        PSF         yes
+# 2.6             2.5         2008        PSF         yes
+# 2.6.1           2.6         2008        PSF         yes
+# 2.6.2           2.6.1       2009        PSF         yes
+# 2.6.3           2.6.2       2009        PSF         yes
+# 2.6.4           2.6.3       2009        PSF         yes
+# 2.6.5           2.6.4       2010        PSF         yes
+# 2.7             2.6         2010        PSF         yes
+#
+# Footnotes:
+#
+# (1) GPL-compatible doesn't mean that we're distributing Python under
+# the GPL.  All Python licenses, unlike the GPL, let you distribute
+# a modified version without making your changes open source.  The
+# GPL-compatible licenses make it possible to combine Python with
+#     other software that is released under the GPL; the others don't.
+#
+# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+# because its license has a choice of law clause.  According to
+# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+# is "not incompatible" with the GPL.
+#
+# Thanks to the many outside volunteers who have worked under Guido's
+# direction to make these releases possible.
+#
+#
+# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+# ===============================================================
+#
+# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+# --------------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Python Software Foundation
+# ("PSF"), and the Individual or Organization ("Licensee") accessing and
+# otherwise using this software ("Python") in source or binary form and
+# its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
+# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+# analyze, test, perform and/or display publicly, prepare derivative works,
+# distribute, and otherwise use Python alone or in any derivative version,
+# provided, however, that PSF's License Agreement and PSF's notice of copyright,
+# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained
+# in Python alone or in any derivative version prepared by Licensee.
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python.
+#
+# 4. PSF is making Python available to Licensee on an "AS IS"
+# basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. Nothing in this License Agreement shall be deemed to create any
+# relationship of agency, partnership, or joint venture between PSF and
+# Licensee.  This License Agreement does not grant permission to use PSF
+# trademarks or trade name in a trademark sense to endorse or promote
+# products or services of Licensee, or any third party.
+#
+# 8. By copying, installing or otherwise using Python, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+# -------------------------------------------
+#
+# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+#
+# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+# Individual or Organization ("Licensee") accessing and otherwise using
+# this software in source or binary form and its associated
+# documentation ("the Software").
+#
+# 2. Subject to the terms and conditions of this BeOpen Python License
+# Agreement, BeOpen hereby grants Licensee a non-exclusive,
+# royalty-free, world-wide license to reproduce, analyze, test, perform
+# and/or display publicly, prepare derivative works, distribute, and
+# otherwise use the Software alone or in any derivative version,
+# provided, however, that the BeOpen Python License is retained in the
+# Software, alone or in any derivative version prepared by Licensee.
+#
+# 3. BeOpen is making the Software available to Licensee on an "AS IS"
+# basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 5. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 6. This License Agreement shall be governed by and interpreted in all
+# respects by the law of the State of California, excluding conflict of
+# law provisions.  Nothing in this License Agreement shall be deemed to
+# create any relationship of agency, partnership, or joint venture
+# between BeOpen and Licensee.  This License Agreement does not grant
+# permission to use BeOpen trademarks or trade names in a trademark
+# sense to endorse or promote products or services of Licensee, or any
+# third party.  As an exception, the "BeOpen Python" logos available at
+# http://www.pythonlabs.com/logos.html may be used according to the
+# permissions granted on that web page.
+#
+# 7. By copying, installing or otherwise using the software, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+# ---------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Corporation for National
+#     Research Initiatives, having an office at 1895 Preston White Drive,
+# Reston, VA 20191 ("CNRI"), and the Individual or Organization
+# ("Licensee") accessing and otherwise using Python 1.6.1 software in
+# source or binary form and its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, CNRI
+# hereby grants Licensee a nonexclusive, royalty-free, world-wide
+# license to reproduce, analyze, test, perform and/or display publicly,
+# prepare derivative works, distribute, and otherwise use Python 1.6.1
+# alone or in any derivative version, provided, however, that CNRI's
+# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+# 1995-2001 Corporation for National Research Initiatives; All Rights
+# Reserved" are retained in Python 1.6.1 alone or in any derivative
+# version prepared by Licensee.  Alternately, in lieu of CNRI's License
+# Agreement, Licensee may substitute the following text (omitting the
+# quotes): "Python 1.6.1 is made available subject to the terms and
+# conditions in CNRI's License Agreement.  This Agreement together with
+# Python 1.6.1 may be located on the Internet using the following
+# unique, persistent identifier (known as a handle): 1895.22/1013.  This
+# Agreement may also be obtained from a proxy server on the Internet
+# using the following URL: http://hdl.handle.net/1895.22/1013".
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python 1.6.1 or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python 1.6.1.
+#
+# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+# basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. This License Agreement shall be governed by the federal
+# intellectual property law of the United States, including without
+# limitation the federal copyright law, and, to the extent such
+# U.S. federal law does not apply, by the law of the Commonwealth of
+# Virginia, excluding Virginia's conflict of law provisions.
+# Notwithstanding the foregoing, with regard to derivative works based
+# on Python 1.6.1 that incorporate non-separable material that was
+# previously distributed under the GNU General Public License (GPL), the
+# law of the Commonwealth of Virginia shall govern this License
+# Agreement only as to issues arising under or with respect to
+# Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+# License Agreement shall be deemed to create any relationship of
+# agency, partnership, or joint venture between CNRI and Licensee.  This
+# License Agreement does not grant permission to use CNRI trademarks or
+# trade name in a trademark sense to endorse or promote products or
+# services of Licensee, or any third party.
+#
+# 8. By clicking on the "ACCEPT" button where indicated, or by copying,
+# installing or otherwise using Python 1.6.1, Licensee agrees to be
+# bound by the terms and conditions of this License Agreement.
+#
+# ACCEPT
+#
+#
+# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+# --------------------------------------------------
+#
+# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+# The Netherlands.  All rights reserved.
+#
+# Permission to use, copy, modify, and distribute this software and its
+# documentation for any purpose and without fee is hereby granted,
+# provided that the above copyright notice appear in all copies and that
+# both that copyright notice and this permission notice appear in
+# supporting documentation, and that the name of Stichting Mathematisch
+# Centrum or CWI not be used in advertising or publicity pertaining to
+# distribution of the software without specific, written prior
+# permission.
+#
+# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-javolution.txt b/licenses/LICENSE-javolution.txt
new file mode 100644
index 0000000000000..b64af4d8298aa
--- /dev/null
+++ b/licenses/LICENSE-javolution.txt
@@ -0,0 +1,27 @@
+/*
+ * Javolution - Java(tm) Solution for Real-Time and Embedded Systems
+ * Copyright (c) 2012, Javolution (http://javolution.org/)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
\ No newline at end of file
diff --git a/licenses/LICENSE-jbcrypt.txt b/licenses/LICENSE-jbcrypt.txt
new file mode 100644
index 0000000000000..d332534c06356
--- /dev/null
+++ b/licenses/LICENSE-jbcrypt.txt
@@ -0,0 +1,17 @@
+jBCrypt is subject to the following license:
+
+/*
+ * Copyright (c) 2006 Damien Miller <djm@mindrot.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
diff --git a/licenses/LICENSE-jblas.txt b/licenses/LICENSE-jblas.txt
new file mode 100644
index 0000000000000..5629dafb65b39
--- /dev/null
+++ b/licenses/LICENSE-jblas.txt
@@ -0,0 +1,31 @@
+Copyright (c) 2009, Mikio L. Braun and contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * Neither the name of the Technische Universität Berlin nor the
+      names of its contributors may be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jline.txt b/licenses/LICENSE-jline.txt
new file mode 100644
index 0000000000000..2ec539d10ac54
--- /dev/null
+++ b/licenses/LICENSE-jline.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2002-2006, Marc Prud'hommeaux <mwp1@cornell.edu>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following
+conditions are met:
+
+Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with
+the distribution.
+
+Neither the name of JLine nor the names of its contributors
+may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jpmml-model.txt b/licenses/LICENSE-jpmml-model.txt
new file mode 100644
index 0000000000000..69411d1c6e9a8
--- /dev/null
+++ b/licenses/LICENSE-jpmml-model.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2009, University of Tartu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jquery.txt b/licenses/LICENSE-jquery.txt
new file mode 100644
index 0000000000000..e1dd696d3b6cc
--- /dev/null
+++ b/licenses/LICENSE-jquery.txt
@@ -0,0 +1,9 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-junit-interface.txt b/licenses/LICENSE-junit-interface.txt
new file mode 100644
index 0000000000000..e835350c4e2a4
--- /dev/null
+++ b/licenses/LICENSE-junit-interface.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2009-2012, Stefan Zeiger
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-kryo.txt b/licenses/LICENSE-kryo.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-kryo.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-minlog.txt b/licenses/LICENSE-minlog.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-minlog.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-netlib.txt b/licenses/LICENSE-netlib.txt
new file mode 100644
index 0000000000000..75783ed6bc357
--- /dev/null
+++ b/licenses/LICENSE-netlib.txt
@@ -0,0 +1,49 @@
+Copyright (c) 2013 Samuel Halliday
+Copyright (c) 1992-2011 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2000-2011 The University of California Berkeley. All
+                        rights reserved.
+Copyright (c) 2006-2011 The University of Colorado Denver.  All rights
+                        reserved.
+
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-paranamer.txt b/licenses/LICENSE-paranamer.txt
new file mode 100644
index 0000000000000..fca18473ba03f
--- /dev/null
+++ b/licenses/LICENSE-paranamer.txt
@@ -0,0 +1,28 @@
+[ ParaNamer used to be 'Pubic Domain', but since it includes a small piece of ASM it is now the same license as that: BSD ]
+
+ Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. Neither the name of the copyright holders nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-protobuf.txt b/licenses/LICENSE-protobuf.txt
new file mode 100644
index 0000000000000..b4350ec83c758
--- /dev/null
+++ b/licenses/LICENSE-protobuf.txt
@@ -0,0 +1,42 @@
+This license applies to all parts of Protocol Buffers except the following:
+
+  - Atomicops support for generic gcc, located in
+    src/google/protobuf/stubs/atomicops_internals_generic_gcc.h.
+    This file is copyrighted by Red Hat Inc.
+
+  - Atomicops support for AIX/POWER, located in
+    src/google/protobuf/stubs/atomicops_internals_aix.h.
+    This file is copyrighted by Bloomberg Finance LP.
+
+Copyright 2014, Google Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it.  This code is not
+standalone and requires a support library to be linked with it.  This
+support library is itself covered by the above license.
\ No newline at end of file
diff --git a/licenses/LICENSE-py4j.txt b/licenses/LICENSE-py4j.txt
new file mode 100644
index 0000000000000..70af3e69ed67a
--- /dev/null
+++ b/licenses/LICENSE-py4j.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- The name of the author may not be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/licenses/LICENSE-pyrolite.txt b/licenses/LICENSE-pyrolite.txt
new file mode 100644
index 0000000000000..9457c7aa66140
--- /dev/null
+++ b/licenses/LICENSE-pyrolite.txt
@@ -0,0 +1,28 @@
+
+Pyro - Python Remote Objects
+Software License, copyright, and disclaimer
+
+  Pyro is Copyright (c) by Irmen de Jong (irmen@razorvine.net).
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+
+This is the "MIT Software License" which is OSI-certified, and GPL-compatible.
+See http://www.opensource.org/licenses/mit-license.php
+
diff --git a/licenses/LICENSE-reflectasm.txt b/licenses/LICENSE-reflectasm.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-reflectasm.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-sbt-launch-lib.txt b/licenses/LICENSE-sbt-launch-lib.txt
new file mode 100644
index 0000000000000..3b9156baaab78
--- /dev/null
+++ b/licenses/LICENSE-sbt-launch-lib.txt
@@ -0,0 +1,26 @@
+// Generated from http://www.opensource.org/licenses/bsd-license.php
+Copyright (c) 2011, Paul Phillips.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the author nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scala.txt b/licenses/LICENSE-scala.txt
new file mode 100644
index 0000000000000..4846076aba246
--- /dev/null
+++ b/licenses/LICENSE-scala.txt
@@ -0,0 +1,30 @@
+Copyright (c) 2002-2013 EPFL
+Copyright (c) 2011-2013 Typesafe, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+- Neither the name of the EPFL nor the names of its contributors may be
+  used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE-scalacheck.txt b/licenses/LICENSE-scalacheck.txt
new file mode 100644
index 0000000000000..cb8f97842f4c4
--- /dev/null
+++ b/licenses/LICENSE-scalacheck.txt
@@ -0,0 +1,32 @@
+ScalaCheck LICENSE
+
+Copyright (c) 2007-2015, Rickard Nilsson
+All rights reserved.
+
+Permission to use, copy, modify, and distribute this software in source
+or binary form for any purpose with or without fee is hereby granted,
+provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the author nor the names of its contributors
+      may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scopt.txt b/licenses/LICENSE-scopt.txt
new file mode 100644
index 0000000000000..2bf24b9b9f848
--- /dev/null
+++ b/licenses/LICENSE-scopt.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-slf4j.txt b/licenses/LICENSE-slf4j.txt
new file mode 100644
index 0000000000000..6548cd3af4322
--- /dev/null
+++ b/licenses/LICENSE-slf4j.txt
@@ -0,0 +1,21 @@
+Copyright (c) 2004-2013 QOS.ch
+ All rights reserved.
+
+ Permission is hereby granted, free  of charge, to any person obtaining
+ a  copy  of this  software  and  associated  documentation files  (the
+ "Software"), to  deal in  the Software without  restriction, including
+ without limitation  the rights to  use, copy, modify,  merge, publish,
+ distribute,  sublicense, and/or sell  copies of  the Software,  and to
+ permit persons to whom the Software  is furnished to do so, subject to
+ the following conditions:
+
+ The  above  copyright  notice  and  this permission  notice  shall  be
+ included in all copies or substantial portions of the Software.
+
+ THE  SOFTWARE IS  PROVIDED  "AS  IS", WITHOUT  WARRANTY  OF ANY  KIND,
+ EXPRESS OR  IMPLIED, INCLUDING  BUT NOT LIMITED  TO THE  WARRANTIES OF
+ MERCHANTABILITY,    FITNESS    FOR    A   PARTICULAR    PURPOSE    AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-sorttable.js.txt b/licenses/LICENSE-sorttable.js.txt
new file mode 100644
index 0000000000000..b31a5b206bf40
--- /dev/null
+++ b/licenses/LICENSE-sorttable.js.txt
@@ -0,0 +1,16 @@
+Copyright (c) 1997-2007 Stuart Langridge
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/licenses/LICENSE-spire.txt b/licenses/LICENSE-spire.txt
new file mode 100644
index 0000000000000..40af7746b9315
--- /dev/null
+++ b/licenses/LICENSE-spire.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2011-2012 Erik Osheim, Tom Switzer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-xmlenc.txt b/licenses/LICENSE-xmlenc.txt
new file mode 100644
index 0000000000000..3a70c9bfcdadd
--- /dev/null
+++ b/licenses/LICENSE-xmlenc.txt
@@ -0,0 +1,27 @@
+Copyright 2003-2005, Ernst de Haan <wfe.dehaan@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/make-distribution.sh b/make-distribution.sh
index 04ad0052eb24c..62c0ba6df7d3f 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -198,6 +198,7 @@ fi
 
 # Copy license and ASF files
 cp "$SPARK_HOME/LICENSE" "$DISTDIR"
+cp -r "$SPARK_HOME/licenses" "$DISTDIR"
 cp "$SPARK_HOME/NOTICE" "$DISTDIR"
 
 if [ -e "$SPARK_HOME"/CHANGES.txt ]; then
diff --git a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
index 57113ed12d414..922c37a10efdd 100644
--- a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
+++ b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
@@ -15,6 +15,24 @@
  * limitations under the License.
  */
 
+/*
+ * Based on LimitedInputStream.java from Google Guava
+ *
+ * Copyright (C) 2007 The Guava Authors
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
 package org.apache.spark.network.util;
 
 import java.io.FilterInputStream;

From 9b9fe5f7bf55257269d8febcd64e95677075dfb6 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Mon, 28 Sep 2015 22:40:02 -0700
Subject: [PATCH 426/802] [SPARK-10670] [ML] [Doc] add api reference for ml doc

jira: https://issues.apache.org/jira/browse/SPARK-10670
In the Markdown docs for the spark.ml Programming Guide, we have code examples with codetabs for each language. We should link to each language's API docs within the corresponding codetab, but we are inconsistent about this. For an example of what we want to do, see the "Word2Vec" section in https://github.com/apache/spark/blob/64743870f23bffb8d96dcc8a0181c1452782a151/docs/ml-features.md
This JIRA is just for spark.ml, not spark.mllib

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #8901 from hhbyyh/docAPI.
---
 docs/ml-features.md | 259 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 195 insertions(+), 64 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index b70da4ac63845..44a98829393e7 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -28,12 +28,15 @@ The algorithm combines Term Frequency (TF) counts with the [hashing trick](http:
 **IDF**: `IDF` is an `Estimator` which fits on a dataset and produces an `IDFModel`.  The `IDFModel` takes feature vectors (generally created from `HashingTF`) and scales each column.  Intuitively, it down-weights columns which appear frequently in a corpus.
 
 Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term Frequency and Inverse Document Frequency.
-For API details, refer to the [HashingTF API docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and the [IDF API docs](api/scala/index.html#org.apache.spark.ml.feature.IDF).
 
 In the following code segment, we start with a set of sentences.  We split each sentence into words using `Tokenizer`.  For each sentence (bag of words), we use `HashingTF` to hash the sentence into a feature vector.  We use `IDF` to rescale the feature vectors; this generally improves performance when using text as features.  Our feature vectors could then be passed to a learning algorithm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
+the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
 
@@ -54,6 +57,10 @@ rescaledData.select("features", "label").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
+[IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -100,6 +107,10 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
+the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer
 
@@ -267,9 +278,11 @@ each vector represents the token counts of the document over the vocabulary.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-More details can be found in the API docs for
-[CountVectorizer](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer) and
-[CountVectorizerModel](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel).
+
+Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer)
+and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.CountVectorizer
 import org.apache.spark.mllib.util.CountVectorizerModel
@@ -297,9 +310,11 @@ cvModel.transform(df).select("features").show()
 </div>
 
 <div data-lang="java" markdown="1">
-More details can be found in the API docs for
-[CountVectorizer](api/java/org/apache/spark/ml/feature/CountVectorizer.html) and
-[CountVectorizerModel](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html).
+
+Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/CountVectorizer.html)
+and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.CountVectorizer;
@@ -351,6 +366,11 @@ cvModel.transform(df).show();
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
+and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
 
@@ -373,6 +393,11 @@ regexTokenized.select("words", "label").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenizer.html)
+and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -414,6 +439,11 @@ RegexTokenizer regexTokenizer = new RegexTokenizer()
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer) and
+the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Tokenizer, RegexTokenizer
 
@@ -443,7 +473,8 @@ words from the input sequences. The list of stopwords is specified by
 the `stopWords` parameter.  We provide [a list of stop
 words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by
 default, accessible by calling `getStopWords` on a newly instantiated
-`StopWordsRemover` instance.
+`StopWordsRemover` instance. A boolean parameter `caseSensitive` indicates
+if the matches should be case sensitive (false by default).
 
 **Examples**
 
@@ -473,10 +504,8 @@ filtered out.
 
 <div data-lang="scala" markdown="1">
 
-[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
-takes an input column name, an output column name, a list of stop words,
-and a boolean indicating if the matches should be case sensitive (false
-by default).
+Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.StopWordsRemover
@@ -495,10 +524,8 @@ remover.transform(dataSet).show()
 
 <div data-lang="java" markdown="1">
 
-[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
-takes an input column name, an output column name, a list of stop words,
-and a boolean indicating if the matches should be case sensitive (false
-by default).
+Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -531,10 +558,9 @@ remover.transform(dataset).show();
 </div>
 
 <div data-lang="python" markdown="1">
-[`StopWordsRemover`](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
-takes an input column name, an output column name, a list of stop words,
-and a boolean indicating if the matches should be case sensitive (false
-by default).
+
+Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import StopWordsRemover
@@ -560,7 +586,8 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
 
 <div data-lang="scala" markdown="1">
 
-[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.NGram
@@ -579,7 +606,8 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri
 
 <div data-lang="java" markdown="1">
 
-[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -617,7 +645,8 @@ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
 
 <div data-lang="python" markdown="1">
 
-[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import NGram
@@ -645,7 +674,8 @@ Binarization is the process of thresholding numerical features to binary (0/1) f
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Binarizer API doc](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) for more details.
+Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.Binarizer
@@ -671,7 +701,8 @@ binarizedFeatures.collect().foreach(println)
 
 <div data-lang="java" markdown="1">
 
-Refer to the [Binarizer API doc](api/java/org/apache/spark/ml/feature/Binarizer.html) for more details.
+Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -711,7 +742,8 @@ for (Row r : binarizedFeatures.collect()) {
 
 <div data-lang="python" markdown="1">
 
-Refer to the [Binarizer API doc](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) for more details.
+Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import Binarizer
@@ -736,7 +768,10 @@ for binarized_feature, in binarizedFeatures.collect():
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details.
+
+Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.PCA
 import org.apache.spark.mllib.linalg.Vectors
@@ -759,7 +794,10 @@ result.show()
 </div>
 
 <div data-lang="java" markdown="1">
-See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details.
+
+Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -799,7 +837,10 @@ result.show();
 </div>
 
 <div data-lang="python" markdown="1">
-See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details.
+
+Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import PCA
 from pyspark.mllib.linalg import Vectors
@@ -822,6 +863,10 @@ result.show(truncate=False)
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.PolynomialExpansion
 import org.apache.spark.mllib.linalg.Vectors
@@ -842,6 +887,10 @@ polyDF.select("polyFeatures").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -882,6 +931,10 @@ for (Row r : row) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import PolynomialExpansion
 from pyspark.mllib.linalg import Vectors
@@ -915,6 +968,10 @@ $0$th DCT coefficient and _not_ the $N/2$th).
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.DCT
 import org.apache.spark.mllib.linalg.Vectors
@@ -934,6 +991,10 @@ dctDf.select("featuresDCT").show(3)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1018,8 +1079,8 @@ index `2`.
 
 <div data-lang="scala" markdown="1">
 
-[`StringIndexer`](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) takes an input
-column name and an output column name.
+Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.StringIndexer
@@ -1036,8 +1097,9 @@ indexed.show()
 </div>
 
 <div data-lang="java" markdown="1">
-[`StringIndexer`](api/java/org/apache/spark/ml/feature/StringIndexer.html) takes an input column
-name and an output column name.
+
+Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1074,8 +1136,8 @@ indexed.show();
 
 <div data-lang="python" markdown="1">
 
-[`StringIndexer`](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer) takes an input
-column name and an output column name.
+Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import StringIndexer
@@ -1096,6 +1158,10 @@ indexed.show()
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
 
@@ -1122,6 +1188,10 @@ encoded.select("id", "categoryVec").foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1164,6 +1234,10 @@ DataFrame encoded = encoder.transform(indexed);
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import OneHotEncoder, StringIndexer
 
@@ -1197,12 +1271,14 @@ It can both automatically decide which features are categorical and convert orig
 
 Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance.
 
-Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) for more details.
-
 In the example below, we read in a dataset of labeled points and then use `VectorIndexer` to decide which features should be treated as categorical.  We transform the categorical feature values to their indices.  This transformed data could then be passed to algorithms such as `DecisionTreeRegressor` that handle categorical features.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.VectorIndexer
 
@@ -1223,6 +1299,10 @@ val indexedData = indexerModel.transform(data)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Map;
 
@@ -1250,6 +1330,10 @@ DataFrame indexedData = indexerModel.transform(data);
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import VectorIndexer
 
@@ -1273,6 +1357,10 @@ The following example demonstrates how to load a dataset in libsvm format and th
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Normalizer
 
@@ -1292,6 +1380,10 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
 </div>
 
 <div data-lang="java">
+
+Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.ml.feature.Normalizer;
 import org.apache.spark.sql.DataFrame;
@@ -1313,6 +1405,10 @@ DataFrame lInfNormData =
 </div>
 
 <div data-lang="python">
+
+Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Normalizer
 
@@ -1341,14 +1437,14 @@ lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
 
 Note that if the standard deviation of a feature is zero, it will return default `0.0` value in the `Vector` for that feature.
 
-More details can be found in the API docs for
-[StandardScaler](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler) and
-[StandardScalerModel](api/scala/index.html#org.apache.spark.ml.feature.StandardScalerModel).
-
 The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.StandardScaler
 
@@ -1369,6 +1465,10 @@ val scaledData = scalerModel.transform(dataFrame)
 </div>
 
 <div data-lang="java">
+
+Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.ml.feature.StandardScaler;
 import org.apache.spark.ml.feature.StandardScalerModel;
@@ -1391,6 +1491,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
 </div>
 
 <div data-lang="python">
+
+Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import StandardScaler
 
@@ -1429,9 +1533,11 @@ The following example demonstrates how to load a dataset in libsvm format and th
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-More details can be found in the API docs for
-[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and
-[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
+
+Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler)
+and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.MinMaxScaler
 
@@ -1450,9 +1556,11 @@ val scaledData = scalerModel.transform(dataFrame)
 </div>
 
 <div data-lang="java" markdown="1">
-More details can be found in the API docs for
-[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and
-[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html).
+
+Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMaxScaler.html)
+and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.MinMaxScaler;
@@ -1490,6 +1598,10 @@ The following example demonstrates how to bucketize a column of `Double`s into a
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Bucketizer
 import org.apache.spark.sql.DataFrame
@@ -1510,6 +1622,10 @@ val bucketedData = bucketizer.transform(dataFrame)
 </div>
 
 <div data-lang="java">
+
+Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1545,6 +1661,10 @@ DataFrame bucketedData = bucketizer.transform(dataFrame);
 </div>
 
 <div data-lang="python">
+
+Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Bucketizer
 
@@ -1581,14 +1701,14 @@ v_N
   \end{pmatrix}
 \]`
 
-[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct) takes the following parameter:
-
-* `scalingVec`: the transforming vector.
-
 This example below demonstrates how to transform vectors using a transforming vector value.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
@@ -1611,6 +1731,10 @@ transformer.transform(dataFrame).show()
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1649,6 +1773,10 @@ transformer.transform(dataFrame).show();
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import ElementwiseProduct
 from pyspark.mllib.linalg import Vectors
@@ -1702,8 +1830,8 @@ output column to `features`, after transformation we should get the following Da
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`VectorAssembler`](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler) takes an array
-of input column names and an output column name.
+Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -1722,8 +1850,8 @@ println(output.select("features", "clicked").first())
 
 <div data-lang="java" markdown="1">
 
-[`VectorAssembler`](api/java/org/apache/spark/ml/feature/VectorAssembler.html) takes an array
-of input column names and an output column name.
+Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1759,8 +1887,8 @@ System.out.println(output.select("features", "clicked").first());
 
 <div data-lang="python" markdown="1">
 
-[`VectorAssembler`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) takes a list
-of input column names and an output column name.
+Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.linalg import Vectors
@@ -1836,8 +1964,8 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input
-column name with specified indices or names and an output column name.
+Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -1870,8 +1998,8 @@ println(output.select("userFeatures", "features").first())
 
 <div data-lang="java" markdown="1">
 
-[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name
-with specified indices or names and an output column name.
+Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1941,7 +2069,8 @@ id | country | hour | clicked | features         | label
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.RFormula
@@ -1962,7 +2091,8 @@ output.select("features", "label").show()
 
 <div data-lang="java" markdown="1">
 
-[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -2000,7 +2130,8 @@ output.select("features", "label").show();
 
 <div data-lang="python" markdown="1">
 
-[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import RFormula

From dba95ea03216e6b8e623db4a36e1018c6ed95538 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 29 Sep 2015 11:53:28 -0700
Subject: [PATCH 427/802] [SPARK-10825] [CORE] [TESTS] Fix race conditions in
 StandaloneDynamicAllocationSuite

Fix the following issues in StandaloneDynamicAllocationSuite:

1. It should not assume master and workers start in order
2. It should not assume master and workers get ready at once
3. It should not assume the application is already registered with master after creating SparkContext
4. It should not access Master.app and idToApp which are not thread safe

The changes includes:
* Use `eventually` to wait until master and workers are ready to fix 1 and 2
* Use `eventually`  to wait until the application is registered with master to fix 3
* Use `askWithRetry[MasterStateResponse](RequestMasterState)` to get the application info to fix 4

Author: zsxwing <zsxwing@gmail.com>

Closes #8914 from zsxwing/fix-StandaloneDynamicAllocationSuite.
---
 .../StandaloneDynamicAllocationSuite.scala    | 305 +++++++++++-------
 1 file changed, 192 insertions(+), 113 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index 1f2a0f0d309ce..2e2fa22eb4772 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -17,10 +17,15 @@
 
 package org.apache.spark.deploy
 
+import scala.concurrent.duration._
+
 import org.mockito.Mockito.{mock, when}
 import org.scalatest.BeforeAndAfterAll
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
+import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
+import org.apache.spark.deploy.master.ApplicationInfo
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.deploy.worker.Worker
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv}
@@ -56,6 +61,10 @@ class StandaloneDynamicAllocationSuite
     }
     master = makeMaster()
     workers = makeWorkers(10, 2048)
+    // Wait until all workers register with master successfully
+    eventually(timeout(60.seconds), interval(10.millis)) {
+      assert(getMasterState.workers.size === numWorkers)
+    }
   }
 
   override def afterAll(): Unit = {
@@ -73,167 +82,208 @@ class StandaloneDynamicAllocationSuite
   test("dynamic allocation default behavior") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one won't go through
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 2 = 1 executor
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with max cores <= cores per worker") {
     sc = new SparkContext(appConf.set("spark.cores.max", "8"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 8)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 8)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more; this one won't go through because we're already at max cores.
     // This highlights a limitation of using dynamic allocation with max cores WITHOUT
     // setting cores per executor: once an application scales down and then scales back
     // up, its executors may not be spread out anymore!
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one also won't go through for the same reason
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 1 = 2 executor
     // Note: we scheduled these executors together, so their cores should be evenly distributed
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+    assert(apps.head.getExecutorLimit === 2)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with max cores > cores per worker") {
     sc = new SparkContext(appConf.set("spark.cores.max", "16"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 10)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 10)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more
     // Note: the cores are not evenly distributed because we scheduled these executors 1 by 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toSet === Set(10, 6))
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toSet === Set(10, 6))
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one won't go through
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 2 = 1 executor
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 10)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 10)
+    assert(apps.head.getExecutorLimit === 1)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with cores per executor") {
     sc = new SparkContext(appConf.set("spark.executor.cores", "2"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 10) // 20 cores total
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 10) // 20 cores total
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 3 more
     assert(sc.requestExecutors(3))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 4)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 4)
     // request 10 more; only 6 will go through
     assert(sc.requestExecutors(10))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 14)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 14)
     // kill 2 executors; we should get 2 back immediately
     assert(killNExecutors(sc, 2))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 12)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 12)
     // kill 4 executors; we should end up with 12 - 4 = 8 executors
     assert(killNExecutors(sc, 4))
-    assert(master.apps.head.executors.size === 8)
-    assert(master.apps.head.getExecutorLimit === 8)
+    apps = getApplications()
+    assert(apps.head.executors.size === 8)
+    assert(apps.head.getExecutorLimit === 8)
     // kill all executors; this time we'll have 8 - 8 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with cores per executor AND max cores") {
@@ -241,55 +291,70 @@ class StandaloneDynamicAllocationSuite
       .set("spark.executor.cores", "2")
       .set("spark.cores.max", "8"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 4) // 8 cores total
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 4) // 8 cores total
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 3 more
     assert(sc.requestExecutors(3))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 4)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 4)
     // request 10 more; none will go through
     assert(sc.requestExecutors(10))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 14)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 14)
     // kill all executors; 4 executors will be launched immediately
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 10)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 10)
     // ... and again
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 6)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 6)
     // ... and again; now we end up with 6 - 4 = 2 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
     // ... and again; this time we have 2 - 2 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("kill the same executor twice (SPARK-9795)") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // sync executors between the Master and the driver, needed because
     // the driver refuses to kill executors it does not know about
     syncExecutors(sc)
@@ -298,9 +363,10 @@ class StandaloneDynamicAllocationSuite
     assert(executors.size === 2)
     assert(sc.killExecutor(executors.head))
     assert(sc.killExecutor(executors.head))
-    assert(master.apps.head.executors.size === 1)
+    val apps = getApplications()
+    assert(apps.head.executors.size === 1)
     // The limit should not be lowered twice
-    assert(master.apps.head.getExecutorLimit === 1)
+    assert(apps.head.getExecutorLimit === 1)
   }
 
   // ===============================
@@ -333,6 +399,16 @@ class StandaloneDynamicAllocationSuite
     }
   }
 
+  /** Get the Master state */
+  private def getMasterState: MasterStateResponse = {
+    master.self.askWithRetry[MasterStateResponse](RequestMasterState)
+  }
+
+  /** Get the applictions that are active from Master */
+  private def getApplications(): Seq[ApplicationInfo] = {
+    getMasterState.activeApps
+  }
+
   /** Kill all executors belonging to this application. */
   private def killAllExecutors(sc: SparkContext): Boolean = {
     killNExecutors(sc, Int.MaxValue)
@@ -352,8 +428,11 @@ class StandaloneDynamicAllocationSuite
    * don't wait for executors to register. Otherwise the tests will take much longer to run.
    */
   private def getExecutorIds(sc: SparkContext): Seq[String] = {
-    assert(master.idToApp.contains(sc.applicationId))
-    master.idToApp(sc.applicationId).executors.keys.map(_.toString).toSeq
+    val app = getApplications().find(_.id == sc.applicationId)
+    assert(app.isDefined)
+    // Although executors is transient, master is in the same process so the message won't be
+    // serialized and it's safe here.
+    app.get.executors.keys.map(_.toString).toSeq
   }
 
   /**

From b7ad54ec793af1c84973b402f5cceb88307f7996 Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Tue, 29 Sep 2015 13:19:46 -0700
Subject: [PATCH 428/802] [SPARK-10871] include number of executor failures in
 error msg

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #8939 from ryan-williams/errmsg.
---
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 93621b44c9183..07a0a45e60cbe 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -345,7 +345,7 @@ private[spark] class ApplicationMaster(
             if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
               finish(FinalApplicationStatus.FAILED,
                 ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
-                "Max number of executor failures reached")
+                s"Max number of executor failures ($maxNumExecutorFailures) reached")
             } else {
               logDebug("Sending progress")
               allocator.allocateResources()

From ab41864f91713b450695babd5c1424622cb57a54 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 29 Sep 2015 13:24:42 -0700
Subject: [PATCH 429/802] [SPARK-10415] [PYSPARK] [MLLIB] [DOCS] Enhance
 Navigation Sidebar in PySpark API

These are CSS/JavaScript changes changes to make navigation in the PySpark API a bit simpler by adding the following to the sidebar:

* Classes
* Functions
* Tags to highlight experimental features

![screen shot 2015-09-02 at 08 50 12](https://cloud.githubusercontent.com/assets/11915197/9634781/301f853a-518b-11e5-8d5c-fda202f6202f.png)

Online example here: https://dl.dropboxusercontent.com/u/20821334/pyspark-api-nav-enhance/pyspark.mllib.html

(The contribution is my original work and that I license the work to the project under the project's open source license)

Author: noelsmith <mail@noelsmith.com>

Closes #8571 from noel-smith/pyspark-api-nav-enhance.
---
 python/docs/_static/pyspark.css    | 90 +++++++++++++++++++++++++++
 python/docs/_static/pyspark.js     | 99 ++++++++++++++++++++++++++++++
 python/docs/_templates/layout.html |  6 ++
 python/docs/conf.py                |  4 +-
 4 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 python/docs/_static/pyspark.css
 create mode 100644 python/docs/_static/pyspark.js
 create mode 100644 python/docs/_templates/layout.html

diff --git a/python/docs/_static/pyspark.css b/python/docs/_static/pyspark.css
new file mode 100644
index 0000000000000..41106f2f6e26d
--- /dev/null
+++ b/python/docs/_static/pyspark.css
@@ -0,0 +1,90 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+body {
+    background-color: #ffffff;
+}
+
+div.sphinxsidebar {
+    width: 274px;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 274px;
+}
+
+div.sphinxsidebar ul {
+    margin-right: 10px;
+}
+
+div.sphinxsidebar li a {
+    word-break: break-all;
+}
+
+span.pys-tag {
+    font-size: 11px;
+    font-weight: bold;
+    margin: 0 0 0 2px;
+    padding: 1px 3px 1px 3px;
+    -moz-border-radius: 3px;
+    -webkit-border-radius: 3px;
+    border-radius: 3px;
+    text-align: center;
+    text-decoration: none;
+}
+
+span.pys-tag-experimental {
+    background-color: rgb(37, 112, 128);
+    color: rgb(255, 255, 255);
+}
+
+span.pys-tag-deprecated {
+    background-color: rgb(238, 238, 238);
+    color: rgb(62, 67, 73);
+}
+
+div.pys-note-experimental {
+    background-color: rgb(88, 151, 165);
+    border-color: rgb(59, 115, 127);
+    color: rgb(255, 255, 255);
+}
+
+div.pys-note-deprecated {
+}
+
+.hasTooltip {
+    position:relative;
+}
+.hasTooltip span {
+    display:none;
+}
+
+.hasTooltip:hover span.tooltip {
+    display: inline-block;
+    -moz-border-radius: 2px;
+    -webkit-border-radius: 2px;
+    border-radius: 2px;
+    background-color: rgb(250, 250, 250);
+    color: rgb(68, 68, 68);
+    font-weight: normal;
+    box-shadow: 1px 1px 3px rgb(127, 127, 127);
+    position: absolute;
+    padding: 0 3px 0 3px;
+    top: 1.3em;
+    left: 14px;
+    z-index: 9999
+}
diff --git a/python/docs/_static/pyspark.js b/python/docs/_static/pyspark.js
new file mode 100644
index 0000000000000..75e4c42492a48
--- /dev/null
+++ b/python/docs/_static/pyspark.js
@@ -0,0 +1,99 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+$(function (){
+
+    function startsWith(s, prefix) {
+        return s && s.indexOf(prefix) === 0;
+    }
+
+    function buildSidebarLinkMap() {
+        var linkMap = {};
+        $('div.sphinxsidebar a.reference.internal').each(function (i,a)  {
+            var href = $(a).attr('href');
+            if (startsWith(href, '#module-')) {
+                var id = href.substr(8);
+                linkMap[id] = [$(a), null];
+            }
+        })
+        return linkMap;
+    };
+
+    function getAdNoteDivs(dd) {
+        var noteDivs = {};
+        dd.find('> div.admonition.note > p.last').each(function (i, p) {
+            var text = $(p).text();
+            if (!noteDivs.experimental && startsWith(text, 'Experimental')) {
+                noteDivs.experimental = $(p).parent();
+            }
+            if (!noteDivs.deprecated && startsWith(text, 'Deprecated')) {
+                noteDivs.deprecated = $(p).parent();
+            }
+        });
+        return noteDivs;
+    }
+
+    function getParentId(name) {
+        var last_idx = name.lastIndexOf('.');
+        return last_idx == -1? '': name.substr(0, last_idx);
+    }
+
+    function buildTag(text, cls, tooltip) {
+        return '<span class="pys-tag ' + cls + ' hasTooltip">' + text + '<span class="tooltip">'
+            + tooltip + '</span></span>'
+    }
+
+
+    var sidebarLinkMap = buildSidebarLinkMap();
+
+    $('dl.class, dl.function').each(function (i,dl)  {
+
+        dl = $(dl);
+        dt = dl.children('dt').eq(0);
+        dd = dl.children('dd').eq(0);
+        var id = dt.attr('id');
+        var desc = dt.find('> .descname').text();
+        var adNoteDivs = getAdNoteDivs(dd);
+
+        if (id) {
+            var parent_id = getParentId(id);
+
+            var r = sidebarLinkMap[parent_id];
+            if (r) {
+                if (r[1] === null) {
+                    r[1] = $('<ul/>');
+                    r[0].parent().append(r[1]);
+                }
+                var tags = '';
+                if (adNoteDivs.experimental) {
+                    tags += buildTag('E', 'pys-tag-experimental', 'Experimental');
+                    adNoteDivs.experimental.addClass('pys-note pys-note-experimental');
+                }
+                if (adNoteDivs.deprecated) {
+                    tags += buildTag('D', 'pys-tag-deprecated', 'Deprecated');
+                    adNoteDivs.deprecated.addClass('pys-note pys-note-deprecated');
+                }
+                var li = $('<li/>');
+                var a = $('<a href="#' + id + '">' + desc + '</a>');
+                li.append(a);
+                li.append(tags);
+                r[1].append(li);
+                sidebarLinkMap[id] = [a, null];
+            }
+        }
+    });
+});
diff --git a/python/docs/_templates/layout.html b/python/docs/_templates/layout.html
new file mode 100644
index 0000000000000..ab36ebababf88
--- /dev/null
+++ b/python/docs/_templates/layout.html
@@ -0,0 +1,6 @@
+{% extends "!layout.html" %}
+{% set script_files = script_files + ["_static/pyspark.js"] %}
+{% set css_files = css_files + ['_static/pyspark.css'] %}
+{% block rootrellink %}
+    {{ super() }}
+{% endblock %}
diff --git a/python/docs/conf.py b/python/docs/conf.py
index 163987dd8e5fa..365d6af514177 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -23,7 +23,7 @@
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+needs_sphinx = '1.2'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -135,7 +135,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
+html_static_path = ['_static']
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied

From 7d399c9daa6769ab234890c551e1b3456e0e6e85 Mon Sep 17 00:00:00 2001
From: Erik Shilts <erik.shilts@opower.com>
Date: Tue, 29 Sep 2015 13:38:15 -0700
Subject: [PATCH 430/802] [SPARK-6919] [PYSPARK] Add asDict method to
 StatCounter

Add method to easily convert a StatCounter instance into a Python dict

https://issues.apache.org/jira/browse/SPARK-6919

Note: This is my original work and the existing Spark license applies.

Author: Erik Shilts <erik.shilts@opower.com>

Closes #5516 from eshilts/statcounter-asdict.
---
 python/pyspark/statcounter.py | 22 ++++++++++++++++++++++
 python/pyspark/tests.py       | 20 ++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 0fee3b2096826..03ea0b6d33c9d 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -131,6 +131,28 @@ def stdev(self):
     def sampleStdev(self):
         return sqrt(self.sampleVariance())
 
+    def asDict(self, sample=False):
+        """Returns the :class:`StatCounter` members as a ``dict``.
+
+        >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict()
+        {'count': 4L,
+         'max': 4.0,
+         'mean': 2.5,
+         'min': 1.0,
+         'stdev': 1.2909944487358056,
+         'sum': 10.0,
+         'variance': 1.6666666666666667}
+        """
+        return {
+            'count': self.count(),
+            'mean': self.mean(),
+            'sum': self.sum(),
+            'min': self.min(),
+            'max': self.max(),
+            'stdev': self.stdev() if sample else self.sampleStdev(),
+            'variance': self.variance() if sample else self.sampleVariance()
+        }
+
     def __repr__(self):
         return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %
                 (self.count(), self.mean(), self.stdev(), self.max(), self.min()))
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f11aaf001c8df..63cc87e0c4b2c 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1976,6 +1976,26 @@ def test_statcounter_array(self):
         self.assertSequenceEqual([3.0, 3.0], s.max().tolist())
         self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist())
 
+        stats_dict = s.asDict()
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_dict['sum'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['stdev'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['variance'].tolist())
+
+        stats_sample_dict = s.asDict(sample=True)
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_sample_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_sample_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_sample_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_sample_dict['sum'].tolist())
+        self.assertSequenceEqual(
+            [0.816496580927726, 0.816496580927726], stats_sample_dict['stdev'].tolist())
+        self.assertSequenceEqual(
+            [0.6666666666666666, 0.6666666666666666], stats_sample_dict['variance'].tolist())
+
 
 if __name__ == "__main__":
     if not _have_scipy:

From c1ad373f26053e1906fce7681c03d130a642bf33 Mon Sep 17 00:00:00 2001
From: asokadiggs <asoka.diggs@intel.com>
Date: Tue, 29 Sep 2015 17:45:18 -0400
Subject: [PATCH 431/802] [SPARK-10782] [PYTHON] Update dropDuplicates
 documentation

Documentation for dropDuplicates() and drop_duplicates() is one and the same.  Resolved the error in the example for drop_duplicates using the same approach used for groupby and groupBy, by indicating that dropDuplicates and drop_duplicates are aliases.

Author: asokadiggs <asoka.diggs@intel.com>

Closes #8930 from asokadiggs/jira-10782.
---
 python/pyspark/sql/dataframe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index b09422aadef8e..033b31983ffac 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -931,6 +931,8 @@ def dropDuplicates(self, subset=None):
         """Return a new :class:`DataFrame` with duplicate rows removed,
         optionally only considering certain columns.
 
+        :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
+
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([ \
             Row(name='Alice', age=5, height=80), \

From 4d5a005b0d2591d4d57a19be48c4954b9f1434a9 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 29 Sep 2015 23:30:27 -0700
Subject: [PATCH 432/802] [SPARK-10811] [SQL] Eliminates unnecessary byte array
 copying

When reading Parquet string and binary-backed decimal values, Parquet `Binary.getBytes` always returns a copied byte array, which is unnecessary. Since the underlying implementation of `Binary` values there is guaranteed to be `ByteArraySliceBackedBinary`, and Parquet itself never reuses underlying byte arrays, we can use `Binary.toByteBuffer.array()` to steal the underlying byte arrays without copying them.

This brings performance benefits when scanning Parquet string and binary-backed decimal columns. Note that, this trick doesn't cover binary-backed decimals with precision greater than 18.

My micro-benchmark result is that, this brings a ~15% performance boost for scanning TPC-DS `store_sales` table (scale factor 15).

Another minor optimization done in this PR is that, now we directly construct a Java `BigDecimal` in `Decimal.toJavaBigDecimal` without constructing a Scala `BigDecimal` first. This brings another ~5% performance gain.

Author: Cheng Lian <lian@databricks.com>

Closes #8907 from liancheng/spark-10811/eliminate-array-copying.
---
 .../org/apache/spark/sql/types/Decimal.scala  |  8 +++++-
 .../parquet/CatalystRowConverter.scala        | 26 ++++++++++++++-----
 .../parquet/CatalystSchemaConverter.scala     |  4 +--
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index c988f1d1b972e..bfcf111385b7e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -145,7 +145,13 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     }
   }
 
-  def toJavaBigDecimal: java.math.BigDecimal = toBigDecimal.underlying()
+  def toJavaBigDecimal: java.math.BigDecimal = {
+    if (decimalVal.ne(null)) {
+      decimalVal.underlying()
+    } else {
+      java.math.BigDecimal.valueOf(longVal, _scale)
+    }
+  }
 
   def toUnscaledLong: Long = {
     if (decimalVal.ne(null)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 2ff2fda3610b8..050d3610a6413 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -302,7 +302,13 @@ private[parquet] class CatalystRowConverter(
     }
 
     override def addBinary(value: Binary): Unit = {
-      updater.set(UTF8String.fromBytes(value.getBytes))
+      // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we
+      // are using `Binary.toByteBuffer.array()` to steal the underlying byte array without copying
+      // it.
+      val buffer = value.toByteBuffer
+      val offset = buffer.position()
+      val numBytes = buffer.limit() - buffer.position()
+      updater.set(UTF8String.fromBytes(buffer.array(), offset, numBytes))
     }
   }
 
@@ -332,24 +338,30 @@ private[parquet] class CatalystRowConverter(
     private def toDecimal(value: Binary): Decimal = {
       val precision = decimalType.precision
       val scale = decimalType.scale
-      val bytes = value.getBytes
 
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
-        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.  The underlying
+        // `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we are using
+        // `Binary.toByteBuffer.array()` to steal the underlying byte array without copying it.
+        val buffer = value.toByteBuffer
+        val bytes = buffer.array()
+        val start = buffer.position()
+        val end = buffer.limit()
+
         var unscaled = 0L
-        var i = 0
+        var i = start
 
-        while (i < bytes.length) {
+        while (i < end) {
           unscaled = (unscaled << 8) | (bytes(i) & 0xff)
           i += 1
         }
 
-        val bits = 8 * bytes.length
+        val bits = 8 * (end - start)
         unscaled = (unscaled << (64 - bits)) >> (64 - bits)
         Decimal(unscaled, precision, scale)
       } else {
         // Otherwise, resorts to an unscaled `BigInteger` instead.
-        Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale)
+        Decimal(new BigDecimal(new BigInteger(value.getBytes), scale), precision, scale)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index 2d237da81c20d..97ffeb08aafe2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -567,9 +567,9 @@ private[parquet] object CatalystSchemaConverter {
     }
   }
 
-  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
+  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) /* 9 */
 
-  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
+  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) /* 18 */
 
   // Max precision of a decimal value stored in `numBytes` bytes
   def maxPrecisionForBytes(numBytes: Int): Int = {

From 2931e89f0c54248d87f1f84c81137a5a91e142e9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 29 Sep 2015 23:58:32 -0700
Subject: [PATCH 433/802] [SPARK-10736] [ML] Use 1 for all ratings if
 $(ratingCol) = ""

For some implicit dataset, ratings may not exist in the training data. In this case, we can assume all observed pairs to be positive and treat their ratings as 1. This should happen when users set ```ratingCol``` to an empty string.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8937 from yanboliang/spark-10736.
---
 .../main/scala/org/apache/spark/ml/recommendation/ALS.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 9a56a75b69d0b..f6f5281f71a5f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -315,9 +315,9 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
 
   override def fit(dataset: DataFrame): ALSModel = {
     import dataset.sqlContext.implicits._
+    val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f)
     val ratings = dataset
-      .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType),
-        col($(ratingCol)).cast(FloatType))
+      .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType), r)
       .map { row =>
         Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
       }

From 16fd2a2f4225771af43403f20fbc79b6c914ee4d Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@questtec.nl>
Date: Wed, 30 Sep 2015 10:12:52 -0700
Subject: [PATCH 434/802] [SPARK-9741] [SQL] Approximate Count Distinct using
 the new UDAF interface.

This PR implements a HyperLogLog based Approximate Count Distinct function using the new UDAF interface.

The implementation is inspired by the ClearSpring HyperLogLog implementation and should produce the same results.

There is still some documentation and testing left to do.

cc yhuai

Author: Herman van Hovell <hvanhovell@questtec.nl>

Closes #8362 from hvanhovell/SPARK-9741.
---
 .../expressions/aggregate/functions.scala     | 399 ++++++++++++++++++
 .../expressions/aggregate/utils.scala         |   6 +
 .../aggregate/HyperLogLogPlusPlusSuite.scala  | 149 +++++++
 3 files changed, 554 insertions(+)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 02cd0ac0db118..e7f8104d439c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
+import java.lang.{Long => JLong}
+import java.util
+
+import com.clearspring.analytics.hash.MurmurHash
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
@@ -445,3 +450,397 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 
   override val evaluateExpression = Cast(currentSum, resultType)
 }
+
+// scalastyle:off
+/**
+ * HyperLogLog++ (HLL++) is a state of the art cardinality estimation algorithm. This class
+ * implements the dense version of the HLL++ algorithm as an Aggregate Function.
+ *
+ * This implementation has been based on the following papers:
+ * HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm
+ * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
+ *
+ * HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation
+ * Algorithm
+ * http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en/us/pubs/archive/40671.pdf
+ *
+ * Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality
+ * Estimation Algorithm
+ * https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen#
+ *
+ * @param child to estimate the cardinality of.
+ * @param relativeSD the maximum estimation error allowed.
+ */
+// scalastyle:on
+case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
+    extends AggregateFunction2 {
+  import HyperLogLogPlusPlus._
+
+  /**
+   * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
+   * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
+   * error requested.
+   *
+   * HLL++ requires that we use at least 4 bits of addressing space (a minimum precision of 27%).
+   *
+   * This method rounds up to the nearest integer. This means that the error is always equal to or
+   * lower than the requested error. Use the <code>trueRsd</code> method to get the actual RSD
+   * value.
+   */
+  private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
+
+  require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
+    "Use a lower error, at most 27%.")
+
+  /**
+   * Shift used to extract the index of the register from the hashed value.
+   *
+   * This assumes the use of 64-bit hashcodes.
+   */
+  private[this] val idxShift = JLong.SIZE - p
+
+  /**
+   * Value to pad the 'w' value with before the number of leading zeros is determined.
+   */
+  private[this] val wPadding = 1L << (p - 1)
+
+  /**
+   * The number of registers used.
+   */
+  private[this] val m = 1 << p
+
+  /**
+   * The pre-calculated combination of: alpha * m * m
+   *
+   * 'alpha' corrects the raw cardinality estimate 'Z'. See the FlFuGaMe07 paper for its
+   * derivation.
+   */
+  private[this] val alphaM2 = p match {
+    case 4 => 0.673d * m * m
+    case 5 => 0.697d * m * m
+    case 6 => 0.709d * m * m
+    case _ => (0.7213d / (1.0d + 1.079d / m)) * m * m
+  }
+
+  /**
+   * The number of words used to store the registers. We use Longs for storage because this is the
+   * most compact way of storage; Spark aligns to 8-byte words or uses Long wrappers.
+   *
+   * We only store whole registers per word in order to prevent overly complex bitwise operations.
+   * In practice this means we only use 60 out of 64 bits.
+   */
+  private[this] val numWords = m / REGISTERS_PER_WORD + 1
+
+  def children: Seq[Expression] = Seq(child)
+
+  def nullable: Boolean = false
+
+  def dataType: DataType = LongType
+
+  def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
+
+  def cloneBufferAttributes: Seq[Attribute] = bufferAttributes.map(_.newInstance())
+
+  /** Allocate enough words to store all registers. */
+  val bufferAttributes: Seq[AttributeReference] = Seq.tabulate(numWords) { i =>
+    AttributeReference(s"MS[$i]", LongType)()
+  }
+
+  /** Fill all words with zeros. */
+  def initialize(buffer: MutableRow): Unit = {
+    var word = 0
+    while (word < numWords) {
+      buffer.setLong(mutableBufferOffset + word, 0)
+      word += 1
+    }
+  }
+
+  /**
+   * Update the HLL++ buffer.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def update(buffer: MutableRow, input: InternalRow): Unit = {
+    val v = child.eval(input)
+    if (v != null) {
+      // Create the hashed value 'x'.
+      val x = MurmurHash.hash64(v)
+
+      // Determine the index of the register we are going to use.
+      val idx = (x >>> idxShift).toInt
+
+      // Determine the number of leading zeros in the remaining bits 'w'.
+      val pw = JLong.numberOfLeadingZeros((x << p) | wPadding) + 1L
+
+      // Get the word containing the register we are interested in.
+      val wordOffset = idx / REGISTERS_PER_WORD
+      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+
+      // Extract the M[J] register value from the word.
+      val shift = REGISTER_SIZE * (idx - (wordOffset * REGISTERS_PER_WORD))
+      val mask = REGISTER_WORD_MASK << shift
+      val Midx = (word & mask) >>> shift
+
+      // Assign the maximum number of leading zeros to the register.
+      if (pw > Midx) {
+        buffer.setLong(mutableBufferOffset + wordOffset, (word & ~mask) | (pw << shift))
+      }
+    }
+  }
+
+  /**
+   * Merge the HLL buffers by iterating through the registers in both buffers and select the
+   * maximum number of leading zeros for each register.
+   */
+  def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word1 = buffer1.getLong(mutableBufferOffset + wordOffset)
+      val word2 = buffer2.getLong(inputBufferOffset + wordOffset)
+      var word = 0L
+      var i = 0
+      var mask = REGISTER_WORD_MASK
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        word |= Math.max(word1 & mask, word2 & mask)
+        mask <<= REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      buffer1.setLong(mutableBufferOffset + wordOffset, word)
+      wordOffset += 1
+    }
+  }
+
+  /**
+   * Estimate the bias using the raw estimates with their respective biases from the HLL++
+   * appendix. We currently use KNN interpolation to determine the bias (as suggested in the
+   * paper).
+   */
+  def estimateBias(e: Double): Double = {
+    val estimates = RAW_ESTIMATE_DATA(p - 4)
+    val numEstimates = estimates.length
+
+    // The estimates are sorted so we can use a binary search to find the index of the
+    // interpolation estimate closest to the current estimate.
+    val nearestEstimateIndex = util.Arrays.binarySearch(estimates, 0, numEstimates, e) match {
+      case ix if ix < 0 => -(ix + 1)
+      case ix => ix
+    }
+
+    // Use square of the difference between the current estimate and the estimate at the given
+    // index as distance metric.
+    def distance(i: Int): Double = {
+      val diff = e - estimates(i)
+      diff * diff
+    }
+
+    // Keep moving bounds as long as the the (exclusive) high bound is closer to the estimate than
+    // the lower (inclusive) bound.
+    var low = math.max(nearestEstimateIndex - K + 1, 0)
+    var high = math.min(low + K, numEstimates)
+    while (high < numEstimates && distance(high) < distance(low)) {
+      low += 1
+      high += 1
+    }
+
+    // Calculate the sum of the biases in low-high interval.
+    val biases = BIAS_DATA(p - 4)
+    var i = low
+    var biasSum = 0.0
+    while (i < high) {
+      biasSum += biases(i)
+      i += 1
+    }
+
+    // Calculate the bias.
+    biasSum / (high - low)
+  }
+
+  /**
+   * Compute the HyperLogLog estimate.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def eval(buffer: InternalRow): Any = {
+    // Compute the inverse of indicator value 'z' and count the number of zeros 'V'.
+    var zInverse = 0.0d
+    var V = 0.0d
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+      var i = 0
+      var shift = 0
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        val Midx = (word >>> shift) & REGISTER_WORD_MASK
+        zInverse += 1.0 / (1 << Midx)
+        if (Midx == 0) {
+          V += 1.0d
+        }
+        shift += REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      wordOffset += 1
+    }
+
+    // We integrate two steps from the paper:
+    // val Z = 1.0d / zInverse
+    // val E = alphaM2 * Z
+    @inline
+    def EBiasCorrected = alphaM2 / zInverse match {
+      case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
+      case e => e
+    }
+
+    // Estimate the cardinality.
+    val estimate = if (V > 0) {
+      // Use linear counting for small cardinality estimates.
+      val H = m * Math.log(m / V)
+      if (H <= THRESHOLDS(p - 4)) {
+        H
+      } else {
+        EBiasCorrected
+      }
+    } else {
+      EBiasCorrected
+    }
+
+    // Round to the nearest long value.
+    Math.round(estimate)
+  }
+
+  /**
+   * The <code>rsd</code> of HLL++ is always equal to or better than the <code>rsd</code> requested.
+   * This method returns the <code>rsd</code> this instance actually guarantees.
+   *
+   * @return the actual <code>rsd</code>.
+   */
+  def trueRsd: Double = 1.04 / math.sqrt(m)
+}
+
+// scalastyle:off
+/**
+ * Constants used in the implementation of the HyperLogLogPlusPlus aggregate function.
+ *
+ * See the Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art
+ * Cardinality (https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen)
+ * for more information.
+ */
+// scalastyle:on
+object HyperLogLogPlusPlus {
+  /**
+   * The size of a word used for storing registers: 64 Bits.
+   */
+  val WORD_SIZE = java.lang.Long.SIZE
+
+  /**
+   * The number of bits that is required per register.
+   *
+   * This number is determined by the maximum number of leading binary zeros a hashcode can
+   * produce. This is equal to the number of bits the hashcode returns. The current
+   * implementation uses a 64-bit hashcode, this means 6-bits are (at most) needed to store the
+   * number of leading zeros.
+   */
+  val REGISTER_SIZE = 6
+
+  /**
+   * Value used to mask a register stored in a word.
+   */
+  val REGISTER_WORD_MASK: Long = (1 << REGISTER_SIZE) - 1
+
+  /**
+   * The number of registers which can be stored in one word.
+   */
+  val REGISTERS_PER_WORD = WORD_SIZE / REGISTER_SIZE
+
+  /**
+   * Number of points used for interpolating the bias value.
+   */
+  val K = 6
+
+  // Sacrificing style for readability here.
+  // scalastyle:off
+
+  /**
+   * Thresholds which decide if the linear counting or the regular algorithm is used.
+   */
+  val THRESHOLDS = Array[Double](10, 20, 40, 80, 220, 400, 900, 1800, 3100, 6500, 15500, 20000, 50000, 120000, 350000)
+
+  /**
+   * Lookup table used to find the (index of the) bias correction for a given precision (exact)
+   * and estimate (nearest).
+   */
+  val RAW_ESTIMATE_DATA = Array(
+    // precision 4
+    Array(11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394),
+    // precision 5
+    Array(23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852),
+    // precision 6
+    Array(46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858),
+    // precision 7
+    Array(92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102),
+    // precision 8
+    Array(184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192),
+    // precision 9
+    Array(369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768),
+    // precision 10
+    Array(738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828),
+    // precision 11
+    Array(1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176),
+    // precision 12
+    Array(2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22),
+    // precision 13
+    Array(5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424),
+    // precision 14
+    Array(11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884),
+    // precision 15
+    Array(23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842),
+    // precision 16
+    Array(47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542),
+    // precision 17
+    Array(94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845),
+    // precision 18
+    Array(189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691)
+  )
+
+  /**
+   * Bias corrections given a precision and the index of the raw estimate table.
+   */
+  val BIAS_DATA = Array(
+    // precision 4
+    Array(10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606),
+    // precision 5
+    Array(22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014),
+    // precision 6
+    Array(45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983),
+    // precision 7
+    Array(91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996),
+    // precision 8
+    Array(183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996),
+    // precision 9
+    Array(368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997),
+    // precision 10
+    Array(737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041),
+    // precision 11
+    Array(1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993),
+    // precision 12
+    Array(2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988),
+    // precision 13
+    Array(5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972),
+    // precision 14
+    Array(11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038),
+    // precision 15
+    Array(23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005),
+    // precision 16
+    Array(47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001),
+    // precision 17
+    Array(94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028),
+    // precision 18
+    Array(189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892)
+  )
+  // scalastyle:on
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index ce3dddad87f55..f656ccf13b156 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -114,6 +114,12 @@ object Utils {
             aggregateFunction = aggregate.Sum(child),
             mode = aggregate.Complete,
             isDistinct = true)
+
+        case expressions.ApproxCountDistinct(child, rsd) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.HyperLogLogPlusPlus(child, rsd),
+            mode = aggregate.Complete,
+            isDistinct = false)
       }
       // Check if there is any expressions.AggregateExpression1 left.
       // If so, we cannot convert this plan.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
new file mode 100644
index 0000000000000..ecc0644164de2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, MutableRow, BoundReference}
+import org.apache.spark.sql.types.{DataType, IntegerType}
+
+import scala.collection.mutable
+import org.scalatest.Assertions._
+
+class HyperLogLogPlusPlusSuite extends SparkFunSuite {
+
+  /** Create a HLL++ instance and an input and output buffer. */
+  def createEstimator(rsd: Double, dt: DataType = IntegerType):
+      (HyperLogLogPlusPlus, MutableRow, MutableRow) = {
+    val input = new SpecificMutableRow(Seq(dt))
+    val hll = new HyperLogLogPlusPlus(new BoundReference(0, dt, true), rsd)
+    val buffer = createBuffer(hll)
+    (hll, input, buffer)
+  }
+
+  def createBuffer(hll: HyperLogLogPlusPlus): MutableRow = {
+    val buffer = new SpecificMutableRow(hll.bufferAttributes.map(_.dataType))
+    hll.initialize(buffer)
+    buffer
+  }
+
+  /** Evaluate the estimate. It should be within 3*SD's of the given true rsd. */
+  def evaluateEstimate(hll: HyperLogLogPlusPlus, buffer: MutableRow, cardinality: Int): Unit = {
+    val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
+    val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
+    assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+  }
+
+  test("add nulls") {
+    val (hll, input, buffer) = createEstimator(0.05)
+    input.setNullAt(0)
+    hll.update(buffer, input)
+    hll.update(buffer, input)
+    val estimate = hll.eval(buffer).asInstanceOf[Long]
+    assert(estimate == 0L, "Nothing meaningful added; estimate should be 0.")
+  }
+
+  def testCardinalityEstimates(
+      rsds: Seq[Double],
+      ns: Seq[Int],
+      f: Int => Int,
+      c: Int => Int): Unit = {
+    rsds.flatMap(rsd => ns.map(n => (rsd, n))).foreach {
+      case (rsd, n) =>
+        val (hll, input, buffer) = createEstimator(rsd)
+        var i = 0
+        while (i < n) {
+          input.setInt(0, f(i))
+          hll.update(buffer, input)
+          i += 1
+        }
+        val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
+        val cardinality = c(n)
+        val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
+        assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+    }
+  }
+
+  test("deterministic cardinality estimation") {
+    val repeats = 10
+    testCardinalityEstimates(
+      Seq(0.1, 0.05, 0.025, 0.01),
+      Seq(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000).map(_ * repeats),
+      i => i / repeats,
+      i => i / repeats)
+  }
+
+  test("random cardinality estimation") {
+    val srng = new Random(323981238L)
+    val seen = mutable.HashSet.empty[Int]
+    val update = (i: Int) => {
+      val value = srng.nextInt()
+      seen += value
+      value
+    }
+    val eval = (n: Int) => {
+      val cardinality = seen.size
+      seen.clear()
+      cardinality
+    }
+    testCardinalityEstimates(
+      Seq(0.05, 0.01),
+      Seq(100, 10000, 500000),
+      update,
+      eval)
+  }
+
+  // Test merging
+  test("merging HLL instances") {
+    val (hll, input, buffer1a) = createEstimator(0.05)
+    val buffer1b = createBuffer(hll)
+    val buffer2 = createBuffer(hll)
+
+    // Create the
+    // Add the lower half
+    var i = 0
+    while (i < 500000) {
+      input.setInt(0, i)
+      hll.update(buffer1a, input)
+      i += 1
+    }
+
+    // Add the upper half
+    i = 500000
+    while (i < 1000000) {
+      input.setInt(0, i)
+      hll.update(buffer1b, input)
+      i += 1
+    }
+
+    // Merge the lower and upper halfs.
+    hll.merge(buffer1a, buffer1b)
+
+    // Create the other buffer in reverse
+    i = 999999
+    while (i >= 0) {
+      input.setInt(0, i)
+      hll.update(buffer2, input)
+      i -= 1
+    }
+
+    // Check if the buffers are equal.
+    assert(buffer2 == buffer1a, "Buffers should be equal")
+  }
+}

From c7b29ae6418368a1266b960ba8776317fd867313 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 30 Sep 2015 11:03:08 -0700
Subject: [PATCH 435/802] [SPARK-10851] [SPARKR] Exception not failing R
 applications (in yarn cluster mode)

The YARN backend doesn't like when user code calls System.exit, since it cannot know the exit status and thus cannot set an appropriate final status for the application.

This PR remove the usage of system.exit to exit the RRunner. Instead, when the R process running an SparkR script returns an exit code other than 0, throws SparkUserAppException which will be caught by ApplicationMaster and ApplicationMaster knows it failed. For other failures, throws SparkException.

Author: Sun Rui <rui.sun@intel.com>

Closes #8938 from sun-rui/SPARK-10851.
---
 .../main/scala/org/apache/spark/deploy/RRunner.scala   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index 05b954ce36998..58cc1f9d963df 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.api.r.{RBackend, RUtils}
+import org.apache.spark.{SparkException, SparkUserAppException}
 import org.apache.spark.util.RedirectThread
 
 /**
@@ -84,12 +85,15 @@ object RRunner {
       } finally {
         sparkRBackend.close()
       }
-      System.exit(returnCode)
+      if (returnCode != 0) {
+        throw new SparkUserAppException(returnCode)
+      }
     } else {
+      val errorMessage = s"SparkR backend did not initialize in $backendTimeout seconds"
       // scalastyle:off println
-      System.err.println("SparkR backend did not initialize in " + backendTimeout + " seconds")
+      System.err.println(errorMessage)
       // scalastyle:on println
-      System.exit(-1)
+      throw new SparkException(errorMessage)
     }
   }
 }

From 03cca5dce2cd7618b5c0e33163efb8502415b06e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 30 Sep 2015 14:36:54 -0400
Subject: [PATCH 436/802] [SPARK-10770] [SQL]
 SparkPlan.executeCollect/executeTake should return InternalRow rather than
 external Row.

Author: Reynold Xin <rxin@databricks.com>

Closes #8900 from rxin/SPARK-10770-1.
---
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../spark/sql/execution/LocalTableScan.scala  | 10 ++++-----
 .../spark/sql/execution/SparkPlan.scala       | 22 +++++++++++--------
 .../spark/sql/execution/basicOperators.scala  |  7 +++---
 .../apache/spark/sql/execution/commands.scala | 13 ++++++-----
 .../apache/spark/sql/execution/python.scala   | 10 ++++-----
 .../spark/sql/execution/SparkPlanTest.scala   |  2 +-
 .../apache/spark/sql/hive/HiveContext.scala   |  6 ++---
 .../hive/execution/InsertIntoHiveTable.scala  |  6 ++---
 9 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 9c67ad18c3d27..01f60aba87ede 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1415,7 +1415,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def collect(): Array[Row] = withNewExecutionId {
-    queryExecution.executedPlan.executeCollect()
+    queryExecution.executedPlan.executeCollectPublic()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index 34e926e4582be..adb6bbc4acc5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -34,13 +34,11 @@ private[sql] case class LocalTableScan(
 
   protected override def doExecute(): RDD[InternalRow] = rdd
 
-  override def executeCollect(): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    rows.map(converter(_).asInstanceOf[Row]).toArray
+  override def executeCollect(): Array[InternalRow] = {
+    rows.toArray
   }
 
-  override def executeTake(limit: Int): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    rows.map(converter(_).asInstanceOf[Row]).take(limit).toArray
+  override def executeTake(limit: Int): Array[InternalRow] = {
+    rows.take(limit).toArray
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 72f5450510a10..fcb42047ffe60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -170,11 +170,16 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /**
    * Runs this query returning the result as an array.
    */
-  def executeCollect(): Array[Row] = {
-    execute().mapPartitions { iter =>
-      val converter = CatalystTypeConverters.createToScalaConverter(schema)
-      iter.map(converter(_).asInstanceOf[Row])
-    }.collect()
+  def executeCollect(): Array[InternalRow] = {
+    execute().map(_.copy()).collect()
+  }
+
+  /**
+   * Runs this query returning the result as an array, using external Row format.
+   */
+  def executeCollectPublic(): Array[Row] = {
+    val converter = CatalystTypeConverters.createToScalaConverter(schema)
+    executeCollect().map(converter(_).asInstanceOf[Row])
   }
 
   /**
@@ -182,9 +187,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
    *
    * This is modeled after RDD.take but never runs any job locally on the driver.
    */
-  def executeTake(n: Int): Array[Row] = {
+  def executeTake(n: Int): Array[InternalRow] = {
     if (n == 0) {
-      return new Array[Row](0)
+      return new Array[InternalRow](0)
     }
 
     val childRDD = execute().map(_.copy())
@@ -218,8 +223,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       partsScanned += numPartsToTry
     }
 
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    buf.toArray.map(converter(_).asInstanceOf[Row])
+    buf.toArray
   }
 
   private[this] def isTesting: Boolean = sys.props.contains("spark.testing")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index bf6d44c098ee3..3e49e0a35741d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -204,7 +204,7 @@ case class Limit(limit: Int, child: SparkPlan)
   override def output: Seq[Attribute] = child.output
   override def outputPartitioning: Partitioning = SinglePartition
 
-  override def executeCollect(): Array[Row] = child.executeTake(limit)
+  override def executeCollect(): Array[InternalRow] = child.executeTake(limit)
 
   protected override def doExecute(): RDD[InternalRow] = {
     val rdd: RDD[_ <: Product2[Boolean, InternalRow]] = if (sortBasedShuffleOn) {
@@ -258,9 +258,8 @@ case class TakeOrderedAndProject(
     projection.map(data.map(_)).getOrElse(data)
   }
 
-  override def executeCollect(): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    collectData().map(converter(_).asInstanceOf[Row])
+  override def executeCollect(): Array[InternalRow] = {
+    collectData()
   }
 
   // TODO: Terminal split should be implemented differently from non-terminal split.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index af28e2dfa4186..05ccc53830bd1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -54,20 +54,21 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
    * The `execute()` method of all the physical command classes should reference `sideEffectResult`
    * so that the command can be executed eagerly right after the command query is created.
    */
-  protected[sql] lazy val sideEffectResult: Seq[Row] = cmd.run(sqlContext)
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    cmd.run(sqlContext).map(converter(_).asInstanceOf[InternalRow])
+  }
 
   override def output: Seq[Attribute] = cmd.output
 
   override def children: Seq[SparkPlan] = Nil
 
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+  override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
-  override def executeTake(limit: Int): Array[Row] = sideEffectResult.take(limit).toArray
+  override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val convert = CatalystTypeConverters.createToCatalystConverter(schema)
-    val converted = sideEffectResult.map(convert(_).asInstanceOf[InternalRow])
-    sqlContext.sparkContext.parallelize(converted, 1)
+    sqlContext.sparkContext.parallelize(sideEffectResult, 1)
   }
 
   override def argString: String = cmd.toString
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index d6aaf424a8935..5dbe0fc5f95c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -121,12 +121,10 @@ object EvaluatePython {
 
   def takeAndServe(df: DataFrame, n: Int): Int = {
     registerPicklers()
-    // This is an annoying hack - we should refactor the code so executeCollect and executeTake
-    // returns InternalRow rather than Row.
-    val converter = CatalystTypeConverters.createToCatalystConverter(df.schema)
-    val iter = new SerDeUtil.AutoBatchedPickler(df.take(n).iterator.map { row =>
-      EvaluatePython.toJava(converter(row).asInstanceOf[InternalRow], df.schema)
-    })
+    val iter = new SerDeUtil.AutoBatchedPickler(
+      df.queryExecution.executedPlan.executeTake(n).iterator.map { row =>
+        EvaluatePython.toJava(row, df.schema)
+      })
     PythonRDD.serveIterator(iter, s"serve-DataFrame")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 3d218f01c9ead..8549a6a0f6643 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -245,7 +245,7 @@ object SparkPlanTest {
           }
       }
     )
-    resolvedPlan.executeCollect().toSeq
+    resolvedPlan.executeCollectPublic().toSeq
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c75025e79af4a..17de8ef56f9a6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -563,7 +563,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
 
   /** Extends QueryExecution with hive specific features. */
   protected[sql] class QueryExecution(logicalPlan: LogicalPlan)
-    extends super.QueryExecution(logicalPlan) {
+    extends org.apache.spark.sql.execution.QueryExecution(this, logicalPlan) {
 
     /**
      * Returns the result as a hive compatible sequence of strings.  For native commands, the
@@ -581,10 +581,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
               .mkString("\t")
         }
       case command: ExecutedCommand =>
-        command.executeCollect().map(_(0).toString)
+        command.executeCollect().map(_.getString(0))
 
       case other =>
-        val result: Seq[Seq[Any]] = other.executeCollect().map(_.toSeq).toSeq
+        val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq
         // We need the types so we can output struct field names
         val types = analyzed.output.map(_.dataType)
         // Reformat to match hive tab delimited output.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 0c700bdb370ac..f936cf565b2bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -124,7 +124,7 @@ case class InsertIntoHiveTable(
    *
    * Note: this is run once and then kept to avoid double insertions.
    */
-  protected[sql] lazy val sideEffectResult: Seq[Row] = {
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
@@ -267,10 +267,10 @@ case class InsertIntoHiveTable(
     // however for now we return an empty list to simplify compatibility checks with hive, which
     // does not return anything for insert operations.
     // TODO: implement hive compatibility as rules.
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+  override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
     sqlContext.sparkContext.parallelize(sideEffectResult.asInstanceOf[Seq[InternalRow]], 1)

From 89ea0041ae5a701ce8d211ed08f1f059b7f9c396 Mon Sep 17 00:00:00 2001
From: Nathan Howell <nhowell@godaddy.com>
Date: Wed, 30 Sep 2015 15:33:12 -0700
Subject: [PATCH 437/802] [SPARK-9617] [SQL] Implement json_tuple

This is an implementation of Hive's `json_tuple` function using Jackson Streaming.

Author: Nathan Howell <nhowell@godaddy.com>

Closes #7946 from NathanHowell/SPARK-9617.
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../expressions/jsonExpressions.scala         | 167 +++++++++++++++++-
 .../expressions/JsonExpressionsSuite.scala    | 114 ++++++++++++
 .../apache/spark/sql/JsonFunctionsSuite.scala |  38 ++++
 4 files changed, 316 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 11b4866bf264b..e6122d92b763c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -184,6 +184,7 @@ object FunctionRegistry {
     expression[FormatNumber]("format_number"),
     expression[GetJsonObject]("get_json_object"),
     expression[InitCap]("initcap"),
+    expression[JsonTuple]("json_tuple"),
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[Length]("length"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 23bfa18c94286..0770fab0ae901 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -21,8 +21,9 @@ import java.io.{StringWriter, ByteArrayOutputStream}
 
 import com.fasterxml.jackson.core._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.types.{StringType, DataType}
+import org.apache.spark.sql.types.{StructField, StructType, StringType, DataType}
 import org.apache.spark.unsafe.types.UTF8String
 
 import scala.util.parsing.combinator.RegexParsers
@@ -92,8 +93,8 @@ private[this] object JsonPathParser extends RegexParsers {
   }
 }
 
-private[this] object GetJsonObject {
-  private val jsonFactory = new JsonFactory()
+private[this] object SharedFactory {
+  val jsonFactory = new JsonFactory()
 
   // Enabled for Hive compatibility
   jsonFactory.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS)
@@ -106,7 +107,7 @@ private[this] object GetJsonObject {
 case class GetJsonObject(json: Expression, path: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
-  import GetJsonObject._
+  import SharedFactory._
   import PathInstruction._
   import WriteStyle._
   import com.fasterxml.jackson.core.JsonToken._
@@ -307,3 +308,161 @@ case class GetJsonObject(json: Expression, path: Expression)
     }
   }
 }
+
+case class JsonTuple(children: Seq[Expression])
+  extends Expression with CodegenFallback {
+
+  import SharedFactory._
+
+  override def nullable: Boolean = {
+    // a row is always returned
+    false
+  }
+
+  // if processing fails this shared value will be returned
+  @transient private lazy val nullRow: InternalRow =
+    new GenericInternalRow(Array.ofDim[Any](fieldExpressions.length))
+
+  // the json body is the first child
+  @transient private lazy val jsonExpr: Expression = children.head
+
+  // the fields to query are the remaining children
+  @transient private lazy val fieldExpressions: Seq[Expression] = children.tail
+
+  // eagerly evaluate any foldable the field names
+  @transient private lazy val foldableFieldNames: IndexedSeq[String] = {
+    fieldExpressions.map {
+      case expr if expr.foldable => expr.eval().asInstanceOf[UTF8String].toString
+      case _ => null
+    }.toIndexedSeq
+  }
+
+  // and count the number of foldable fields, we'll use this later to optimize evaluation
+  @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null)
+
+  override lazy val dataType: StructType = {
+    val fields = fieldExpressions.zipWithIndex.map {
+      case (_, idx) => StructField(
+        name = s"c$idx", // mirroring GenericUDTFJSONTuple.initialize
+        dataType = StringType,
+        nullable = true)
+    }
+
+    StructType(fields)
+  }
+
+  override def prettyName: String = "json_tuple"
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.length < 2) {
+      TypeCheckResult.TypeCheckFailure(s"$prettyName requires at least two arguments")
+    } else if (children.forall(child => StringType.acceptsType(child.dataType))) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"$prettyName requires that all arguments are strings")
+    }
+  }
+
+  override def eval(input: InternalRow): InternalRow = {
+    val json = jsonExpr.eval(input).asInstanceOf[UTF8String]
+    if (json == null) {
+      return nullRow
+    }
+
+    try {
+      val parser = jsonFactory.createParser(json.getBytes)
+
+      try {
+        parseRow(parser, input)
+      } finally {
+        parser.close()
+      }
+    } catch {
+      case _: JsonProcessingException =>
+        nullRow
+    }
+  }
+
+  private def parseRow(parser: JsonParser, input: InternalRow): InternalRow = {
+    // only objects are supported
+    if (parser.nextToken() != JsonToken.START_OBJECT) {
+      return nullRow
+    }
+
+    // evaluate the field names as String rather than UTF8String to
+    // optimize lookups from the json token, which is also a String
+    val fieldNames = if (constantFields == fieldExpressions.length) {
+      // typically the user will provide the field names as foldable expressions
+      // so we can use the cached copy
+      foldableFieldNames
+    } else if (constantFields == 0) {
+      // none are foldable so all field names need to be evaluated from the input row
+      fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString)
+    } else {
+      // if there is a mix of constant and non-constant expressions
+      // prefer the cached copy when available
+      foldableFieldNames.zip(fieldExpressions).map {
+        case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString
+        case (fieldName, _) => fieldName
+      }
+    }
+
+    val row = Array.ofDim[Any](fieldNames.length)
+
+    // start reading through the token stream, looking for any requested field names
+    while (parser.nextToken() != JsonToken.END_OBJECT) {
+      if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
+        // check to see if this field is desired in the output
+        val idx = fieldNames.indexOf(parser.getCurrentName)
+        if (idx >= 0) {
+          // it is, copy the child tree to the correct location in the output row
+          val output = new ByteArrayOutputStream()
+
+          // write the output directly to UTF8 encoded byte array
+          if (parser.nextToken() != JsonToken.VALUE_NULL) {
+            val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
+
+            try {
+              copyCurrentStructure(generator, parser)
+            } finally {
+              generator.close()
+            }
+
+            row(idx) = UTF8String.fromBytes(output.toByteArray)
+          }
+        }
+      }
+
+      // always skip children, it's cheap enough to do even if copyCurrentStructure was called
+      parser.skipChildren()
+    }
+
+    new GenericInternalRow(row)
+  }
+
+  private def copyCurrentStructure(generator: JsonGenerator, parser: JsonParser): Unit = {
+    parser.getCurrentToken match {
+      // if the user requests a string field it needs to be returned without enclosing
+      // quotes which is accomplished via JsonGenerator.writeRaw instead of JsonGenerator.write
+      case JsonToken.VALUE_STRING if parser.hasTextCharacters =>
+        // slight optimization to avoid allocating a String instance, though the characters
+        // still have to be decoded... Jackson doesn't have a way to access the raw bytes
+        generator.writeRaw(parser.getTextCharacters, parser.getTextOffset, parser.getTextLength)
+
+      case JsonToken.VALUE_STRING =>
+        // the normal String case, pass it through to the output without enclosing quotes
+        generator.writeRaw(parser.getText)
+
+      case JsonToken.VALUE_NULL =>
+        // a special case that needs to be handled outside of this method.
+        // if a requested field is null, the result must be null. the easiest
+        // way to achieve this is just by ignoring null tokens entirely
+        throw new IllegalStateException("Do not attempt to copy a null field")
+
+      case _ =>
+        // handle other types including objects, arrays, booleans and numbers
+        generator.copyCurrentStructure(parser)
+    }
+  }
+}
+
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 4addbaf0cbce7..f33125f463e14 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.unsafe.types.UTF8String
 
 class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   val json =
@@ -199,4 +201,116 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       GetJsonObject(NonFoldableLiteral(json), NonFoldableLiteral("$.fb:testid")),
       "1234")
   }
+
+  val jsonTupleQuery = Literal("f1") ::
+    Literal("f2") ::
+    Literal("f3") ::
+    Literal("f4") ::
+    Literal("f5") ::
+    Nil
+
+  test("json_tuple - hive key 1") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(Seq("value1", "value2", "3", null, "5.23").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 2") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 2 (mix of foldable fields)") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+        Literal("f1") ::
+        NonFoldableLiteral("f2") ::
+        NonFoldableLiteral("f3") ::
+        Literal("f4") ::
+        Literal("f5") ::
+        Nil),
+      InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3 (nonfoldable json)") {
+    checkEvaluation(
+      JsonTuple(
+        NonFoldableLiteral(
+          """{"f1": "value13", "f4": "value44",
+            | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin)
+          :: jsonTupleQuery),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3 (nonfoldable fields)") {
+    checkEvaluation(
+      JsonTuple(Literal(
+        """{"f1": "value13", "f4": "value44",
+          | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin) ::
+        NonFoldableLiteral("f1") ::
+        NonFoldableLiteral("f2") ::
+        NonFoldableLiteral("f3") ::
+        NonFoldableLiteral("f4") ::
+        NonFoldableLiteral("f5") ::
+        Nil),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 4 - null json") {
+    checkEvaluation(
+      JsonTuple(Literal(null) :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - hive key 5 - null and empty fields") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"f1": "", "f5": null}""") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(UTF8String.fromString(""), null, null, null, null)))
+  }
+
+  test("json_tuple - hive key 6 - invalid json (array)") {
+    checkEvaluation(
+      JsonTuple(Literal("[invalid JSON string]") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (object start only)") {
+    checkEvaluation(
+      JsonTuple(Literal("{") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (no object end)") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"foo": "bar"""") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (invalid json)") {
+    checkEvaluation(
+      JsonTuple(Literal("\\") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - preserve newlines") {
+    checkEvaluation(
+      JsonTuple(Literal("{\"a\":\"b\nc\"}") :: Literal("a") :: Nil),
+      InternalRow.fromSeq(Seq(UTF8String.fromString("b\nc"))))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 045fea82e4c89..e3531d0d6d799 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -29,4 +29,42 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("alice", "5"))
   }
 
+
+  val tuples: Seq[(String, String)] =
+    ("1", """{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") ::
+    ("2", """{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+    ("3", """{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") ::
+    ("4", null) ::
+    ("5", """{"f1": "", "f5": null}""") ::
+    ("6", "[invalid JSON string]") ::
+    Nil
+
+  test("json_tuple select") {
+    val df: DataFrame = tuples.toDF("key", "jstring")
+    val expected = Row("1", Row("value1", "value2", "3", null, "5.23")) ::
+      Row("2", Row("value12", "2", "value3", "4.01", null)) ::
+      Row("3", Row("value13", "2", "value33", "value44", "5.01")) ::
+      Row("4", Row(null, null, null, null, null)) ::
+      Row("5", Row("", null, null, null, null)) ::
+      Row("6", Row(null, null, null, null, null)) ::
+      Nil
+
+    checkAnswer(df.selectExpr("key", "json_tuple(jstring, 'f1', 'f2', 'f3', 'f4', 'f5')"), expected)
+  }
+
+  test("json_tuple filter and group") {
+    val df: DataFrame = tuples.toDF("key", "jstring")
+    val expr = df
+      .selectExpr("json_tuple(jstring, 'f1', 'f2') as jt")
+      .where($"jt.c0".isNotNull)
+      .groupBy($"jt.c1")
+      .count()
+
+    val expected = Row(null, 1) ::
+      Row("2", 2) ::
+      Row("value2", 1) ::
+      Nil
+
+    checkAnswer(expr, expected)
+  }
 }

From f21e2da03fbf8041fece476e3d5c699aef819451 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <olarayej@mail.usf.edu>
Date: Wed, 30 Sep 2015 18:03:31 -0700
Subject: [PATCH 438/802] [SPARK-10807] [SPARKR] Added as.data.frame as a
 synonym for collect

Created method as.data.frame as a synonym for collect().

Author: Oscar D. Lara Yejas <olarayej@mail.usf.edu>
Author: olarayej <oscar.lara.yejas@us.ibm.com>
Author: Oscar D. Lara Yejas <oscar.lara.yejas@us.ibm.com>

Closes #8908 from olarayej/SPARK-10807.
---
 R/pkg/NAMESPACE                  |  2 ++
 R/pkg/R/DataFrame.R              | 25 +++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R |  9 ++++++++-
 4 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9d39630706436..c28c47daeac13 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -247,3 +247,5 @@ export("structField",
        "structType.jobj",
        "structType.structField",
        "print.structType")
+
+export("as.data.frame")
\ No newline at end of file
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index c3c1893487334..65e368c47dd81 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1848,3 +1848,28 @@ setMethod("crosstab",
             sct <- callJMethod(statFunctions, "crosstab", col1, col2)
             collect(dataFrame(sct))
           })
+
+
+#' This function downloads the contents of a DataFrame into an R's data.frame.
+#' Since data.frames are held in memory, ensure that you have enough memory
+#' in your system to accommodate the contents.
+#' 
+#' @title Download data from a DataFrame into a data.frame
+#' @param x a DataFrame
+#' @return a data.frame
+#' @rdname as.data.frame
+#' @examples \dontrun{
+#' 
+#' irisDF <- createDataFrame(sqlContext, iris)
+#' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ])
+#' }
+setMethod("as.data.frame",
+          signature(x = "DataFrame"),
+          function(x, ...) {
+            # Check if additional parameters have been passed
+            if (length(list(...)) > 0) {
+              stop(paste("Unused argument(s): ", paste(list(...), collapse=", ")))
+            }
+            collect(x)
+          }
+)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 43dd8d283ab6b..3db41e0fe2bb6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -983,3 +983,7 @@ setGeneric("glm")
 #' @rdname rbind
 #' @export
 setGeneric("rbind", signature = "...")
+
+#' @rdname as.data.frame
+#' @export
+setGeneric("as.data.frame")
\ No newline at end of file
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e159a69584274..8f85eecbc4a97 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1327,6 +1327,13 @@ test_that("SQL error message is returned from JVM", {
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)
 })
 
+test_that("Method as.data.frame as a synonym for collect()", {
+  irisDF <- createDataFrame(sqlContext, iris)
+  expect_equal(as.data.frame(irisDF), collect(irisDF))
+  irisDF2 <- irisDF[irisDF$Species == "setosa", ]
+  expect_equal(as.data.frame(irisDF2), collect(irisDF2))
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
-unlink(jsonPathNa)
+unlink(jsonPathNa)
\ No newline at end of file

From 9b3e7768a27d51ddd4711c4a68a428a6875bd6d7 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 1 Oct 2015 07:09:31 -0700
Subject: [PATCH 439/802] [SPARK-10058] [CORE] [TESTS] Fix the flaky tests in
 HeartbeatReceiverSuite

Fixed the test failure here: https://amplab.cs.berkeley.edu/jenkins/view/Spark-QA-Test/job/Spark-1.5-SBT/116/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.2,label=spark-test/testReport/junit/org.apache.spark/HeartbeatReceiverSuite/normal_heartbeat/

This failure is because `HeartbeatReceiverSuite. heartbeatReceiver` may receive `SparkListenerExecutorAdded("driver")` sent from [LocalBackend](https://github.com/apache/spark/blob/8fb3a65cbb714120d612e58ef9d12b0521a83260/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala#L121).

There are other race conditions in `HeartbeatReceiverSuite` because `HeartbeatReceiver.onExecutorAdded` and `HeartbeatReceiver.onExecutorRemoved` are asynchronous. This PR also fixed them.

Author: zsxwing <zsxwing@gmail.com>

Closes #8946 from zsxwing/SPARK-10058.
---
 .../org/apache/spark/HeartbeatReceiver.scala  | 25 ++++++++-
 .../apache/spark/HeartbeatReceiverSuite.scala | 51 ++++++++++++++-----
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index ee60d697d8799..1f1f0b75de5f1 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
 
 import scala.collection.mutable
+import scala.concurrent.Future
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext}
@@ -147,11 +148,31 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
       }
   }
 
+  /**
+   * Send ExecutorRegistered to the event loop to add a new executor. Only for test.
+   *
+   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
+   *         indicate if this operation is successful.
+   */
+  def addExecutor(executorId: String): Option[Future[Boolean]] = {
+    Option(self).map(_.ask[Boolean](ExecutorRegistered(executorId)))
+  }
+
   /**
    * If the heartbeat receiver is not stopped, notify it of executor registrations.
    */
   override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
-    Option(self).foreach(_.ask[Boolean](ExecutorRegistered(executorAdded.executorId)))
+    addExecutor(executorAdded.executorId)
+  }
+
+  /**
+   * Send ExecutorRemoved to the event loop to remove a executor. Only for test.
+   *
+   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
+   *         indicate if this operation is successful.
+   */
+  def removeExecutor(executorId: String): Option[Future[Boolean]] = {
+    Option(self).map(_.ask[Boolean](ExecutorRemoved(executorId)))
   }
 
   /**
@@ -165,7 +186,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
    * and expire it with loud error messages.
    */
   override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {
-    Option(self).foreach(_.ask[Boolean](ExecutorRemoved(executorRemoved.executorId)))
+    removeExecutor(executorRemoved.executorId)
   }
 
   private def expireDeadHosts(): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index 139b8dc25f4b4..18f2229fea39b 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -19,7 +19,10 @@ package org.apache.spark
 
 import java.util.concurrent.{ExecutorService, TimeUnit}
 
+import scala.collection.Map
 import scala.collection.mutable
+import scala.concurrent.Await
+import scala.concurrent.duration._
 import scala.language.postfixOps
 
 import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester}
@@ -96,18 +99,18 @@ class HeartbeatReceiverSuite
 
   test("normal heartbeat") {
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 2)
     assert(trackedExecutors.contains(executorId1))
     assert(trackedExecutors.contains(executorId2))
   }
 
   test("reregister if scheduler is not ready yet") {
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
+    addExecutorAndVerify(executorId1)
     // Task scheduler is not set yet in HeartbeatReceiver, so executors should reregister
     triggerHeartbeat(executorId1, executorShouldReregister = true)
   }
@@ -116,20 +119,20 @@ class HeartbeatReceiverSuite
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
     // Received heartbeat from unknown executor, so we ask it to re-register
     triggerHeartbeat(executorId1, executorShouldReregister = true)
-    assert(heartbeatReceiver.invokePrivate(_executorLastSeen()).isEmpty)
+    assert(getTrackedExecutors.isEmpty)
   }
 
   test("reregister if heartbeat from removed executor") {
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     // Remove the second executor but not the first
-    heartbeatReceiver.onExecutorRemoved(SparkListenerExecutorRemoved(0, executorId2, "bad boy"))
+    removeExecutorAndVerify(executorId2)
     // Now trigger the heartbeats
     // A heartbeat from the second executor should require reregistering
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = true)
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
@@ -138,8 +141,8 @@ class HeartbeatReceiverSuite
   test("expire dead hosts") {
     val executorTimeout = heartbeatReceiver.invokePrivate(_executorTimeoutMs())
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
     // Advance the clock and only trigger a heartbeat for the first executor
@@ -149,7 +152,7 @@ class HeartbeatReceiverSuite
     heartbeatReceiverRef.askWithRetry[Boolean](ExpireDeadHosts)
     // Only the second executor should be expired as a dead host
     verify(scheduler).executorLost(Matchers.eq(executorId2), any())
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
@@ -175,8 +178,8 @@ class HeartbeatReceiverSuite
     fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
       RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty))
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
 
@@ -222,6 +225,26 @@ class HeartbeatReceiverSuite
     }
   }
 
+  private def addExecutorAndVerify(executorId: String): Unit = {
+    assert(
+      heartbeatReceiver.addExecutor(executorId).map { f =>
+        Await.result(f, 10.seconds)
+      } === Some(true))
+  }
+
+  private def removeExecutorAndVerify(executorId: String): Unit = {
+    assert(
+      heartbeatReceiver.removeExecutor(executorId).map { f =>
+        Await.result(f, 10.seconds)
+      } === Some(true))
+  }
+
+  private def getTrackedExecutors: Map[String, Long] = {
+    // We may receive undesired SparkListenerExecutorAdded from LocalBackend, so exclude it from
+    // the map. See SPARK-10800.
+    heartbeatReceiver.invokePrivate(_executorLastSeen()).
+      filterKeys(_ != SparkContext.DRIVER_IDENTIFIER)
+  }
 }
 
 // TODO: use these classes to add end-to-end tests for dynamic allocation!

From 4d8c7c6d1c973f07e210e27936b063b5a763e9a3 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 1 Oct 2015 11:48:15 -0700
Subject: [PATCH 440/802] [SPARK-10865] [SPARK-10866] [SQL] Fix bug of
 ceil/floor, which should returns long instead of the Double type

Floor & Ceiling function should returns Long type, rather than Double.

Verified with MySQL & Hive.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8933 from chenghao-intel/ceiling.
---
 .../expressions/mathExpressions.scala         | 24 ++++++++++++++++---
 .../expressions/MathFunctionsSuite.scala      |  4 ++--
 .../spark/sql/MathExpressionsSuite.scala      | 14 ++++++-----
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 15ceb9193a8c5..39de0e8f448de 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -52,7 +52,7 @@ abstract class LeafMathExpression(c: Double, name: String)
  * @param f The math function.
  * @param name The short name of the function
  */
-abstract class UnaryMathExpression(f: Double => Double, name: String)
+abstract class UnaryMathExpression(val f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType)
@@ -152,7 +152,16 @@ case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN"
 
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
-case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL")
+case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
+  override def dataType: DataType = LongType
+  protected override def nullSafeEval(input: Any): Any = {
+    f(input.asInstanceOf[Double]).toLong
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+  }
+}
 
 case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
@@ -195,7 +204,16 @@ case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
-case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR")
+case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
+  override def dataType: DataType = LongType
+  protected override def nullSafeEval(input: Any): Any = {
+    f(input.asInstanceOf[Double]).toLong
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+  }
+}
 
 object Factorial {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 90c59f240b542..1b2a9163a3d09 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -244,12 +244,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("ceil") {
-    testUnary(Ceil, math.ceil)
+    testUnary(Ceil, (d: Double) => math.ceil(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType)
   }
 
   test("floor") {
-    testUnary(Floor, math.floor)
+    testUnary(Floor, (d: Double) => math.floor(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 30289c3c1d097..58f982c2bc932 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -37,9 +37,11 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   private lazy val nullDoubles =
     Seq(NullDoubles(1.0), NullDoubles(2.0), NullDoubles(3.0), NullDoubles(null)).toDF()
 
-  private def testOneToOneMathFunction[@specialized(Int, Long, Float, Double) T](
+  private def testOneToOneMathFunction[
+  @specialized(Int, Long, Float, Double) T,
+  @specialized(Int, Long, Float, Double) U](
       c: Column => Column,
-      f: T => T): Unit = {
+      f: T => U): Unit = {
     checkAnswer(
       doubleData.select(c('a)),
       (1 to 10).map(n => Row(f((n * 0.2 - 1).asInstanceOf[T])))
@@ -165,10 +167,10 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("ceil and ceiling") {
-    testOneToOneMathFunction(ceil, math.ceil)
+    testOneToOneMathFunction(ceil, (d: Double) => math.ceil(d).toLong)
     checkAnswer(
       sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"),
-      Row(0.0, 1.0, 2.0))
+      Row(0L, 1L, 2L))
   }
 
   test("conv") {
@@ -184,7 +186,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("floor") {
-    testOneToOneMathFunction(floor, math.floor)
+    testOneToOneMathFunction(floor, (d: Double) => math.floor(d).toLong)
   }
 
   test("factorial") {
@@ -228,7 +230,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("signum / sign") {
-    testOneToOneMathFunction[Double](signum, math.signum)
+    testOneToOneMathFunction[Double, Double](signum, math.signum)
 
     checkAnswer(
       sql("SELECT sign(10), signum(-11)"),

From 02026a8132a2e6fe3be97b33b49826139cd1312e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Thu, 1 Oct 2015 13:23:59 -0700
Subject: [PATCH 441/802] [SPARK-10671] [SQL] Throws an analysis exception if
 we cannot find Hive UDFs

Takes over https://github.com/apache/spark/pull/8800

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8941 from cloud-fan/hive-udf.
---
 .../org/apache/spark/sql/hive/hiveUDFs.scala  | 68 +++++++++++++------
 .../sql/hive/execution/HiveUDFSuite.scala     | 59 +++++++++++++++-
 2 files changed, 104 insertions(+), 23 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index fa9012b96edde..a85d4db88d7eb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -60,20 +60,36 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
 
       val functionClassName = functionInfo.getFunctionClass.getName
 
-      if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (
-        classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveUDAFFunction(new HiveFunctionWrapper(functionClassName), children)
-      } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveUDAFFunction(
-          new HiveFunctionWrapper(functionClassName), children, isUDAFBridgeRequired = true)
-      } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
-      } else {
-        sys.error(s"No handler for udf ${functionInfo.getFunctionClass}")
+      // When we instantiate hive UDF wrapper class, we may throw exception if the input expressions
+      // don't satisfy the hive UDF, such as type mismatch, input number mismatch, etc. Here we
+      // catch the exception and throw AnalysisException instead.
+      try {
+        if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
+        } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
+        } else if (
+          classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveUDAFFunction(new HiveFunctionWrapper(functionClassName), children)
+        } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveUDAFFunction(
+            new HiveFunctionWrapper(functionClassName), children, isUDAFBridgeRequired = true)
+        } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          val udtf = HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
+          udtf.elementTypes // Force it to check input data types.
+          udtf
+        } else {
+          throw new AnalysisException(s"No handler for udf ${functionInfo.getFunctionClass}")
+        }
+      } catch {
+        case analysisException: AnalysisException =>
+          // If the exception is an AnalysisException, just throw it.
+          throw analysisException
+        case throwable: Throwable =>
+          // If there is any other error, we throw an AnalysisException.
+          val errorMessage = s"No handler for Hive udf ${functionInfo.getFunctionClass} " +
+            s"because: ${throwable.getMessage}."
+          throw new AnalysisException(errorMessage)
       }
     }
   }
@@ -134,7 +150,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
   @transient
   private lazy val conversionHelper = new ConversionHelper(method, arguments)
 
-  val dataType = javaClassToDataType(method.getReturnType)
+  override val dataType = javaClassToDataType(method.getReturnType)
 
   @transient
   lazy val returnInspector = ObjectInspectorFactory.getReflectionObjectInspector(
@@ -205,7 +221,7 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr
     new DeferredObjectAdapter(inspect, child.dataType)
   }.toArray[DeferredObject]
 
-  lazy val dataType: DataType = inspectorToDataType(returnInspector)
+  override val dataType: DataType = inspectorToDataType(returnInspector)
 
   override def eval(input: InternalRow): Any = {
     returnInspector // Make sure initialized.
@@ -231,6 +247,12 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr
  * Resolves [[UnresolvedWindowFunction]] to [[HiveWindowFunction]].
  */
 private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
+  private def shouldResolveFunction(
+      unresolvedWindowFunction: UnresolvedWindowFunction,
+      windowSpec: WindowSpecDefinition): Boolean = {
+    unresolvedWindowFunction.childrenResolved && windowSpec.childrenResolved
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p: LogicalPlan if !p.childrenResolved => p
 
@@ -238,9 +260,11 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
     // replaced those WindowSpecReferences.
     case p: LogicalPlan =>
       p transformExpressions {
+        // We will not start to resolve the function unless all arguments are resolved
+        // and all expressions in window spec are fixed.
         case WindowExpression(
-          UnresolvedWindowFunction(name, children),
-          windowSpec: WindowSpecDefinition) =>
+          u @ UnresolvedWindowFunction(name, children),
+          windowSpec: WindowSpecDefinition) if shouldResolveFunction(u, windowSpec) =>
           // First, let's find the window function info.
           val windowFunctionInfo: WindowFunctionInfo =
             Option(FunctionRegistry.getWindowFunctionInfo(name.toLowerCase)).getOrElse(
@@ -256,7 +280,7 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
             // are expressions in Order By clause.
             if (classOf[GenericUDAFRank].isAssignableFrom(functionClass)) {
               if (children.nonEmpty) {
-               throw  new AnalysisException(s"$name does not take input parameters.")
+               throw new AnalysisException(s"$name does not take input parameters.")
               }
               windowSpec.orderSpec.map(_.child)
             } else {
@@ -358,7 +382,7 @@ private[hive] case class HiveWindowFunction(
     evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
   }
 
-  override def dataType: DataType =
+  override val dataType: DataType =
     if (!pivotResult) {
       inspectorToDataType(returnInspector)
     } else {
@@ -478,7 +502,7 @@ private[hive] case class HiveGenericUDTF(
   @transient
   protected lazy val collector = new UDTFCollector
 
-  lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
+  override lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
     field => (inspectorToDataType(field.getFieldObjectInspector), true)
   }
 
@@ -602,6 +626,6 @@ private[hive] case class HiveUDAFFunction(
 
   override def supportsPartial: Boolean = false
 
-  override lazy val dataType: DataType = inspectorToDataType(returnInspector)
+  override val dataType: DataType = inspectorToDataType(returnInspector)
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 3c8a0091c83ec..5f9a447759b48 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -21,7 +21,8 @@ import java.io.{DataInput, DataOutput}
 import java.util.{ArrayList, Arrays, Properties}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.udf.generic.{GenericUDAFAverage, GenericUDF}
+import org.apache.hadoop.hive.ql.udf.UDAFPercentile
+import org.apache.hadoop.hive.ql.udf.generic.{GenericUDFOPAnd, GenericUDTFExplode, GenericUDAFAverage, GenericUDF}
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
@@ -299,6 +300,62 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton {
 
     hiveContext.reset()
   }
+
+  test("Hive UDFs with insufficient number of input arguments should trigger an analysis error") {
+    Seq((1, 2)).toDF("a", "b").registerTempTable("testUDF")
+
+    {
+      // HiveSimpleUDF
+      sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDFTwoListList() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
+    }
+
+    {
+      // HiveGenericUDF
+      sql(s"CREATE TEMPORARY FUNCTION testUDFAnd AS '${classOf[GenericUDFOPAnd].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDFAnd() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFAnd")
+    }
+
+    {
+      // Hive UDAF
+      sql(s"CREATE TEMPORARY FUNCTION testUDAFPercentile AS '${classOf[UDAFPercentile].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDAFPercentile(a) FROM testUDF GROUP BY b")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFPercentile")
+    }
+
+    {
+      // AbstractGenericUDAFResolver
+      sql(s"CREATE TEMPORARY FUNCTION testUDAFAverage AS '${classOf[GenericUDAFAverage].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDAFAverage() FROM testUDF GROUP BY b")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFAverage")
+    }
+
+    {
+      // Hive UDTF
+      sql(s"CREATE TEMPORARY FUNCTION testUDTFExplode AS '${classOf[GenericUDTFExplode].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDTFExplode() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDTFExplode")
+    }
+
+    sqlContext.dropTempTable("testUDF")
+  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {

From 01cd688f5245cbb752863100b399b525b31c3510 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 1 Oct 2015 17:23:27 -0700
Subject: [PATCH 442/802] [SPARK-10400] [SQL] Renames
 SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC

We introduced SQL option `spark.sql.parquet.followParquetFormatSpec` while working on implementing Parquet backwards-compatibility rules in SPARK-6777. It indicates whether we should use legacy Parquet format adopted by Spark 1.4 and prior versions or the standard format defined in parquet-format spec to write Parquet files.

This option defaults to `false` and is marked as a non-public option (`isPublic = false`) because we haven't finished refactored Parquet write path. The problem is, the name of this option is somewhat confusing, because it's not super intuitive why we shouldn't follow the spec. Would be nice to rename it to `spark.sql.parquet.writeLegacyFormat`, and invert its default value (the two option names have opposite meanings).

Although this option is private in 1.5, we'll make it public in 1.6 after refactoring Parquet write path. So that users can decide whether to write Parquet files in standard format or legacy format.

Author: Cheng Lian <lian@databricks.com>

Closes #8566 from liancheng/spark-10400/deprecate-follow-parquet-format-spec.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  11 +-
 .../parquet/CatalystReadSupport.scala         |   2 +-
 .../parquet/CatalystSchemaConverter.scala     |  61 ++--
 .../datasources/parquet/ParquetRelation.scala |  14 +-
 .../parquet/ParquetSchemaSuite.scala          | 289 ++++++++++++------
 .../apache/spark/sql/hive/parquetSuites.scala |   2 +-
 6 files changed, 231 insertions(+), 148 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index b9fb90d964206..e7bbc7d5db493 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -290,9 +290,9 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     doc = "Enables Parquet filter push-down optimization when set to true.")
 
-  val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf(
-    key = "spark.sql.parquet.followParquetFormatSpec",
-    defaultValue = Some(false),
+  val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
+    key = "spark.sql.parquet.writeLegacyFormat",
+    defaultValue = Some(true),
     doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
       "Spark SQL schema and vice versa.",
     isPublic = false)
@@ -304,8 +304,7 @@ private[spark] object SQLConf {
       "subclass of org.apache.hadoop.mapreduce.OutputCommitter.  Typically, it's also a subclass " +
       "of org.apache.parquet.hadoop.ParquetOutputCommitter.  NOTE: 1. Instead of SQLConf, this " +
       "option must be set in Hadoop Configuration.  2. This option overrides " +
-      "\"spark.sql.sources.outputCommitterClass\"."
-  )
+      "\"spark.sql.sources.outputCommitterClass\".")
 
   val ORC_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.orc.filterPushdown",
     defaultValue = Some(false),
@@ -491,7 +490,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
-  private[spark] def followParquetFormatSpec: Boolean = getConf(PARQUET_FOLLOW_PARQUET_FORMAT_SPEC)
+  private[spark] def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT)
 
   private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 9502b835a54d5..5325698034095 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -263,7 +263,7 @@ private[parquet] object CatalystReadSupport {
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
     val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
-    val toParquet = new CatalystSchemaConverter(followParquetFormatSpec = true)
+    val toParquet = new CatalystSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index 97ffeb08aafe2..6904fc736c106 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -41,34 +41,31 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  * @constructor
  * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL
  *        [[StringType]] fields when converting Parquet a [[MessageType]] to Spark SQL
- *        [[StructType]].
+ *        [[StructType]].  This argument only affects Parquet read path.
  * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL
  *        [[TimestampType]] fields when converting Parquet a [[MessageType]] to Spark SQL
  *        [[StructType]].  Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which
  *        has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS`
- *        described in Parquet format spec.
- * @param followParquetFormatSpec Whether to generate standard DECIMAL, LIST, and MAP structure when
- *        converting Spark SQL [[StructType]] to Parquet [[MessageType]].  For Spark 1.4.x and
- *        prior versions, Spark SQL only supports decimals with a max precision of 18 digits, and
- *        uses non-standard LIST and MAP structure.  Note that the current Parquet format spec is
- *        backwards-compatible with these settings.  If this argument is set to `false`, we fallback
- *        to old style non-standard behaviors.
+ *        described in Parquet format spec.  This argument only affects Parquet read path.
+ * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
+ *        and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]].
+ *        When set to false, use standard format defined in parquet-format spec.  This argument only
+ *        affects Parquet write path.
  */
 private[parquet] class CatalystSchemaConverter(
     assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
     assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
-    followParquetFormatSpec: Boolean = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get
-) {
+    writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get) {
 
   def this(conf: SQLConf) = this(
     assumeBinaryIsString = conf.isParquetBinaryAsString,
     assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp,
-    followParquetFormatSpec = conf.followParquetFormatSpec)
+    writeLegacyParquetFormat = conf.writeLegacyParquetFormat)
 
   def this(conf: Configuration) = this(
     assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean,
     assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean,
-    followParquetFormatSpec = conf.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key).toBoolean)
+    writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean)
 
   /**
    * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]].
@@ -371,15 +368,15 @@ private[parquet] class CatalystSchemaConverter(
       case BinaryType =>
         Types.primitive(BINARY, repetition).named(field.name)
 
-      // =====================================
-      // Decimals (for Spark version <= 1.4.x)
-      // =====================================
+      // ======================
+      // Decimals (legacy mode)
+      // ======================
 
       // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
       // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
       // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
       // by `DECIMAL`.
-      case DecimalType.Fixed(precision, scale) if !followParquetFormatSpec =>
+      case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
@@ -388,13 +385,13 @@ private[parquet] class CatalystSchemaConverter(
           .length(CatalystSchemaConverter.minBytesForPrecision(precision))
           .named(field.name)
 
-      // =====================================
-      // Decimals (follow Parquet format spec)
-      // =====================================
+      // ========================
+      // Decimals (standard mode)
+      // ========================
 
       // Uses INT32 for 1 <= precision <= 9
       case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat =>
         Types
           .primitive(INT32, repetition)
           .as(DECIMAL)
@@ -404,7 +401,7 @@ private[parquet] class CatalystSchemaConverter(
 
       // Uses INT64 for 1 <= precision <= 18
       case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat =>
         Types
           .primitive(INT64, repetition)
           .as(DECIMAL)
@@ -413,7 +410,7 @@ private[parquet] class CatalystSchemaConverter(
           .named(field.name)
 
       // Uses FIXED_LEN_BYTE_ARRAY for all other precisions
-      case DecimalType.Fixed(precision, scale) if followParquetFormatSpec =>
+      case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
@@ -422,15 +419,15 @@ private[parquet] class CatalystSchemaConverter(
           .length(CatalystSchemaConverter.minBytesForPrecision(precision))
           .named(field.name)
 
-      // ===================================================
-      // ArrayType and MapType (for Spark versions <= 1.4.x)
-      // ===================================================
+      // ===================================
+      // ArrayType and MapType (legacy mode)
+      // ===================================
 
       // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
       // `LIST` structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
       // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
       // field name "array" is borrowed from parquet-avro.
-      case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
+      case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   optional group bag {
         //     repeated <element-type> array;
@@ -448,7 +445,7 @@ private[parquet] class CatalystSchemaConverter(
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
       // covered by the backwards-compatibility rules implemented in `isElementType()`.
-      case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec =>
+      case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   repeated <element-type> element;
         // }
@@ -460,7 +457,7 @@ private[parquet] class CatalystSchemaConverter(
 
       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
-      case MapType(keyType, valueType, valueContainsNull) if !followParquetFormatSpec =>
+      case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
         // <map-repetition> group <name> (MAP) {
         //   repeated group map (MAP_KEY_VALUE) {
         //     required <key-type> key;
@@ -473,11 +470,11 @@ private[parquet] class CatalystSchemaConverter(
           convertField(StructField("key", keyType, nullable = false)),
           convertField(StructField("value", valueType, valueContainsNull)))
 
-      // ==================================================
-      // ArrayType and MapType (follow Parquet format spec)
-      // ==================================================
+      // =====================================
+      // ArrayType and MapType (standard mode)
+      // =====================================
 
-      case ArrayType(elementType, containsNull) if followParquetFormatSpec =>
+      case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   repeated group list {
         //     <element-repetition> <element-type> element;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 953fcab126970..8a9c0e733a9a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -287,7 +287,7 @@ private[sql] class ParquetRelation(
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+    val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat
 
     // Parquet row group size. We will use this value as the value for
     // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value
@@ -305,7 +305,7 @@ private[sql] class ParquetRelation(
         parquetFilterPushDown,
         assumeBinaryIsString,
         assumeInt96IsTimestamp,
-        followParquetFormatSpec) _
+        writeLegacyParquetFormat) _
 
     // Create the function to set input paths at the driver side.
     val setInputPaths =
@@ -531,7 +531,7 @@ private[sql] object ParquetRelation extends Logging {
       parquetFilterPushDown: Boolean,
       assumeBinaryIsString: Boolean,
       assumeInt96IsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean)(job: Job): Unit = {
+      writeLegacyParquetFormat: Boolean)(job: Job): Unit = {
     val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
@@ -561,7 +561,7 @@ private[sql] object ParquetRelation extends Logging {
     // Sets flags for Parquet schema conversion
     conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
     conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
-    conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
+    conf.setBoolean(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, writeLegacyParquetFormat)
 
     overrideMinSplitSize(parquetBlockSize, conf)
   }
@@ -586,7 +586,7 @@ private[sql] object ParquetRelation extends Logging {
       val converter = new CatalystSchemaConverter(
         sqlContext.conf.isParquetBinaryAsString,
         sqlContext.conf.isParquetBinaryAsString,
-        sqlContext.conf.followParquetFormatSpec)
+        sqlContext.conf.writeLegacyParquetFormat)
 
       converter.convert(schema)
     }
@@ -720,7 +720,7 @@ private[sql] object ParquetRelation extends Logging {
       filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = {
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+    val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat
     val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration)
 
     // !! HACK ALERT !!
@@ -760,7 +760,7 @@ private[sql] object ParquetRelation extends Logging {
             new CatalystSchemaConverter(
               assumeBinaryIsString = assumeBinaryIsString,
               assumeInt96IsTimestamp = assumeInt96IsTimestamp,
-              followParquetFormatSpec = followParquetFormatSpec)
+              writeLegacyParquetFormat = writeLegacyParquetFormat)
 
           footers.map { footer =>
             ParquetRelation.readSchemaFromFooter(footer, converter)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 5a8f772c32289..f17fb36f25fe8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -22,7 +22,6 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.parquet.schema.MessageTypeParser
 
-import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -35,32 +34,29 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext {
   protected def testSchemaInference[T <: Product: ClassTag: TypeTag](
       testName: String,
       messageType: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      writeLegacyParquetFormat: Boolean): Unit = {
     testSchema(
       testName,
       StructType.fromAttributes(ScalaReflection.attributesFor[T]),
       messageType,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      writeLegacyParquetFormat)
   }
 
   protected def testParquetToCatalyst(
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      writeLegacyParquetFormat: Boolean): Unit = {
     val converter = new CatalystSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
-      followParquetFormatSpec = followParquetFormatSpec)
+      writeLegacyParquetFormat = writeLegacyParquetFormat)
 
     test(s"sql <= parquet: $testName") {
       val actual = converter.convert(MessageTypeParser.parseMessageType(parquetSchema))
@@ -78,14 +74,13 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext {
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      writeLegacyParquetFormat: Boolean): Unit = {
     val converter = new CatalystSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
-      followParquetFormatSpec = followParquetFormatSpec)
+      writeLegacyParquetFormat = writeLegacyParquetFormat)
 
     test(s"sql => parquet: $testName") {
       val actual = converter.convert(sqlSchema)
@@ -99,10 +94,9 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext {
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      writeLegacyParquetFormat: Boolean): Unit = {
 
     testCatalystToParquet(
       testName,
@@ -110,8 +104,7 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      writeLegacyParquetFormat)
 
     testParquetToCatalyst(
       testName,
@@ -119,8 +112,7 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSQLContext {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      writeLegacyParquetFormat)
   }
 }
 
@@ -137,7 +129,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  optional binary  _6;
       |}
     """.stripMargin,
-    binaryAsString = false)
+    binaryAsString = false,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)](
     "logical integral types",
@@ -149,7 +143,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  required int64 _4 (INT_64);
       |  optional int32 _5 (DATE);
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[String]](
     "string",
@@ -158,7 +155,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  optional binary _1 (UTF8);
       |}
     """.stripMargin,
-    binaryAsString = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[String]](
     "binary enum as string",
@@ -166,7 +165,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional binary _1 (ENUM);
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - non-standard",
@@ -176,7 +178,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    repeated int32 array;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - standard",
@@ -189,7 +194,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
     "nullable array - non-standard",
@@ -201,7 +208,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
     "nullable array - standard",
@@ -214,7 +224,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
     "map - standard",
@@ -228,7 +240,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
     "map - non-standard",
@@ -241,7 +255,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Pair[Int, String]]](
     "struct",
@@ -253,7 +270,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
     "deeply nested type - non-standard",
@@ -276,7 +295,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
     "deeply nested type - standard",
@@ -300,7 +322,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[(Option[Int], Map[Int, Option[Double]])](
     "optional types",
@@ -315,36 +339,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
-
-  // Parquet files generated by parquet-thrift are already handled by the schema converter, but
-  // let's leave this test here until both read path and write path are all updated.
-  ignore("thrift generated parquet schema") {
-    // Test for SPARK-4520 -- ensure that thrift generated parquet schema is generated
-    // as expected from attributes
-    testSchemaInference[(
-      Array[Byte], Array[Byte], Array[Byte], Seq[Int], Map[Array[Byte], Seq[Int]])](
-      "thrift generated parquet schema",
-      """
-        |message root {
-        |  optional binary _1 (UTF8);
-        |  optional binary _2 (UTF8);
-        |  optional binary _3 (UTF8);
-        |  optional group _4 (LIST) {
-        |    repeated int32 _4_tuple;
-        |  }
-        |  optional group _5 (MAP) {
-        |    repeated group map (MAP_KEY_VALUE) {
-        |      required binary key (UTF8);
-        |      optional group value (LIST) {
-        |        repeated int32 value_tuple;
-        |      }
-        |    }
-        |  }
-        |}
-      """.stripMargin,
-      isThriftDerived = true)
-  }
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 }
 
 class ParquetSchemaSuite extends ParquetSchemaTest {
@@ -470,7 +467,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with nullable element type - 2",
@@ -486,7 +486,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -499,7 +502,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 2",
@@ -512,7 +518,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 3",
@@ -523,7 +532,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    repeated int32 element;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 4",
@@ -544,7 +556,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style",
@@ -563,7 +578,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style",
@@ -582,7 +600,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type 7 - " +
@@ -592,7 +613,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  repeated int32 f1;
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type 8 - " +
@@ -612,7 +636,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    required int32 c2;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   // =======================================================
   // Tests for converting Catalyst ArrayType to Parquet LIST
@@ -633,7 +660,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x",
@@ -649,7 +678,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -666,7 +698,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x",
@@ -680,7 +714,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    repeated int32 array;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   // ====================================================
   // Tests for converting Parquet Map to Catalyst MapType
@@ -701,7 +738,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 2",
@@ -718,7 +758,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x",
@@ -735,7 +778,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -752,7 +798,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 2",
@@ -769,7 +818,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style",
@@ -786,7 +838,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   // ====================================================
   // Tests for converting Catalyst MapType to Parquet Map
@@ -808,7 +863,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x",
@@ -825,7 +882,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -843,7 +903,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x",
@@ -860,7 +922,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   // =================================
   // Tests for conversion for decimals
@@ -873,7 +938,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(1, 0));
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(8, 3) - standard",
@@ -882,7 +949,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(8, 3));
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(9, 3) - standard",
@@ -891,7 +960,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(9, 3));
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(18, 3) - standard",
@@ -900,7 +971,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int64 f1 (DECIMAL(18, 3));
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(19, 3) - standard",
@@ -909,7 +982,9 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional fixed_len_byte_array(9) f1 (DECIMAL(19, 3));
       |}
     """.stripMargin,
-    followParquetFormatSpec = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(1, 0) - prior to 1.4.x",
@@ -917,7 +992,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(8, 3) - prior to 1.4.x",
@@ -925,7 +1003,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(9, 3) - prior to 1.4.x",
@@ -933,7 +1014,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(18, 3) - prior to 1.4.x",
@@ -941,7 +1025,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    writeLegacyParquetFormat = true)
 
   private def testSchemaClipping(
       testName: String,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 7d8104f9358d5..ff2d0a35e309a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -620,7 +620,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
         val conf = Seq(
           HiveContext.CONVERT_METASTORE_PARQUET.key -> "false",
           SQLConf.PARQUET_BINARY_AS_STRING.key -> "true",
-          SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key -> "true")
+          SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false")
 
         withSQLConf(conf: _*) {
           sql(

From 2272962eb087ffedaee12c761506e33e45bd0239 Mon Sep 17 00:00:00 2001
From: Takeshi YAMAMURO <linguin.m.s@gmail.com>
Date: Thu, 1 Oct 2015 21:33:27 -0400
Subject: [PATCH 443/802] [SPARK-9867] [SQL] Move utilities for binary data
 into ByteArray

The utilities such as Substring#substringBinarySQL and BinaryPrefixComparator#computePrefix for binary data are put together in ByteArray for easy-to-read.

Author: Takeshi YAMAMURO <linguin.m.s@gmail.com>

Closes #8122 from maropu/CleanUpForBinaryType.
---
 .../unsafe/sort/PrefixComparators.java        | 17 +------
 .../expressions/stringExpressions.scala       | 39 ++-------------
 .../apache/spark/unsafe/types/ByteArray.java  | 47 ++++++++++++++++++-
 3 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 71b76d5ddfaa7..d2bf297c6c178 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -21,6 +21,7 @@
 
 import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.types.ByteArray;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.apache.spark.util.Utils;
 
@@ -62,21 +63,7 @@ public int compare(long aPrefix, long bPrefix) {
     }
 
     public static long computePrefix(byte[] bytes) {
-      if (bytes == null) {
-        return 0L;
-      } else {
-        /**
-         * TODO: If a wrapper for BinaryType is created (SPARK-8786),
-         * these codes below will be in the wrapper class.
-         */
-        final int minLen = Math.min(bytes.length, 8);
-        long p = 0;
-        for (int i = 0; i < minLen; ++i) {
-          p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
-              << (56 - 8 * i);
-        }
-        return p;
-      }
+      return ByteArray.getPrefix(bytes);
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index a09d5b6e3ad14..4ab27c044f50e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -18,14 +18,12 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.text.DecimalFormat
-import java.util.Arrays
-import java.util.{Map => JMap, HashMap}
-import java.util.Locale
+import java.util.{HashMap, Locale, Map => JMap}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines expressions for string operations.
@@ -690,34 +688,6 @@ case class StringSpace(child: Expression)
   override def prettyName: String = "space"
 }
 
-object Substring {
-  def subStringBinarySQL(bytes: Array[Byte], pos: Int, len: Int): Array[Byte] = {
-    if (pos > bytes.length) {
-      return Array[Byte]()
-    }
-
-    var start = if (pos > 0) {
-      pos - 1
-    } else if (pos < 0) {
-      bytes.length + pos
-    } else {
-      0
-    }
-
-    val end = if ((bytes.length - start) < len) {
-      bytes.length
-    } else {
-      start + len
-    }
-
-    start = Math.max(start, 0)  // underflow
-    if (start < end) {
-      Arrays.copyOfRange(bytes, start, end)
-    } else {
-      Array[Byte]()
-    }
-  }
-}
 /**
  * A function that takes a substring of its first argument starting at a given position.
  * Defined for String and Binary types.
@@ -740,18 +710,17 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
     str.dataType match {
       case StringType => string.asInstanceOf[UTF8String]
         .substringSQL(pos.asInstanceOf[Int], len.asInstanceOf[Int])
-      case BinaryType => Substring.subStringBinarySQL(string.asInstanceOf[Array[Byte]],
+      case BinaryType => ByteArray.subStringSQL(string.asInstanceOf[Array[Byte]],
         pos.asInstanceOf[Int], len.asInstanceOf[Int])
     }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
 
-    val cls = classOf[Substring].getName
     defineCodeGen(ctx, ev, (string, pos, len) => {
       str.dataType match {
         case StringType => s"$string.substringSQL($pos, $len)"
-        case BinaryType => s"$cls.subStringBinarySQL($string, $pos, $len)"
+        case BinaryType => s"${classOf[ByteArray].getName}.subStringSQL($string, $pos, $len)"
       }
     })
   }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
index c08c9c73d2396..3ced2094f5e6b 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -19,7 +19,11 @@
 
 import org.apache.spark.unsafe.Platform;
 
-public class ByteArray {
+import java.util.Arrays;
+
+public final class ByteArray {
+
+  public static final byte[] EMPTY_BYTE = new byte[0];
 
   /**
    * Writes the content of a byte array into a memory address, identified by an object and an
@@ -29,4 +33,45 @@ public class ByteArray {
   public static void writeToMemory(byte[] src, Object target, long targetOffset) {
     Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET, target, targetOffset, src.length);
   }
+
+  /**
+   * Returns a 64-bit integer that can be used as the prefix used in sorting.
+   */
+  public static long getPrefix(byte[] bytes) {
+    if (bytes == null) {
+      return 0L;
+    } else {
+      final int minLen = Math.min(bytes.length, 8);
+      long p = 0;
+      for (int i = 0; i < minLen; ++i) {
+        p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
+            << (56 - 8 * i);
+      }
+      return p;
+    }
+  }
+
+  public static byte[] subStringSQL(byte[] bytes, int pos, int len) {
+    // This pos calculation is according to UTF8String#subStringSQL
+    if (pos > bytes.length) {
+      return EMPTY_BYTE;
+    }
+    int start = 0;
+    int end;
+    if (pos > 0) {
+      start = pos - 1;
+    } else if (pos < 0) {
+      start = bytes.length + pos;
+    }
+    if ((bytes.length - start) < len) {
+      end = bytes.length;
+    } else {
+      end = start + len;
+    }
+    start = Math.max(start, 0); // underflow
+    if (start >= end) {
+      return EMPTY_BYTE;
+    }
+    return Arrays.copyOfRange(bytes, start, end);
+  }
 }

From 2a717821bbb026d4d8c43d31b3300721357951c6 Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Fri, 2 Oct 2015 10:15:02 -0700
Subject: [PATCH 444/802] [SPARK-9798] [ML] CrossValidatorModel Documentation
 Improvements

Document CrossValidatorModel members: bestModel and avgMetrics

Author: Rerngvit Yanggratoke <rerngvit@kth.se>

Closes #8882 from rerngvit/Spark-9798.
---
 .../scala/org/apache/spark/ml/tuning/CrossValidator.scala     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 0679bfd0f3ffe..820c9965d09b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -136,6 +136,10 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
 /**
  * :: Experimental ::
  * Model from k-fold cross validation.
+ *
+ * @param bestModel The best model selected from k-fold cross validation.
+ * @param avgMetrics Average cross-validation metrics for each paramMap in [[estimatorParamMaps]], in the
+ *                   corresponding order.
  */
 @Experimental
 class CrossValidatorModel private[ml] (

From 23a9448c04da7130d6c41c37f9fdf03184422dc8 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 2 Oct 2015 10:19:18 -0700
Subject: [PATCH 445/802] [SPARK-5890] [ML] Add feature discretizer

JIRA issue [here](https://issues.apache.org/jira/browse/SPARK-5890).

I borrow the code of `findSplits` from `RandomForest`. I don't think it's good to call it from `RandomForest` directly.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #5779 from yinxusen/SPARK-5890.
---
 .../ml/feature/QuantileDiscretizer.scala      | 176 ++++++++++++++++++
 .../ml/feature/QuantileDiscretizerSuite.scala |  98 ++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
new file mode 100644
index 0000000000000..46b836da9cfde
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml._
+import org.apache.spark.ml.attribute.NominalAttribute
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.{IntParam, _}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.util.random.XORShiftRandom
+
+/**
+ * Params for [[QuantileDiscretizer]].
+ */
+private[feature] trait QuantileDiscretizerBase extends Params with HasInputCol with HasOutputCol {
+
+  /**
+   * Maximum number of buckets (quantiles, or categories) into which data points are grouped. Must
+   * be >= 2.
+   * default: 2
+   * @group param
+   */
+  val numBuckets = new IntParam(this, "numBuckets", "Maximum number of buckets (quantiles, or " +
+    "categories) into which data points are grouped. Must be >= 2.",
+    ParamValidators.gtEq(2))
+  setDefault(numBuckets -> 2)
+
+  /** @group getParam */
+  def getNumBuckets: Int = getOrDefault(numBuckets)
+}
+
+/**
+ * :: Experimental ::
+ * `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
+ * categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
+ * into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
+ * covering all real values. This attempts to find numBuckets partitions based on a sample of data,
+ * but it may find fewer depending on the data sample values.
+ */
+@Experimental
+final class QuantileDiscretizer(override val uid: String)
+  extends Estimator[Bucketizer] with QuantileDiscretizerBase {
+
+  def this() = this(Identifiable.randomUID("quantileDiscretizer"))
+
+  /** @group setParam */
+  def setNumBuckets(value: Int): this.type = set(numBuckets, value)
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
+    val inputFields = schema.fields
+    require(inputFields.forall(_.name != $(outputCol)),
+      s"Output column ${$(outputCol)} already exists.")
+    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
+    val outputFields = inputFields :+ attr.toStructField()
+    StructType(outputFields)
+  }
+
+  override def fit(dataset: DataFrame): Bucketizer = {
+    val samples = QuantileDiscretizer.getSampledInput(dataset.select($(inputCol)), $(numBuckets))
+      .map { case Row(feature: Double) => feature }
+    val candidates = QuantileDiscretizer.findSplitCandidates(samples, $(numBuckets) - 1)
+    val splits = QuantileDiscretizer.getSplits(candidates)
+    val bucketizer = new Bucketizer(uid).setSplits(splits)
+    copyValues(bucketizer)
+  }
+
+  override def copy(extra: ParamMap): QuantileDiscretizer = defaultCopy(extra)
+}
+
+private[feature] object QuantileDiscretizer extends Logging {
+  /**
+   * Sampling from the given dataset to collect quantile statistics.
+   */
+  def getSampledInput(dataset: DataFrame, numBins: Int): Array[Row] = {
+    val totalSamples = dataset.count()
+    require(totalSamples > 0,
+      "QuantileDiscretizer requires non-empty input dataset but was given an empty input.")
+    val requiredSamples = math.max(numBins * numBins, 10000)
+    val fraction = math.min(requiredSamples / dataset.count(), 1.0)
+    dataset.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect()
+  }
+
+  /**
+   * Compute split points with respect to the sample distribution.
+   */
+  def findSplitCandidates(samples: Array[Double], numSplits: Int): Array[Double] = {
+    val valueCountMap = samples.foldLeft(Map.empty[Double, Int]) { (m, x) =>
+      m + ((x, m.getOrElse(x, 0) + 1))
+    }
+    val valueCounts = valueCountMap.toSeq.sortBy(_._1).toArray ++ Array((Double.MaxValue, 1))
+    val possibleSplits = valueCounts.length - 1
+    if (possibleSplits <= numSplits) {
+      valueCounts.dropRight(1).map(_._1)
+    } else {
+      val stride: Double = math.ceil(samples.length.toDouble / (numSplits + 1))
+      val splitsBuilder = mutable.ArrayBuilder.make[Double]
+      var index = 1
+      // currentCount: sum of counts of values that have been visited
+      var currentCount = valueCounts(0)._2
+      // targetCount: target value for `currentCount`. If `currentCount` is closest value to
+      // `targetCount`, then current value is a split threshold. After finding a split threshold,
+      // `targetCount` is added by stride.
+      var targetCount = stride
+      while (index < valueCounts.length) {
+        val previousCount = currentCount
+        currentCount += valueCounts(index)._2
+        val previousGap = math.abs(previousCount - targetCount)
+        val currentGap = math.abs(currentCount - targetCount)
+        // If adding count of current value to currentCount makes the gap between currentCount and
+        // targetCount smaller, previous value is a split threshold.
+        if (previousGap < currentGap) {
+          splitsBuilder += valueCounts(index - 1)._1
+          targetCount += stride
+        }
+        index += 1
+      }
+      splitsBuilder.result()
+    }
+  }
+
+  /**
+   * Adjust split candidates to proper splits by: adding positive/negative infinity to both sides as
+   * needed, and adding a default split value of 0 if no good candidates are found.
+   */
+  def getSplits(candidates: Array[Double]): Array[Double] = {
+    val effectiveValues = if (candidates.size != 0) {
+      if (candidates.head == Double.NegativeInfinity
+        && candidates.last == Double.PositiveInfinity) {
+        candidates.drop(1).dropRight(1)
+      } else if (candidates.head == Double.NegativeInfinity) {
+        candidates.drop(1)
+      } else if (candidates.last == Double.PositiveInfinity) {
+        candidates.dropRight(1)
+      } else {
+        candidates
+      }
+    } else {
+      candidates
+    }
+
+    if (effectiveValues.size == 0) {
+      Array(Double.NegativeInfinity, 0, Double.PositiveInfinity)
+    } else {
+      Array(Double.NegativeInfinity) ++ effectiveValues ++ Array(Double.PositiveInfinity)
+    }
+  }
+}
+
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
new file mode 100644
index 0000000000000..b2bdd8935f903
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.{SparkContext, SparkFunSuite}
+
+class QuantileDiscretizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+  import org.apache.spark.ml.feature.QuantileDiscretizerSuite._
+
+  test("Test quantile discretizer") {
+    checkDiscretizedData(sc,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      10,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))
+
+    checkDiscretizedData(sc,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      4,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))
+
+    checkDiscretizedData(sc,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      3,
+      Array[Double](0, 1, 2, 2, 2, 2, 2, 2, 2),
+      Array("-Infinity, 2.0", "2.0, 3.0", "3.0, Infinity"))
+
+    checkDiscretizedData(sc,
+      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
+      2,
+      Array[Double](0, 1, 1, 1, 1, 1, 1, 1, 1),
+      Array("-Infinity, 2.0", "2.0, Infinity"))
+
+  }
+
+  test("Test getting splits") {
+    val splitTestPoints = Array(
+      Array[Double]() -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
+      Array(Double.NegativeInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
+      Array(Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
+      Array(Double.NegativeInfinity, Double.PositiveInfinity)
+        -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
+      Array(0.0) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
+      Array(1.0) -> Array(Double.NegativeInfinity, 1, Double.PositiveInfinity),
+      Array(0.0, 1.0) -> Array(Double.NegativeInfinity, 0, 1, Double.PositiveInfinity)
+    )
+    for ((ori, res) <- splitTestPoints) {
+      assert(QuantileDiscretizer.getSplits(ori) === res, "Returned splits are invalid.")
+    }
+  }
+}
+
+private object QuantileDiscretizerSuite extends SparkFunSuite {
+
+  def checkDiscretizedData(
+      sc: SparkContext,
+      data: Array[Double],
+      numBucket: Int,
+      expectedResult: Array[Double],
+      expectedAttrs: Array[String]): Unit = {
+    val sqlCtx = SQLContext.getOrCreate(sc)
+    import sqlCtx.implicits._
+
+    val df = sc.parallelize(data.map(Tuple1.apply)).toDF("input")
+    val discretizer = new QuantileDiscretizer().setInputCol("input").setOutputCol("result")
+      .setNumBuckets(numBucket)
+    val result = discretizer.fit(df).transform(df)
+
+    val transformedFeatures = result.select("result").collect()
+      .map { case Row(transformedFeature: Double) => transformedFeature }
+    val transformedAttrs = Attribute.fromStructField(result.schema("result"))
+      .asInstanceOf[NominalAttribute].values.get
+
+    assert(transformedFeatures === expectedResult,
+      "Transformed features do not equal expected features.")
+    assert(transformedAttrs === expectedAttrs,
+      "Transformed attributes do not equal expected attributes.")
+  }
+}

From 633aaae0a1e31e9ba634423840e350b22342c6b5 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 2 Oct 2015 10:25:58 -0700
Subject: [PATCH 446/802] [SPARK-6530] [ML] Add chi-square selector for ml
 package

See JIRA [here](https://issues.apache.org/jira/browse/SPARK-6530).

Author: Xusen Yin <yinxusen@gmail.com>

Closes #5742 from yinxusen/SPARK-6530.
---
 .../spark/ml/feature/ChiSqSelector.scala      | 150 ++++++++++++++++++
 .../spark/mllib/feature/ChiSqSelector.scala   |   2 +
 .../spark/ml/feature/ChiSqSelectorSuite.scala |  61 +++++++
 3 files changed, 213 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
new file mode 100644
index 0000000000000..5e4061fba5494
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml._
+import org.apache.spark.ml.attribute.{AttributeGroup, _}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+
+/**
+ * Params for [[ChiSqSelector]] and [[ChiSqSelectorModel]].
+ */
+private[feature] trait ChiSqSelectorParams extends Params
+  with HasFeaturesCol with HasOutputCol with HasLabelCol {
+
+  /**
+   * Number of features that selector will select (ordered by statistic value descending). If the
+   * number of features is < numTopFeatures, then this will select all features. The default value
+   * of numTopFeatures is 50.
+   * @group param
+   */
+  final val numTopFeatures = new IntParam(this, "numTopFeatures",
+    "Number of features that selector will select, ordered by statistics value descending. If the" +
+      " number of features is < numTopFeatures, then this will select all features.",
+    ParamValidators.gtEq(1))
+  setDefault(numTopFeatures -> 50)
+
+  /** @group getParam */
+  def getNumTopFeatures: Int = $(numTopFeatures)
+}
+
+/**
+ * :: Experimental ::
+ * Chi-Squared feature selection, which selects categorical features to use for predicting a
+ * categorical label.
+ */
+@Experimental
+final class ChiSqSelector(override val uid: String)
+  extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {
+
+  def this() = this(Identifiable.randomUID("chiSqSelector"))
+
+  /** @group setParam */
+  def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value)
+
+  /** @group setParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  override def fit(dataset: DataFrame): ChiSqSelectorModel = {
+    transformSchema(dataset.schema, logging = true)
+    val input = dataset.select($(labelCol), $(featuresCol)).map {
+      case Row(label: Double, features: Vector) =>
+        LabeledPoint(label, features)
+    }
+    val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input)
+    copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+    SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
+  }
+
+  override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra)
+}
+
+/**
+ * :: Experimental ::
+ * Model fitted by [[ChiSqSelector]].
+ */
+@Experimental
+final class ChiSqSelectorModel private[ml] (
+    override val uid: String,
+    private val chiSqSelector: feature.ChiSqSelectorModel)
+  extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
+
+  /** @group setParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val transformedSchema = transformSchema(dataset.schema, logging = true)
+    val newField = transformedSchema.last
+    val selector = udf { chiSqSelector.transform _ }
+    dataset.withColumn($(outputCol), selector(col($(featuresCol))), newField.metadata)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+    val newField = prepOutputField(schema)
+    val outputFields = schema.fields :+ newField
+    StructType(outputFields)
+  }
+
+  /**
+   * Prepare the output column field, including per-feature metadata.
+   */
+  private def prepOutputField(schema: StructType): StructField = {
+    val selector = chiSqSelector.selectedFeatures.toSet
+    val origAttrGroup = AttributeGroup.fromStructField(schema($(featuresCol)))
+    val featureAttributes: Array[Attribute] = if (origAttrGroup.attributes.nonEmpty) {
+      origAttrGroup.attributes.get.zipWithIndex.filter(x => selector.contains(x._2)).map(_._1)
+    } else {
+      Array.fill[Attribute](selector.size)(NominalAttribute.defaultAttr)
+    }
+    val newAttributeGroup = new AttributeGroup($(outputCol), featureAttributes)
+    newAttributeGroup.toStructField()
+  }
+
+  override def copy(extra: ParamMap): ChiSqSelectorModel = {
+    val copied = new ChiSqSelectorModel(uid, chiSqSelector)
+    copyValues(copied, extra).setParent(parent)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 4743cfd1a2c3f..b1524cf377808 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -109,6 +109,8 @@ class ChiSqSelectorModel @Since("1.3.0") (
  * Creates a ChiSquared feature selector.
  * @param numTopFeatures number of features that selector will select
  *                       (ordered by statistic value descending)
+ *                       Note that if the number of features is < numTopFeatures, then this will
+ *                       select all features.
  */
 @Since("1.3.0")
 @Experimental
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
new file mode 100644
index 0000000000000..e5a42967bd2c8
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.{Row, SQLContext}
+
+class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("Test Chi-Square selector") {
+    val sqlContext = SQLContext.getOrCreate(sc)
+    import sqlContext.implicits._
+
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
+      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
+    )
+
+    val preFilteredData = Seq(
+      Vectors.dense(0.0),
+      Vectors.dense(6.0),
+      Vectors.dense(8.0),
+      Vectors.dense(5.0)
+    )
+
+    val df = sc.parallelize(data.zip(preFilteredData))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    val model = new ChiSqSelector()
+      .setNumTopFeatures(1)
+      .setFeaturesCol("data")
+      .setLabelCol("label")
+      .setOutputCol("filtered")
+
+    model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+  }
+}

From b0baa11d3bb0e1e44553989b91a5efb9d04bc594 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 2 Oct 2015 11:23:08 -0700
Subject: [PATCH 447/802] [HOT-FIX] Fix style.

https://github.com/apache/spark/pull/8882 broke our build.

Author: Yin Huai <yhuai@databricks.com>

Closes #8964 from yhuai/fixStyle.
---
 .../scala/org/apache/spark/ml/tuning/CrossValidator.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 820c9965d09b3..77d9948ed86b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -138,8 +138,8 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM
  * Model from k-fold cross validation.
  *
  * @param bestModel The best model selected from k-fold cross validation.
- * @param avgMetrics Average cross-validation metrics for each paramMap in [[estimatorParamMaps]], in the
- *                   corresponding order.
+ * @param avgMetrics Average cross-validation metrics for each paramMap in
+ *                   [[estimatorParamMaps]], in the corresponding order.
  */
 @Experimental
 class CrossValidatorModel private[ml] (

From f85aa06464a10f5d1563302fd76465dded475a12 Mon Sep 17 00:00:00 2001
From: Joshi <rekhajoshm@gmail.com>
Date: Fri, 2 Oct 2015 15:26:11 -0700
Subject: [PATCH 448/802] [SPARK-10317] [CORE] Compatibility between history
 server script and functionality

Compatibility between history server script and functionality

The history server has its argument parsing class in HistoryServerArguments. However, this doesn't get involved in the start-history-server.sh codepath where the $0 arg is assigned to spark.history.fs.logDirectory and all other arguments discarded (e.g --property-file.)
This stops the other options being usable from this script

Author: Joshi <rekhajoshm@gmail.com>
Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #8758 from rekhajoshm/SPARK-10317.
---
 .../history/HistoryServerArguments.scala      | 40 +++++++----
 .../history/HistoryServerArgumentsSuite.scala | 70 +++++++++++++++++++
 sbin/start-history-server.sh                  |  8 +--
 3 files changed, 96 insertions(+), 22 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index 18265df9faa2c..d03bab3820bb2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -30,28 +30,35 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin
   parse(args.toList)
 
   private def parse(args: List[String]): Unit = {
-    args match {
-      case ("--dir" | "-d") :: value :: tail =>
-        logWarning("Setting log directory through the command line is deprecated as of " +
-          "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
-        conf.set("spark.history.fs.logDirectory", value)
-        System.setProperty("spark.history.fs.logDirectory", value)
-        parse(tail)
+    if (args.length == 1) {
+      setLogDirectory(args.head)
+    } else {
+      args match {
+        case ("--dir" | "-d") :: value :: tail =>
+          setLogDirectory(value)
+          parse(tail)
 
-      case ("--help" | "-h") :: tail =>
-        printUsageAndExit(0)
+        case ("--help" | "-h") :: tail =>
+          printUsageAndExit(0)
 
-      case ("--properties-file") :: value :: tail =>
-        propertiesFile = value
-        parse(tail)
+        case ("--properties-file") :: value :: tail =>
+          propertiesFile = value
+          parse(tail)
 
-      case Nil =>
+        case Nil =>
 
-      case _ =>
-        printUsageAndExit(1)
+        case _ =>
+          printUsageAndExit(1)
+      }
     }
   }
 
+  private def setLogDirectory(value: String): Unit = {
+    logWarning("Setting log directory through the command line is deprecated as of " +
+      "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
+    conf.set("spark.history.fs.logDirectory", value)
+  }
+
    // This mutates the SparkConf, so all accesses to it must be made after this line
    Utils.loadDefaultSparkProperties(conf, propertiesFile)
 
@@ -62,6 +69,8 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin
       |Usage: HistoryServer [options]
       |
       |Options:
+      |  DIR                         Deprecated; set spark.history.fs.logDirectory directly
+      |  --dir DIR (-d DIR)          Deprecated; set spark.history.fs.logDirectory directly
       |  --properties-file FILE      Path to a custom Spark properties file.
       |                              Default is conf/spark-defaults.conf.
       |
@@ -90,3 +99,4 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin
   }
 
 }
+
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala
new file mode 100644
index 0000000000000..34f27ecaa07a3
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.history
+
+import java.io.File
+import java.nio.charset.StandardCharsets._
+
+import com.google.common.io.Files
+
+import org.apache.spark._
+import org.apache.spark.util.Utils
+
+class HistoryServerArgumentsSuite extends SparkFunSuite {
+
+  private val logDir = new File("src/test/resources/spark-events")
+  private val conf = new SparkConf()
+    .set("spark.history.fs.logDirectory", logDir.getAbsolutePath)
+    .set("spark.history.fs.updateInterval", "1")
+    .set("spark.testing", "true")
+
+  test("No Arguments Parsing") {
+    val argStrings = Array[String]()
+    val hsa = new HistoryServerArguments(conf, argStrings)
+    assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
+    assert(conf.get("spark.history.fs.updateInterval") === "1")
+    assert(conf.get("spark.testing") === "true")
+  }
+
+  test("Directory Arguments Parsing --dir or -d") {
+    val argStrings = Array("--dir", "src/test/resources/spark-events1")
+    val hsa = new HistoryServerArguments(conf, argStrings)
+    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events1")
+  }
+
+  test("Directory Param can also be set directly") {
+    val argStrings = Array("src/test/resources/spark-events2")
+    val hsa = new HistoryServerArguments(conf, argStrings)
+    assert(conf.get("spark.history.fs.logDirectory") === "src/test/resources/spark-events2")
+  }
+
+  test("Properties File Arguments Parsing --properties-file") {
+    val tmpDir = Utils.createTempDir()
+    val outFile = File.createTempFile("test-load-spark-properties", "test", tmpDir)
+    try {
+      Files.write("spark.test.CustomPropertyA blah\n" +
+        "spark.test.CustomPropertyB notblah\n", outFile, UTF_8)
+      val argStrings = Array("--properties-file", outFile.getAbsolutePath)
+      val hsa = new HistoryServerArguments(conf, argStrings)
+      assert(conf.get("spark.test.CustomPropertyA") === "blah")
+      assert(conf.get("spark.test.CustomPropertyB") === "notblah")
+    } finally {
+      Utils.deleteRecursively(tmpDir)
+    }
+  }
+
+}
diff --git a/sbin/start-history-server.sh b/sbin/start-history-server.sh
index 7172ad15d88fc..9034e5715cc85 100755
--- a/sbin/start-history-server.sh
+++ b/sbin/start-history-server.sh
@@ -30,10 +30,4 @@ sbin="`cd "$sbin"; pwd`"
 . "$sbin/spark-config.sh"
 . "$SPARK_PREFIX/bin/load-spark-env.sh"
 
-if [ $# != 0 ]; then
-  echo "Using command line arguments for setting the log directory is deprecated. Please "
-  echo "set the spark.history.fs.logDirectory configuration option instead."
-  export SPARK_HISTORY_OPTS="$SPARK_HISTORY_OPTS -Dspark.history.fs.logDirectory=$1"
-fi
-
-exec "$sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1
+exec "$sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 $@

From 314bc68435ac3901a97724b9eccd1daf8f89578e Mon Sep 17 00:00:00 2001
From: gweidner <gweidner@us.ibm.com>
Date: Sat, 3 Oct 2015 01:04:14 -0700
Subject: [PATCH 449/802] [SPARK-7275] [SQL] Make LogicalRelation public

Given LogicalRelation (and other classes) were moved from sources package to execution.sources package, removed private[sql] to make LogicalRelation public to facilitate access for data sources.

Author: gweidner <gweidner@us.ibm.com>

Closes #8965 from gweidner/SPARK-7275.
---
 .../spark/sql/execution/datasources/LogicalRelation.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 4069179aa734b..783252e0a297f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.sources.BaseRelation
  * changing the output attributes' IDs.  The `expectedOutputAttributes` parameter is used for
  * this purpose.  See https://issues.apache.org/jira/browse/SPARK-10741 for more details.
  */
-private[sql] case class LogicalRelation(
+case class LogicalRelation(
     relation: BaseRelation,
     expectedOutputAttributes: Option[Seq[Attribute]] = None)
   extends LeafNode with MultiInstanceRelation {

From 107320c9bbfe2496963a4e75e60fd6ba7fbfbabc Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 3 Oct 2015 01:04:35 -0700
Subject: [PATCH 450/802] [SPARK-6028] [CORE] Remerge #6457: new RPC
 implemetation and also pick #8905

This PR just reverted https://github.com/apache/spark/commit/02144d6745ec0a6d8877d969feb82139bd22437f to remerge #6457 and also included the commits in #8905.

Author: zsxwing <zsxwing@gmail.com>

Closes #8944 from zsxwing/SPARK-6028.
---
 .../org/apache/spark/MapOutputTracker.scala   |   2 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  20 +-
 .../apache/spark/deploy/worker/Worker.scala   |   2 +-
 .../spark/deploy/worker/WorkerWatcher.scala   |  13 +-
 .../org/apache/spark/rpc/RpcCallContext.scala |   2 +-
 .../org/apache/spark/rpc/RpcEndpoint.scala    |  51 +-
 .../rpc/RpcEndpointNotFoundException.scala    |  22 +
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   5 +-
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |   6 +-
 .../apache/spark/rpc/netty/Dispatcher.scala   | 218 ++++++++
 .../apache/spark/rpc/netty/IDVerifier.scala   |  39 ++
 .../org/apache/spark/rpc/netty/Inbox.scala    | 227 ++++++++
 .../spark/rpc/netty/NettyRpcAddress.scala     |  56 ++
 .../spark/rpc/netty/NettyRpcCallContext.scala |  87 +++
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 504 ++++++++++++++++++
 .../storage/BlockManagerSlaveEndpoint.scala   |   6 +-
 .../org/apache/spark/util/ThreadUtils.scala   |   6 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  10 +-
 .../org/apache/spark/SSLSampleConfigs.scala   |   2 +
 .../deploy/worker/WorkerWatcherSuite.scala    |   6 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  81 ++-
 .../apache/spark/rpc/TestRpcEndpoint.scala    | 123 +++++
 .../apache/spark/rpc/netty/InboxSuite.scala   | 150 ++++++
 .../rpc/netty/NettyRpcAddressSuite.scala      |  29 +
 .../spark/rpc/netty/NettyRpcEnvSuite.scala    |  38 ++
 .../rpc/netty/NettyRpcHandlerSuite.scala      |  67 +++
 .../spark/network/client/TransportClient.java |   4 +
 .../spark/network/server/RpcHandler.java      |   2 +
 .../server/TransportRequestHandler.java       |   1 +
 .../streaming/scheduler/ReceiverTracker.scala |   2 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |   5 +-
 31 files changed, 1715 insertions(+), 71 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index e4cb72e39d7ec..45e12e40c837f 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -45,7 +45,7 @@ private[spark] class MapOutputTrackerMasterEndpoint(
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case GetMapOutputStatuses(shuffleId: Int) =>
-      val hostPort = context.sender.address.hostPort
+      val hostPort = context.senderAddress.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
       val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
       val serializedSize = mapOutputStatuses.size
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index c6fef7f91f00c..cfde27fb2e7d3 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -20,11 +20,10 @@ package org.apache.spark
 import java.io.File
 import java.net.Socket
 
-import akka.actor.ActorSystem
-
 import scala.collection.mutable
 import scala.util.Properties
 
+import akka.actor.ActorSystem
 import com.google.common.collect.MapMaker
 
 import org.apache.spark.annotation.DeveloperApi
@@ -41,7 +40,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
 import org.apache.spark.storage._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator}
-import org.apache.spark.util.{RpcUtils, Utils}
+import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils}
 
 /**
  * :: DeveloperApi ::
@@ -57,6 +56,7 @@ import org.apache.spark.util.{RpcUtils, Utils}
 class SparkEnv (
     val executorId: String,
     private[spark] val rpcEnv: RpcEnv,
+    _actorSystem: ActorSystem, // TODO Remove actorSystem
     val serializer: Serializer,
     val closureSerializer: Serializer,
     val cacheManager: CacheManager,
@@ -76,7 +76,7 @@ class SparkEnv (
 
   // TODO Remove actorSystem
   @deprecated("Actor system is no longer supported as of 1.4.0", "1.4.0")
-  val actorSystem: ActorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+  val actorSystem: ActorSystem = _actorSystem
 
   private[spark] var isStopped = false
   private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()
@@ -100,6 +100,9 @@ class SparkEnv (
       blockManager.master.stop()
       metricsSystem.stop()
       outputCommitCoordinator.stop()
+      if (!rpcEnv.isInstanceOf[AkkaRpcEnv]) {
+        actorSystem.shutdown()
+      }
       rpcEnv.shutdown()
 
       // Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut
@@ -249,7 +252,13 @@ object SparkEnv extends Logging {
     // Create the ActorSystem for Akka and get the port it binds to.
     val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
     val rpcEnv = RpcEnv.create(actorSystemName, hostname, port, conf, securityManager)
-    val actorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+    val actorSystem: ActorSystem =
+      if (rpcEnv.isInstanceOf[AkkaRpcEnv]) {
+        rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
+      } else {
+        // Create a ActorSystem for legacy codes
+        AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)._1
+      }
 
     // Figure out which port Akka actually bound to in case the original port is 0 or occupied.
     if (isDriver) {
@@ -395,6 +404,7 @@ object SparkEnv extends Logging {
     val envInstance = new SparkEnv(
       executorId,
       rpcEnv,
+      actorSystem,
       serializer,
       closureSerializer,
       cacheManager,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 770927c80f7a4..93a1b3f310422 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -329,7 +329,7 @@ private[deploy] class Worker(
         registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate(
           new Runnable {
             override def run(): Unit = Utils.tryLogNonFatalError {
-              self.send(ReregisterWithMaster)
+              Option(self).foreach(_.send(ReregisterWithMaster))
             }
           },
           INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index 735c4f0927150..ab56fde938bae 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -24,14 +24,13 @@ import org.apache.spark.rpc._
  * Actor which connects to a worker process and terminates the JVM if the connection is severed.
  * Provides fate sharing between a worker and its associated child processes.
  */
-private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: String)
+private[spark] class WorkerWatcher(
+    override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false)
   extends RpcEndpoint with Logging {
 
-  override def onStart() {
-    logInfo(s"Connecting to worker $workerUrl")
-    if (!isTesting) {
-      rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
-    }
+  logInfo(s"Connecting to worker $workerUrl")
+  if (!isTesting) {
+    rpcEnv.asyncSetupEndpointRefByURI(workerUrl)
   }
 
   // Used to avoid shutting down JVM during tests
@@ -40,8 +39,6 @@ private[spark] class WorkerWatcher(override val rpcEnv: RpcEnv, workerUrl: Strin
   // true rather than calling `System.exit`. The user can check `isShutDown` to know if
   // `exitNonZero` is called.
   private[deploy] var isShutDown = false
-  private[deploy] def setTesting(testing: Boolean) = isTesting = testing
-  private var isTesting = false
 
   // Lets filter events only from the worker's rpc system
   private val expectedAddress = RpcAddress.fromURIString(workerUrl)
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
index 3e5b64265e919..f527ec86ab7b2 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
@@ -37,5 +37,5 @@ private[spark] trait RpcCallContext {
   /**
    * The sender of this message.
    */
-  def sender: RpcEndpointRef
+  def senderAddress: RpcAddress
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
index dfcbc51cdf616..f1ddc6d2cd438 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
@@ -28,20 +28,6 @@ private[spark] trait RpcEnvFactory {
   def create(config: RpcEnvConfig): RpcEnv
 }
 
-/**
- * A trait that requires RpcEnv thread-safely sending messages to it.
- *
- * Thread-safety means processing of one message happens before processing of the next message by
- * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
- * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
- * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
- *
- * However, there is no guarantee that the same thread will be executing the same
- * [[ThreadSafeRpcEndpoint]] for different messages.
- */
-private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint
-
-
 /**
  * An end point for the RPC that defines what functions to trigger given a message.
  *
@@ -101,38 +87,39 @@ private[spark] trait RpcEndpoint {
   }
 
   /**
-   * Invoked before [[RpcEndpoint]] starts to handle any message.
+   * Invoked when `remoteAddress` is connected to the current node.
    */
-  def onStart(): Unit = {
+  def onConnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when [[RpcEndpoint]] is stopping.
+   * Invoked when `remoteAddress` is lost.
    */
-  def onStop(): Unit = {
+  def onDisconnected(remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when `remoteAddress` is connected to the current node.
+   * Invoked when some network error happens in the connection between the current node and
+   * `remoteAddress`.
    */
-  def onConnected(remoteAddress: RpcAddress): Unit = {
+  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when `remoteAddress` is lost.
+   * Invoked before [[RpcEndpoint]] starts to handle any message.
    */
-  def onDisconnected(remoteAddress: RpcAddress): Unit = {
+  def onStart(): Unit = {
     // By default, do nothing.
   }
 
   /**
-   * Invoked when some network error happens in the connection between the current node and
-   * `remoteAddress`.
+   * Invoked when [[RpcEndpoint]] is stopping. `self` will be `null` in this method and you cannot
+   * use it to send or ask messages.
    */
-  def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+  def onStop(): Unit = {
     // By default, do nothing.
   }
 
@@ -146,3 +133,17 @@ private[spark] trait RpcEndpoint {
     }
   }
 }
+
+/**
+ * A trait that requires RpcEnv thread-safely sending messages to it.
+ *
+ * Thread-safety means processing of one message happens before processing of the next message by
+ * the same [[ThreadSafeRpcEndpoint]]. In the other words, changes to internal fields of a
+ * [[ThreadSafeRpcEndpoint]] are visible when processing the next message, and fields in the
+ * [[ThreadSafeRpcEndpoint]] need not be volatile or equivalent.
+ *
+ * However, there is no guarantee that the same thread will be executing the same
+ * [[ThreadSafeRpcEndpoint]] for different messages.
+ */
+private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint {
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
new file mode 100644
index 0000000000000..d177881fb3053
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointNotFoundException.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc
+
+import org.apache.spark.SparkException
+
+private[rpc] class RpcEndpointNotFoundException(uri: String)
+  extends SparkException(s"Cannot find endpoint: $uri")
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 29debe8081308..35e402c725331 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -36,8 +36,9 @@ private[spark] object RpcEnv {
 
   private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = {
     // Add more RpcEnv implementations here
-    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory")
-    val rpcEnvName = conf.get("spark.rpc", "akka")
+    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
+      "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory")
+    val rpcEnvName = conf.get("spark.rpc", "netty")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
     Utils.classForName(rpcEnvFactoryClassName).newInstance().asInstanceOf[RpcEnvFactory]
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index ad67e1c5ad4d5..95132a4e4a0bf 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -166,9 +166,9 @@ private[spark] class AkkaRpcEnv private[akka] (
             _sender ! AkkaMessage(response, false)
           }
 
-          // Some RpcEndpoints need to know the sender's address
-          override val sender: RpcEndpointRef =
-            new AkkaRpcEndpointRef(defaultAddress, _sender, conf)
+          // Use "lazy" because most of RpcEndpoints don't need "senderAddress"
+          override lazy val senderAddress: RpcAddress =
+            new AkkaRpcEndpointRef(defaultAddress, _sender, conf).address
         })
       } else {
         endpoint.receive
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
new file mode 100644
index 0000000000000..d71e6f01dbb29
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.JavaConverters._
+import scala.concurrent.Promise
+import scala.util.control.NonFatal
+
+import org.apache.spark.{SparkException, Logging}
+import org.apache.spark.network.client.RpcResponseCallback
+import org.apache.spark.rpc._
+import org.apache.spark.util.ThreadUtils
+
+private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
+
+  private class EndpointData(
+      val name: String,
+      val endpoint: RpcEndpoint,
+      val ref: NettyRpcEndpointRef) {
+    val inbox = new Inbox(ref, endpoint)
+  }
+
+  private val endpoints = new ConcurrentHashMap[String, EndpointData]()
+  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]()
+
+  // Track the receivers whose inboxes may contain messages.
+  private val receivers = new LinkedBlockingQueue[EndpointData]()
+
+  @GuardedBy("this")
+  private var stopped = false
+
+  def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
+    val addr = new NettyRpcAddress(nettyEnv.address.host, nettyEnv.address.port, name)
+    val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
+    synchronized {
+      if (stopped) {
+        throw new IllegalStateException("RpcEnv has been stopped")
+      }
+      if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) {
+        throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name")
+      }
+      val data = endpoints.get(name)
+      endpointRefs.put(data.endpoint, data.ref)
+      receivers.put(data)
+    }
+    endpointRef
+  }
+
+  def getRpcEndpointRef(endpoint: RpcEndpoint): RpcEndpointRef = endpointRefs.get(endpoint)
+
+  def removeRpcEndpointRef(endpoint: RpcEndpoint): Unit = endpointRefs.remove(endpoint)
+
+  // Should be idempotent
+  private def unregisterRpcEndpoint(name: String): Unit = {
+    val data = endpoints.remove(name)
+    if (data != null) {
+      data.inbox.stop()
+      receivers.put(data)
+    }
+    // Don't clean `endpointRefs` here because it's possible that some messages are being processed
+    // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
+    // `removeRpcEndpointRef`.
+  }
+
+  def stop(rpcEndpointRef: RpcEndpointRef): Unit = {
+    synchronized {
+      if (stopped) {
+        // This endpoint will be stopped by Dispatcher.stop() method.
+        return
+      }
+      unregisterRpcEndpoint(rpcEndpointRef.name)
+    }
+  }
+
+  /**
+   * Send a message to all registered [[RpcEndpoint]]s.
+   * @param message
+   */
+  def broadcastMessage(message: InboxMessage): Unit = {
+    val iter = endpoints.keySet().iterator()
+    while (iter.hasNext) {
+      val name = iter.next
+      postMessageToInbox(name, (_) => message,
+        () => { logWarning(s"Drop ${message} because ${name} has been stopped") })
+    }
+  }
+
+  def postMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
+    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
+      val rpcCallContext =
+        new RemoteNettyRpcCallContext(
+          nettyEnv, sender, callback, message.senderAddress, message.needReply)
+      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
+    }
+
+    def onEndpointStopped(): Unit = {
+      callback.onFailure(
+        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
+    }
+
+    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+  }
+
+  def postMessage(message: RequestMessage, p: Promise[Any]): Unit = {
+    def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
+      val rpcCallContext =
+        new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p)
+      ContentMessage(message.senderAddress, message.content, message.needReply, rpcCallContext)
+    }
+
+    def onEndpointStopped(): Unit = {
+      p.tryFailure(
+        new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
+    }
+
+    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+  }
+
+  private def postMessageToInbox(
+      endpointName: String,
+      createMessageFn: NettyRpcEndpointRef => InboxMessage,
+      onStopped: () => Unit): Unit = {
+    val shouldCallOnStop =
+      synchronized {
+        val data = endpoints.get(endpointName)
+        if (stopped || data == null) {
+          true
+        } else {
+          data.inbox.post(createMessageFn(data.ref))
+          receivers.put(data)
+          false
+        }
+      }
+    if (shouldCallOnStop) {
+      // We don't need to call `onStop` in the `synchronized` block
+      onStopped()
+    }
+  }
+
+  private val parallelism = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.parallelism",
+    Runtime.getRuntime.availableProcessors())
+
+  private val executor = ThreadUtils.newDaemonFixedThreadPool(parallelism, "dispatcher-event-loop")
+
+  (0 until parallelism) foreach { _ =>
+    executor.execute(new MessageLoop)
+  }
+
+  def stop(): Unit = {
+    synchronized {
+      if (stopped) {
+        return
+      }
+      stopped = true
+    }
+    // Stop all endpoints. This will queue all endpoints for processing by the message loops.
+    endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
+    // Enqueue a message that tells the message loops to stop.
+    receivers.put(PoisonEndpoint)
+    executor.shutdown()
+  }
+
+  def awaitTermination(): Unit = {
+    executor.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
+  }
+
+  /**
+   * Return if the endpoint exists
+   */
+  def verify(name: String): Boolean = {
+    endpoints.containsKey(name)
+  }
+
+  private class MessageLoop extends Runnable {
+    override def run(): Unit = {
+      try {
+        while (true) {
+          try {
+            val data = receivers.take()
+            if (data == PoisonEndpoint) {
+              // Put PoisonEndpoint back so that other MessageLoops can see it.
+              receivers.put(PoisonEndpoint)
+              return
+            }
+            data.inbox.process(Dispatcher.this)
+          } catch {
+            case NonFatal(e) => logError(e.getMessage, e)
+          }
+        }
+      } catch {
+        case ie: InterruptedException => // exit
+      }
+    }
+  }
+
+  /**
+   * A poison endpoint that indicates MessageLoop should exit its loop.
+   */
+  private val PoisonEndpoint = new EndpointData(null, null, null)
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
new file mode 100644
index 0000000000000..6061c9b8de944
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
+
+/**
+ * A message used to ask the remote [[IDVerifier]] if an [[RpcEndpoint]] exists
+ */
+private[netty] case class ID(name: String)
+
+/**
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
+ */
+private[netty] class IDVerifier(
+    override val rpcEnv: RpcEnv, dispatcher: Dispatcher) extends RpcEndpoint {
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case ID(name) => context.reply(dispatcher.verify(name))
+  }
+}
+
+private[netty] object IDVerifier {
+  val NAME = "id-verifier"
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
new file mode 100644
index 0000000000000..b669f59a2884e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.LinkedList
+import javax.annotation.concurrent.GuardedBy
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint}
+
+private[netty] sealed trait InboxMessage
+
+private[netty] case class ContentMessage(
+    senderAddress: RpcAddress,
+    content: Any,
+    needReply: Boolean,
+    context: NettyRpcCallContext) extends InboxMessage
+
+private[netty] case object OnStart extends InboxMessage
+
+private[netty] case object OnStop extends InboxMessage
+
+/**
+ * A broadcast message that indicates connecting to a remote node.
+ */
+private[netty] case class Associated(remoteAddress: RpcAddress) extends InboxMessage
+
+/**
+ * A broadcast message that indicates a remote connection is lost.
+ */
+private[netty] case class Disassociated(remoteAddress: RpcAddress) extends InboxMessage
+
+/**
+ * A broadcast message that indicates a network error
+ */
+private[netty] case class AssociationError(cause: Throwable, remoteAddress: RpcAddress)
+  extends InboxMessage
+
+/**
+ * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely.
+ * @param endpointRef
+ * @param endpoint
+ */
+private[netty] class Inbox(
+    val endpointRef: NettyRpcEndpointRef,
+    val endpoint: RpcEndpoint) extends Logging {
+
+  inbox =>
+
+  @GuardedBy("this")
+  protected val messages = new LinkedList[InboxMessage]()
+
+  @GuardedBy("this")
+  private var stopped = false
+
+  @GuardedBy("this")
+  private var enableConcurrent = false
+
+  @GuardedBy("this")
+  private var workerCount = 0
+
+  // OnStart should be the first message to process
+  inbox.synchronized {
+    messages.add(OnStart)
+  }
+
+  /**
+   * Process stored messages.
+   */
+  def process(dispatcher: Dispatcher): Unit = {
+    var message: InboxMessage = null
+    inbox.synchronized {
+      if (!enableConcurrent && workerCount != 0) {
+        return
+      }
+      message = messages.poll()
+      if (message != null) {
+        workerCount += 1
+      } else {
+        return
+      }
+    }
+    while (true) {
+      safelyCall(endpoint) {
+        message match {
+          case ContentMessage(_sender, content, needReply, context) =>
+            val pf: PartialFunction[Any, Unit] =
+              if (needReply) {
+                endpoint.receiveAndReply(context)
+              } else {
+                endpoint.receive
+              }
+            try {
+              pf.applyOrElse[Any, Unit](content, { msg =>
+                throw new SparkException(s"Unmatched message $message from ${_sender}")
+              })
+              if (!needReply) {
+                context.finish()
+              }
+            } catch {
+              case NonFatal(e) =>
+                if (needReply) {
+                  // If the sender asks a reply, we should send the error back to the sender
+                  context.sendFailure(e)
+                } else {
+                  context.finish()
+                  throw e
+                }
+            }
+
+          case OnStart => {
+            endpoint.onStart()
+            if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {
+              inbox.synchronized {
+                if (!stopped) {
+                  enableConcurrent = true
+                }
+              }
+            }
+          }
+
+          case OnStop =>
+            val _workCount = inbox.synchronized {
+              workerCount
+            }
+            assert(_workCount == 1, s"There should be only one worker but was ${_workCount}")
+            dispatcher.removeRpcEndpointRef(endpoint)
+            endpoint.onStop()
+            assert(isEmpty, "OnStop should be the last message")
+
+          case Associated(remoteAddress) =>
+            endpoint.onConnected(remoteAddress)
+
+          case Disassociated(remoteAddress) =>
+            endpoint.onDisconnected(remoteAddress)
+
+          case AssociationError(cause, remoteAddress) =>
+            endpoint.onNetworkError(cause, remoteAddress)
+        }
+      }
+
+      inbox.synchronized {
+        // "enableConcurrent" will be set to false after `onStop` is called, so we should check it
+        // every time.
+        if (!enableConcurrent && workerCount != 1) {
+          // If we are not the only one worker, exit
+          workerCount -= 1
+          return
+        }
+        message = messages.poll()
+        if (message == null) {
+          workerCount -= 1
+          return
+        }
+      }
+    }
+  }
+
+  def post(message: InboxMessage): Unit = {
+    val dropped =
+      inbox.synchronized {
+        if (stopped) {
+          // We already put "OnStop" into "messages", so we should drop further messages
+          true
+        } else {
+          messages.add(message)
+          false
+        }
+      }
+    if (dropped) {
+      onDrop(message)
+    }
+  }
+
+  def stop(): Unit = inbox.synchronized {
+    // The following codes should be in `synchronized` so that we can make sure "OnStop" is the last
+    // message
+    if (!stopped) {
+      // We should disable concurrent here. Then when RpcEndpoint.onStop is called, it's the only
+      // thread that is processing messages. So `RpcEndpoint.onStop` can release its resources
+      // safely.
+      enableConcurrent = false
+      stopped = true
+      messages.add(OnStop)
+      // Note: The concurrent events in messages will be processed one by one.
+    }
+  }
+
+  // Visible for testing.
+  protected def onDrop(message: InboxMessage): Unit = {
+    logWarning(s"Drop ${message} because $endpointRef is stopped")
+  }
+
+  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }
+
+  private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {
+    try {
+      action
+    } catch {
+      case NonFatal(e) => {
+        try {
+          endpoint.onError(e)
+        } catch {
+          case NonFatal(e) => logWarning(s"Ignore error", e)
+        }
+      }
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
new file mode 100644
index 0000000000000..1876b25592086
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.net.URI
+
+import org.apache.spark.SparkException
+import org.apache.spark.rpc.RpcAddress
+
+private[netty] case class NettyRpcAddress(host: String, port: Int, name: String) {
+
+  def toRpcAddress: RpcAddress = RpcAddress(host, port)
+
+  override val toString = s"spark://$name@$host:$port"
+}
+
+private[netty] object NettyRpcAddress {
+
+  def apply(sparkUrl: String): NettyRpcAddress = {
+    try {
+      val uri = new URI(sparkUrl)
+      val host = uri.getHost
+      val port = uri.getPort
+      val name = uri.getUserInfo
+      if (uri.getScheme != "spark" ||
+        host == null ||
+        port < 0 ||
+        name == null ||
+        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
+        uri.getFragment != null ||
+        uri.getQuery != null) {
+        throw new SparkException("Invalid Spark URL: " + sparkUrl)
+      }
+      NettyRpcAddress(host, port, name)
+    } catch {
+      case e: java.net.URISyntaxException =>
+        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
new file mode 100644
index 0000000000000..75dcc02a0c5a9
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import scala.concurrent.Promise
+
+import org.apache.spark.Logging
+import org.apache.spark.network.client.RpcResponseCallback
+import org.apache.spark.rpc.{RpcAddress, RpcCallContext}
+
+private[netty] abstract class NettyRpcCallContext(
+    endpointRef: NettyRpcEndpointRef,
+    override val senderAddress: RpcAddress,
+    needReply: Boolean) extends RpcCallContext with Logging {
+
+  protected def send(message: Any): Unit
+
+  override def reply(response: Any): Unit = {
+    if (needReply) {
+      send(AskResponse(endpointRef, response))
+    } else {
+      throw new IllegalStateException(
+        s"Cannot send $response to the sender because the sender won't handle it")
+    }
+  }
+
+  override def sendFailure(e: Throwable): Unit = {
+    if (needReply) {
+      send(AskResponse(endpointRef, RpcFailure(e)))
+    } else {
+      logError(e.getMessage, e)
+      throw new IllegalStateException(
+        "Cannot send reply to the sender because the sender won't handle it")
+    }
+  }
+
+  def finish(): Unit = {
+    if (!needReply) {
+      send(Ack(endpointRef))
+    }
+  }
+}
+
+/**
+ * If the sender and the receiver are in the same process, the reply can be sent back via `Promise`.
+ */
+private[netty] class LocalNettyRpcCallContext(
+    endpointRef: NettyRpcEndpointRef,
+    senderAddress: RpcAddress,
+    needReply: Boolean,
+    p: Promise[Any]) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+
+  override protected def send(message: Any): Unit = {
+    p.success(message)
+  }
+}
+
+/**
+ * A [[RpcCallContext]] that will call [[RpcResponseCallback]] to send the reply back.
+ */
+private[netty] class RemoteNettyRpcCallContext(
+    nettyEnv: NettyRpcEnv,
+    endpointRef: NettyRpcEndpointRef,
+    callback: RpcResponseCallback,
+    senderAddress: RpcAddress,
+    needReply: Boolean) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+
+  override protected def send(message: Any): Unit = {
+    val reply = nettyEnv.serialize(message)
+    callback.onSuccess(reply)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
new file mode 100644
index 0000000000000..5522b40782d9e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -0,0 +1,504 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rpc.netty
+
+import java.io._
+import java.net.{InetSocketAddress, URI}
+import java.nio.ByteBuffer
+import java.util.Arrays
+import java.util.concurrent._
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.{Future, Promise}
+import scala.reflect.ClassTag
+import scala.util.{DynamicVariable, Failure, Success}
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.network.TransportContext
+import org.apache.spark.network.client._
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.sasl.{SaslClientBootstrap, SaslServerBootstrap}
+import org.apache.spark.network.server._
+import org.apache.spark.rpc._
+import org.apache.spark.serializer.{JavaSerializer, JavaSerializerInstance}
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+private[netty] class NettyRpcEnv(
+    val conf: SparkConf,
+    javaSerializerInstance: JavaSerializerInstance,
+    host: String,
+    securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
+
+  private val transportConf =
+    SparkTransportConf.fromSparkConf(conf, conf.getInt("spark.rpc.io.threads", 0))
+
+  private val dispatcher: Dispatcher = new Dispatcher(this)
+
+  private val transportContext =
+    new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
+
+  private val clientFactory = {
+    val bootstraps: Seq[TransportClientBootstrap] =
+      if (securityManager.isAuthenticationEnabled()) {
+        Seq(new SaslClientBootstrap(transportConf, "", securityManager,
+          securityManager.isSaslEncryptionEnabled()))
+      } else {
+        Nil
+      }
+    transportContext.createClientFactory(bootstraps.asJava)
+  }
+
+  val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
+
+  // Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool
+  // to implement non-blocking send/ask.
+  // TODO: a non-blocking TransportClientFactory.createClient in future
+  private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
+    "netty-rpc-connection",
+    conf.getInt("spark.rpc.connect.threads", 256))
+
+  @volatile private var server: TransportServer = _
+
+  def start(port: Int): Unit = {
+    val bootstraps: Seq[TransportServerBootstrap] =
+      if (securityManager.isAuthenticationEnabled()) {
+        Seq(new SaslServerBootstrap(transportConf, securityManager))
+      } else {
+        Nil
+      }
+    server = transportContext.createServer(port, bootstraps.asJava)
+    dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
+  }
+
+  override lazy val address: RpcAddress = {
+    require(server != null, "NettyRpcEnv has not yet started")
+    RpcAddress(host, server.getPort())
+  }
+
+  override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
+    dispatcher.registerRpcEndpoint(name, endpoint)
+  }
+
+  def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = {
+    val addr = NettyRpcAddress(uri)
+    val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
+    val idVerifierRef =
+      new NettyRpcEndpointRef(conf, NettyRpcAddress(addr.host, addr.port, IDVerifier.NAME), this)
+    idVerifierRef.ask[Boolean](ID(endpointRef.name)).flatMap { find =>
+      if (find) {
+        Future.successful(endpointRef)
+      } else {
+        Future.failed(new RpcEndpointNotFoundException(uri))
+      }
+    }(ThreadUtils.sameThread)
+  }
+
+  override def stop(endpointRef: RpcEndpointRef): Unit = {
+    require(endpointRef.isInstanceOf[NettyRpcEndpointRef])
+    dispatcher.stop(endpointRef)
+  }
+
+  private[netty] def send(message: RequestMessage): Unit = {
+    val remoteAddr = message.receiver.address
+    if (remoteAddr == address) {
+      val promise = Promise[Any]()
+      dispatcher.postMessage(message, promise)
+      promise.future.onComplete {
+        case Success(response) =>
+          val ack = response.asInstanceOf[Ack]
+          logDebug(s"Receive ack from ${ack.sender}")
+        case Failure(e) =>
+          logError(s"Exception when sending $message", e)
+      }(ThreadUtils.sameThread)
+    } else {
+      try {
+        // `createClient` will block if it cannot find a known connection, so we should run it in
+        // clientConnectionExecutor
+        clientConnectionExecutor.execute(new Runnable {
+          override def run(): Unit = Utils.tryLogNonFatalError {
+            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
+            client.sendRpc(serialize(message), new RpcResponseCallback {
+
+              override def onFailure(e: Throwable): Unit = {
+                logError(s"Exception when sending $message", e)
+              }
+
+              override def onSuccess(response: Array[Byte]): Unit = {
+                val ack = deserialize[Ack](response)
+                logDebug(s"Receive ack from ${ack.sender}")
+              }
+            })
+          }
+        })
+      } catch {
+        case e: RejectedExecutionException => {
+          // `send` after shutting clientConnectionExecutor down, ignore it
+          logWarning(s"Cannot send ${message} because RpcEnv is stopped")
+        }
+      }
+    }
+  }
+
+  private[netty] def ask(message: RequestMessage): Future[Any] = {
+    val promise = Promise[Any]()
+    val remoteAddr = message.receiver.address
+    if (remoteAddr == address) {
+      val p = Promise[Any]()
+      dispatcher.postMessage(message, p)
+      p.future.onComplete {
+        case Success(response) =>
+          val reply = response.asInstanceOf[AskResponse]
+          if (reply.reply.isInstanceOf[RpcFailure]) {
+            if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
+              logWarning(s"Ignore failure: ${reply.reply}")
+            }
+          } else if (!promise.trySuccess(reply.reply)) {
+            logWarning(s"Ignore message: ${reply}")
+          }
+        case Failure(e) =>
+          if (!promise.tryFailure(e)) {
+            logWarning("Ignore Exception", e)
+          }
+      }(ThreadUtils.sameThread)
+    } else {
+      try {
+        // `createClient` will block if it cannot find a known connection, so we should run it in
+        // clientConnectionExecutor
+        clientConnectionExecutor.execute(new Runnable {
+          override def run(): Unit = {
+            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
+            client.sendRpc(serialize(message), new RpcResponseCallback {
+
+              override def onFailure(e: Throwable): Unit = {
+                if (!promise.tryFailure(e)) {
+                  logWarning("Ignore Exception", e)
+                }
+              }
+
+              override def onSuccess(response: Array[Byte]): Unit = {
+                val reply = deserialize[AskResponse](response)
+                if (reply.reply.isInstanceOf[RpcFailure]) {
+                  if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
+                    logWarning(s"Ignore failure: ${reply.reply}")
+                  }
+                } else if (!promise.trySuccess(reply.reply)) {
+                  logWarning(s"Ignore message: ${reply}")
+                }
+              }
+            })
+          }
+        })
+      } catch {
+        case e: RejectedExecutionException => {
+          if (!promise.tryFailure(e)) {
+            logWarning(s"Ignore failure", e)
+          }
+        }
+      }
+    }
+    promise.future
+  }
+
+  private[netty] def serialize(content: Any): Array[Byte] = {
+    val buffer = javaSerializerInstance.serialize(content)
+    Arrays.copyOfRange(
+      buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
+  }
+
+  private[netty] def deserialize[T: ClassTag](bytes: Array[Byte]): T = {
+    deserialize { () =>
+      javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes))
+    }
+  }
+
+  override def endpointRef(endpoint: RpcEndpoint): RpcEndpointRef = {
+    dispatcher.getRpcEndpointRef(endpoint)
+  }
+
+  override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
+    new NettyRpcAddress(address.host, address.port, endpointName).toString
+
+  override def shutdown(): Unit = {
+    cleanup()
+  }
+
+  override def awaitTermination(): Unit = {
+    dispatcher.awaitTermination()
+  }
+
+  private def cleanup(): Unit = {
+    if (timeoutScheduler != null) {
+      timeoutScheduler.shutdownNow()
+    }
+    if (server != null) {
+      server.close()
+    }
+    if (clientFactory != null) {
+      clientFactory.close()
+    }
+    if (dispatcher != null) {
+      dispatcher.stop()
+    }
+    if (clientConnectionExecutor != null) {
+      clientConnectionExecutor.shutdownNow()
+    }
+  }
+
+  override def deserialize[T](deserializationAction: () => T): T = {
+    NettyRpcEnv.currentEnv.withValue(this) {
+      deserializationAction()
+    }
+  }
+}
+
+private[netty] object NettyRpcEnv extends Logging {
+
+  /**
+   * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
+   * Use `currentEnv` to wrap the deserialization codes. E.g.,
+   *
+   * {{{
+   *   NettyRpcEnv.currentEnv.withValue(this) {
+   *     your deserialization codes
+   *   }
+   * }}}
+   */
+  private[netty] val currentEnv = new DynamicVariable[NettyRpcEnv](null)
+}
+
+private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
+
+  def create(config: RpcEnvConfig): RpcEnv = {
+    val sparkConf = config.conf
+    // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support
+    // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance
+    val javaSerializerInstance =
+      new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]
+    val nettyEnv =
+      new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager)
+    val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
+      nettyEnv.start(actualPort)
+      (nettyEnv, actualPort)
+    }
+    try {
+      Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1
+    } catch {
+      case NonFatal(e) =>
+        nettyEnv.shutdown()
+        throw e
+    }
+  }
+}
+
+private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
+  extends RpcEndpointRef(conf) with Serializable with Logging {
+
+  @transient @volatile private var nettyEnv: NettyRpcEnv = _
+
+  @transient @volatile private var _address: NettyRpcAddress = _
+
+  def this(conf: SparkConf, _address: NettyRpcAddress, nettyEnv: NettyRpcEnv) {
+    this(conf)
+    this._address = _address
+    this.nettyEnv = nettyEnv
+  }
+
+  override def address: RpcAddress = _address.toRpcAddress
+
+  private def readObject(in: ObjectInputStream): Unit = {
+    in.defaultReadObject()
+    _address = in.readObject().asInstanceOf[NettyRpcAddress]
+    nettyEnv = NettyRpcEnv.currentEnv.value
+  }
+
+  private def writeObject(out: ObjectOutputStream): Unit = {
+    out.defaultWriteObject()
+    out.writeObject(_address)
+  }
+
+  override def name: String = _address.name
+
+  override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = {
+    val promise = Promise[Any]()
+    val timeoutCancelable = nettyEnv.timeoutScheduler.schedule(new Runnable {
+      override def run(): Unit = {
+        promise.tryFailure(new TimeoutException("Cannot receive any reply in " + timeout.duration))
+      }
+    }, timeout.duration.toNanos, TimeUnit.NANOSECONDS)
+    val f = nettyEnv.ask(RequestMessage(nettyEnv.address, this, message, true))
+    f.onComplete { v =>
+      timeoutCancelable.cancel(true)
+      if (!promise.tryComplete(v)) {
+        logWarning(s"Ignore message $v")
+      }
+    }(ThreadUtils.sameThread)
+    promise.future.mapTo[T].recover(timeout.addMessageIfTimeout)(ThreadUtils.sameThread)
+  }
+
+  override def send(message: Any): Unit = {
+    require(message != null, "Message is null")
+    nettyEnv.send(RequestMessage(nettyEnv.address, this, message, false))
+  }
+
+  override def toString: String = s"NettyRpcEndpointRef(${_address})"
+
+  def toURI: URI = new URI(s"spark://${_address}")
+
+  final override def equals(that: Any): Boolean = that match {
+    case other: NettyRpcEndpointRef => _address == other._address
+    case _ => false
+  }
+
+  final override def hashCode(): Int = if (_address == null) 0 else _address.hashCode()
+}
+
+/**
+ * The message that is sent from the sender to the receiver.
+ */
+private[netty] case class RequestMessage(
+    senderAddress: RpcAddress, receiver: NettyRpcEndpointRef, content: Any, needReply: Boolean)
+
+/**
+ * The base trait for all messages that are sent back from the receiver to the sender.
+ */
+private[netty] trait ResponseMessage
+
+/**
+ * The reply for `ask` from the receiver side.
+ */
+private[netty] case class AskResponse(sender: NettyRpcEndpointRef, reply: Any)
+  extends ResponseMessage
+
+/**
+ * A message to send back to the receiver side. It's necessary because [[TransportClient]] only
+ * clean the resources when it receives a reply.
+ */
+private[netty] case class Ack(sender: NettyRpcEndpointRef) extends ResponseMessage
+
+/**
+ * A response that indicates some failure happens in the receiver side.
+ */
+private[netty] case class RpcFailure(e: Throwable)
+
+/**
+ * Maintain the mapping relations between client addresses and [[RpcEnv]] addresses, broadcast
+ * network events and forward messages to [[Dispatcher]].
+ */
+private[netty] class NettyRpcHandler(
+    dispatcher: Dispatcher, nettyEnv: NettyRpcEnv) extends RpcHandler with Logging {
+
+  private type ClientAddress = RpcAddress
+  private type RemoteEnvAddress = RpcAddress
+
+  // Store all client addresses and their NettyRpcEnv addresses.
+  @GuardedBy("this")
+  private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
+
+  // Store the connections from other NettyRpcEnv addresses. We need to keep track of the connection
+  // count because `TransportClientFactory.createClient` will create multiple connections
+  // (at most `spark.shuffle.io.numConnectionsPerPeer` connections) and randomly select a connection
+  // to send the message. See `TransportClientFactory.createClient` for more details.
+  @GuardedBy("this")
+  private val remoteConnectionCount = new mutable.HashMap[RemoteEnvAddress, Int]()
+
+  override def receive(
+      client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
+    val requestMessage = nettyEnv.deserialize[RequestMessage](message)
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    assert(addr != null)
+    val remoteEnvAddress = requestMessage.senderAddress
+    val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+    val broadcastMessage =
+      synchronized {
+        // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
+        if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
+          // clientAddr connects at the first time
+          val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
+          // Increase the connection number of remoteEnvAddress
+          remoteConnectionCount.put(remoteEnvAddress, count + 1)
+          if (count == 0) {
+            // This is the first connection, so fire "Associated"
+            Some(Associated(remoteEnvAddress))
+          } else {
+            None
+          }
+        } else {
+          None
+        }
+      }
+    broadcastMessage.foreach(dispatcher.broadcastMessage)
+    dispatcher.postMessage(requestMessage, callback)
+  }
+
+  override def getStreamManager: StreamManager = new OneForOneStreamManager
+
+  override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = {
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    if (addr != null) {
+      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      val broadcastMessage =
+        synchronized {
+          remoteAddresses.get(clientAddr).map(AssociationError(cause, _))
+        }
+      if (broadcastMessage.isEmpty) {
+        logError(cause.getMessage, cause)
+      } else {
+        dispatcher.broadcastMessage(broadcastMessage.get)
+      }
+    } else {
+      // If the channel is closed before connecting, its remoteAddress will be null.
+      // See java.net.Socket.getRemoteSocketAddress
+      // Because we cannot get a RpcAddress, just log it
+      logError("Exception before connecting to the client", cause)
+    }
+  }
+
+  override def connectionTerminated(client: TransportClient): Unit = {
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    if (addr != null) {
+      val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      val broadcastMessage =
+        synchronized {
+          // If the last connection to a remote RpcEnv is terminated, we should broadcast
+          // "Disassociated"
+          remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
+            remoteAddresses -= clientAddr
+            val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
+            assert(count != 0, "remoteAddresses and remoteConnectionCount are not consistent")
+            if (count - 1 == 0) {
+              // We lost all clients, so clean up and fire "Disassociated"
+              remoteConnectionCount.remove(remoteEnvAddress)
+              Some(Disassociated(remoteEnvAddress))
+            } else {
+              // Decrease the connection number of remoteEnvAddress
+              remoteConnectionCount.put(remoteEnvAddress, count - 1)
+              None
+            }
+          }
+        }
+      broadcastMessage.foreach(dispatcher.broadcastMessage)
+    } else {
+      // If the channel is closed before connecting, its remoteAddress will be null. In this case,
+      // we can ignore it since we don't fire "Associated".
+      // See java.net.Socket.getRemoteSocketAddress
+    }
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
index 7478ab0fc2f7a..e749631bf6f19 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
@@ -19,7 +19,7 @@ package org.apache.spark.storage
 
 import scala.concurrent.{ExecutionContext, Future}
 
-import org.apache.spark.rpc.{RpcEnv, RpcCallContext, RpcEndpoint}
+import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext, RpcEndpoint}
 import org.apache.spark.util.ThreadUtils
 import org.apache.spark.{Logging, MapOutputTracker, SparkEnv}
 import org.apache.spark.storage.BlockManagerMessages._
@@ -33,7 +33,7 @@ class BlockManagerSlaveEndpoint(
     override val rpcEnv: RpcEnv,
     blockManager: BlockManager,
     mapOutputTracker: MapOutputTracker)
-  extends RpcEndpoint with Logging {
+  extends ThreadSafeRpcEndpoint with Logging {
 
   private val asyncThreadPool =
     ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool")
@@ -80,7 +80,7 @@ class BlockManagerSlaveEndpoint(
     future.onSuccess { case response =>
       logDebug("Done " + actionMessage + ", response is " + response)
       context.reply(response)
-      logDebug("Sent response: " + response + " to " + context.sender)
+      logDebug("Sent response: " + response + " to " + context.senderAddress)
     }
     future.onFailure { case t: Throwable =>
       logError("Error in " + actionMessage, t)
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 22e291a2b48d6..1ed098379e299 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -85,7 +85,11 @@ private[spark] object ThreadUtils {
    */
   def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()
-    Executors.newSingleThreadScheduledExecutor(threadFactory)
+    val executor = new ScheduledThreadPoolExecutor(1, threadFactory)
+    // By default, a cancelled task is not automatically removed from the work queue until its delay
+    // elapses. We have to enable it manually.
+    executor.setRemoveOnCancelPolicy(true)
+    executor
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index af4e68950f75a..7e70308bb360c 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -168,10 +168,9 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     masterTracker.registerShuffle(10, 1)
     masterTracker.registerMapOutput(10, 0, MapStatus(
       BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0)))
-    val sender = mock(classOf[RpcEndpointRef])
-    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
+    val senderAddress = RpcAddress("localhost", 12345)
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.sender).thenReturn(sender)
+    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(10))
     verify(rpcCallContext).reply(any())
     verify(rpcCallContext, never()).sendFailure(any())
@@ -198,10 +197,9 @@ class MapOutputTrackerSuite extends SparkFunSuite {
       masterTracker.registerMapOutput(20, i, new CompressedMapStatus(
         BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0)))
     }
-    val sender = mock(classOf[RpcEndpointRef])
-    when(sender.address).thenReturn(RpcAddress("localhost", 12345))
+    val senderAddress = RpcAddress("localhost", 12345)
     val rpcCallContext = mock(classOf[RpcCallContext])
-    when(rpcCallContext.sender).thenReturn(sender)
+    when(rpcCallContext.senderAddress).thenReturn(senderAddress)
     masterEndpoint.receiveAndReply(rpcCallContext)(GetMapOutputStatuses(20))
     verify(rpcCallContext, never()).reply(any())
     verify(rpcCallContext).sendFailure(isA(classOf[SparkException]))
diff --git a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
index 33270bec6247c..2d14249855c9d 100644
--- a/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
+++ b/core/src/test/scala/org/apache/spark/SSLSampleConfigs.scala
@@ -41,6 +41,7 @@ object SSLSampleConfigs {
 
   def sparkSSLConfig(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
+    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", keyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
@@ -54,6 +55,7 @@ object SSLSampleConfigs {
 
   def sparkSSLConfigUntrusted(): SparkConf = {
     val conf = new SparkConf(loadDefaults = false)
+    conf.set("spark.rpc", "akka")
     conf.set("spark.ssl.enabled", "true")
     conf.set("spark.ssl.keyStore", untrustedKeyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
index e9034e39a715c..40c24bdecc6ce 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
@@ -26,8 +26,7 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val conf = new SparkConf()
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
-    workerWatcher.setTesting(testing = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(RpcAddress("1.2.3.4", 1234))
     assert(workerWatcher.isShutDown)
@@ -39,8 +38,7 @@ class WorkerWatcherSuite extends SparkFunSuite {
     val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
     val targetWorkerUrl = rpcEnv.uriOf("test", RpcAddress("1.2.3.4", 1234), "Worker")
     val otherRpcAddress = RpcAddress("4.3.2.1", 1234)
-    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl)
-    workerWatcher.setTesting(testing = true)
+    val workerWatcher = new WorkerWatcher(rpcEnv, targetWorkerUrl, isTesting = true)
     rpcEnv.setupEndpoint("worker-watcher", workerWatcher)
     workerWatcher.onDisconnected(otherRpcAddress)
     assert(!workerWatcher.isShutDown)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 6ceafe4337747..3bead6395d384 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rpc
 
+import java.io.NotSerializableException
 import java.util.concurrent.{TimeUnit, CountDownLatch, TimeoutException}
 
 import scala.collection.mutable
@@ -99,7 +100,6 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     }
     val rpcEndpointRef = env.setupEndpoint("send-ref", endpoint)
-
     val newRpcEndpointRef = rpcEndpointRef.askWithRetry[RpcEndpointRef]("Hello")
     val reply = newRpcEndpointRef.askWithRetry[String]("Echo")
     assert("Echo" === reply)
@@ -328,9 +328,6 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       override def onStop(): Unit = {
         selfOption = Option(self)
       }
-
-      override def onError(cause: Throwable): Unit = {
-      }
     })
 
     env.stop(endpointRef)
@@ -516,6 +513,9 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       assert(events === List(
         ("onConnected", remoteAddress),
         ("onNetworkError", remoteAddress),
+        ("onDisconnected", remoteAddress)) ||
+        events === List(
+        ("onConnected", remoteAddress),
         ("onDisconnected", remoteAddress)))
     }
   }
@@ -535,15 +535,84 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       "local", env.address, "sendWithReply-unserializable-error")
     try {
       val f = rpcEndpointRef.ask[String]("hello")
-      intercept[TimeoutException] {
+      val e = intercept[Exception] {
         Await.result(f, 1 seconds)
       }
+      assert(e.isInstanceOf[TimeoutException] || // For Akka
+        e.isInstanceOf[NotSerializableException] // For Netty
+      )
     } finally {
       anotherEnv.shutdown()
       anotherEnv.awaitTermination()
     }
   }
 
+  test("port conflict") {
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", env.address.port)
+    assert(anotherEnv.address.port != env.address.port)
+  }
+
+  test("send with authentication") {
+    val conf = new SparkConf
+    conf.set("spark.authenticate", "true")
+    conf.set("spark.authenticate.secret", "good")
+
+    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+
+    try {
+      @volatile var message: String = null
+      localEnv.setupEndpoint("send-authentication", new RpcEndpoint {
+        override val rpcEnv = localEnv
+
+        override def receive: PartialFunction[Any, Unit] = {
+          case msg: String => message = msg
+        }
+      })
+      val rpcEndpointRef =
+        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "send-authentication")
+      rpcEndpointRef.send("hello")
+      eventually(timeout(5 seconds), interval(10 millis)) {
+        assert("hello" === message)
+      }
+    } finally {
+      localEnv.shutdown()
+      localEnv.awaitTermination()
+      remoteEnv.shutdown()
+      remoteEnv.awaitTermination()
+    }
+  }
+
+  test("ask with authentication") {
+    val conf = new SparkConf
+    conf.set("spark.authenticate", "true")
+    conf.set("spark.authenticate.secret", "good")
+
+    val localEnv = createRpcEnv(conf, "authentication-local", 13345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+
+    try {
+      localEnv.setupEndpoint("ask-authentication", new RpcEndpoint {
+        override val rpcEnv = localEnv
+
+        override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+          case msg: String => {
+            context.reply(msg)
+          }
+        }
+      })
+      val rpcEndpointRef =
+        remoteEnv.setupEndpointRef("authentication-local", localEnv.address, "ask-authentication")
+      val reply = rpcEndpointRef.askWithRetry[String]("hello")
+      assert("hello" === reply)
+    } finally {
+      localEnv.shutdown()
+      localEnv.awaitTermination()
+      remoteEnv.shutdown()
+      remoteEnv.awaitTermination()
+    }
+  }
+
   test("construct RpcTimeout with conf property") {
     val conf = new SparkConf
 
@@ -612,7 +681,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     // once the future is complete to verify addMessageIfTimeout was invoked
     val reply3 =
       intercept[RpcTimeoutException] {
-        Await.result(fut3, 200 millis)
+        Await.result(fut3, 2000 millis)
       }.getMessage
 
     // When the future timed out, the recover callback should have used
diff --git a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
new file mode 100644
index 0000000000000..5e8da3e205ab0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalactic.TripleEquals
+
+class TestRpcEndpoint extends ThreadSafeRpcEndpoint with TripleEquals {
+
+  override val rpcEnv: RpcEnv = null
+
+  @volatile private var receiveMessages = ArrayBuffer[Any]()
+
+  @volatile private var receiveAndReplyMessages = ArrayBuffer[Any]()
+
+  @volatile private var onConnectedMessages = ArrayBuffer[RpcAddress]()
+
+  @volatile private var onDisconnectedMessages = ArrayBuffer[RpcAddress]()
+
+  @volatile private var onNetworkErrorMessages = ArrayBuffer[(Throwable, RpcAddress)]()
+
+  @volatile private var started = false
+
+  @volatile private var stopped = false
+
+  override def receive: PartialFunction[Any, Unit] = {
+    case message: Any => receiveMessages += message
+  }
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case message: Any => receiveAndReplyMessages += message
+  }
+
+  override def onConnected(remoteAddress: RpcAddress): Unit = {
+    onConnectedMessages += remoteAddress
+  }
+
+  /**
+   * Invoked when some network error happens in the connection between the current node and
+   * `remoteAddress`.
+   */
+  override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+    onNetworkErrorMessages += cause -> remoteAddress
+  }
+
+  override def onDisconnected(remoteAddress: RpcAddress): Unit = {
+    onDisconnectedMessages += remoteAddress
+  }
+
+  def numReceiveMessages: Int = receiveMessages.size
+
+  override def onStart(): Unit = {
+    started = true
+  }
+
+  override def onStop(): Unit = {
+    stopped = true
+  }
+
+  def verifyStarted(): Unit = {
+    assert(started, "RpcEndpoint is not started")
+  }
+
+  def verifyStopped(): Unit = {
+    assert(stopped, "RpcEndpoint is not stopped")
+  }
+
+  def verifyReceiveMessages(expected: Seq[Any]): Unit = {
+    assert(receiveMessages === expected)
+  }
+
+  def verifySingleReceiveMessage(message: Any): Unit = {
+    verifyReceiveMessages(List(message))
+  }
+
+  def verifyReceiveAndReplyMessages(expected: Seq[Any]): Unit = {
+    assert(receiveAndReplyMessages === expected)
+  }
+
+  def verifySingleReceiveAndReplyMessage(message: Any): Unit = {
+    verifyReceiveAndReplyMessages(List(message))
+  }
+
+  def verifySingleOnConnectedMessage(remoteAddress: RpcAddress): Unit = {
+    verifyOnConnectedMessages(List(remoteAddress))
+  }
+
+  def verifyOnConnectedMessages(expected: Seq[RpcAddress]): Unit = {
+    assert(onConnectedMessages === expected)
+  }
+
+  def verifySingleOnDisconnectedMessage(remoteAddress: RpcAddress): Unit = {
+    verifyOnDisconnectedMessages(List(remoteAddress))
+  }
+
+  def verifyOnDisconnectedMessages(expected: Seq[RpcAddress]): Unit = {
+    assert(onDisconnectedMessages === expected)
+  }
+
+  def verifySingleOnNetworkErrorMessage(cause: Throwable, remoteAddress: RpcAddress): Unit = {
+    verifyOnNetworkErrorMessages(List(cause -> remoteAddress))
+  }
+
+  def verifyOnNetworkErrorMessages(expected: Seq[(Throwable, RpcAddress)]): Unit = {
+    assert(onNetworkErrorMessages === expected)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
new file mode 100644
index 0000000000000..120cf1b6fa9dc
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.concurrent.{CountDownLatch, TimeUnit}
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.mockito.Mockito._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.rpc.{RpcEnv, RpcEndpoint, RpcAddress, TestRpcEndpoint}
+
+class InboxSuite extends SparkFunSuite {
+
+  test("post") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    when(endpointRef.name).thenReturn("hello")
+
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    val message = ContentMessage(null, "hi", false, null)
+    inbox.post(message)
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    endpoint.verifySingleReceiveMessage("hi")
+
+    inbox.stop()
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+    endpoint.verifyStarted()
+    endpoint.verifyStopped()
+  }
+
+  test("post: with reply") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    val message = ContentMessage(null, "hi", true, null)
+    inbox.post(message)
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    endpoint.verifySingleReceiveAndReplyMessage("hi")
+  }
+
+  test("post: multiple threads") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    when(endpointRef.name).thenReturn("hello")
+
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val numDroppedMessages = new AtomicInteger(0)
+    val inbox = new Inbox(endpointRef, endpoint) {
+      override def onDrop(message: InboxMessage): Unit = {
+        numDroppedMessages.incrementAndGet()
+      }
+    }
+
+    val exitLatch = new CountDownLatch(10)
+
+    for (_ <- 0 until 10) {
+      new Thread {
+        override def run(): Unit = {
+          for (_ <- 0 until 100) {
+            val message = ContentMessage(null, "hi", false, null)
+            inbox.post(message)
+          }
+          exitLatch.countDown()
+        }
+      }.start()
+    }
+    // Try to process some messages
+    inbox.process(dispatcher)
+    inbox.stop()
+    // After `stop` is called, further messages will be dropped. However, while `stop` is called,
+    // some messages may be post to Inbox, so process them here.
+    inbox.process(dispatcher)
+    assert(inbox.isEmpty)
+
+    exitLatch.await(30, TimeUnit.SECONDS)
+
+    assert(1000 === endpoint.numReceiveMessages + numDroppedMessages.get)
+    endpoint.verifyStarted()
+    endpoint.verifyStopped()
+  }
+
+  test("post: Associated") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(Associated(remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnConnectedMessage(remoteAddress)
+  }
+
+  test("post: Disassociated") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(Disassociated(remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnDisconnectedMessage(remoteAddress)
+  }
+
+  test("post: AssociationError") {
+    val endpoint = new TestRpcEndpoint
+    val endpointRef = mock(classOf[NettyRpcEndpointRef])
+    val dispatcher = mock(classOf[Dispatcher])
+
+    val remoteAddress = RpcAddress("localhost", 11111)
+    val cause = new RuntimeException("Oops")
+
+    val inbox = new Inbox(endpointRef, endpoint)
+    inbox.post(AssociationError(cause, remoteAddress))
+    inbox.process(dispatcher)
+
+    endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
new file mode 100644
index 0000000000000..a5d43d3704e37
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.SparkFunSuite
+
+class NettyRpcAddressSuite extends SparkFunSuite {
+
+  test("toString") {
+    val addr = NettyRpcAddress("localhost", 12345, "test")
+    assert(addr.toString === "spark://test@localhost:12345")
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
new file mode 100644
index 0000000000000..be19668e17c04
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.rpc._
+
+class NettyRpcEnvSuite extends RpcEnvSuite {
+
+  override def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv = {
+    val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf))
+    new NettyRpcEnvFactory().create(config)
+  }
+
+  test("non-existent endpoint") {
+    val uri = env.uriOf("test", env.address, "nonexist-endpoint")
+    val e = intercept[RpcEndpointNotFoundException] {
+      env.setupEndpointRef("test", env.address, "nonexist-endpoint")
+    }
+    assert(e.getMessage.contains(uri))
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
new file mode 100644
index 0000000000000..06ca035d199e8
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.net.InetSocketAddress
+
+import io.netty.channel.Channel
+import org.mockito.Mockito._
+import org.mockito.Matchers._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.network.client.{TransportResponseHandler, TransportClient}
+import org.apache.spark.rpc._
+
+class NettyRpcHandlerSuite extends SparkFunSuite {
+
+  val env = mock(classOf[NettyRpcEnv])
+  when(env.deserialize(any(classOf[Array[Byte]]))(any())).
+    thenReturn(RequestMessage(RpcAddress("localhost", 12345), null, null, false))
+
+  test("receive") {
+    val dispatcher = mock(classOf[Dispatcher])
+    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
+
+    val channel = mock(classOf[Channel])
+    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.receive(client, null, null)
+
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
+    nettyRpcHandler.receive(client, null, null)
+
+    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+  }
+
+  test("connectionTerminated") {
+    val dispatcher = mock(classOf[Dispatcher])
+    val nettyRpcHandler = new NettyRpcHandler(dispatcher, env)
+
+    val channel = mock(classOf[Channel])
+    val client = new TransportClient(channel, mock(classOf[TransportResponseHandler]))
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.receive(client, null, null)
+
+    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
+    nettyRpcHandler.connectionTerminated(client)
+
+    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).broadcastMessage(Disassociated(RpcAddress("localhost", 12345)))
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index df841288a02e3..fbb8bb6b2f6c3 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -78,6 +78,10 @@ public TransportClient(Channel channel, TransportResponseHandler handler) {
     this.handler = Preconditions.checkNotNull(handler);
   }
 
+  public Channel getChannel() {
+    return channel;
+  }
+
   public boolean isActive() {
     return channel.isOpen() || channel.isActive();
   }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
index 2ba92a40f8b0a..dbb7f95f55bc0 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/RpcHandler.java
@@ -52,4 +52,6 @@ public abstract void receive(
    * No further requests will come from this client.
    */
   public void connectionTerminated(TransportClient client) { }
+
+  public void exceptionCaught(Throwable cause, TransportClient client) { }
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index df6027805838d..96941d26be19d 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -71,6 +71,7 @@ public TransportRequestHandler(
 
   @Override
   public void exceptionCaught(Throwable cause) {
+    rpcHandler.exceptionCaught(cause, reverseClient);
   }
 
   @Override
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 204e6142fd6cf..d053e9e84910f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -474,7 +474,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       // Remote messages
       case RegisterReceiver(streamId, typ, hostPort, receiverEndpoint) =>
         val successful =
-          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.sender.address)
+          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.senderAddress)
         context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 07a0a45e60cbe..0df31736c163f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -556,10 +556,7 @@ private[spark] class ApplicationMaster(
       override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
     extends RpcEndpoint with Logging {
 
-    override def onStart(): Unit = {
-      driver.send(RegisterClusterManager(self))
-
-    }
+    driver.send(RegisterClusterManager(self))
 
     override def receive: PartialFunction[Any, Unit] = {
       case x: AddWebUIFilter =>

From be0dcd6eb120491bca62d65a11c476401f9932c1 Mon Sep 17 00:00:00 2001
From: Guillaume Poulin <guillaume@hopper.com>
Date: Sat, 3 Oct 2015 12:14:00 +0100
Subject: [PATCH 451/802] FIX: rememberDuration reassignment error message

I was reading throught the scheduler and found this small mistake.

Author: Guillaume Poulin <guillaume@hopper.com>

Closes #8966 from gpoulin/remember_duration_typo.
---
 .../apache/spark/streaming/DStreamGraph.scala    | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index 40789c66f3991..ebbcb6b778510 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -38,9 +38,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
 
   def start(time: Time) {
     this.synchronized {
-      if (zeroTime != null) {
-        throw new Exception("DStream graph computation already started")
-      }
+      require(zeroTime == null, "DStream graph computation already started")
       zeroTime = time
       startTime = time
       outputStreams.foreach(_.initialize(zeroTime))
@@ -68,20 +66,16 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
 
   def setBatchDuration(duration: Duration) {
     this.synchronized {
-      if (batchDuration != null) {
-        throw new Exception("Batch duration already set as " + batchDuration +
-          ". cannot set it again.")
-      }
+      require(batchDuration == null,
+        s"Batch duration already set as $batchDuration. Cannot set it again.")
       batchDuration = duration
     }
   }
 
   def remember(duration: Duration) {
     this.synchronized {
-      if (rememberDuration != null) {
-        throw new Exception("Remember duration already set as " + batchDuration +
-          ". cannot set it again.")
-      }
+      require(rememberDuration == null,
+        s"Remember duration already set as $rememberDuration. Cannot set it again.")
       rememberDuration = duration
     }
   }

From ae6570ec2bf937e28bd1e7bada7813ac56a7b79d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 3 Oct 2015 18:08:25 -0700
Subject: [PATCH 452/802] Remove TODO in ShuffleMemoryManager.

---
 .../scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index a0d8abc2eecb3..9839c7640cc63 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -177,7 +177,6 @@ private[spark] object ShuffleMemoryManager {
     val cores = if (numCores > 0) numCores else Runtime.getRuntime.availableProcessors()
     // Because of rounding to next power of 2, we may have safetyFactor as 8 in worst case
     val safetyFactor = 16
-    // TODO(davies): don't round to next power of 2
     val size = ByteArrayMethods.nextPowerOf2(maxMemory / cores / safetyFactor)
     val default = math.min(maxPageSize, math.max(minPageSize, size))
     conf.getSizeAsBytes("spark.buffer.pageSize", default)

From 721e8b5f35b230ff426c1757a9bdc1399fb19afa Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Sat, 3 Oct 2015 22:42:36 -0700
Subject: [PATCH 453/802] [SPARK-10904] [SPARKR] Fix to support `select(df,
 c("col1", "col2"))`

The fix is to coerce `c("a", "b")` into a list such that it could be serialized to call JVM with.

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #8961 from felixcheung/rselect.
---
 R/pkg/R/DataFrame.R              | 18 +++++++++++++-----
 R/pkg/inst/tests/test_sparkSQL.R |  9 ++++++++-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 65e368c47dd81..14aea923fcfef 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1075,12 +1075,20 @@ setMethod("subset", signature(x = "DataFrame"),
 #'   select(df, c("col1", "col2"))
 #'   select(df, list(df$name, df$age + 1))
 #'   # Similar to R data frames columns can also be selected using `$`
-#'   df$age
+#'   df[,df$age]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
-            sdf <- callJMethod(x@sdf, "select", col, list(...))
-            dataFrame(sdf)
+            if (length(col) > 1) {
+              if (length(list(...)) > 0) {
+                stop("To select multiple columns, use a character vector or list for col")
+              }
+
+              select(x, as.list(col))
+            } else {
+              sdf <- callJMethod(x@sdf, "select", col, list(...))
+              dataFrame(sdf)
+            }
           })
 
 #' @rdname select
@@ -1853,13 +1861,13 @@ setMethod("crosstab",
 #' This function downloads the contents of a DataFrame into an R's data.frame.
 #' Since data.frames are held in memory, ensure that you have enough memory
 #' in your system to accommodate the contents.
-#' 
+#'
 #' @title Download data from a DataFrame into a data.frame
 #' @param x a DataFrame
 #' @return a data.frame
 #' @rdname as.data.frame
 #' @examples \dontrun{
-#' 
+#'
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ])
 #' }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 8f85eecbc4a97..faf42b7182c30 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -673,6 +673,13 @@ test_that("select with column", {
   expect_equal(columns(df3), c("x"))
   expect_equal(count(df3), 3)
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
+
+  df4 <- select(df, c("name", "age"))
+  expect_equal(columns(df4), c("name", "age"))
+  expect_equal(count(df4), 3)
+
+  expect_error(select(df, c("name", "age"), "name"),
+                "To select multiple columns, use a character vector or list for col")
 })
 
 test_that("subsetting", {
@@ -1336,4 +1343,4 @@ test_that("Method as.data.frame as a synonym for collect()", {
 
 unlink(parquetPath)
 unlink(jsonPath)
-unlink(jsonPathNa)
\ No newline at end of file
+unlink(jsonPathNa)

From 82bbc2a5f2c74604db59060cb5e462a057398ddd Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 4 Oct 2015 09:31:52 +0100
Subject: [PATCH 454/802] [SPARK-9570] [DOCS] Consistent recommendation for
 submitting spark apps to YARN, -master yarn --deploy-mode x vs -master
 yarn-x'.

Recommend `--master yarn --deploy-mode {cluster,client}` consistently in docs.
Follow-on to https://github.com/apache/spark/pull/8385
CC nssalian

Author: Sean Owen <sowen@cloudera.com>

Closes #8968 from srowen/SPARK-9570.
---
 README.md                       |  2 +-
 docs/running-on-yarn.md         | 32 +++++++++++++++++---------------
 docs/sql-programming-guide.md   |  2 +-
 docs/submitting-applications.md | 25 +++++++++++++++----------
 4 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 76e29b4235666..4116ef3563879 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ will run the Pi example locally.
 
 You can set the MASTER environment variable when running examples to submit
 examples to a cluster. This can be a mesos:// or spark:// URL,
-"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run
+"yarn" to run on YARN, and "local" to run
 locally with one thread, or "local[N]" to run locally with N threads. You
 can also use an abbreviated class name if the class is in the `examples`
 package. For instance:
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 0e25ccf512c02..6d77db6a3271e 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -16,18 +16,19 @@ containers used by the application use the same configuration. If the configurat
 Java system properties or environment variables not managed by YARN, they should also be set in the
 Spark application's configuration (driver, executors, and the AM when running in client mode).
 
-There are two deploy modes that can be used to launch Spark applications on YARN. In `yarn-cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `yarn-client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN.
+There are two deploy modes that can be used to launch Spark applications on YARN. In `cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN.
 
-Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`.
+Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn`.
 
-To launch a Spark application in `yarn-cluster` mode:
+To launch a Spark application in `cluster` mode:
 
-    $ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]
+    $ ./bin/spark-submit --class path.to.your.Class --master yarn --deploy-mode cluster [options] <app jar> [app options]
 
 For example:
 
     $ ./bin/spark-submit --class org.apache.spark.examples.SparkPi \
-        --master yarn-cluster \
+        --master yarn \
+        --deploy-mode cluster \
         --driver-memory 4g \
         --executor-memory 2g \
         --executor-cores 1 \
@@ -37,16 +38,17 @@ For example:
 
 The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running.  Refer to the "Debugging your Application" section below for how to see driver and executor logs.
 
-To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`. The following shows how you can run `spark-shell` in `yarn-client` mode:
+To launch a Spark application in `client` mode, do the same, but replace `cluster` with `client`. The following shows how you can run `spark-shell` in `client` mode:
 
-    $ ./bin/spark-shell --master yarn-client
+    $ ./bin/spark-shell --master yarn --deploy-mode client
 
 ## Adding Other JARs
 
-In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command.
+In `cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command.
 
     $ ./bin/spark-submit --class my.main.Class \
-        --master yarn-cluster \
+        --master yarn \
+        --deploy-mode cluster \
         --jars my-other-jar.jar,my-other-other-jar.jar
         my-main-jar.jar
         app_arg1 app_arg2
@@ -129,8 +131,8 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td><code>spark.yarn.am.waitTime</code></td>
   <td><code>100s</code></td>
   <td>
-    In <code>yarn-cluster</code> mode, time for the YARN Application Master to wait for the
-    SparkContext to be initialized. In <code>yarn-client</code> mode, time for the YARN Application Master to wait
+    In <code>cluster</code> mode, time for the YARN Application Master to wait for the
+    SparkContext to be initialized. In <code>client</code> mode, time for the YARN Application Master to wait
     for the driver to connect to it.
   </td>
 </tr>
@@ -268,8 +270,8 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td>
      Add the environment variable specified by <code>EnvironmentVariableName</code> to the
      Application Master process launched on YARN. The user can specify multiple of
-     these and to set multiple environment variables. In <code>yarn-cluster</code> mode this controls
-     the environment of the Spark driver and in <code>yarn-client</code> mode it only controls
+     these and to set multiple environment variables. In <code>cluster</code> mode this controls
+     the environment of the Spark driver and in <code>client</code> mode it only controls
      the environment of the executor launcher.
   </td>
 </tr>
@@ -388,6 +390,6 @@ If you need a reference to the proper location to put log files in the YARN so t
 # Important notes
 
 - Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured.
-- In `yarn-cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `yarn-client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `yarn-client` mode, only the Spark executors do.
+- In `cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `client` mode, only the Spark executors do.
 - The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named `localtest.txt` into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN.
-- The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `yarn-cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files.
+- The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index a1cbc7de97c65..30206c6f6fd93 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1585,7 +1585,7 @@ on all of the worker nodes, as they will need access to the Hive serialization a
 (SerDes) in order to access data stored in Hive.
 
 Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`. Please note when running
-the query on a YARN cluster (`yarn-cluster` mode), the `datanucleus` jars under the `lib_managed/jars` directory
+the query on a YARN cluster (`cluster` mode), the `datanucleus` jars under the `lib_managed/jars` directory
 and `hive-site.xml` under `conf/` directory need to be available on the driver and all executors launched by the
 YARN cluster. The convenient way to do this is adding them through the `--jars` option and `--file` option of the
 `spark-submit` command.
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 915be0f479157..ac2a14eb56fea 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -103,7 +103,8 @@ run it with `--help`. Here are a few examples of common options:
 export HADOOP_CONF_DIR=XXX
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
-  --master yarn-cluster \  # can also be yarn-client for client mode
+  --master yarn \
+  --deploy-mode cluster \  # can be client for client mode
   --executor-memory 20G \
   --num-executors 50 \
   /path/to/examples.jar \
@@ -122,21 +123,25 @@ The master URL passed to Spark can be in one of the following formats:
 
 <table class="table">
 <tr><th>Master URL</th><th>Meaning</th></tr>
-<tr><td> local </td><td> Run Spark locally with one worker thread (i.e. no parallelism at all). </td></tr>
-<tr><td> local[K] </td><td> Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine). </td></tr>
-<tr><td> local[*] </td><td> Run Spark locally with as many worker threads as logical cores on your machine.</td></tr>
-<tr><td> spark://HOST:PORT </td><td> Connect to the given <a href="spark-standalone.html">Spark standalone
+<tr><td> <code>local</code> </td><td> Run Spark locally with one worker thread (i.e. no parallelism at all). </td></tr>
+<tr><td> <code>local[K]</code> </td><td> Run Spark locally with K worker threads (ideally, set this to the number of cores on your machine). </td></tr>
+<tr><td> <code>local[*]</code> </td><td> Run Spark locally with as many worker threads as logical cores on your machine.</td></tr>
+<tr><td> <code>spark://HOST:PORT</code> </td><td> Connect to the given <a href="spark-standalone.html">Spark standalone
         cluster</a> master. The port must be whichever one your master is configured to use, which is 7077 by default.
 </td></tr>
-<tr><td> mesos://HOST:PORT </td><td> Connect to the given <a href="running-on-mesos.html">Mesos</a> cluster.
+<tr><td> <code>mesos://HOST:PORT</code> </td><td> Connect to the given <a href="running-on-mesos.html">Mesos</a> cluster.
         The port must be whichever one your is configured to use, which is 5050 by default.
         Or, for a Mesos cluster using ZooKeeper, use <code>mesos://zk://...</code>.
 </td></tr>
-<tr><td> yarn-client </td><td> Connect to a <a href="running-on-yarn.html"> YARN </a> cluster in
-client mode. The cluster location will be found based on the HADOOP_CONF_DIR or YARN_CONF_DIR variable.
+<tr><td> <code>yarn</code> </td><td> Connect to a <a href="running-on-yarn.html"> YARN </a> cluster in
+        <code>client</code> or <code>cluster</code> mode depending on the value of <code>--deploy-mode</code>. 
+        The cluster location will be found based on the <code>HADOOP_CONF_DIR</code> or <code>YARN_CONF_DIR</code> variable.
 </td></tr>
-<tr><td> yarn-cluster </td><td> Connect to a <a href="running-on-yarn.html"> YARN </a> cluster in
-cluster mode. The cluster location will be found based on the HADOOP_CONF_DIR or YARN_CONF_DIR variable.
+<tr><td> <code>yarn-client</code> </td><td> Equivalent to <code>yarn</code> with <code>--deploy-mode client</code>,
+        which is preferred to `yarn-client`
+</td></tr>
+<tr><td> <code>yarn-cluster</code> </td><td> Equivalent to <code>yarn</code> with <code>--deploy-mode cluster</code>,
+        which is preferred to `yarn-cluster`
 </td></tr>
 </table>
 

From 883bd8fccf83aae7a2a847c9a6ca129fac86e6a3 Mon Sep 17 00:00:00 2001
From: Avrohom Katz <iambpentameter@gmail.com>
Date: Sun, 4 Oct 2015 09:36:07 +0100
Subject: [PATCH 455/802] [SPARK-10889] [STREAMING] Bump KCL to add
 MillisBehindLatest metric

I don't believe the API changed at all.

Author: Avrohom Katz <iambpentameter@gmail.com>

Closes #8957 from akatz/kcl-upgrade.
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 6535994641145..d04ed1e798657 100644
--- a/pom.xml
+++ b/pom.xml
@@ -152,7 +152,7 @@
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <jets3t.version>0.7.1</jets3t.version>
     <aws.java.sdk.version>1.9.16</aws.java.sdk.version>
-    <aws.kinesis.client.version>1.2.1</aws.kinesis.client.version>
+    <aws.kinesis.client.version>1.3.0</aws.kinesis.client.version>
     <!--  org.apache.httpcomponents/httpclient-->
     <commons.httpclient.version>4.3.2</commons.httpclient.version>
     <!--  commons-httpclient/commons-httpclient-->

From c4871369db96fc33c465d11b3bbd1ffeb3b94e89 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Mon, 5 Oct 2015 13:00:58 -0700
Subject: [PATCH 456/802] [SPARK-10585] [SQL] only copy data once when generate
 unsafe projection

This PR is a completely rewritten of GenerateUnsafeProjection, to accomplish the goal of copying data only once. The old code of GenerateUnsafeProjection is still there to reduce review difficulty.

Instead of creating unsafe conversion code for struct, array and map, we create code of writing the content to the global row buffer.

Author: Wenchen Fan <cloud0fan@163.com>
Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8747 from cloud-fan/copy-once.
---
 .../catalyst/expressions/UnsafeArrayData.java |   7 +-
 .../catalyst/expressions/UnsafeMapData.java   |  15 +-
 .../catalyst/expressions/UnsafeReaders.java   |   6 +
 .../sql/catalyst/expressions/UnsafeRow.java   |   4 +-
 .../expressions/UnsafeRowWriters.java         |   4 +-
 .../catalyst/expressions/UnsafeWriters.java   |   4 +-
 .../expressions/codegen/BufferHolder.java     |  54 ++++
 .../codegen/UnsafeArrayWriter.java            | 151 +++++++++
 .../expressions/codegen/UnsafeRowWriter.java  | 199 ++++++++++++
 .../expressions/codegen/CodeGenerator.scala   |   1 +
 .../codegen/GenerateUnsafeProjection.scala    | 284 +++++++++++++++-
 .../expressions/UnsafeRowConverterSuite.scala | 305 ++++++++++++++----
 12 files changed, 950 insertions(+), 84 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 501dff090313c..da9538b3f13cc 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -20,7 +20,6 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 
-import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
@@ -256,7 +255,7 @@ public CalendarInterval getInterval(int ordinal) {
   }
 
   @Override
-  public InternalRow getStruct(int ordinal, int numFields) {
+  public UnsafeRow getStruct(int ordinal, int numFields) {
     assertIndexIsValid(ordinal);
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
@@ -267,7 +266,7 @@ public InternalRow getStruct(int ordinal, int numFields) {
   }
 
   @Override
-  public ArrayData getArray(int ordinal) {
+  public UnsafeArrayData getArray(int ordinal) {
     assertIndexIsValid(ordinal);
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
@@ -276,7 +275,7 @@ public ArrayData getArray(int ordinal) {
   }
 
   @Override
-  public MapData getMap(int ordinal) {
+  public UnsafeMapData getMap(int ordinal) {
     assertIndexIsValid(ordinal);
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index 46216054ab38b..e9dab9edb6bd1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -17,18 +17,23 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import org.apache.spark.sql.types.ArrayData;
 import org.apache.spark.sql.types.MapData;
 
 /**
  * An Unsafe implementation of Map which is backed by raw memory instead of Java objects.
  *
  * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData.
+ *
+ * Note that when we write out this map, we should write out the `numElements` at first 4 bytes,
+ * and numBytes of key array at second 4 bytes, then follows key array content and value array
+ * content without `numElements` header.
+ * When we read in a map, we should read first 4 bytes as `numElements` and second 4 bytes as
+ * numBytes of key array, and construct unsafe key array and value array with these 2 information.
  */
 public class UnsafeMapData extends MapData {
 
-  public final UnsafeArrayData keys;
-  public final UnsafeArrayData values;
+  private final UnsafeArrayData keys;
+  private final UnsafeArrayData values;
   // The number of elements in this array
   private int numElements;
   // The size of this array's backing data, in bytes
@@ -50,12 +55,12 @@ public int numElements() {
   }
 
   @Override
-  public ArrayData keyArray() {
+  public UnsafeArrayData keyArray() {
     return keys;
   }
 
   @Override
-  public ArrayData valueArray() {
+  public UnsafeArrayData valueArray() {
     return values;
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
index 7b03185a30e3c..6c5fcbca63fd7 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
@@ -21,6 +21,9 @@
 
 public class UnsafeReaders {
 
+  /**
+   * Reads in unsafe array according to the format described in `UnsafeArrayData`.
+   */
   public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int numBytes) {
     // Read the number of elements from first 4 bytes.
     final int numElements = Platform.getInt(baseObject, baseOffset);
@@ -30,6 +33,9 @@ public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int
     return array;
   }
 
+  /**
+   * Reads in unsafe map according to the format described in `UnsafeMapData`.
+   */
   public static UnsafeMapData readMap(Object baseObject, long baseOffset, int numBytes) {
     // Read the number of elements from first 4 bytes.
     final int numElements = Platform.getInt(baseObject, baseOffset);
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 6c020045c311a..e8ac2999c2d29 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -446,7 +446,7 @@ public UnsafeRow getStruct(int ordinal, int numFields) {
   }
 
   @Override
-  public ArrayData getArray(int ordinal) {
+  public UnsafeArrayData getArray(int ordinal) {
     if (isNullAt(ordinal)) {
       return null;
     } else {
@@ -458,7 +458,7 @@ public ArrayData getArray(int ordinal) {
   }
 
   @Override
-  public MapData getMap(int ordinal) {
+  public UnsafeMapData getMap(int ordinal) {
     if (isNullAt(ordinal)) {
       return null;
     } else {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index 2f43db68a750e..0f1e0202aace1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -233,8 +233,8 @@ public static int getSize(UnsafeMapData input) {
 
     public static int write(UnsafeRow target, int ordinal, int cursor, UnsafeMapData input) {
       final long offset = target.getBaseOffset() + cursor;
-      final UnsafeArrayData keyArray = input.keys;
-      final UnsafeArrayData valueArray = input.values;
+      final UnsafeArrayData keyArray = input.keyArray();
+      final UnsafeArrayData valueArray = input.valueArray();
       final int keysNumBytes = keyArray.getSizeInBytes();
       final int valuesNumBytes = valueArray.getSizeInBytes();
       final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java
index cd83695fca033..ce2d9c4ffbf9f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java
@@ -168,8 +168,8 @@ public static int getSize(UnsafeMapData input) {
     }
 
     public static int write(Object targetObject, long targetOffset, UnsafeMapData input) {
-      final UnsafeArrayData keyArray = input.keys;
-      final UnsafeArrayData valueArray = input.values;
+      final UnsafeArrayData keyArray = input.keyArray();
+      final UnsafeArrayData valueArray = input.valueArray();
       final int keysNumBytes = keyArray.getSizeInBytes();
       final int valuesNumBytes = valueArray.getSizeInBytes();
       final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
new file mode 100644
index 0000000000000..9c9468678065d
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.unsafe.Platform;
+
+/**
+ * A helper class to manage the row buffer used in `GenerateUnsafeProjection`.
+ *
+ * Note that it is only used in `GenerateUnsafeProjection`, so it's safe to mark member variables
+ * public for ease of use.
+ */
+public class BufferHolder {
+  public byte[] buffer = new byte[64];
+  public int cursor = Platform.BYTE_ARRAY_OFFSET;
+
+  public void grow(int neededSize) {
+    final int length = totalSize() + neededSize;
+    if (buffer.length < length) {
+      // This will not happen frequently, because the buffer is re-used.
+      final byte[] tmp = new byte[length * 2];
+      Platform.copyMemory(
+        buffer,
+        Platform.BYTE_ARRAY_OFFSET,
+        tmp,
+        Platform.BYTE_ARRAY_OFFSET,
+        totalSize());
+      buffer = tmp;
+    }
+  }
+
+  public void reset() {
+    cursor = Platform.BYTE_ARRAY_OFFSET;
+  }
+
+  public int totalSize() {
+    return cursor - Platform.BYTE_ARRAY_OFFSET;
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
new file mode 100644
index 0000000000000..138178ce99d85
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A helper class to write data into global row buffer using `UnsafeArrayData` format,
+ * used by {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}.
+ */
+public class UnsafeArrayWriter {
+
+  private BufferHolder holder;
+  // The offset of the global buffer where we start to write this array.
+  private int startingOffset;
+
+  public void initialize(BufferHolder holder, int numElements, int fixedElementSize) {
+    // We need 4 bytes each element to store offset.
+    final int fixedSize = 4 * numElements;
+
+    this.holder = holder;
+    this.startingOffset = holder.cursor;
+
+    holder.grow(fixedSize);
+    holder.cursor += fixedSize;
+
+    // Grows the global buffer ahead for fixed size data.
+    holder.grow(fixedElementSize * numElements);
+  }
+
+  private long getElementOffset(int ordinal) {
+    return startingOffset + 4 * ordinal;
+  }
+
+  public void setNullAt(int ordinal) {
+    final int relativeOffset = holder.cursor - startingOffset;
+    // Writes negative offset value to represent null element.
+    Platform.putInt(holder.buffer, getElementOffset(ordinal), -relativeOffset);
+  }
+
+  public void setOffset(int ordinal) {
+    final int relativeOffset = holder.cursor - startingOffset;
+    Platform.putInt(holder.buffer, getElementOffset(ordinal), relativeOffset);
+  }
+
+  public void writeCompactDecimal(int ordinal, Decimal input, int precision, int scale) {
+    // make sure Decimal object has the same scale as DecimalType
+    if (input.changePrecision(precision, scale)) {
+      Platform.putLong(holder.buffer, holder.cursor, input.toUnscaledLong());
+      setOffset(ordinal);
+      holder.cursor += 8;
+    } else {
+      setNullAt(ordinal);
+    }
+  }
+
+  public void write(int ordinal, Decimal input, int precision, int scale) {
+    // make sure Decimal object has the same scale as DecimalType
+    if (input.changePrecision(precision, scale)) {
+      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
+      assert bytes.length <= 16;
+      holder.grow(bytes.length);
+
+      // Write the bytes to the variable length portion.
+      Platform.copyMemory(
+        bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
+      setOffset(ordinal);
+      holder.cursor += bytes.length;
+    } else {
+      setNullAt(ordinal);
+    }
+  }
+
+  public void write(int ordinal, UTF8String input) {
+    final int numBytes = input.numBytes();
+
+    // grow the global buffer before writing data.
+    holder.grow(numBytes);
+
+    // Write the bytes to the variable length portion.
+    input.writeToMemory(holder.buffer, holder.cursor);
+
+    setOffset(ordinal);
+
+    // move the cursor forward.
+    holder.cursor += numBytes;
+  }
+
+  public void write(int ordinal, byte[] input) {
+    // grow the global buffer before writing data.
+    holder.grow(input.length);
+
+    // Write the bytes to the variable length portion.
+    Platform.copyMemory(
+      input, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, input.length);
+
+    setOffset(ordinal);
+
+    // move the cursor forward.
+    holder.cursor += input.length;
+  }
+
+  public void write(int ordinal, CalendarInterval input) {
+    // grow the global buffer before writing data.
+    holder.grow(16);
+
+    // Write the months and microseconds fields of Interval to the variable length portion.
+    Platform.putLong(holder.buffer, holder.cursor, input.months);
+    Platform.putLong(holder.buffer, holder.cursor + 8, input.microseconds);
+
+    setOffset(ordinal);
+
+    // move the cursor forward.
+    holder.cursor += 16;
+  }
+
+
+
+  // If this array is already an UnsafeArray, we don't need to go through all elements, we can
+  // directly write it.
+  public static void directWrite(BufferHolder holder, UnsafeArrayData input) {
+    final int numBytes = input.getSizeInBytes();
+
+    // grow the global buffer before writing data.
+    holder.grow(numBytes);
+
+    // Writes the array content to the variable length portion.
+    input.writeToMemory(holder.buffer, holder.cursor);
+
+    holder.cursor += numBytes;
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
new file mode 100644
index 0000000000000..8b7debd440031
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.bitset.BitSetMethods;
+import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A helper class to write data into global row buffer using `UnsafeRow` format,
+ * used by {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}.
+ */
+public class UnsafeRowWriter {
+
+  private BufferHolder holder;
+  // The offset of the global buffer where we start to write this row.
+  private int startingOffset;
+  private int nullBitsSize;
+
+  public void initialize(BufferHolder holder, int numFields) {
+    this.holder = holder;
+    this.startingOffset = holder.cursor;
+    this.nullBitsSize = UnsafeRow.calculateBitSetWidthInBytes(numFields);
+
+    // grow the global buffer to make sure it has enough space to write fixed-length data.
+    final int fixedSize = nullBitsSize + 8 * numFields;
+    holder.grow(fixedSize);
+    holder.cursor += fixedSize;
+
+    // zero-out the null bits region
+    for (int i = 0; i < nullBitsSize; i += 8) {
+      Platform.putLong(holder.buffer, startingOffset + i, 0L);
+    }
+  }
+
+  private void zeroOutPaddingBytes(int numBytes) {
+    if ((numBytes & 0x07) > 0) {
+      Platform.putLong(holder.buffer, holder.cursor + ((numBytes >> 3) << 3), 0L);
+    }
+  }
+
+  public void setNullAt(int ordinal) {
+    BitSetMethods.set(holder.buffer, startingOffset, ordinal);
+    Platform.putLong(holder.buffer, getFieldOffset(ordinal), 0L);
+  }
+
+  public long getFieldOffset(int ordinal) {
+    return startingOffset + nullBitsSize + 8 * ordinal;
+  }
+
+  public void setOffsetAndSize(int ordinal, long size) {
+    setOffsetAndSize(ordinal, holder.cursor, size);
+  }
+
+  public void setOffsetAndSize(int ordinal, long currentCursor, long size) {
+    final long relativeOffset = currentCursor - startingOffset;
+    final long fieldOffset = getFieldOffset(ordinal);
+    final long offsetAndSize = (relativeOffset << 32) | size;
+
+    Platform.putLong(holder.buffer, fieldOffset, offsetAndSize);
+  }
+
+  // Do word alignment for this row and grow the row buffer if needed.
+  // todo: remove this after we make unsafe array data word align.
+  public void alignToWords(int numBytes) {
+    final int remainder = numBytes & 0x07;
+
+    if (remainder > 0) {
+      final int paddingBytes = 8 - remainder;
+      holder.grow(paddingBytes);
+
+      for (int i = 0; i < paddingBytes; i++) {
+        Platform.putByte(holder.buffer, holder.cursor, (byte) 0);
+        holder.cursor++;
+      }
+    }
+  }
+
+  public void writeCompactDecimal(int ordinal, Decimal input, int precision, int scale) {
+    // make sure Decimal object has the same scale as DecimalType
+    if (input.changePrecision(precision, scale)) {
+      Platform.putLong(holder.buffer, getFieldOffset(ordinal), input.toUnscaledLong());
+    } else {
+      setNullAt(ordinal);
+    }
+  }
+
+  public void write(int ordinal, Decimal input, int precision, int scale) {
+    // grow the global buffer before writing data.
+    holder.grow(16);
+
+    // zero-out the bytes
+    Platform.putLong(holder.buffer, holder.cursor, 0L);
+    Platform.putLong(holder.buffer, holder.cursor + 8, 0L);
+
+    // Make sure Decimal object has the same scale as DecimalType.
+    // Note that we may pass in null Decimal object to set null for it.
+    if (input == null || !input.changePrecision(precision, scale)) {
+      BitSetMethods.set(holder.buffer, startingOffset, ordinal);
+      // keep the offset for future update
+      setOffsetAndSize(ordinal, 0L);
+    } else {
+      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
+      assert bytes.length <= 16;
+
+      // Write the bytes to the variable length portion.
+      Platform.copyMemory(
+        bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
+      setOffsetAndSize(ordinal, bytes.length);
+    }
+
+    // move the cursor forward.
+    holder.cursor += 16;
+  }
+
+  public void write(int ordinal, UTF8String input) {
+    final int numBytes = input.numBytes();
+    final int roundedSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+
+    // grow the global buffer before writing data.
+    holder.grow(roundedSize);
+
+    zeroOutPaddingBytes(numBytes);
+
+    // Write the bytes to the variable length portion.
+    input.writeToMemory(holder.buffer, holder.cursor);
+
+    setOffsetAndSize(ordinal, numBytes);
+
+    // move the cursor forward.
+    holder.cursor += roundedSize;
+  }
+
+  public void write(int ordinal, byte[] input) {
+    final int numBytes = input.length;
+    final int roundedSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+
+    // grow the global buffer before writing data.
+    holder.grow(roundedSize);
+
+    zeroOutPaddingBytes(numBytes);
+
+    // Write the bytes to the variable length portion.
+    Platform.copyMemory(input, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, numBytes);
+
+    setOffsetAndSize(ordinal, numBytes);
+
+    // move the cursor forward.
+    holder.cursor += roundedSize;
+  }
+
+  public void write(int ordinal, CalendarInterval input) {
+    // grow the global buffer before writing data.
+    holder.grow(16);
+
+    // Write the months and microseconds fields of Interval to the variable length portion.
+    Platform.putLong(holder.buffer, holder.cursor, input.months);
+    Platform.putLong(holder.buffer, holder.cursor + 8, input.microseconds);
+
+    setOffsetAndSize(ordinal, 16);
+
+    // move the cursor forward.
+    holder.cursor += 16;
+  }
+
+
+
+  // If this struct is already an UnsafeRow, we don't need to go through all fields, we can
+  // directly write it.
+  public static void directWrite(BufferHolder holder, UnsafeRow input) {
+    // No need to zero-out the bytes as UnsafeRow is word aligned for sure.
+    final int numBytes = input.getSizeInBytes();
+    // grow the global buffer before writing data.
+    holder.grow(numBytes);
+    // Write the bytes to the variable length portion.
+    input.writeToMemory(holder.buffer, holder.cursor);
+    // move the cursor forward.
+    holder.cursor += numBytes;
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index da3103b4ebb6b..9a2878113304c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -272,6 +272,7 @@ class CodeGenContext {
    * 64kb code size limit in JVM
    *
    * @param row the variable name of row that is used by expressions
+   * @param expressions the codes to evaluate expressions.
    */
   def splitExpressions(row: String, expressions: Seq[String]): String = {
     val blocks = new ArrayBuffer[String]()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 55562facf9652..99bf50a84571b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -393,10 +393,292 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case _ => input
   }
 
+  private val rowWriterClass = classOf[UnsafeRowWriter].getName
+  private val arrayWriterClass = classOf[UnsafeArrayWriter].getName
+
+  // TODO: if the nullability of field is correct, we can use it to save null check.
+  private def writeStructToBuffer(
+      ctx: CodeGenContext,
+      input: String,
+      fieldTypes: Seq[DataType],
+      bufferHolder: String): String = {
+    val fieldEvals = fieldTypes.zipWithIndex.map { case (dt, i) =>
+      val fieldName = ctx.freshName("fieldName")
+      val code = s"final ${ctx.javaType(dt)} $fieldName = ${ctx.getValue(input, dt, i.toString)};"
+      val isNull = s"$input.isNullAt($i)"
+      GeneratedExpressionCode(code, isNull, fieldName)
+    }
+
+    s"""
+      if ($input instanceof UnsafeRow) {
+        $rowWriterClass.directWrite($bufferHolder, (UnsafeRow) $input);
+      } else {
+        ${writeExpressionsToBuffer(ctx, input, fieldEvals, fieldTypes, bufferHolder)}
+      }
+    """
+  }
+
+  private def writeExpressionsToBuffer(
+      ctx: CodeGenContext,
+      row: String,
+      inputs: Seq[GeneratedExpressionCode],
+      inputTypes: Seq[DataType],
+      bufferHolder: String): String = {
+    val rowWriter = ctx.freshName("rowWriter")
+    ctx.addMutableState(rowWriterClass, rowWriter, s"this.$rowWriter = new $rowWriterClass();")
+
+    val writeFields = inputs.zip(inputTypes).zipWithIndex.map {
+      case ((input, dt), index) =>
+        val tmpCursor = ctx.freshName("tmpCursor")
+
+        val setNull = dt match {
+          case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
+            // Can't call setNullAt() for DecimalType with precision larger than 18.
+            s"$rowWriter.write($index, null, ${t.precision}, ${t.scale});"
+          case _ => s"$rowWriter.setNullAt($index);"
+        }
+
+        val writeField = dt match {
+          case t: StructType =>
+            s"""
+              // Remember the current cursor so that we can calculate how many bytes are
+              // written later.
+              final int $tmpCursor = $bufferHolder.cursor;
+              ${writeStructToBuffer(ctx, input.primitive, t.map(_.dataType), bufferHolder)}
+              $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
+            """
+
+          case a @ ArrayType(et, _) =>
+            s"""
+              // Remember the current cursor so that we can calculate how many bytes are
+              // written later.
+              final int $tmpCursor = $bufferHolder.cursor;
+              ${writeArrayToBuffer(ctx, input.primitive, et, bufferHolder)}
+              $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
+              $rowWriter.alignToWords($bufferHolder.cursor - $tmpCursor);
+            """
+
+          case m @ MapType(kt, vt, _) =>
+            s"""
+              // Remember the current cursor so that we can calculate how many bytes are
+              // written later.
+              final int $tmpCursor = $bufferHolder.cursor;
+              ${writeMapToBuffer(ctx, input.primitive, kt, vt, bufferHolder)}
+              $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
+              $rowWriter.alignToWords($bufferHolder.cursor - $tmpCursor);
+            """
+
+          case _ if ctx.isPrimitiveType(dt) =>
+            val fieldOffset = ctx.freshName("fieldOffset")
+            s"""
+              final long $fieldOffset = $rowWriter.getFieldOffset($index);
+              Platform.putLong($bufferHolder.buffer, $fieldOffset, 0L);
+              ${writePrimitiveType(ctx, input.primitive, dt, s"$bufferHolder.buffer", fieldOffset)}
+            """
+
+          case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
+            s"$rowWriter.writeCompactDecimal($index, ${input.primitive}, " +
+              s"${t.precision}, ${t.scale});"
+
+          case t: DecimalType =>
+            s"$rowWriter.write($index, ${input.primitive}, ${t.precision}, ${t.scale});"
+
+          case NullType => ""
+
+          case _ => s"$rowWriter.write($index, ${input.primitive});"
+        }
+
+        s"""
+          ${input.code}
+          if (${input.isNull}) {
+            $setNull
+          } else {
+            $writeField
+          }
+        """
+    }
+
+    s"""
+      $rowWriter.initialize($bufferHolder, ${inputs.length});
+      ${ctx.splitExpressions(row, writeFields)}
+    """
+  }
+
+  // TODO: if the nullability of array element is correct, we can use it to save null check.
+  private def writeArrayToBuffer(
+      ctx: CodeGenContext,
+      input: String,
+      elementType: DataType,
+      bufferHolder: String,
+      needHeader: Boolean = true): String = {
+    val arrayWriter = ctx.freshName("arrayWriter")
+    ctx.addMutableState(arrayWriterClass, arrayWriter,
+      s"this.$arrayWriter = new $arrayWriterClass();")
+    val numElements = ctx.freshName("numElements")
+    val index = ctx.freshName("index")
+    val element = ctx.freshName("element")
+
+    val jt = ctx.javaType(elementType)
+
+    val fixedElementSize = elementType match {
+      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => 8
+      case _ if ctx.isPrimitiveType(jt) => elementType.defaultSize
+      case _ => 0
+    }
+
+    val writeElement = elementType match {
+      case t: StructType =>
+        s"""
+          $arrayWriter.setOffset($index);
+          ${writeStructToBuffer(ctx, element, t.map(_.dataType), bufferHolder)}
+        """
+
+      case a @ ArrayType(et, _) =>
+        s"""
+          $arrayWriter.setOffset($index);
+          ${writeArrayToBuffer(ctx, element, et, bufferHolder)}
+        """
+
+      case m @ MapType(kt, vt, _) =>
+        s"""
+          $arrayWriter.setOffset($index);
+          ${writeMapToBuffer(ctx, element, kt, vt, bufferHolder)}
+        """
+
+      case _ if ctx.isPrimitiveType(elementType) =>
+        // Should we do word align?
+        val dataSize = elementType.defaultSize
+
+        s"""
+          $arrayWriter.setOffset($index);
+          ${writePrimitiveType(ctx, element, elementType,
+            s"$bufferHolder.buffer", s"$bufferHolder.cursor")}
+          $bufferHolder.cursor += $dataSize;
+        """
+
+      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
+        s"$arrayWriter.writeCompactDecimal($index, $element, ${t.precision}, ${t.scale});"
+
+      case t: DecimalType =>
+        s"$arrayWriter.write($index, $element, ${t.precision}, ${t.scale});"
+
+      case NullType => ""
+
+      case _ => s"$arrayWriter.write($index, $element);"
+    }
+
+    val writeHeader = if (needHeader) {
+      // If header is required, we need to write the number of elements into first 4 bytes.
+      s"""
+        $bufferHolder.grow(4);
+        Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $numElements);
+        $bufferHolder.cursor += 4;
+      """
+    } else ""
+
+    s"""
+      final int $numElements = $input.numElements();
+      $writeHeader
+      if ($input instanceof UnsafeArrayData) {
+        $arrayWriterClass.directWrite($bufferHolder, (UnsafeArrayData) $input);
+      } else {
+        $arrayWriter.initialize($bufferHolder, $numElements, $fixedElementSize);
+
+        for (int $index = 0; $index < $numElements; $index++) {
+          if ($input.isNullAt($index)) {
+            $arrayWriter.setNullAt($index);
+          } else {
+            final $jt $element = ${ctx.getValue(input, elementType, index)};
+            $writeElement
+          }
+        }
+      }
+    """
+  }
+
+  // TODO: if the nullability of value element is correct, we can use it to save null check.
+  private def writeMapToBuffer(
+      ctx: CodeGenContext,
+      input: String,
+      keyType: DataType,
+      valueType: DataType,
+      bufferHolder: String): String = {
+    val keys = ctx.freshName("keys")
+    val values = ctx.freshName("values")
+    val tmpCursor = ctx.freshName("tmpCursor")
+
+
+    // Writes out unsafe map according to the format described in `UnsafeMapData`.
+    s"""
+      final ArrayData $keys = $input.keyArray();
+      final ArrayData $values = $input.valueArray();
+
+      $bufferHolder.grow(8);
+
+      // Write the numElements into first 4 bytes.
+      Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $keys.numElements());
+
+      $bufferHolder.cursor += 8;
+      // Remember the current cursor so that we can write numBytes of key array later.
+      final int $tmpCursor = $bufferHolder.cursor;
+
+      ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder, needHeader = false)}
+      // Write the numBytes of key array into second 4 bytes.
+      Platform.putInt($bufferHolder.buffer, $tmpCursor - 4, $bufferHolder.cursor - $tmpCursor);
+
+      ${writeArrayToBuffer(ctx, values, valueType, bufferHolder, needHeader = false)}
+    """
+  }
+
+  private def writePrimitiveType(
+      ctx: CodeGenContext,
+      input: String,
+      dt: DataType,
+      buffer: String,
+      offset: String) = {
+    assert(ctx.isPrimitiveType(dt))
+
+    val putMethod = s"put${ctx.primitiveTypeName(dt)}"
+
+    dt match {
+      case FloatType | DoubleType =>
+        val normalized = ctx.freshName("normalized")
+        val boxedType = ctx.boxedType(dt)
+        val handleNaN =
+          s"""
+            final ${ctx.javaType(dt)} $normalized;
+            if ($boxedType.isNaN($input)) {
+              $normalized = $boxedType.NaN;
+            } else {
+              $normalized = $input;
+            }
+          """
+
+        s"""
+          $handleNaN
+          Platform.$putMethod($buffer, $offset, $normalized);
+        """
+      case _ => s"Platform.$putMethod($buffer, $offset, $input);"
+    }
+  }
+
   def createCode(ctx: CodeGenContext, expressions: Seq[Expression]): GeneratedExpressionCode = {
     val exprEvals = expressions.map(e => e.gen(ctx))
     val exprTypes = expressions.map(_.dataType)
-    createCodeForStruct(ctx, "i", exprEvals, exprTypes)
+
+    val result = ctx.freshName("result")
+    ctx.addMutableState("UnsafeRow", result, s"this.$result = new UnsafeRow();")
+    val bufferHolder = ctx.freshName("bufferHolder")
+    val holderClass = classOf[BufferHolder].getName
+    ctx.addMutableState(holderClass, bufferHolder, s"this.$bufferHolder = new $holderClass();")
+
+    val code =
+      s"""
+        $bufferHolder.reset();
+        ${writeExpressionsToBuffer(ctx, "i", exprEvals, exprTypes, bufferHolder)}
+        $result.pointTo($bufferHolder.buffer, ${expressions.length}, $bufferHolder.totalSize());
+      """
+    GeneratedExpressionCode(code, "false", result)
   }
 
   protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 8c72203193630..c991cd86d28c8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
-import java.util.Arrays
 
 import org.scalatest.Matchers
 
@@ -43,7 +42,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     row.setInt(2, 2)
 
     val unsafeRow: UnsafeRow = converter.apply(row)
-    assert(converter.apply(row).getSizeInBytes === 8 + (3 * 8))
+    assert(unsafeRow.getSizeInBytes === 8 + (3 * 8))
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getLong(1) === 1)
     assert(unsafeRow.getInt(2) === 2)
@@ -62,6 +61,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(unsafeRowCopy.getLong(0) === 0)
     assert(unsafeRowCopy.getLong(1) === 1)
     assert(unsafeRowCopy.getInt(2) === 2)
+
+    // Make sure the converter can be reused, i.e. we correctly reset all states.
+    val unsafeRow2: UnsafeRow = converter.apply(row)
+    assert(unsafeRow2.getSizeInBytes === 8 + (3 * 8))
+    assert(unsafeRow2.getLong(0) === 0)
+    assert(unsafeRow2.getLong(1) === 1)
+    assert(unsafeRow2.getInt(2) === 2)
   }
 
   test("basic conversion with primitive, string and binary types") {
@@ -176,7 +182,6 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       r
     }
 
-    // todo: we reuse the UnsafeRow in projection, so these tests are meaningless.
     val setToNullAfterCreation = converter.apply(rowWithNoNullColumns)
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
     assert(setToNullAfterCreation.getBoolean(1) === rowWithNoNullColumns.getBoolean(1))
@@ -192,7 +197,6 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       rowWithNoNullColumns.getDecimal(10, 10, 0))
     assert(setToNullAfterCreation.getDecimal(11, 38, 18) ===
       rowWithNoNullColumns.getDecimal(11, 38, 18))
-    // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
 
     for (i <- fieldTypes.indices) {
       // Cann't call setNullAt() on DecimalType
@@ -202,8 +206,6 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
         setToNullAfterCreation.setNullAt(i)
       }
     }
-    // There are some garbage left in the var-length area
-    assert(Arrays.equals(createdFromNull.getBytes, setToNullAfterCreation.getBytes()))
 
     setToNullAfterCreation.setNullAt(0)
     setToNullAfterCreation.setBoolean(1, false)
@@ -251,107 +253,274 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(converter.apply(row1).getBytes === converter.apply(row2).getBytes)
   }
 
+  test("basic conversion with struct type") {
+    val fieldTypes: Array[DataType] = Array(
+      new StructType().add("i", IntegerType),
+      new StructType().add("nest", new StructType().add("l", LongType))
+    )
+
+    val converter = UnsafeProjection.create(fieldTypes)
+
+    val row = new GenericMutableRow(fieldTypes.length)
+    row.update(0, InternalRow(1))
+    row.update(1, InternalRow(InternalRow(2L)))
+
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.numFields == 2)
+
+    val row1 = unsafeRow.getStruct(0, 1)
+    assert(row1.getSizeInBytes == 8 + 1 * 8)
+    assert(row1.numFields == 1)
+    assert(row1.getInt(0) == 1)
+
+    val row2 = unsafeRow.getStruct(1, 1)
+    assert(row2.numFields() == 1)
+
+    val innerRow = row2.getStruct(0, 1)
+
+    {
+      assert(innerRow.getSizeInBytes == 8 + 1 * 8)
+      assert(innerRow.numFields == 1)
+      assert(innerRow.getLong(0) == 2L)
+    }
+
+    assert(row2.getSizeInBytes == 8 + 1 * 8 + innerRow.getSizeInBytes)
+
+    assert(unsafeRow.getSizeInBytes == 8 + 2 * 8 + row1.getSizeInBytes + row2.getSizeInBytes)
+  }
+
+  private def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray)
+
+  private def createMap(keys: Any*)(values: Any*): MapData = {
+    assert(keys.length == values.length)
+    new ArrayBasedMapData(createArray(keys: _*), createArray(values: _*))
+  }
+
+  private def arraySizeInRow(numBytes: Int): Int = roundedSize(4 + numBytes)
+
+  private def mapSizeInRow(numBytes: Int): Int = roundedSize(8 + numBytes)
+
+  private def testArrayInt(array: UnsafeArrayData, values: Seq[Int]): Unit = {
+    assert(array.numElements == values.length)
+    assert(array.getSizeInBytes == (4 + 4) * values.length)
+    values.zipWithIndex.foreach {
+      case (value, index) => assert(array.getInt(index) == value)
+    }
+  }
+
+  private def testMapInt(map: UnsafeMapData, keys: Seq[Int], values: Seq[Int]): Unit = {
+    assert(keys.length == values.length)
+    assert(map.numElements == keys.length)
+
+    testArrayInt(map.keyArray, keys)
+    testArrayInt(map.valueArray, values)
+
+    assert(map.getSizeInBytes == map.keyArray.getSizeInBytes + map.valueArray.getSizeInBytes)
+  }
+
   test("basic conversion with array type") {
     val fieldTypes: Array[DataType] = Array(
-      ArrayType(LongType),
-      ArrayType(ArrayType(LongType))
+      ArrayType(IntegerType),
+      ArrayType(ArrayType(IntegerType))
     )
     val converter = UnsafeProjection.create(fieldTypes)
 
-    val array1 = new GenericArrayData(Array[Any](1L, 2L))
-    val array2 = new GenericArrayData(Array[Any](new GenericArrayData(Array[Any](3L, 4L))))
     val row = new GenericMutableRow(fieldTypes.length)
-    row.update(0, array1)
-    row.update(1, array2)
+    row.update(0, createArray(1, 2))
+    row.update(1, createArray(createArray(3, 4)))
 
     val unsafeRow: UnsafeRow = converter.apply(row)
     assert(unsafeRow.numFields() == 2)
 
-    val unsafeArray1 = unsafeRow.getArray(0).asInstanceOf[UnsafeArrayData]
-    assert(unsafeArray1.getSizeInBytes == 4 * 2 + 8 * 2)
-    assert(unsafeArray1.numElements() == 2)
-    assert(unsafeArray1.getLong(0) == 1L)
-    assert(unsafeArray1.getLong(1) == 2L)
+    val unsafeArray1 = unsafeRow.getArray(0)
+    testArrayInt(unsafeArray1, Seq(1, 2))
 
-    val unsafeArray2 = unsafeRow.getArray(1).asInstanceOf[UnsafeArrayData]
-    assert(unsafeArray2.numElements() == 1)
+    val unsafeArray2 = unsafeRow.getArray(1)
+    assert(unsafeArray2.numElements == 1)
 
-    val nestedArray = unsafeArray2.getArray(0).asInstanceOf[UnsafeArrayData]
-    assert(nestedArray.getSizeInBytes == 4 * 2 + 8 * 2)
-    assert(nestedArray.numElements() == 2)
-    assert(nestedArray.getLong(0) == 3L)
-    assert(nestedArray.getLong(1) == 4L)
+    val nestedArray = unsafeArray2.getArray(0)
+    testArrayInt(nestedArray, Seq(3, 4))
 
-    assert(unsafeArray2.getSizeInBytes == 4 + 4 + nestedArray.getSizeInBytes)
+    assert(unsafeArray2.getSizeInBytes == 4 + (4 + nestedArray.getSizeInBytes))
 
-    val array1Size = roundedSize(4 + unsafeArray1.getSizeInBytes)
-    val array2Size = roundedSize(4 + unsafeArray2.getSizeInBytes)
+    val array1Size = arraySizeInRow(unsafeArray1.getSizeInBytes)
+    val array2Size = arraySizeInRow(unsafeArray2.getSizeInBytes)
     assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + array1Size + array2Size)
   }
 
   test("basic conversion with map type") {
-    def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray)
+    val fieldTypes: Array[DataType] = Array(
+      MapType(IntegerType, IntegerType),
+      MapType(IntegerType, MapType(IntegerType, IntegerType))
+    )
+    val converter = UnsafeProjection.create(fieldTypes)
 
-    def testIntLongMap(map: UnsafeMapData, keys: Array[Int], values: Array[Long]): Unit = {
-      val numElements = keys.length
-      assert(map.numElements() == numElements)
+    val map1 = createMap(1, 2)(3, 4)
 
-      val keyArray = map.keys
-      assert(keyArray.getSizeInBytes == 4 * numElements + 4 * numElements)
-      assert(keyArray.numElements() == numElements)
-      keys.zipWithIndex.foreach { case (key, i) =>
-        assert(keyArray.getInt(i) == key)
-      }
+    val innerMap = createMap(5, 6)(7, 8)
+    val map2 = createMap(9)(innerMap)
 
-      val valueArray = map.values
-      assert(valueArray.getSizeInBytes == 4 * numElements + 8 * numElements)
-      assert(valueArray.numElements() == numElements)
-      values.zipWithIndex.foreach { case (value, i) =>
-        assert(valueArray.getLong(i) == value)
-      }
+    val row = new GenericMutableRow(fieldTypes.length)
+    row.update(0, map1)
+    row.update(1, map2)
 
-      assert(map.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.numFields == 2)
+
+    val unsafeMap1 = unsafeRow.getMap(0)
+    testMapInt(unsafeMap1, Seq(1, 2), Seq(3, 4))
+
+    val unsafeMap2 = unsafeRow.getMap(1)
+    assert(unsafeMap2.numElements == 1)
+
+    val keyArray = unsafeMap2.keyArray
+    testArrayInt(keyArray, Seq(9))
+
+    val valueArray = unsafeMap2.valueArray
+
+    {
+      assert(valueArray.numElements == 1)
+
+      val nestedMap = valueArray.getMap(0)
+      testMapInt(nestedMap, Seq(5, 6), Seq(7, 8))
+
+      assert(valueArray.getSizeInBytes == 4 + (8 + nestedMap.getSizeInBytes))
     }
 
+    assert(unsafeMap2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+
+    val map1Size = mapSizeInRow(unsafeMap1.getSizeInBytes)
+    val map2Size = mapSizeInRow(unsafeMap2.getSizeInBytes)
+    assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + map1Size + map2Size)
+  }
+
+  test("basic conversion with struct and array") {
     val fieldTypes: Array[DataType] = Array(
-      MapType(IntegerType, LongType),
-      MapType(IntegerType, MapType(IntegerType, LongType))
+      new StructType().add("arr", ArrayType(IntegerType)),
+      ArrayType(new StructType().add("l", LongType))
     )
     val converter = UnsafeProjection.create(fieldTypes)
 
-    val map1 = new ArrayBasedMapData(createArray(1, 2), createArray(3L, 4L))
+    val row = new GenericMutableRow(fieldTypes.length)
+    row.update(0, InternalRow(createArray(1)))
+    row.update(1, createArray(InternalRow(2L)))
+
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.numFields() == 2)
+
+    val field1 = unsafeRow.getStruct(0, 1)
+    assert(field1.numFields == 1)
+
+    val innerArray = field1.getArray(0)
+    testArrayInt(innerArray, Seq(1))
 
-    val innerMap = new ArrayBasedMapData(createArray(5, 6), createArray(7L, 8L))
-    val map2 = new ArrayBasedMapData(createArray(9), createArray(innerMap))
+    assert(field1.getSizeInBytes == 8 + 8 + arraySizeInRow(innerArray.getSizeInBytes))
+
+    val field2 = unsafeRow.getArray(1)
+    assert(field2.numElements == 1)
+
+    val innerStruct = field2.getStruct(0, 1)
+
+    {
+      assert(innerStruct.numFields == 1)
+      assert(innerStruct.getSizeInBytes == 8 + 8)
+      assert(innerStruct.getLong(0) == 2L)
+    }
+
+    assert(field2.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+
+    assert(unsafeRow.getSizeInBytes ==
+      8 + 8 * 2 + field1.getSizeInBytes + arraySizeInRow(field2.getSizeInBytes))
+  }
+
+  test("basic conversion with struct and map") {
+    val fieldTypes: Array[DataType] = Array(
+      new StructType().add("map", MapType(IntegerType, IntegerType)),
+      MapType(IntegerType, new StructType().add("l", LongType))
+    )
+    val converter = UnsafeProjection.create(fieldTypes)
 
     val row = new GenericMutableRow(fieldTypes.length)
-    row.update(0, map1)
-    row.update(1, map2)
+    row.update(0, InternalRow(createMap(1)(2)))
+    row.update(1, createMap(3)(InternalRow(4L)))
 
     val unsafeRow: UnsafeRow = converter.apply(row)
     assert(unsafeRow.numFields() == 2)
 
-    val unsafeMap1 = unsafeRow.getMap(0).asInstanceOf[UnsafeMapData]
-    testIntLongMap(unsafeMap1, Array(1, 2), Array(3L, 4L))
+    val field1 = unsafeRow.getStruct(0, 1)
+    assert(field1.numFields == 1)
 
-    val unsafeMap2 = unsafeRow.getMap(1).asInstanceOf[UnsafeMapData]
-    assert(unsafeMap2.numElements() == 1)
+    val innerMap = field1.getMap(0)
+    testMapInt(innerMap, Seq(1), Seq(2))
 
-    val keyArray = unsafeMap2.keys
-    assert(keyArray.getSizeInBytes == 4 + 4)
-    assert(keyArray.numElements() == 1)
-    assert(keyArray.getInt(0) == 9)
+    assert(field1.getSizeInBytes == 8 + 8 + mapSizeInRow(innerMap.getSizeInBytes))
 
-    val valueArray = unsafeMap2.values
-    assert(valueArray.numElements() == 1)
-    val nestedMap = valueArray.getMap(0).asInstanceOf[UnsafeMapData]
-    testIntLongMap(nestedMap, Array(5, 6), Array(7L, 8L))
-    assert(valueArray.getSizeInBytes == 4 + 8 + nestedMap.getSizeInBytes)
+    val field2 = unsafeRow.getMap(1)
 
-    assert(unsafeMap2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    val keyArray = field2.keyArray
+    testArrayInt(keyArray, Seq(3))
 
-    val map1Size = roundedSize(8 + unsafeMap1.getSizeInBytes)
-    val map2Size = roundedSize(8 + unsafeMap2.getSizeInBytes)
-    assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + map1Size + map2Size)
+    val valueArray = field2.valueArray
+
+    {
+      assert(valueArray.numElements == 1)
+
+      val innerStruct = valueArray.getStruct(0, 1)
+      assert(innerStruct.numFields == 1)
+      assert(innerStruct.getSizeInBytes == 8 + 8)
+      assert(innerStruct.getLong(0) == 4L)
+
+      assert(valueArray.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+    }
+
+    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+
+    assert(unsafeRow.getSizeInBytes ==
+      8 + 8 * 2 + field1.getSizeInBytes + mapSizeInRow(field2.getSizeInBytes))
+  }
+
+  test("basic conversion with array and map") {
+    val fieldTypes: Array[DataType] = Array(
+      ArrayType(MapType(IntegerType, IntegerType)),
+      MapType(IntegerType, ArrayType(IntegerType))
+    )
+    val converter = UnsafeProjection.create(fieldTypes)
+
+    val row = new GenericMutableRow(fieldTypes.length)
+    row.update(0, createArray(createMap(1)(2)))
+    row.update(1, createMap(3)(createArray(4)))
+
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.numFields() == 2)
+
+    val field1 = unsafeRow.getArray(0)
+    assert(field1.numElements == 1)
+
+    val innerMap = field1.getMap(0)
+    testMapInt(innerMap, Seq(1), Seq(2))
+
+    assert(field1.getSizeInBytes == 4 + (8 + innerMap.getSizeInBytes))
+
+    val field2 = unsafeRow.getMap(1)
+    assert(field2.numElements == 1)
+
+    val keyArray = field2.keyArray
+    testArrayInt(keyArray, Seq(3))
+
+    val valueArray = field2.valueArray
+
+    {
+      assert(valueArray.numElements == 1)
+
+      val innerArray = valueArray.getArray(0)
+      testArrayInt(innerArray, Seq(4))
+
+      assert(valueArray.getSizeInBytes == 4 + (4 + innerArray.getSizeInBytes))
+    }
+
+    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+
+    assert(unsafeRow.getSizeInBytes ==
+      8 + 8 * 2 + arraySizeInRow(field1.getSizeInBytes) + mapSizeInRow(field2.getSizeInBytes))
   }
 }

From a609eb20d964a7b92f1066300443415f6db181e3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Mon, 5 Oct 2015 17:30:54 -0700
Subject: [PATCH 457/802] [SPARK-10934] [SQL] handle hashCode of unsafe array
 correctly

`Murmur3_x86_32.hashUnsafeWords` only accepts word-aligned bytes, but unsafe array is not.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8987 from cloud-fan/hash.
---
 .../spark/sql/catalyst/expressions/UnsafeArrayData.java    | 6 +++++-
 .../test/scala/org/apache/spark/sql/UnsafeRowSuite.scala   | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index da9538b3f13cc..6a16d34083163 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -285,7 +285,11 @@ public UnsafeMapData getMap(int ordinal) {
 
   @Override
   public int hashCode() {
-    return Murmur3_x86_32.hashUnsafeWords(baseObject, baseOffset, sizeInBytes, 42);
+    int result = 37;
+    for (int i = 0; i < sizeInBytes; i++) {
+      result = 37 * result + Platform.getByte(baseObject, baseOffset + i);
+    }
+    return result;
   }
 
   @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 2476b10e3cf9e..944d4e11348cf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -131,4 +131,11 @@ class UnsafeRowSuite extends SparkFunSuite {
     assert(emptyRow.getInt(0) === unsafeRow.getInt(0))
     assert(emptyRow.getUTF8String(1) === unsafeRow.getUTF8String(1))
   }
+
+  test("calling hashCode on unsafe array returned by getArray(ordinal)") {
+    val row = InternalRow.apply(new GenericArrayData(Array(1L)))
+    val unsafeRow = UnsafeProjection.create(Array[DataType](ArrayType(LongType))).apply(row)
+    // Makes sure hashCode on unsafe array won't crash
+    unsafeRow.getArray(0).hashCode()
+  }
 }

From be7c5ff1ad02ce1c03113c98656a4e0c0c3cee83 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 5 Oct 2015 19:23:41 -0700
Subject: [PATCH 458/802] [SPARK-10900] [STREAMING] Add output operation events
 to StreamingListener

Add output operation events to StreamingListener so as to implement the following UI features:

1. Progress bar of a batch in the batch list.
2. Be able to display output operation `description` and `duration` when there is no spark job in a Streaming job.

Author: zsxwing <zsxwing@gmail.com>

Closes #8958 from zsxwing/output-operation-events.
---
 .../apache/spark/streaming/DStreamGraph.scala |  6 ++-
 .../spark/streaming/scheduler/Job.scala       |  7 +++
 .../streaming/scheduler/JobScheduler.scala    | 20 +++++----
 .../scheduler/OutputOperationInfo.scala       | 44 +++++++++++++++++++
 .../scheduler/StreamingListener.scala         | 16 +++++++
 .../scheduler/StreamingListenerBus.scala      |  4 ++
 .../streaming/StreamingListenerSuite.scala    | 37 ++++++++++++++++
 7 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index ebbcb6b778510..de79c9ef1abfa 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -111,7 +111,11 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
   def generateJobs(time: Time): Seq[Job] = {
     logDebug("Generating jobs for time " + time)
     val jobs = this.synchronized {
-      outputStreams.flatMap(outputStream => outputStream.generateJob(time))
+      outputStreams.flatMap { outputStream =>
+        val jobOption = outputStream.generateJob(time)
+        jobOption.foreach(_.setCallSite(outputStream.creationSite.longForm))
+        jobOption
+      }
     }
     logDebug("Generated " + jobs.length + " jobs for time " + time)
     jobs
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
index 3c481bf3491f9..1373053f064f3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
@@ -29,6 +29,7 @@ class Job(val time: Time, func: () => _) {
   private var _outputOpId: Int = _
   private var isSet = false
   private var _result: Try[_] = null
+  private var _callSite: String = "Unknown"
 
   def run() {
     _result = Try(func())
@@ -70,5 +71,11 @@ class Job(val time: Time, func: () => _) {
     _outputOpId = outputOpId
   }
 
+  def setCallSite(callSite: String): Unit = {
+    _callSite = callSite
+  }
+
+  def callSite: String = _callSite
+
   override def toString: String = id
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 66afbf1b11764..0a4a396a0f498 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -30,8 +30,8 @@ import org.apache.spark.util.{EventLoop, ThreadUtils}
 
 
 private[scheduler] sealed trait JobSchedulerEvent
-private[scheduler] case class JobStarted(job: Job) extends JobSchedulerEvent
-private[scheduler] case class JobCompleted(job: Job) extends JobSchedulerEvent
+private[scheduler] case class JobStarted(job: Job, startTime: Long) extends JobSchedulerEvent
+private[scheduler] case class JobCompleted(job: Job, completedTime: Long) extends JobSchedulerEvent
 private[scheduler] case class ErrorReported(msg: String, e: Throwable) extends JobSchedulerEvent
 
 /**
@@ -143,8 +143,8 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   private def processEvent(event: JobSchedulerEvent) {
     try {
       event match {
-        case JobStarted(job) => handleJobStart(job)
-        case JobCompleted(job) => handleJobCompletion(job)
+        case JobStarted(job, startTime) => handleJobStart(job, startTime)
+        case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
         case ErrorReported(m, e) => handleError(m, e)
       }
     } catch {
@@ -153,7 +153,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     }
   }
 
-  private def handleJobStart(job: Job) {
+  private def handleJobStart(job: Job, startTime: Long) {
     val jobSet = jobSets.get(job.time)
     val isFirstJobOfJobSet = !jobSet.hasStarted
     jobSet.handleJobStart(job)
@@ -162,12 +162,16 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
       // correct "jobSet.processingStartTime".
       listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
     }
+    listenerBus.post(StreamingListenerOutputOperationStarted(
+      OutputOperationInfo(job.time, job.outputOpId, job.callSite, Some(startTime), None)))
     logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
   }
 
-  private def handleJobCompletion(job: Job) {
+  private def handleJobCompletion(job: Job, completedTime: Long) {
     val jobSet = jobSets.get(job.time)
     jobSet.handleJobCompletion(job)
+    listenerBus.post(StreamingListenerOutputOperationCompleted(
+      OutputOperationInfo(job.time, job.outputOpId, job.callSite, None, Some(completedTime))))
     logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
     if (jobSet.hasCompleted) {
       jobSets.remove(jobSet.time)
@@ -210,7 +214,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
         // it's possible that when `post` is called, `eventLoop` happens to null.
         var _eventLoop = eventLoop
         if (_eventLoop != null) {
-          _eventLoop.post(JobStarted(job))
+          _eventLoop.post(JobStarted(job, clock.getTimeMillis()))
           // Disable checks for existing output directories in jobs launched by the streaming
           // scheduler, since we may need to write output to an existing directory during checkpoint
           // recovery; see SPARK-4835 for more details.
@@ -219,7 +223,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
           }
           _eventLoop = eventLoop
           if (_eventLoop != null) {
-            _eventLoop.post(JobCompleted(job))
+            _eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
           }
         } else {
           // JobScheduler has been stopped.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
new file mode 100644
index 0000000000000..d5614b343912b
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.streaming.Time
+
+/**
+ * :: DeveloperApi ::
+ * Class having information on output operations.
+ * @param batchTime Time of the batch
+ * @param id Id of this output operation. Different output operations have different ids in a batch.
+ * @param description The description of this output operation.
+ * @param startTime Clock time of when the output operation started processing
+ * @param endTime Clock time of when the output operation started processing
+ */
+@DeveloperApi
+case class OutputOperationInfo(
+    batchTime: Time,
+    id: Int,
+    description: String,
+    startTime: Option[Long],
+    endTime: Option[Long]) {
+
+  /**
+   * Return the duration of this output operation.
+   */
+  def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala
index 74dbba453f026..d19bdbb443c5e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala
@@ -38,6 +38,14 @@ case class StreamingListenerBatchCompleted(batchInfo: BatchInfo) extends Streami
 @DeveloperApi
 case class StreamingListenerBatchStarted(batchInfo: BatchInfo) extends StreamingListenerEvent
 
+@DeveloperApi
+case class StreamingListenerOutputOperationStarted(outputOperationInfo: OutputOperationInfo)
+  extends StreamingListenerEvent
+
+@DeveloperApi
+case class StreamingListenerOutputOperationCompleted(outputOperationInfo: OutputOperationInfo)
+  extends StreamingListenerEvent
+
 @DeveloperApi
 case class StreamingListenerReceiverStarted(receiverInfo: ReceiverInfo)
   extends StreamingListenerEvent
@@ -75,6 +83,14 @@ trait StreamingListener {
 
   /** Called when processing of a batch of jobs has completed. */
   def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { }
+
+  /** Called when processing of a job of a batch has started. */
+  def onOutputOperationStarted(
+      outputOperationStarted: StreamingListenerOutputOperationStarted) { }
+
+  /** Called when processing of a job of a batch has completed. */
+  def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted) { }
 }
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
index b07d6cf347ca7..ca111bb636ed5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
@@ -43,6 +43,10 @@ private[spark] class StreamingListenerBus
         listener.onBatchStarted(batchStarted)
       case batchCompleted: StreamingListenerBatchCompleted =>
         listener.onBatchCompleted(batchCompleted)
+      case outputOperationStarted: StreamingListenerOutputOperationStarted =>
+        listener.onOutputOperationStarted(outputOperationStarted)
+      case outputOperationCompleted: StreamingListenerOutputOperationCompleted =>
+        listener.onOutputOperationCompleted(outputOperationCompleted)
       case _ =>
     }
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index d8fd2ced3bf3b..2b43b7467042b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -140,6 +140,27 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
     }
   }
 
+  test("output operation reporting") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD(_.count())
+    inputStream.foreachRDD(_.collect())
+    inputStream.foreachRDD(_.count())
+
+    val collector = new OutputOperationInfoCollector
+    ssc.addStreamingListener(collector)
+
+    ssc.start()
+    try {
+      eventually(timeout(30 seconds), interval(20 millis)) {
+        collector.startedOutputOperationIds.take(3) should be (Seq(0, 1, 2))
+        collector.completedOutputOperationIds.take(3) should be (Seq(0, 1, 2))
+      }
+    } finally {
+      ssc.stop()
+    }
+  }
+
   test("onBatchCompleted with successful batch") {
     ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
     val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
@@ -254,6 +275,22 @@ class ReceiverInfoCollector extends StreamingListener {
   }
 }
 
+/** Listener that collects information on processed output operations */
+class OutputOperationInfoCollector extends StreamingListener {
+  val startedOutputOperationIds = new ArrayBuffer[Int] with SynchronizedBuffer[Int]
+  val completedOutputOperationIds = new ArrayBuffer[Int] with SynchronizedBuffer[Int]
+
+  override def onOutputOperationStarted(
+      outputOperationStarted: StreamingListenerOutputOperationStarted): Unit = {
+    startedOutputOperationIds += outputOperationStarted.outputOperationInfo.id
+  }
+
+  override def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = {
+    completedOutputOperationIds += outputOperationCompleted.outputOperationInfo.id
+  }
+}
+
 class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_ONLY) with Logging {
   def onStart() {
     Future {

From 4e0027feaee7c028741da88d8fbc26a45fc4a268 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Mon, 5 Oct 2015 23:24:12 -0700
Subject: [PATCH 459/802] [SPARK-10585] [SQL] [FOLLOW-UP] remove
 no-longer-necessary code for unsafe generation

These code was left there to produce clear diff for https://github.com/apache/spark/pull/8747

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8991 from cloud-fan/clean.
---
 .../expressions/UnsafeRowWriters.java         | 264 -------------
 .../catalyst/expressions/UnsafeWriters.java   | 193 ----------
 .../codegen/GenerateUnsafeProjection.scala    | 351 ------------------
 3 files changed, 808 deletions(-)
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
deleted file mode 100644
index 0f1e0202aace1..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions;
-
-import java.math.BigInteger;
-
-import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.array.ByteArrayMethods;
-import org.apache.spark.unsafe.types.ByteArray;
-import org.apache.spark.unsafe.types.CalendarInterval;
-import org.apache.spark.unsafe.types.UTF8String;
-
-/**
- * A set of helper methods to write data into {@link UnsafeRow}s,
- * used by {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}.
- */
-public class UnsafeRowWriters {
-
-  /** Writer for Decimal with precision under 18. */
-  public static class CompactDecimalWriter {
-
-    public static int getSize(Decimal input) {
-      return 0;
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input) {
-      target.setLong(ordinal, input.toUnscaledLong());
-      return 0;
-    }
-  }
-
-  /** Writer for Decimal with precision larger than 18. */
-  public static class DecimalWriter {
-    private static final int SIZE = 16;
-    public static int getSize(Decimal input) {
-      // bounded size
-      return SIZE;
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input) {
-      final Object base = target.getBaseObject();
-      final long offset = target.getBaseOffset() + cursor;
-      // zero-out the bytes
-      Platform.putLong(base, offset, 0L);
-      Platform.putLong(base, offset + 8, 0L);
-
-      if (input == null) {
-        target.setNullAt(ordinal);
-        // keep the offset and length for update
-        int fieldOffset = UnsafeRow.calculateBitSetWidthInBytes(target.numFields()) + ordinal * 8;
-        Platform.putLong(base, target.getBaseOffset() + fieldOffset,
-          ((long) cursor) << 32);
-        return SIZE;
-      }
-
-      final BigInteger integer = input.toJavaBigDecimal().unscaledValue();
-      byte[] bytes = integer.toByteArray();
-
-      // Write the bytes to the variable length portion.
-      Platform.copyMemory(
-        bytes, Platform.BYTE_ARRAY_OFFSET, base, target.getBaseOffset() + cursor, bytes.length);
-      // Set the fixed length portion.
-      target.setLong(ordinal, (((long) cursor) << 32) | (long) bytes.length);
-
-      return SIZE;
-    }
-  }
-
-  /** Writer for UTF8String. */
-  public static class UTF8StringWriter {
-
-    public static int getSize(UTF8String input) {
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(input.numBytes());
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, UTF8String input) {
-      final long offset = target.getBaseOffset() + cursor;
-      final int numBytes = input.numBytes();
-
-      // zero-out the padding bytes
-      if ((numBytes & 0x07) > 0) {
-        Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
-      }
-
-      // Write the bytes to the variable length portion.
-      input.writeToMemory(target.getBaseObject(), offset);
-
-      // Set the fixed length portion.
-      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-  }
-
-  /** Writer for binary (byte array) type. */
-  public static class BinaryWriter {
-
-    public static int getSize(byte[] input) {
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(input.length);
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input) {
-      final long offset = target.getBaseOffset() + cursor;
-      final int numBytes = input.length;
-
-      // zero-out the padding bytes
-      if ((numBytes & 0x07) > 0) {
-        Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
-      }
-
-      // Write the bytes to the variable length portion.
-      ByteArray.writeToMemory(input, target.getBaseObject(), offset);
-
-      // Set the fixed length portion.
-      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-  }
-
-  /**
-   * Writer for struct type where the struct field is backed by an {@link UnsafeRow}.
-   *
-   * We throw UnsupportedOperationException for inputs that are not backed by {@link UnsafeRow}.
-   * Non-UnsafeRow struct fields are handled directly in
-   * {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}
-   * by generating the Java code needed to convert them into UnsafeRow.
-   */
-  public static class StructWriter {
-    public static int getSize(InternalRow input) {
-      int numBytes = 0;
-      if (input instanceof UnsafeRow) {
-        numBytes = ((UnsafeRow) input).getSizeInBytes();
-      } else {
-        // This is handled directly in GenerateUnsafeProjection.
-        throw new UnsupportedOperationException();
-      }
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, InternalRow input) {
-      int numBytes = 0;
-      final long offset = target.getBaseOffset() + cursor;
-      if (input instanceof UnsafeRow) {
-        final UnsafeRow row = (UnsafeRow) input;
-        numBytes = row.getSizeInBytes();
-
-        // zero-out the padding bytes
-        if ((numBytes & 0x07) > 0) {
-          Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
-        }
-
-        // Write the bytes to the variable length portion.
-        row.writeToMemory(target.getBaseObject(), offset);
-
-        // Set the fixed length portion.
-        target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
-      } else {
-        // This is handled directly in GenerateUnsafeProjection.
-        throw new UnsupportedOperationException();
-      }
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-  }
-
-  /** Writer for interval type. */
-  public static class IntervalWriter {
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, CalendarInterval input) {
-      final long offset = target.getBaseOffset() + cursor;
-
-      // Write the months and microseconds fields of Interval to the variable length portion.
-      Platform.putLong(target.getBaseObject(), offset, input.months);
-      Platform.putLong(target.getBaseObject(), offset + 8, input.microseconds);
-
-      // Set the fixed length portion.
-      target.setLong(ordinal, ((long) cursor) << 32);
-      return 16;
-    }
-  }
-
-  public static class ArrayWriter {
-
-    public static int getSize(UnsafeArrayData input) {
-      // we need extra 4 bytes the store the number of elements in this array.
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(input.getSizeInBytes() + 4);
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, UnsafeArrayData input) {
-      final int numBytes = input.getSizeInBytes() + 4;
-      final long offset = target.getBaseOffset() + cursor;
-
-      // write the number of elements into first 4 bytes.
-      Platform.putInt(target.getBaseObject(), offset, input.numElements());
-
-      // zero-out the padding bytes
-      if ((numBytes & 0x07) > 0) {
-        Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
-      }
-
-      // Write the bytes to the variable length portion.
-      input.writeToMemory(target.getBaseObject(), offset + 4);
-
-      // Set the fixed length portion.
-      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
-
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-  }
-
-  public static class MapWriter {
-
-    public static int getSize(UnsafeMapData input) {
-      // we need extra 8 bytes to store number of elements and numBytes of key array.
-      final int sizeInBytes = 4 + 4 + input.getSizeInBytes();
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(sizeInBytes);
-    }
-
-    public static int write(UnsafeRow target, int ordinal, int cursor, UnsafeMapData input) {
-      final long offset = target.getBaseOffset() + cursor;
-      final UnsafeArrayData keyArray = input.keyArray();
-      final UnsafeArrayData valueArray = input.valueArray();
-      final int keysNumBytes = keyArray.getSizeInBytes();
-      final int valuesNumBytes = valueArray.getSizeInBytes();
-      final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes;
-
-      // write the number of elements into first 4 bytes.
-      Platform.putInt(target.getBaseObject(), offset, input.numElements());
-      // write the numBytes of key array into second 4 bytes.
-      Platform.putInt(target.getBaseObject(), offset + 4, keysNumBytes);
-
-      // zero-out the padding bytes
-      if ((numBytes & 0x07) > 0) {
-        Platform.putLong(target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
-      }
-
-      // Write the bytes of key array to the variable length portion.
-      keyArray.writeToMemory(target.getBaseObject(), offset + 8);
-
-      // Write the bytes of value array to the variable length portion.
-      valueArray.writeToMemory(target.getBaseObject(), offset + 8 + keysNumBytes);
-
-      // Set the fixed length portion.
-      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
-
-      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
-    }
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java
deleted file mode 100644
index ce2d9c4ffbf9f..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeWriters.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions;
-
-import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.types.CalendarInterval;
-import org.apache.spark.unsafe.types.UTF8String;
-
-/**
- * A set of helper methods to write data into the variable length portion.
- */
-public class UnsafeWriters {
-  public static void writeToMemory(
-      Object inputObject,
-      long inputOffset,
-      Object targetObject,
-      long targetOffset,
-      int numBytes) {
-
-    // zero-out the padding bytes
-//    if ((numBytes & 0x07) > 0) {
-//      Platform.putLong(targetObject, targetOffset + ((numBytes >> 3) << 3), 0L);
-//    }
-
-    // Write the UnsafeData to the target memory.
-    Platform.copyMemory(inputObject, inputOffset, targetObject, targetOffset, numBytes);
-  }
-
-  public static int getRoundedSize(int size) {
-    //return ByteArrayMethods.roundNumberOfBytesToNearestWord(size);
-    // todo: do word alignment
-    return size;
-  }
-
-  /** Writer for Decimal with precision larger than 18. */
-  public static class DecimalWriter {
-
-    public static int getSize(Decimal input) {
-      return 16;
-    }
-
-    public static int write(Object targetObject, long targetOffset, Decimal input) {
-      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
-      final int numBytes = bytes.length;
-      assert(numBytes <= 16);
-
-      // zero-out the bytes
-      Platform.putLong(targetObject, targetOffset, 0L);
-      Platform.putLong(targetObject, targetOffset + 8, 0L);
-
-      // Write the bytes to the variable length portion.
-      Platform.copyMemory(bytes, Platform.BYTE_ARRAY_OFFSET, targetObject, targetOffset, numBytes);
-      return 16;
-    }
-  }
-
-  /** Writer for UTF8String. */
-  public static class UTF8StringWriter {
-
-    public static int getSize(UTF8String input) {
-      return getRoundedSize(input.numBytes());
-    }
-
-    public static int write(Object targetObject, long targetOffset, UTF8String input) {
-      final int numBytes = input.numBytes();
-
-      // Write the bytes to the variable length portion.
-      writeToMemory(input.getBaseObject(), input.getBaseOffset(),
-        targetObject, targetOffset, numBytes);
-
-      return getRoundedSize(numBytes);
-    }
-  }
-
-  /** Writer for binary (byte array) type. */
-  public static class BinaryWriter {
-
-    public static int getSize(byte[] input) {
-      return getRoundedSize(input.length);
-    }
-
-    public static int write(Object targetObject, long targetOffset, byte[] input) {
-      final int numBytes = input.length;
-
-      // Write the bytes to the variable length portion.
-      writeToMemory(input, Platform.BYTE_ARRAY_OFFSET, targetObject, targetOffset, numBytes);
-
-      return getRoundedSize(numBytes);
-    }
-  }
-
-  /** Writer for UnsafeRow. */
-  public static class StructWriter {
-
-    public static int getSize(UnsafeRow input) {
-      return getRoundedSize(input.getSizeInBytes());
-    }
-
-    public static int write(Object targetObject, long targetOffset, UnsafeRow input) {
-      final int numBytes = input.getSizeInBytes();
-
-      // Write the bytes to the variable length portion.
-      writeToMemory(input.getBaseObject(), input.getBaseOffset(),
-        targetObject, targetOffset, numBytes);
-
-      return getRoundedSize(numBytes);
-    }
-  }
-
-  /** Writer for interval type. */
-  public static class IntervalWriter {
-
-    public static int getSize(UnsafeRow input) {
-      return 16;
-    }
-
-    public static int write(Object targetObject, long targetOffset, CalendarInterval input) {
-      // Write the months and microseconds fields of Interval to the variable length portion.
-      Platform.putLong(targetObject, targetOffset, input.months);
-      Platform.putLong(targetObject, targetOffset + 8, input.microseconds);
-      return 16;
-    }
-  }
-
-  /** Writer for UnsafeArrayData. */
-  public static class ArrayWriter {
-
-    public static int getSize(UnsafeArrayData input) {
-      // we need extra 4 bytes the store the number of elements in this array.
-      return getRoundedSize(input.getSizeInBytes() + 4);
-    }
-
-    public static int write(Object targetObject, long targetOffset, UnsafeArrayData input) {
-      final int numBytes = input.getSizeInBytes();
-
-      // write the number of elements into first 4 bytes.
-      Platform.putInt(targetObject, targetOffset, input.numElements());
-
-      // Write the bytes to the variable length portion.
-      writeToMemory(
-        input.getBaseObject(), input.getBaseOffset(), targetObject, targetOffset + 4, numBytes);
-
-      return getRoundedSize(numBytes + 4);
-    }
-  }
-
-  public static class MapWriter {
-
-    public static int getSize(UnsafeMapData input) {
-      // we need extra 8 bytes to store number of elements and numBytes of key array.
-      return getRoundedSize(4 + 4 + input.getSizeInBytes());
-    }
-
-    public static int write(Object targetObject, long targetOffset, UnsafeMapData input) {
-      final UnsafeArrayData keyArray = input.keyArray();
-      final UnsafeArrayData valueArray = input.valueArray();
-      final int keysNumBytes = keyArray.getSizeInBytes();
-      final int valuesNumBytes = valueArray.getSizeInBytes();
-      final int numBytes = 4 + 4 + keysNumBytes + valuesNumBytes;
-
-      // write the number of elements into first 4 bytes.
-      Platform.putInt(targetObject, targetOffset, input.numElements());
-      // write the numBytes of key array into second 4 bytes.
-      Platform.putInt(targetObject, targetOffset + 4, keysNumBytes);
-
-      // Write the bytes of key array to the variable length portion.
-      writeToMemory(keyArray.getBaseObject(), keyArray.getBaseOffset(),
-        targetObject, targetOffset + 8, keysNumBytes);
-
-      // Write the bytes of value array to the variable length portion.
-      writeToMemory(valueArray.getBaseObject(), valueArray.getBaseOffset(),
-        targetObject, targetOffset + 8 + keysNumBytes, valuesNumBytes);
-
-      return getRoundedSize(numBytes);
-    }
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 99bf50a84571b..8e58cb9ad1e5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -31,15 +31,6 @@ import org.apache.spark.sql.types._
  */
 object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafeProjection] {
 
-  private val StringWriter = classOf[UnsafeRowWriters.UTF8StringWriter].getName
-  private val BinaryWriter = classOf[UnsafeRowWriters.BinaryWriter].getName
-  private val IntervalWriter = classOf[UnsafeRowWriters.IntervalWriter].getName
-  private val StructWriter = classOf[UnsafeRowWriters.StructWriter].getName
-  private val CompactDecimalWriter = classOf[UnsafeRowWriters.CompactDecimalWriter].getName
-  private val DecimalWriter = classOf[UnsafeRowWriters.DecimalWriter].getName
-  private val ArrayWriter = classOf[UnsafeRowWriters.ArrayWriter].getName
-  private val MapWriter = classOf[UnsafeRowWriters.MapWriter].getName
-
   /** Returns true iff we support this data type. */
   def canSupport(dataType: DataType): Boolean = dataType match {
     case NullType => true
@@ -51,348 +42,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case _ => false
   }
 
-  def genAdditionalSize(dt: DataType, ev: GeneratedExpressionCode): String = dt match {
-    case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
-      s"$DecimalWriter.getSize(${ev.primitive})"
-    case StringType =>
-      s"${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive})"
-    case BinaryType =>
-      s"${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive})"
-    case CalendarIntervalType =>
-      s"${ev.isNull} ? 0 : 16"
-    case _: StructType =>
-      s"${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive})"
-    case _: ArrayType =>
-      s"${ev.isNull} ? 0 : $ArrayWriter.getSize(${ev.primitive})"
-    case _: MapType =>
-      s"${ev.isNull} ? 0 : $MapWriter.getSize(${ev.primitive})"
-    case _ => ""
-  }
-
-  def genFieldWriter(
-      ctx: CodeGenContext,
-      fieldType: DataType,
-      ev: GeneratedExpressionCode,
-      target: String,
-      index: Int,
-      cursor: String): String = fieldType match {
-    case _ if ctx.isPrimitiveType(fieldType) =>
-      s"${ctx.setColumn(target, fieldType, index, ev.primitive)}"
-    case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-      s"""
-       // make sure Decimal object has the same scale as DecimalType
-       if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) {
-         $CompactDecimalWriter.write($target, $index, $cursor, ${ev.primitive});
-       } else {
-         $target.setNullAt($index);
-       }
-       """
-    case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
-      s"""
-       // make sure Decimal object has the same scale as DecimalType
-       if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) {
-         $cursor += $DecimalWriter.write($target, $index, $cursor, ${ev.primitive});
-       } else {
-         $cursor += $DecimalWriter.write($target, $index, $cursor, null);
-       }
-       """
-    case StringType =>
-      s"$cursor += $StringWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case BinaryType =>
-      s"$cursor += $BinaryWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case CalendarIntervalType =>
-      s"$cursor += $IntervalWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case _: StructType =>
-      s"$cursor += $StructWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case _: ArrayType =>
-      s"$cursor += $ArrayWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case _: MapType =>
-      s"$cursor += $MapWriter.write($target, $index, $cursor, ${ev.primitive})"
-    case NullType => ""
-    case _ =>
-      throw new UnsupportedOperationException(s"Not supported DataType: $fieldType")
-  }
-
-  /**
-   * Generates the Java code to convert a struct (backed by InternalRow) to UnsafeRow.
-   *
-   * @param ctx code generation context
-   * @param inputs could be the codes for expressions or input struct fields.
-   * @param inputTypes types of the inputs
-   */
-  private def createCodeForStruct(
-      ctx: CodeGenContext,
-      row: String,
-      inputs: Seq[GeneratedExpressionCode],
-      inputTypes: Seq[DataType]): GeneratedExpressionCode = {
-
-    val fixedSize = 8 * inputTypes.length + UnsafeRow.calculateBitSetWidthInBytes(inputTypes.length)
-
-    val output = ctx.freshName("convertedStruct")
-    ctx.addMutableState("UnsafeRow", output, s"this.$output = new UnsafeRow();")
-    val buffer = ctx.freshName("buffer")
-    ctx.addMutableState("byte[]", buffer, s"this.$buffer = new byte[$fixedSize];")
-    val cursor = ctx.freshName("cursor")
-    ctx.addMutableState("int", cursor, s"this.$cursor = 0;")
-    val tmpBuffer = ctx.freshName("tmpBuffer")
-
-    val convertedFields = inputTypes.zip(inputs).zipWithIndex.map { case ((dt, input), i) =>
-      val ev = createConvertCode(ctx, input, dt)
-      val growBuffer = if (!UnsafeRow.isFixedLength(dt)) {
-        val numBytes = ctx.freshName("numBytes")
-        s"""
-          int $numBytes = $cursor + (${genAdditionalSize(dt, ev)});
-          if ($buffer.length < $numBytes) {
-            // This will not happen frequently, because the buffer is re-used.
-            byte[] $tmpBuffer = new byte[$numBytes * 2];
-            Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
-              $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
-            $buffer = $tmpBuffer;
-          }
-          $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $numBytes);
-         """
-      } else {
-        ""
-      }
-      val update = dt match {
-        case dt: DecimalType if dt.precision > Decimal.MAX_LONG_DIGITS =>
-          // Can't call setNullAt() for DecimalType
-          s"""
-          if (${ev.isNull}) {
-            $cursor += $DecimalWriter.write($output, $i, $cursor, null);
-          } else {
-            ${genFieldWriter(ctx, dt, ev, output, i, cursor)};
-          }
-        """
-        case _ =>
-          s"""
-          if (${ev.isNull}) {
-            $output.setNullAt($i);
-          } else {
-            ${genFieldWriter(ctx, dt, ev, output, i, cursor)};
-          }
-        """
-      }
-      s"""
-        ${ev.code}
-        $growBuffer
-        $update
-      """
-    }
-
-    val code = s"""
-      $cursor = $fixedSize;
-      $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $cursor);
-      ${ctx.splitExpressions(row, convertedFields)}
-      """
-    GeneratedExpressionCode(code, "false", output)
-  }
-
-  private def getWriter(dt: DataType) = dt match {
-    case StringType => classOf[UnsafeWriters.UTF8StringWriter].getName
-    case BinaryType => classOf[UnsafeWriters.BinaryWriter].getName
-    case CalendarIntervalType => classOf[UnsafeWriters.IntervalWriter].getName
-    case _: StructType => classOf[UnsafeWriters.StructWriter].getName
-    case _: ArrayType => classOf[UnsafeWriters.ArrayWriter].getName
-    case _: MapType => classOf[UnsafeWriters.MapWriter].getName
-    case _: DecimalType => classOf[UnsafeWriters.DecimalWriter].getName
-  }
-
-  private def createCodeForArray(
-      ctx: CodeGenContext,
-      input: GeneratedExpressionCode,
-      elementType: DataType): GeneratedExpressionCode = {
-    val output = ctx.freshName("convertedArray")
-    ctx.addMutableState("UnsafeArrayData", output, s"$output = new UnsafeArrayData();")
-    val buffer = ctx.freshName("buffer")
-    ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
-    val tmpBuffer = ctx.freshName("tmpBuffer")
-    val outputIsNull = ctx.freshName("isNull")
-    val numElements = ctx.freshName("numElements")
-    val fixedSize = ctx.freshName("fixedSize")
-    val numBytes = ctx.freshName("numBytes")
-    val cursor = ctx.freshName("cursor")
-    val index = ctx.freshName("index")
-    val elementName = ctx.freshName("elementName")
-
-    val element = {
-      val code = s"${ctx.javaType(elementType)} $elementName = " +
-        s"${ctx.getValue(input.primitive, elementType, index)};"
-      val isNull = s"${input.primitive}.isNullAt($index)"
-      GeneratedExpressionCode(code, isNull, elementName)
-    }
-
-    val convertedElement = createConvertCode(ctx, element, elementType)
-
-    val writeElement = elementType match {
-      case _ if ctx.isPrimitiveType(elementType) =>
-        // Should we do word align?
-        val elementSize = elementType.defaultSize
-        s"""
-          Platform.put${ctx.primitiveTypeName(elementType)}(
-            $buffer,
-            Platform.BYTE_ARRAY_OFFSET + $cursor,
-            ${convertedElement.primitive});
-          $cursor += $elementSize;
-        """
-      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-        s"""
-          Platform.putLong(
-            $buffer,
-            Platform.BYTE_ARRAY_OFFSET + $cursor,
-            ${convertedElement.primitive}.toUnscaledLong());
-          $cursor += 8;
-        """
-      case _ =>
-        val writer = getWriter(elementType)
-        s"""
-          $cursor += $writer.write(
-            $buffer,
-            Platform.BYTE_ARRAY_OFFSET + $cursor,
-            ${convertedElement.primitive});
-        """
-    }
-
-    val checkNull = convertedElement.isNull + (elementType match {
-      case t: DecimalType =>
-        s" || !${convertedElement.primitive}.changePrecision(${t.precision}, ${t.scale})"
-      case _ => ""
-    })
-
-    val elementSize = elementType match {
-      // Should we do word align for primitive types?
-      case _ if ctx.isPrimitiveType(elementType) => elementType.defaultSize.toString
-      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => "8"
-      case _ =>
-        val writer = getWriter(elementType)
-        s"$writer.getSize(${convertedElement.primitive})"
-    }
-
-    val code = s"""
-      ${input.code}
-      final boolean $outputIsNull = ${input.isNull};
-      if (!$outputIsNull) {
-        if (${input.primitive} instanceof UnsafeArrayData) {
-          $output = (UnsafeArrayData) ${input.primitive};
-        } else {
-          final int $numElements = ${input.primitive}.numElements();
-          final int $fixedSize = 4 * $numElements;
-          int $numBytes = $fixedSize;
-
-          int $cursor = $fixedSize;
-          for (int $index = 0; $index < $numElements; $index++) {
-            ${convertedElement.code}
-            if ($checkNull) {
-              // If element is null, write the negative value address into offset region.
-              Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, -$cursor);
-            } else {
-              $numBytes += $elementSize;
-              if ($buffer.length < $numBytes) {
-                // This will not happen frequently, because the buffer is re-used.
-                byte[] $tmpBuffer = new byte[$numBytes * 2];
-                Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
-                  $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
-                $buffer = $tmpBuffer;
-              }
-              Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, $cursor);
-              $writeElement
-            }
-          }
-
-          $output.pointTo(
-            $buffer,
-            Platform.BYTE_ARRAY_OFFSET,
-            $numElements,
-            $numBytes);
-        }
-      }
-      """
-    GeneratedExpressionCode(code, outputIsNull, output)
-  }
-
-  private def createCodeForMap(
-      ctx: CodeGenContext,
-      input: GeneratedExpressionCode,
-      keyType: DataType,
-      valueType: DataType): GeneratedExpressionCode = {
-    val output = ctx.freshName("convertedMap")
-    val outputIsNull = ctx.freshName("isNull")
-    val keyArrayName = ctx.freshName("keyArrayName")
-    val valueArrayName = ctx.freshName("valueArrayName")
-
-    val keyArray = {
-      val code = s"ArrayData $keyArrayName = ${input.primitive}.keyArray();"
-      val isNull = "false"
-      GeneratedExpressionCode(code, isNull, keyArrayName)
-    }
-
-    val valueArray = {
-      val code = s"ArrayData $valueArrayName = ${input.primitive}.valueArray();"
-      val isNull = "false"
-      GeneratedExpressionCode(code, isNull, valueArrayName)
-    }
-
-    val convertedKeys = createCodeForArray(ctx, keyArray, keyType)
-    val convertedValues = createCodeForArray(ctx, valueArray, valueType)
-
-    val code = s"""
-      ${input.code}
-      final boolean $outputIsNull = ${input.isNull};
-      UnsafeMapData $output = null;
-      if (!$outputIsNull) {
-        if (${input.primitive} instanceof UnsafeMapData) {
-          $output = (UnsafeMapData) ${input.primitive};
-        } else {
-          ${convertedKeys.code}
-          ${convertedValues.code}
-          $output = new UnsafeMapData(${convertedKeys.primitive}, ${convertedValues.primitive});
-        }
-      }
-      """
-    GeneratedExpressionCode(code, outputIsNull, output)
-  }
-
-  /**
-   * Generates the java code to convert a data to its unsafe version.
-   */
-  private def createConvertCode(
-      ctx: CodeGenContext,
-      input: GeneratedExpressionCode,
-      dataType: DataType): GeneratedExpressionCode = dataType match {
-    case t: StructType =>
-      val output = ctx.freshName("convertedStruct")
-      val outputIsNull = ctx.freshName("isNull")
-      val fieldTypes = t.fields.map(_.dataType)
-      val fieldEvals = fieldTypes.zipWithIndex.map { case (dt, i) =>
-        val fieldName = ctx.freshName("fieldName")
-        val code = s"${ctx.javaType(dt)} $fieldName = " +
-          s"${ctx.getValue(input.primitive, dt, i.toString)};"
-        val isNull = s"${input.primitive}.isNullAt($i)"
-        GeneratedExpressionCode(code, isNull, fieldName)
-      }
-      val converter = createCodeForStruct(ctx, input.primitive, fieldEvals, fieldTypes)
-      val code = s"""
-        ${input.code}
-         UnsafeRow $output = null;
-         final boolean $outputIsNull = ${input.isNull};
-         if (!$outputIsNull) {
-           if (${input.primitive} instanceof UnsafeRow) {
-             $output = (UnsafeRow) ${input.primitive};
-           } else {
-             ${converter.code}
-             $output = ${converter.primitive};
-           }
-         }
-        """
-      GeneratedExpressionCode(code, outputIsNull, output)
-
-    case ArrayType(elementType, _) => createCodeForArray(ctx, input, elementType)
-
-    case MapType(kt, vt, _) => createCodeForMap(ctx, input, kt, vt)
-
-    case _ => input
-  }
-
   private val rowWriterClass = classOf[UnsafeRowWriter].getName
   private val arrayWriterClass = classOf[UnsafeArrayWriter].getName
 

From 27ecfe61f07c8413a7b8b9fbdf36ed99cf05227d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 6 Oct 2015 08:45:31 -0700
Subject: [PATCH 460/802] [SPARK-10938] [SQL] remove typeId in columnar cache

This PR remove the typeId in columnar cache, it's not needed anymore, it also remove DATE and TIMESTAMP (use INT/LONG instead).

Author: Davies Liu <davies@databricks.com>

Closes #8989 from davies/refactor_cache.
---
 project/MimaExcludes.scala                    |  4 +-
 .../spark/sql/columnar/ColumnAccessor.scala   | 36 ++++------
 .../spark/sql/columnar/ColumnBuilder.scala    | 16 ++---
 .../spark/sql/columnar/ColumnStats.scala      |  4 --
 .../spark/sql/columnar/ColumnType.scala       | 71 ++++---------------
 .../sql/columnar/NullableColumnBuilder.scala  | 19 +++--
 .../CompressibleColumnBuilder.scala           | 27 ++++---
 .../compression/CompressionScheme.scala       |  6 +-
 .../spark/sql/columnar/ColumnStatsSuite.scala |  3 -
 .../spark/sql/columnar/ColumnTypeSuite.scala  | 10 +--
 .../sql/columnar/ColumnarTestUtils.scala      |  5 +-
 .../NullableColumnAccessorSuite.scala         |  8 +--
 .../columnar/NullableColumnBuilderSuite.scala |  5 +-
 13 files changed, 63 insertions(+), 151 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index b2e6be706637b..2d4d146f51339 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -42,7 +42,9 @@ object MimaExcludes {
         excludePackage("org.spark-project.jetty"),
         MimaBuild.excludeSparkPackage("unused"),
         // SQL execution is considered private.
-        excludePackage("org.apache.spark.sql.execution")
+        excludePackage("org.apache.spark.sql.execution"),
+        // SQL columnar is considered private.
+        excludePackage("org.apache.spark.sql.columnar")
       ) ++
       MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
       MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 4c29a093218a0..2b1d7009874ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -103,35 +103,23 @@ private[sql] class GenericColumnAccessor(buffer: ByteBuffer, dataType: DataType)
   extends BasicColumnAccessor[Array[Byte]](buffer, GENERIC(dataType))
   with NullableColumnAccessor
 
-private[sql] class DateColumnAccessor(buffer: ByteBuffer)
-  extends NativeColumnAccessor(buffer, DATE)
-
-private[sql] class TimestampColumnAccessor(buffer: ByteBuffer)
-  extends NativeColumnAccessor(buffer, TIMESTAMP)
-
 private[sql] object ColumnAccessor {
   def apply(dataType: DataType, buffer: ByteBuffer): ColumnAccessor = {
-    val dup = buffer.duplicate().order(ByteOrder.nativeOrder)
-
-    // The first 4 bytes in the buffer indicate the column type.  This field is not used now,
-    // because we always know the data type of the column ahead of time.
-    dup.getInt()
+    val buf = buffer.order(ByteOrder.nativeOrder)
 
     dataType match {
-      case BooleanType => new BooleanColumnAccessor(dup)
-      case ByteType => new ByteColumnAccessor(dup)
-      case ShortType => new ShortColumnAccessor(dup)
-      case IntegerType => new IntColumnAccessor(dup)
-      case DateType => new DateColumnAccessor(dup)
-      case LongType => new LongColumnAccessor(dup)
-      case TimestampType => new TimestampColumnAccessor(dup)
-      case FloatType => new FloatColumnAccessor(dup)
-      case DoubleType => new DoubleColumnAccessor(dup)
-      case StringType => new StringColumnAccessor(dup)
-      case BinaryType => new BinaryColumnAccessor(dup)
+      case BooleanType => new BooleanColumnAccessor(buf)
+      case ByteType => new ByteColumnAccessor(buf)
+      case ShortType => new ShortColumnAccessor(buf)
+      case IntegerType | DateType => new IntColumnAccessor(buf)
+      case LongType | TimestampType => new LongColumnAccessor(buf)
+      case FloatType => new FloatColumnAccessor(buf)
+      case DoubleType => new DoubleColumnAccessor(buf)
+      case StringType => new StringColumnAccessor(buf)
+      case BinaryType => new BinaryColumnAccessor(buf)
       case DecimalType.Fixed(precision, scale) if precision < 19 =>
-        new FixedDecimalColumnAccessor(dup, precision, scale)
-      case other => new GenericColumnAccessor(dup, other)
+        new FixedDecimalColumnAccessor(buf, precision, scale)
+      case other => new GenericColumnAccessor(buf, other)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index 1620fc401ba6e..2e60564f7c64e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -63,9 +63,8 @@ private[sql] class BasicColumnBuilder[JvmType](
     val size = if (initialSize == 0) DEFAULT_INITIAL_BUFFER_SIZE else initialSize
     this.columnName = columnName
 
-    // Reserves 4 bytes for column type ID
-    buffer = ByteBuffer.allocate(4 + size * columnType.defaultSize)
-    buffer.order(ByteOrder.nativeOrder()).putInt(columnType.typeId)
+    buffer = ByteBuffer.allocate(size * columnType.defaultSize)
+    buffer.order(ByteOrder.nativeOrder())
   }
 
   override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
@@ -121,11 +120,6 @@ private[sql] class FixedDecimalColumnBuilder(
 private[sql] class GenericColumnBuilder(dataType: DataType)
   extends ComplexColumnBuilder(new GenericColumnStats(dataType), GENERIC(dataType))
 
-private[sql] class DateColumnBuilder extends NativeColumnBuilder(new DateColumnStats, DATE)
-
-private[sql] class TimestampColumnBuilder
-  extends NativeColumnBuilder(new TimestampColumnStats, TIMESTAMP)
-
 private[sql] object ColumnBuilder {
   val DEFAULT_INITIAL_BUFFER_SIZE = 1024 * 1024
 
@@ -154,10 +148,8 @@ private[sql] object ColumnBuilder {
       case BooleanType => new BooleanColumnBuilder
       case ByteType => new ByteColumnBuilder
       case ShortType => new ShortColumnBuilder
-      case IntegerType => new IntColumnBuilder
-      case DateType => new DateColumnBuilder
-      case LongType => new LongColumnBuilder
-      case TimestampType => new TimestampColumnBuilder
+      case IntegerType | DateType => new IntColumnBuilder
+      case LongType | TimestampType => new LongColumnBuilder
       case FloatType => new FloatColumnBuilder
       case DoubleType => new DoubleColumnBuilder
       case StringType => new StringColumnBuilder
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index fbd51b7c34c8c..3b5052b754060 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -266,7 +266,3 @@ private[sql] class GenericColumnStats(dataType: DataType) extends ColumnStats {
   override def collectedStatistics: GenericInternalRow =
     new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
 }
-
-private[sql] class DateColumnStats extends IntColumnStats
-
-private[sql] class TimestampColumnStats extends LongColumnStats
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index ab482a3636121..3a0cea8750f1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -38,9 +38,6 @@ private[sql] sealed abstract class ColumnType[JvmType] {
   // The catalyst data type of this column.
   def dataType: DataType
 
-  // A unique ID representing the type.
-  def typeId: Int
-
   // Default size in bytes for one element of type T (e.g. 4 for `Int`).
   def defaultSize: Int
 
@@ -107,7 +104,6 @@ private[sql] sealed abstract class ColumnType[JvmType] {
 
 private[sql] abstract class NativeColumnType[T <: AtomicType](
     val dataType: T,
-    val typeId: Int,
     val defaultSize: Int)
   extends ColumnType[T#InternalType] {
 
@@ -117,7 +113,7 @@ private[sql] abstract class NativeColumnType[T <: AtomicType](
   def scalaTag: TypeTag[dataType.InternalType] = dataType.tag
 }
 
-private[sql] object INT extends NativeColumnType(IntegerType, 0, 4) {
+private[sql] object INT extends NativeColumnType(IntegerType, 4) {
   override def append(v: Int, buffer: ByteBuffer): Unit = {
     buffer.putInt(v)
   }
@@ -145,7 +141,7 @@ private[sql] object INT extends NativeColumnType(IntegerType, 0, 4) {
   }
 }
 
-private[sql] object LONG extends NativeColumnType(LongType, 1, 8) {
+private[sql] object LONG extends NativeColumnType(LongType, 8) {
   override def append(v: Long, buffer: ByteBuffer): Unit = {
     buffer.putLong(v)
   }
@@ -173,7 +169,7 @@ private[sql] object LONG extends NativeColumnType(LongType, 1, 8) {
   }
 }
 
-private[sql] object FLOAT extends NativeColumnType(FloatType, 2, 4) {
+private[sql] object FLOAT extends NativeColumnType(FloatType, 4) {
   override def append(v: Float, buffer: ByteBuffer): Unit = {
     buffer.putFloat(v)
   }
@@ -201,7 +197,7 @@ private[sql] object FLOAT extends NativeColumnType(FloatType, 2, 4) {
   }
 }
 
-private[sql] object DOUBLE extends NativeColumnType(DoubleType, 3, 8) {
+private[sql] object DOUBLE extends NativeColumnType(DoubleType, 8) {
   override def append(v: Double, buffer: ByteBuffer): Unit = {
     buffer.putDouble(v)
   }
@@ -229,7 +225,7 @@ private[sql] object DOUBLE extends NativeColumnType(DoubleType, 3, 8) {
   }
 }
 
-private[sql] object BOOLEAN extends NativeColumnType(BooleanType, 4, 1) {
+private[sql] object BOOLEAN extends NativeColumnType(BooleanType, 1) {
   override def append(v: Boolean, buffer: ByteBuffer): Unit = {
     buffer.put(if (v) 1: Byte else 0: Byte)
   }
@@ -255,7 +251,7 @@ private[sql] object BOOLEAN extends NativeColumnType(BooleanType, 4, 1) {
   }
 }
 
-private[sql] object BYTE extends NativeColumnType(ByteType, 5, 1) {
+private[sql] object BYTE extends NativeColumnType(ByteType, 1) {
   override def append(v: Byte, buffer: ByteBuffer): Unit = {
     buffer.put(v)
   }
@@ -283,7 +279,7 @@ private[sql] object BYTE extends NativeColumnType(ByteType, 5, 1) {
   }
 }
 
-private[sql] object SHORT extends NativeColumnType(ShortType, 6, 2) {
+private[sql] object SHORT extends NativeColumnType(ShortType, 2) {
   override def append(v: Short, buffer: ByteBuffer): Unit = {
     buffer.putShort(v)
   }
@@ -311,7 +307,7 @@ private[sql] object SHORT extends NativeColumnType(ShortType, 6, 2) {
   }
 }
 
-private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
+private[sql] object STRING extends NativeColumnType(StringType, 8) {
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     row.getUTF8String(ordinal).numBytes() + 4
   }
@@ -343,46 +339,9 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   override def clone(v: UTF8String): UTF8String = v.clone()
 }
 
-private[sql] object DATE extends NativeColumnType(DateType, 8, 4) {
-  override def extract(buffer: ByteBuffer): Int = {
-    buffer.getInt
-  }
-
-  override def append(v: Int, buffer: ByteBuffer): Unit = {
-    buffer.putInt(v)
-  }
-
-  override def getField(row: InternalRow, ordinal: Int): Int = {
-    row.getInt(ordinal)
-  }
-
-  def setField(row: MutableRow, ordinal: Int, value: Int): Unit = {
-    row(ordinal) = value
-  }
-}
-
-private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 9, 8) {
-  override def extract(buffer: ByteBuffer): Long = {
-    buffer.getLong
-  }
-
-  override def append(v: Long, buffer: ByteBuffer): Unit = {
-    buffer.putLong(v)
-  }
-
-  override def getField(row: InternalRow, ordinal: Int): Long = {
-    row.getLong(ordinal)
-  }
-
-  override def setField(row: MutableRow, ordinal: Int, value: Long): Unit = {
-    row(ordinal) = value
-  }
-}
-
 private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
   extends NativeColumnType(
     DecimalType(precision, scale),
-    10,
     FIXED_DECIMAL.defaultSize) {
 
   override def extract(buffer: ByteBuffer): Decimal = {
@@ -410,9 +369,7 @@ private[sql] object FIXED_DECIMAL {
   val defaultSize = 8
 }
 
-private[sql] sealed abstract class ByteArrayColumnType(
-    val typeId: Int,
-    val defaultSize: Int)
+private[sql] sealed abstract class ByteArrayColumnType(val defaultSize: Int)
   extends ColumnType[Array[Byte]] {
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
@@ -431,7 +388,7 @@ private[sql] sealed abstract class ByteArrayColumnType(
   }
 }
 
-private[sql] object BINARY extends ByteArrayColumnType(11, 16) {
+private[sql] object BINARY extends ByteArrayColumnType(16) {
 
   def dataType: DataType = BooleanType
 
@@ -447,7 +404,7 @@ private[sql] object BINARY extends ByteArrayColumnType(11, 16) {
 // Used to process generic objects (all types other than those listed above). Objects should be
 // serialized first before appending to the column `ByteBuffer`, and is also extracted as serialized
 // byte array.
-private[sql] case class GENERIC(dataType: DataType) extends ByteArrayColumnType(12, 16) {
+private[sql] case class GENERIC(dataType: DataType) extends ByteArrayColumnType(16) {
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]): Unit = {
     row.update(ordinal, SparkSqlSerializer.deserialize[Any](value))
   }
@@ -463,10 +420,8 @@ private[sql] object ColumnType {
       case BooleanType => BOOLEAN
       case ByteType => BYTE
       case ShortType => SHORT
-      case IntegerType => INT
-      case DateType => DATE
-      case LongType => LONG
-      case TimestampType => TIMESTAMP
+      case IntegerType | DateType => INT
+      case LongType | TimestampType => LONG
       case FloatType => FLOAT
       case DoubleType => DOUBLE
       case StringType => STRING
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
index ba47bc783f31e..76cfddf1cd01a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnBuilder.scala
@@ -25,14 +25,13 @@ import org.apache.spark.sql.catalyst.InternalRow
  * A stackable trait used for building byte buffer for a column containing null values.  Memory
  * layout of the final byte buffer is:
  * {{{
- *    .----------------------- Column type ID (4 bytes)
- *    |   .------------------- Null count N (4 bytes)
- *    |   |   .--------------- Null positions (4 x N bytes, empty if null count is zero)
- *    |   |   |     .--------- Non-null elements
- *    V   V   V     V
- *   +---+---+-----+---------+
- *   |   |   | ... | ... ... |
- *   +---+---+-----+---------+
+ *    .------------------- Null count N (4 bytes)
+ *    |   .--------------- Null positions (4 x N bytes, empty if null count is zero)
+ *    |   |     .--------- Non-null elements
+ *    V   V     V
+ *   +---+-----+---------+
+ *   |   | ... | ... ... |
+ *   +---+-----+---------+
  * }}}
  */
 private[sql] trait NullableColumnBuilder extends ColumnBuilder {
@@ -66,16 +65,14 @@ private[sql] trait NullableColumnBuilder extends ColumnBuilder {
 
   abstract override def build(): ByteBuffer = {
     val nonNulls = super.build()
-    val typeId = nonNulls.getInt()
     val nullDataLen = nulls.position()
 
     nulls.limit(nullDataLen)
     nulls.rewind()
 
     val buffer = ByteBuffer
-      .allocate(4 + 4 + nullDataLen + nonNulls.remaining())
+      .allocate(4 + nullDataLen + nonNulls.remaining())
       .order(ByteOrder.nativeOrder())
-      .putInt(typeId)
       .putInt(nullCount)
       .put(nulls)
       .put(nonNulls)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
index 39b21ddb47ba4..161021ff96154 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
@@ -28,17 +28,16 @@ import org.apache.spark.sql.types.AtomicType
  * A stackable trait that builds optionally compressed byte buffer for a column.  Memory layout of
  * the final byte buffer is:
  * {{{
- *    .--------------------------- Column type ID (4 bytes)
- *    |   .----------------------- Null count N (4 bytes)
- *    |   |   .------------------- Null positions (4 x N bytes, empty if null count is zero)
- *    |   |   |     .------------- Compression scheme ID (4 bytes)
- *    |   |   |     |   .--------- Compressed non-null elements
- *    V   V   V     V   V
- *   +---+---+-----+---+---------+
- *   |   |   | ... |   | ... ... |
- *   +---+---+-----+---+---------+
- *    \-----------/ \-----------/
- *       header         body
+ *    .----------------------- Null count N (4 bytes)
+ *    |   .------------------- Null positions (4 x N bytes, empty if null count is zero)
+ *    |   |     .------------- Compression scheme ID (4 bytes)
+ *    |   |     |   .--------- Compressed non-null elements
+ *    V   V     V   V
+ *   +---+-----+---+---------+
+ *   |   | ... |   | ... ... |
+ *   +---+-----+---+---------+
+ *    \-------/ \-----------/
+ *     header         body
  * }}}
  */
 private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
@@ -83,14 +82,13 @@ private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
 
   override def build(): ByteBuffer = {
     val nonNullBuffer = buildNonNulls()
-    val typeId = nonNullBuffer.getInt()
     val encoder: Encoder[T] = {
       val candidate = compressionEncoders.minBy(_.compressionRatio)
       if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
     }
 
-    // Header = column type ID + null count + null positions
-    val headerSize = 4 + 4 + nulls.limit()
+    // Header = null count + null positions
+    val headerSize = 4 + nulls.limit()
     val compressedSize = if (encoder.compressedSize == 0) {
       nonNullBuffer.remaining()
     } else {
@@ -102,7 +100,6 @@ private[sql] trait CompressibleColumnBuilder[T <: AtomicType]
       .allocate(headerSize + 4 + compressedSize)
       .order(ByteOrder.nativeOrder)
       // Write the header
-      .putInt(typeId)
       .putInt(nullCount)
       .put(nulls)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
index b1ef9b2ef7849..9322b772fd898 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
@@ -74,8 +74,8 @@ private[sql] object CompressionScheme {
 
   def columnHeaderSize(columnBuffer: ByteBuffer): Int = {
     val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder)
-    val nullCount = header.getInt(4)
-    // Column type ID + null count + null positions
-    4 + 4 + 4 * nullCount
+    val nullCount = header.getInt()
+    // null count + null positions
+    4 + 4 * nullCount
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index d0430d2a60e75..708fb4cf79332 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -27,10 +27,7 @@ class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0))
   testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0))
   testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0))
-  testColumnStats(classOf[DateColumnStats], DATE, createRow(Int.MaxValue, Int.MinValue, 0))
   testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0))
-  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP,
-    createRow(Long.MaxValue, Long.MinValue, 0))
   testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0))
   testColumnStats(classOf[DoubleColumnStats], DOUBLE,
     createRow(Double.MaxValue, Double.MinValue, 0))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 8f024690efd0d..a4cbe3525ef91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -37,8 +37,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
   test("defaultSize") {
     val checks = Map(
-      BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4, DATE -> 4,
-      LONG -> 8, TIMESTAMP -> 8, FLOAT -> 4, DOUBLE -> 8,
+      BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4,
+      LONG -> 8, FLOAT -> 4, DOUBLE -> 8,
       STRING -> 8, BINARY -> 16, FIXED_DECIMAL(15, 10) -> 8,
       MAP_GENERIC -> 16)
 
@@ -66,9 +66,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(BYTE, Byte.MaxValue, 1)
     checkActualSize(SHORT, Short.MaxValue, 2)
     checkActualSize(INT, Int.MaxValue, 4)
-    checkActualSize(DATE, Int.MaxValue, 4)
     checkActualSize(LONG, Long.MaxValue, 8)
-    checkActualSize(TIMESTAMP, Long.MaxValue, 8)
     checkActualSize(FLOAT, Float.MaxValue, 4)
     checkActualSize(DOUBLE, Double.MaxValue, 8)
     checkActualSize(STRING, UTF8String.fromString("hello"), 4 + "hello".getBytes("utf-8").length)
@@ -93,12 +91,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
   testNativeColumnType(INT)(_.putInt(_), _.getInt)
 
-  testNativeColumnType(DATE)(_.putInt(_), _.getInt)
-
   testNativeColumnType(LONG)(_.putLong(_), _.getLong)
 
-  testNativeColumnType(TIMESTAMP)(_.putLong(_), _.getLong)
-
   testNativeColumnType(FLOAT)(_.putFloat(_), _.getFloat)
 
   testNativeColumnType(DOUBLE)(_.putDouble(_), _.getDouble)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 79bb7d072feb2..123a7053c0a6f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.columnar
 
 import scala.collection.immutable.HashSet
 import scala.util.Random
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{DataType, Decimal, AtomicType}
+import org.apache.spark.sql.types.{AtomicType, Decimal}
 import org.apache.spark.unsafe.types.UTF8String
 
 object ColumnarTestUtils {
@@ -43,9 +44,7 @@ object ColumnarTestUtils {
       case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
       case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
       case INT => Random.nextInt()
-      case DATE => Random.nextInt()
       case LONG => Random.nextLong()
-      case TIMESTAMP => Random.nextLong()
       case FLOAT => Random.nextFloat()
       case DOUBLE => Random.nextDouble()
       case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index f4f6c7649bfa8..a3a23d37d766a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -21,7 +21,7 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{StringType, ArrayType, DataType}
+import org.apache.spark.sql.types.{ArrayType, StringType}
 
 class TestNullableColumnAccessor[JvmType](
     buffer: ByteBuffer,
@@ -32,17 +32,15 @@ class TestNullableColumnAccessor[JvmType](
 object TestNullableColumnAccessor {
   def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType])
     : TestNullableColumnAccessor[JvmType] = {
-    // Skips the column type ID
-    buffer.getInt()
     new TestNullableColumnAccessor(buffer, columnType)
   }
 }
 
 class NullableColumnAccessorSuite extends SparkFunSuite {
-  import ColumnarTestUtils._
+  import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
   Seq(
-    BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE,
+    BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE,
     STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
     .foreach {
     testNullableColumnAccessor(_)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index 241d09ea205e9..9557eead2710e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -38,7 +38,7 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
   import ColumnarTestUtils._
 
   Seq(
-    BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE,
+    BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE,
     STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
     .foreach {
     testNullableColumnBuilder(_)
@@ -53,7 +53,6 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val columnBuilder = TestNullableColumnBuilder(columnType)
       val buffer = columnBuilder.build()
 
-      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
       assertResult(0, "Wrong null count")(buffer.getInt())
       assert(!buffer.hasRemaining)
     }
@@ -68,7 +67,6 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
 
       val buffer = columnBuilder.build()
 
-      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
       assertResult(0, "Wrong null count")(buffer.getInt())
     }
 
@@ -84,7 +82,6 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
 
       val buffer = columnBuilder.build()
 
-      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
       assertResult(4, "Wrong null count")(buffer.getInt())
 
       // For null positions

From 744f03e700b0e3a7c2a92e92edc79d2374c19023 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 6 Oct 2015 10:17:12 -0700
Subject: [PATCH 461/802] [SPARK-10916] [YARN] Set perm gen size when launching
 containers on YARN.

This makes YARN containers behave like all other processes launched by
Spark, which launch with a default perm gen size of 256m unless
overridden by the user (or not needed by the vm).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8970 from vanzin/SPARK-10916.
---
 .../spark/launcher/WorkerCommandBuilder.scala |  2 +-
 .../launcher/AbstractCommandBuilder.java      | 23 -------------------
 .../spark/launcher/CommandBuilderUtils.java   | 23 +++++++++++++++++++
 .../org/apache/spark/deploy/yarn/Client.scala |  4 +++-
 .../spark/deploy/yarn/ExecutorRunnable.scala  |  2 ++
 .../launcher/YarnCommandBuilderUtils.scala    | 21 +++++++++++++++--
 6 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala b/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
index 0c096656f9236..a2add61617281 100644
--- a/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/launcher/WorkerCommandBuilder.scala
@@ -40,7 +40,7 @@ private[spark] class WorkerCommandBuilder(sparkHome: String, memoryMb: Int, comm
     cmd.add(s"-Xms${memoryMb}M")
     cmd.add(s"-Xmx${memoryMb}M")
     command.javaOpts.foreach(cmd.add)
-    addPermGenSizeOpt(cmd)
+    CommandBuilderUtils.addPermGenSizeOpt(cmd)
     addOptionString(cmd, getenv("SPARK_JAVA_OPTS"))
     cmd
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 0a237ee73b670..610e8bdaaa639 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -116,29 +116,6 @@ List<String> buildJavaCommand(String extraClassPath) throws IOException {
     return cmd;
   }
 
-  /**
-   * Adds the default perm gen size option for Spark if the VM requires it and the user hasn't
-   * set it.
-   */
-  void addPermGenSizeOpt(List<String> cmd) {
-    // Don't set MaxPermSize for IBM Java, or Oracle Java 8 and later.
-    if (getJavaVendor() == JavaVendor.IBM) {
-      return;
-    }
-    String[] version = System.getProperty("java.version").split("\\.");
-    if (Integer.parseInt(version[0]) > 1 || Integer.parseInt(version[1]) > 7) {
-      return;
-    }
-
-    for (String arg : cmd) {
-      if (arg.startsWith("-XX:MaxPermSize=")) {
-        return;
-      }
-    }
-
-    cmd.add("-XX:MaxPermSize=256m");
-  }
-
   void addOptionString(List<String> cmd, String options) {
     if (!isEmpty(options)) {
       for (String opt : parseOptionString(options)) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
index a16c0d2b5ca0b..d30c2ec5f87bb 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
@@ -313,4 +313,27 @@ static String quoteForCommandString(String s) {
     return quoted.append('"').toString();
   }
 
+  /**
+   * Adds the default perm gen size option for Spark if the VM requires it and the user hasn't
+   * set it.
+   */
+  static void addPermGenSizeOpt(List<String> cmd) {
+    // Don't set MaxPermSize for IBM Java, or Oracle Java 8 and later.
+    if (getJavaVendor() == JavaVendor.IBM) {
+      return;
+    }
+    String[] version = System.getProperty("java.version").split("\\.");
+    if (Integer.parseInt(version[0]) > 1 || Integer.parseInt(version[1]) > 7) {
+      return;
+    }
+
+    for (String arg : cmd) {
+      if (arg.startsWith("-XX:MaxPermSize=")) {
+        return;
+      }
+    }
+
+    cmd.add("-XX:MaxPermSize=256m");
+  }
+
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 8c53c24a79c48..f8748ef6587af 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -54,8 +54,9 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 import org.apache.hadoop.yarn.util.Records
 
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.util.Utils
 
 private[spark] class Client(
@@ -730,6 +731,7 @@ private[spark] class Client(
 
     // For log4j configuration to reference
     javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+    YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
 
     val userClass =
       if (isClusterMode) {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 9abd09b3cc7a5..2232ffba473b5 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
+import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.util.Utils
 
@@ -199,6 +200,7 @@ class ExecutorRunnable(
 
     // For log4j configuration to reference
     javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+    YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
 
     val userClassPath = Client.getUserClasspath(sparkConf).flatMap { uri =>
       val absPath =
diff --git a/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala b/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
index 3ac36ef0a1c3f..7d246bf407121 100644
--- a/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
+++ b/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
@@ -17,11 +17,28 @@
 
 package org.apache.spark.launcher
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
+
 /**
- * Exposes needed methods
+ * Exposes methods from the launcher library that are used by the YARN backend.
  */
 private[spark] object YarnCommandBuilderUtils {
-  def quoteForBatchScript(arg: String) : String = {
+
+  def quoteForBatchScript(arg: String): String = {
     CommandBuilderUtils.quoteForBatchScript(arg)
   }
+
+  /**
+   * Adds the perm gen configuration to the list of java options if needed and not yet added.
+   *
+   * Note that this method adds the option based on the local JVM version; if the node where
+   * the container is running has a different Java version, there's a risk that the option will
+   * not be added (e.g. if the AM is running Java 8 but the container's node is set up to use
+   * Java 7).
+   */
+  def addPermGenSizeOpt(args: ListBuffer[String]): Unit = {
+    CommandBuilderUtils.addPermGenSizeOpt(args.asJava)
+  }
+
 }

From e9783601599758df87418bf61a7b4636f06714fa Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@staydecay.corp.gq1.yahoo.com>
Date: Tue, 6 Oct 2015 10:18:50 -0700
Subject: [PATCH 462/802] [SPARK-10901] [YARN] spark.yarn.user.classpath.first
 doesn't work

This should go into 1.5.2 also.

The issue is we were no longer adding the __app__.jar to the system classpath.

Author: Thomas Graves <tgraves@staydecay.corp.gq1.yahoo.com>
Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #8959 from tgravescs/SPARK-10901.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index f8748ef6587af..eb3b7fb885087 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1155,13 +1155,24 @@ object Client extends Logging {
     }
 
     if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) {
-      val userClassPath =
+      // in order to properly add the app jar when user classpath is first
+      // we have to do the mainJar separate in order to send the right thing
+      // into addFileToClasspath
+      val mainJar =
         if (args != null) {
-          getUserClasspath(Option(args.userJar), Option(args.addJars))
+          getMainJarUri(Option(args.userJar))
         } else {
-          getUserClasspath(sparkConf)
+          getMainJarUri(sparkConf.getOption(CONF_SPARK_USER_JAR))
         }
-      userClassPath.foreach { x =>
+      mainJar.foreach(addFileToClasspath(sparkConf, _, APP_JAR, env))
+
+      val secondaryJars =
+        if (args != null) {
+          getSecondaryJarUris(Option(args.addJars))
+        } else {
+          getSecondaryJarUris(sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+        }
+      secondaryJars.foreach { x =>
         addFileToClasspath(sparkConf, x, null, env)
       }
     }
@@ -1178,16 +1189,20 @@ object Client extends Logging {
    * @param conf Spark configuration.
    */
   def getUserClasspath(conf: SparkConf): Array[URI] = {
-    getUserClasspath(conf.getOption(CONF_SPARK_USER_JAR),
-      conf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+    val mainUri = getMainJarUri(conf.getOption(CONF_SPARK_USER_JAR))
+    val secondaryUris = getSecondaryJarUris(conf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+    (mainUri ++ secondaryUris).toArray
   }
 
-  private def getUserClasspath(
-      mainJar: Option[String],
-      secondaryJars: Option[String]): Array[URI] = {
-    val mainUri = mainJar.orElse(Some(APP_JAR)).map(new URI(_))
-    val secondaryUris = secondaryJars.map(_.split(",")).toSeq.flatten.map(new URI(_))
-    (mainUri ++ secondaryUris).toArray
+  private def getMainJarUri(mainJar: Option[String]): Option[URI] = {
+    mainJar.flatMap { path =>
+      val uri = new URI(path)
+      if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
+    }.orElse(Some(new URI(APP_JAR)))
+  }
+
+  private def getSecondaryJarUris(secondaryJars: Option[String]): Seq[URI] = {
+    secondaryJars.map(_.split(",")).toSeq.flatten.map(new URI(_))
   }
 
   /**

From 5952bdb7df20d007d59f82261095faca3822c6f6 Mon Sep 17 00:00:00 2001
From: vectorijk <jiangkai@gmail.com>
Date: Tue, 6 Oct 2015 12:43:28 -0700
Subject: [PATCH 463/802] [SPARK-10688] [ML] [PYSPARK] Python API for
 AFTSurvivalRegression

Implement Python API for AFTSurvivalRegression

Author: vectorijk <jiangkai@gmail.com>

Closes #8926 from vectorijk/spark-10688.
---
 python/pyspark/ml/regression.py | 171 +++++++++++++++++++++++++++++++-
 1 file changed, 169 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 21d454f9003bb..a0f7f54e65213 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -22,8 +22,10 @@
 from pyspark.mllib.common import inherit_doc
 
 
-__all__ = ['DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor',
-           'GBTRegressionModel', 'LinearRegression', 'LinearRegressionModel',
+__all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel',
+           'DecisionTreeRegressor', 'DecisionTreeRegressionModel',
+           'GBTRegressor', 'GBTRegressionModel',
+           'LinearRegression', 'LinearRegressionModel',
            'RandomForestRegressor', 'RandomForestRegressionModel']
 
 
@@ -609,6 +611,171 @@ class GBTRegressionModel(TreeEnsembleModels):
     """
 
 
+@inherit_doc
+class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                            HasFitIntercept, HasMaxIter, HasTol):
+    """
+    Accelerated Failure Time (AFT) Model Survival Regression
+
+    Fit a parametric AFT survival regression model based on the Weibull distribution
+    of the survival time.
+
+    .. seealso:: `AFT Model <https://en.wikipedia.org/wiki/Accelerated_failure_time_model>`_
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0), 1.0),
+    ...     (0.0, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"])
+    >>> aftsr = AFTSurvivalRegression()
+    >>> model = aftsr.fit(df)
+    >>> model.predict(Vectors.dense(6.3))
+    1.0
+    >>> model.predictQuantiles(Vectors.dense(6.3))
+    DenseVector([0.0101, 0.0513, 0.1054, 0.2877, 0.6931, 1.3863, 2.3026, 2.9957, 4.6052])
+    >>> model.transform(df).show()
+    +-----+---------+------+----------+
+    |label| features|censor|prediction|
+    +-----+---------+------+----------+
+    |  1.0|    [1.0]|   1.0|       1.0|
+    |  0.0|(1,[],[])|   0.0|       1.0|
+    +-----+---------+------+----------+
+    ...
+
+    .. versionadded:: 1.6.0
+    """
+
+    # a placeholder to make it appear in the generated doc
+    censorCol = Param(Params._dummy(), "censorCol",
+                      "censor column name. The value of this column could be 0 or 1. " +
+                      "If the value is 1, it means the event has occurred i.e. " +
+                      "uncensored; otherwise censored.")
+    quantileProbabilities = \
+        Param(Params._dummy(), "quantileProbabilities",
+              "quantile probabilities array. Values of the quantile probabilities array " +
+              "should be in the range (0, 1) and the array should be non-empty.")
+    quantilesCol = Param(Params._dummy(), "quantilesCol",
+                         "quantiles column name. This column will output quantiles of " +
+                         "corresponding quantileProbabilities if it is set.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor",
+                 quantileProbabilities=None, quantilesCol=None):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
+                 quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
+                 quantilesCol=None):
+        """
+        super(AFTSurvivalRegression, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid)
+        #: Param for censor column name
+        self.censorCol = Param(self,  "censorCol",
+                               "censor column name. The value of this column could be 0 or 1. " +
+                               "If the value is 1, it means the event has occurred i.e. " +
+                               "uncensored; otherwise censored.")
+        #: Param for quantile probabilities array
+        self.quantileProbabilities = \
+            Param(self, "quantileProbabilities",
+                  "quantile probabilities array. Values of the quantile probabilities array " +
+                  "should be in the range (0, 1) and the array should be non-empty.")
+        #: Param for quantiles column name
+        self.quantilesCol = Param(self, "quantilesCol",
+                                  "quantiles column name. This column will output quantiles of " +
+                                  "corresponding quantileProbabilities if it is set.")
+        self._setDefault(censorCol="censor",
+                         quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("1.6.0")
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor",
+                  quantileProbabilities=None, quantilesCol=None):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
+                  quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
+                  quantilesCol=None):
+        """
+        kwargs = self.setParams._input_kwargs
+        if quantileProbabilities is None:
+            return self._set(**kwargs).setQuantileProbabilities([0.01, 0.05, 0.1, 0.25, 0.5,
+                                                                 0.75, 0.9, 0.95, 0.99])
+        else:
+            return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return AFTSurvivalRegressionModel(java_model)
+
+    @since("1.6.0")
+    def setCensorCol(self, value):
+        """
+        Sets the value of :py:attr:`censorCol`.
+        """
+        self._paramMap[self.censorCol] = value
+        return self
+
+    @since("1.6.0")
+    def getCensorCol(self):
+        """
+        Gets the value of censorCol or its default value.
+        """
+        return self.getOrDefault(self.censorCol)
+
+    @since("1.6.0")
+    def setQuantileProbabilities(self, value):
+        """
+        Sets the value of :py:attr:`quantileProbabilities`.
+        """
+        self._paramMap[self.quantileProbabilities] = value
+        return self
+
+    @since("1.6.0")
+    def getQuantileProbabilities(self):
+        """
+        Gets the value of quantileProbabilities or its default value.
+        """
+        return self.getOrDefault(self.quantileProbabilities)
+
+    @since("1.6.0")
+    def setQuantilesCol(self, value):
+        """
+        Sets the value of :py:attr:`quantilesCol`.
+        """
+        self._paramMap[self.quantilesCol] = value
+        return self
+
+    @since("1.6.0")
+    def getQuantilesCol(self):
+        """
+        Gets the value of quantilesCol or its default value.
+        """
+        return self.getOrDefault(self.quantilesCol)
+
+
+class AFTSurvivalRegressionModel(JavaModel):
+    """
+    Model fitted by AFTSurvivalRegression.
+
+    .. versionadded:: 1.6.0
+    """
+
+    def predictQuantiles(self, features):
+        """
+        Predicted Quantiles
+        """
+        return self._call_java("predictQuantiles", features)
+
+    def predict(self, features):
+        """
+        Predicted value
+        """
+        return self._call_java("predict", features)
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 5e035403d41527f092705c68f0dd1b4b946014d6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 6 Oct 2015 14:58:42 -0700
Subject: [PATCH 464/802] [SPARK-10957] [ML] setParams changes
 quantileProbabilities unexpectly in PySpark's AFTSurvivalRegression

If user doesn't specify `quantileProbs` in `setParams`, it will get reset to the default value. We don't need special handling here. vectorijk yanboliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #9001 from mengxr/SPARK-10957.
---
 python/pyspark/ml/regression.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index a0f7f54e65213..e12abeba019f7 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -701,11 +701,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   quantilesCol=None):
         """
         kwargs = self.setParams._input_kwargs
-        if quantileProbabilities is None:
-            return self._set(**kwargs).setQuantileProbabilities([0.01, 0.05, 0.1, 0.25, 0.5,
-                                                                 0.75, 0.9, 0.95, 0.99])
-        else:
-            return self._set(**kwargs)
+        return self._set(**kwargs)
 
     def _create_model(self, java_model):
         return AFTSurvivalRegressionModel(java_model)

From ffe6831e49e28eb855f857fdfa5dd99341e80c9d Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 6 Oct 2015 16:51:03 -0700
Subject: [PATCH 465/802] [SPARK-10885] [STREAMING] Display the failed output
 op in Streaming UI

This PR implements the following features for both `master` and `branch-1.5`.
1. Display the failed output op count in the batch list
2. Display the failure reason of output op in the batch detail page

Screenshots:
<img width="1356" alt="1" src="https://cloud.githubusercontent.com/assets/1000778/10198387/5b2b97ec-67ce-11e5-81c2-f818b9d2f3ad.png">
<img width="1356" alt="2" src="https://cloud.githubusercontent.com/assets/1000778/10198388/5b76ac14-67ce-11e5-8c8b-de2683c5b485.png">

There are still two remaining problems in the UI.
1. If an output operation doesn't run any spark job, we cannot get the its duration since now it's the sum of all jobs' durations.
2. If an output operation doesn't run any spark job, we cannot get the description since it's the latest job's call site.

We need to add new `StreamingListenerEvent` about output operations to fix them. So I'd like to fix them only for `master` in another PR.

Author: zsxwing <zsxwing@gmail.com>

Closes #8950 from zsxwing/batch-failure.
---
 .../spark/streaming/scheduler/BatchInfo.scala |  10 ++
 .../spark/streaming/scheduler/JobSet.scala    |   1 +
 .../spark/streaming/ui/AllBatchesTable.scala  |  15 +-
 .../apache/spark/streaming/ui/BatchPage.scala | 134 +++++++++++++++---
 .../spark/streaming/ui/BatchUIData.scala      |   6 +-
 .../spark/streaming/UISeleniumSuite.scala     |   4 +-
 6 files changed, 143 insertions(+), 27 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 3c869561ef8b9..463f899dc249b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -41,6 +41,8 @@ case class BatchInfo(
 
   private var _failureReasons: Map[Int, String] = Map.empty
 
+  private var _numOutputOp: Int = 0
+
   @deprecated("Use streamIdToInputInfo instead", "1.5.0")
   def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
 
@@ -77,4 +79,12 @@ case class BatchInfo(
 
   /** Failure reasons corresponding to every output ops in the batch */
   private[streaming] def failureReasons = _failureReasons
+
+  /** Set the number of output operations in this batch */
+  private[streaming] def setNumOutputOp(numOutputOp: Int): Unit = {
+    _numOutputOp = numOutputOp
+  }
+
+  /** Return the number of output operations in this batch */
+  private[streaming] def numOutputOp: Int = _numOutputOp
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 255ccf0536688..08f63cc99268f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -81,6 +81,7 @@ case class JobSet(
       if (processingEndTime >= 0) Some(processingEndTime) else None
     )
     binfo.setFailureReason(failureReasons)
+    binfo.setNumOutputOp(jobs.size)
     binfo
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index f702bd5bc9466..3e6590d66f587 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -107,9 +107,10 @@ private[ui] class ActiveBatchTable(
 private[ui] class CompletedBatchTable(batches: Seq[BatchUIData], batchInterval: Long)
   extends BatchTableBase("completed-batches-table", batchInterval) {
 
-  override protected def columns: Seq[Node] = super.columns ++
-    <th>Total Delay
-      {SparkUIUtils.tooltip("Total time taken to handle a batch", "top")}</th>
+  override protected def columns: Seq[Node] = super.columns ++ {
+    <th>Total Delay {SparkUIUtils.tooltip("Total time taken to handle a batch", "top")}</th>
+      <th>Output Ops: Succeeded/Total</th>
+  }
 
   override protected def renderRows: Seq[Node] = {
     batches.flatMap(batch => <tr>{completedBatchRow(batch)}</tr>)
@@ -118,9 +119,17 @@ private[ui] class CompletedBatchTable(batches: Seq[BatchUIData], batchInterval:
   private def completedBatchRow(batch: BatchUIData): Seq[Node] = {
     val totalDelay = batch.totalDelay
     val formattedTotalDelay = totalDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
+    val numFailedOutputOp = batch.failureReason.size
+    val outputOpColumn = if (numFailedOutputOp > 0) {
+        s"${batch.numOutputOp - numFailedOutputOp}/${batch.numOutputOp}" +
+          s" (${numFailedOutputOp} failed)"
+      } else {
+        s"${batch.numOutputOp}/${batch.numOutputOp}"
+      }
     baseRow(batch) ++
       <td sorttable_customkey={totalDelay.getOrElse(Long.MaxValue).toString}>
         {formattedTotalDelay}
       </td>
+      <td>{outputOpColumn}</td>
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 9129c1f26abd4..1b717b64542d5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -38,6 +38,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     <th>Output Op Id</th>
       <th>Description</th>
       <th>Duration</th>
+      <th>Status</th>
       <th>Job Id</th>
       <th>Duration</th>
       <th class="sorttable_nosort">Stages: Succeeded/Total</th>
@@ -49,18 +50,42 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       outputOpId: OutputOpId,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
+      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: SparkJobIdWithUIData): Seq[Node] = {
     if (sparkJob.jobUIData.isDefined) {
       generateNormalJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
+        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
     } else {
       generateDroppedJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
+        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
     }
   }
 
+  private def generateOutputOpRowWithoutSparkJobs(
+    outputOpId: OutputOpId,
+    outputOpDescription: Seq[Node],
+    formattedOutputOpDuration: String,
+    outputOpStatus: String): Seq[Node] = {
+    <tr>
+      <td class="output-op-id-cell" >{outputOpId.toString}</td>
+      <td>{outputOpDescription}</td>
+      <td>{formattedOutputOpDuration}</td>
+      {outputOpStatusCell(outputOpStatus, rowspan = 1)}
+      <!-- Job Id -->
+      <td>-</td>
+      <!-- Duration -->
+      <td>-</td>
+      <!-- Stages: Succeeded/Total -->
+      <td>-</td>
+      <!-- Tasks (for all stages): Succeeded/Total -->
+      <td>-</td>
+      <!-- Error -->
+      <td>-</td>
+    </tr>
+  }
+
   /**
    * Generate a row for a Spark Job. Because duplicated output op infos needs to be collapsed into
    * one cell, we use "rowspan" for the first row of a output op.
@@ -69,6 +94,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       outputOpId: OutputOpId,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
+      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: JobUIData): Seq[Node] = {
@@ -94,7 +120,8 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         <td rowspan={numSparkJobRowsInOutputOp.toString}>
           {outputOpDescription}
         </td>
-        <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td>
+        <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
+        {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -125,7 +152,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
             total = sparkJob.numTasks - sparkJob.numSkippedTasks)
         }
       </td>
-      {failureReasonCell(lastFailureReason)}
+      {failureReasonCell(lastFailureReason, rowspan = 1)}
     </tr>
   }
 
@@ -137,6 +164,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       outputOpId: OutputOpId,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
+      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       jobId: Int): Seq[Node] = {
@@ -147,7 +175,8 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       if (isFirstRow) {
         <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
           <td rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpDescription}</td>
-          <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td>
+          <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
+          {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -156,7 +185,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     <tr>
       {prefixCells}
       <td sorttable_customkey={jobId.toString}>
-        {jobId.toString}
+        {if (jobId >= 0) jobId.toString else "-"}
       </td>
       <!-- Duration -->
       <td>-</td>
@@ -170,7 +199,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   private def generateOutputOpIdRow(
-      outputOpId: OutputOpId, sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
+      outputOpId: OutputOpId,
+      outputOpStatus: String,
+      sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
     // We don't count the durations of dropped jobs
     val sparkJobDurations = sparkJobs.filter(_.jobUIData.nonEmpty).map(_.jobUIData.get).
       map(sparkJob => {
@@ -189,12 +220,32 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
 
     val description = generateOutputOpDescription(sparkJobs)
 
-    generateJobRow(
-      outputOpId, description, formattedOutputOpDuration, sparkJobs.size, true, sparkJobs.head) ++
-      sparkJobs.tail.map { sparkJob =>
+    if (sparkJobs.isEmpty) {
+      generateOutputOpRowWithoutSparkJobs(
+        outputOpId, description, formattedOutputOpDuration, outputOpStatus)
+    } else {
+      val firstRow =
         generateJobRow(
-          outputOpId, description, formattedOutputOpDuration, sparkJobs.size, false, sparkJob)
-      }.flatMap(x => x)
+          outputOpId,
+          description,
+          formattedOutputOpDuration,
+          outputOpStatus,
+          sparkJobs.size,
+          true,
+          sparkJobs.head)
+      val tailRows =
+        sparkJobs.tail.map { sparkJob =>
+          generateJobRow(
+            outputOpId,
+            description,
+            formattedOutputOpDuration,
+            outputOpStatus,
+            sparkJobs.size,
+            false,
+            sparkJob)
+        }
+      (firstRow ++ tailRows).flatten
+    }
   }
 
   private def generateOutputOpDescription(sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
@@ -228,7 +279,10 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
-  private def failureReasonCell(failureReason: String): Seq[Node] = {
+  private def failureReasonCell(
+      failureReason: String,
+      rowspan: Int,
+      includeFirstLineInExpandDetails: Boolean = true): Seq[Node] = {
     val isMultiline = failureReason.indexOf('\n') >= 0
     // Display the first line by default
     val failureReasonSummary = StringEscapeUtils.escapeHtml4(
@@ -237,6 +291,13 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       } else {
         failureReason
       })
+    val failureDetails =
+      if (isMultiline && !includeFirstLineInExpandDetails) {
+        // Skip the first line
+        failureReason.substring(failureReason.indexOf('\n') + 1)
+      } else {
+        failureReason
+      }
     val details = if (isMultiline) {
       // scalastyle:off
       <span onclick="this.parentNode.querySelector('.stacktrace-details').classList.toggle('collapsed')"
@@ -244,13 +305,20 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         +details
       </span> ++
         <div class="stacktrace-details collapsed">
-          <pre>{failureReason}</pre>
+          <pre>{failureDetails}</pre>
         </div>
       // scalastyle:on
     } else {
       ""
     }
-    <td valign="middle" style="max-width: 300px">{failureReasonSummary}{details}</td>
+
+    if (rowspan == 1) {
+      <td valign="middle" style="max-width: 300px">{failureReasonSummary}{details}</td>
+    } else {
+      <td valign="middle" style="max-width: 300px" rowspan={rowspan.toString}>
+        {failureReasonSummary}{details}
+      </td>
+    }
   }
 
   private def getJobData(sparkJobId: SparkJobId): Option[JobUIData] = {
@@ -265,16 +333,31 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
    * Generate the job table for the batch.
    */
   private def generateJobTable(batchUIData: BatchUIData): Seq[Node] = {
-    val outputOpIdToSparkJobIds = batchUIData.outputOpIdSparkJobIdPairs.groupBy(_.outputOpId).toSeq.
-      sortBy(_._1). // sorted by OutputOpId
+    val outputOpIdToSparkJobIds = batchUIData.outputOpIdSparkJobIdPairs.groupBy(_.outputOpId).
       map { case (outputOpId, outputOpIdAndSparkJobIds) =>
         // sort SparkJobIds for each OutputOpId
         (outputOpId, outputOpIdAndSparkJobIds.map(_.sparkJobId).sorted)
       }
+    val outputOps = (0 until batchUIData.numOutputOp).map { outputOpId =>
+      val status = batchUIData.failureReason.get(outputOpId).map { failure =>
+        if (failure.startsWith("org.apache.spark.SparkException")) {
+          "Failed due to Spark job error\n" + failure
+        } else {
+          var nextLineIndex = failure.indexOf("\n")
+          if (nextLineIndex < 0) {
+            nextLineIndex = failure.size
+          }
+          val firstLine = failure.substring(0, nextLineIndex)
+          s"Failed due to error: $firstLine\n$failure"
+        }
+      }.getOrElse("Succeeded")
+      val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
+      (outputOpId, status, sparkJobIds)
+    }
     sparkListener.synchronized {
-      val outputOpIdWithJobs: Seq[(OutputOpId, Seq[SparkJobIdWithUIData])] =
-        outputOpIdToSparkJobIds.map { case (outputOpId, sparkJobIds) =>
-          (outputOpId,
+      val outputOpIdWithJobs: Seq[(OutputOpId, String, Seq[SparkJobIdWithUIData])] =
+        outputOps.map { case (outputOpId, status, sparkJobIds) =>
+          (outputOpId, status,
             sparkJobIds.map(sparkJobId => SparkJobIdWithUIData(sparkJobId, getJobData(sparkJobId))))
         }
 
@@ -285,7 +368,8 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         <tbody>
           {
             outputOpIdWithJobs.map {
-              case (outputOpId, sparkJobIds) => generateOutputOpIdRow(outputOpId, sparkJobIds)
+              case (outputOpId, status, sparkJobIds) =>
+                generateOutputOpIdRow(outputOpId, status, sparkJobIds)
             }
           }
         </tbody>
@@ -386,4 +470,12 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     Unparsed(StringEscapeUtils.escapeHtml4(metadataDescription).
       replaceAllLiterally("\t", "&nbsp;&nbsp;&nbsp;&nbsp;").replaceAllLiterally("\n", "<br/>"))
   }
+
+  private def outputOpStatusCell(status: String, rowspan: Int): Seq[Node] = {
+    if (status == "Succeeded") {
+      <td rowspan={rowspan.toString}>Succeeded</td>
+    } else {
+      failureReasonCell(status, rowspan, includeFirstLineInExpandDetails = false)
+    }
+  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
index ae508c0e9577b..e6c2e2140c6c4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
@@ -30,6 +30,8 @@ private[ui] case class BatchUIData(
     val submissionTime: Long,
     val processingStartTime: Option[Long],
     val processingEndTime: Option[Long],
+    val numOutputOp: Int,
+    val failureReason: Map[Int, String],
     var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) {
 
   /**
@@ -69,7 +71,9 @@ private[ui] object BatchUIData {
       batchInfo.streamIdToInputInfo,
       batchInfo.submissionTime,
       batchInfo.processingStartTime,
-      batchInfo.processingEndTime
+      batchInfo.processingEndTime,
+      batchInfo.numOutputOp,
+      batchInfo.failureReasons
     )
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index 068a6cb0e8fa4..d1df78871d3b8 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -121,7 +121,7 @@ class UISeleniumSuite
         }
         findAll(cssSelector("""#completed-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
-            "Total Delay (?)")
+            "Total Delay (?)", "Output Ops: Succeeded/Total")
         }
 
         val batchLinks =
@@ -138,7 +138,7 @@ class UISeleniumSuite
         summaryText should contain ("Total delay:")
 
         findAll(cssSelector("""#batch-job-table th""")).map(_.text).toSeq should be {
-          List("Output Op Id", "Description", "Duration", "Job Id", "Duration",
+          List("Output Op Id", "Description", "Duration", "Status", "Job Id", "Duration",
             "Stages: Succeeded/Total", "Tasks (for all stages): Succeeded/Total", "Error")
         }
 

From 27cdde2ff87346fb54318532a476bf85f5837da7 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 7 Oct 2015 15:00:19 +0100
Subject: [PATCH 466/802] [SPARK-10669] [DOCS] Link to each language's API in
 codetabs in ML docs: spark.mllib

In the Markdown docs for the spark.mllib Programming Guide, we have code examples with codetabs for each language. We should link to each language's API docs within the corresponding codetab, but we are inconsistent about this. For an example of what we want to do, see the "ChiSqSelector" section in https://github.com/apache/spark/blob/64743870f23bffb8d96dcc8a0181c1452782a151/docs/mllib-feature-extraction.md
This JIRA is just for spark.mllib, not spark.ml.

Please let me know if more work is needed, thanks a lot.

Author: Xin Ren <iamshrek@126.com>

Closes #8977 from keypointt/SPARK-10669.
---
 docs/mllib-clustering.md               | 30 ++++++++++++++--
 docs/mllib-collaborative-filtering.md  |  6 ++++
 docs/mllib-data-types.md               | 47 ++++++++++++++++++++++++++
 docs/mllib-decision-tree.md            | 22 ++++++++----
 docs/mllib-dimensionality-reduction.md | 10 ++++++
 docs/mllib-ensembles.md                | 44 +++++++++++++++++-------
 docs/mllib-evaluation-metrics.md       | 15 ++++++++
 docs/mllib-feature-extraction.md       | 47 ++++++++++++++++++++------
 docs/mllib-frequent-pattern-mining.md  | 13 +++++++
 docs/mllib-isotonic-regression.md      |  6 ++++
 docs/mllib-linear-methods.md           | 18 ++++++++++
 docs/mllib-naive-bayes.md              |  6 ++++
 docs/mllib-optimization.md             |  4 +++
 docs/mllib-pmml-model-export.md        |  2 ++
 docs/mllib-statistics.md               | 34 +++++++++++++++++++
 15 files changed, 274 insertions(+), 30 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index c2711cf82deb4..8fbced6c87d9f 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -4,10 +4,10 @@ title: Clustering - MLlib
 displayTitle: <a href="mllib-guide.html">MLlib</a> - Clustering
 ---
 
-Clustering is an unsupervised learning problem whereby we aim to group subsets
+[Clustering](https://en.wikipedia.org/wiki/Cluster_analysis) is an unsupervised learning problem whereby we aim to group subsets
 of entities with one another based on some notion of similarity.  Clustering is
 often used for exploratory analysis and/or as a component of a hierarchical
-supervised learning pipeline (in which distinct classifiers or regression
+[supervised learning](https://en.wikipedia.org/wiki/Supervised_learning) pipeline (in which distinct classifiers or regression
 models are trained for each cluster).
 
 MLlib supports the following models:
@@ -47,6 +47,8 @@ into two clusters. The number of desired clusters is passed to the algorithm. We
 Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
 optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
+Refer to the [`KMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) and [`KMeansModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeansModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
 import org.apache.spark.mllib.linalg.Vectors
@@ -77,6 +79,8 @@ Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
 that is equivalent to the provided example in Scala is given below:
 
+Refer to the [`KMeans` Java docs](api/java/org/apache/spark/mllib/clustering/KMeans.html) and [`KMeansModel` Java docs](api/java/org/apache/spark/mllib/clustering/KMeansModel.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.*;
 import org.apache.spark.api.java.function.Function;
@@ -132,6 +136,8 @@ data into two clusters. The number of desired clusters is passed to the algorith
 Within Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In
 fact the optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
+Refer to the [`KMeans` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.KMeans) and [`KMeansModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.KMeansModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.clustering import KMeans, KMeansModel
 from numpy import array
@@ -184,6 +190,8 @@ In the following example after loading and parsing data, we use a
 object to cluster the data into two clusters. The number of desired clusters is passed
 to the algorithm. We then output the parameters of the mixture model.
 
+Refer to the [`GaussianMixture` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture) and [`GaussianMixtureModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixtureModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.GaussianMixture
 import org.apache.spark.mllib.clustering.GaussianMixtureModel
@@ -216,6 +224,8 @@ Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
 that is equivalent to the provided example in Scala is given below:
 
+Refer to the [`GaussianMixture` Java docs](api/java/org/apache/spark/mllib/clustering/GaussianMixture.html) and [`GaussianMixtureModel` Java docs](api/java/org/apache/spark/mllib/clustering/GaussianMixtureModel.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.*;
 import org.apache.spark.api.java.function.Function;
@@ -268,6 +278,8 @@ In the following example after loading and parsing data, we use a
 object to cluster the data into two clusters. The number of desired clusters is passed
 to the algorithm. We then output the parameters of the mixture model.
 
+Refer to the [`GaussianMixture` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.GaussianMixture) and [`GaussianMixtureModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.GaussianMixtureModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.clustering import GaussianMixture
 from numpy import array
@@ -324,6 +336,8 @@ Calling `PowerIterationClustering.run` returns a
 [`PowerIterationClusteringModel`](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClusteringModel),
 which contains the computed clustering assignments.
 
+Refer to the [`PowerIterationClustering` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClustering) and [`PowerIterationClusteringModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClusteringModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
 import org.apache.spark.mllib.linalg.Vectors
@@ -365,6 +379,8 @@ Calling `PowerIterationClustering.run` returns a
 [`PowerIterationClusteringModel`](api/java/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html)
 which contains the computed clustering assignments.
 
+Refer to the [`PowerIterationClustering` Java docs](api/java/org/apache/spark/mllib/clustering/PowerIterationClustering.html) and [`PowerIterationClusteringModel` Java docs](api/java/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 import scala.Tuple3;
@@ -411,6 +427,8 @@ Calling `PowerIterationClustering.run` returns a
 [`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering),
 which contains the computed clustering assignments.
 
+Refer to the [`PowerIterationClustering` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering) and [`PowerIterationClusteringModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClusteringModel) for more details on the API.
+
 {% highlight python %}
 from __future__ import print_function
 from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
@@ -571,6 +589,7 @@ to the algorithm. We then output the topics, represented as probability distribu
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+Refer to the [`LDA` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) and [`DistributedLDAModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.DistributedLDAModel) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel}
@@ -602,6 +621,8 @@ val sameModel = DistributedLDAModel.load(sc, "myLDAModel")
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`LDA` Java docs](api/java/org/apache/spark/mllib/clustering/LDA.html) and [`DistributedLDAModel` Java docs](api/java/org/apache/spark/mllib/clustering/DistributedLDAModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -666,6 +687,8 @@ public class JavaLDAExample {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`LDA` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.LDA) and [`LDAModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.LDAModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.clustering import LDA, LDAModel
 from pyspark.mllib.linalg import Vectors
@@ -730,6 +753,7 @@ This example shows how to estimate clusters on streaming data.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+Refer to the [`StreamingKMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.StreamingKMeans) for details on the API.
 
 First we import the neccessary classes.
 
@@ -780,6 +804,8 @@ ssc.awaitTermination()
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`StreamingKMeans` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.clustering.StreamingKMeans) for more details on the API.
+
 First we import the neccessary classes.
 
 {% highlight python %}
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index eedc23424ad54..b3fd51dca5c90 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -64,6 +64,8 @@ We use the default [ALS.train()](api/scala/index.html#org.apache.spark.mllib.rec
 method which assumes ratings are explicit. We evaluate the
 recommendation model by measuring the Mean Squared Error of rating prediction.
 
+Refer to the [`ALS` Scala docs](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.recommendation.ALS
 import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
@@ -119,6 +121,8 @@ Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
 that is equivalent to the provided example in Scala is given bellow:
 
+Refer to the [`ALS` Java docs](api/java/org/apache/spark/mllib/recommendation/ALS.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -201,6 +205,8 @@ In the following example we load rating data. Each row consists of a user, a pro
 We use the default ALS.train() method which assumes ratings are explicit. We evaluate the
 recommendation by measuring the Mean Squared Error of rating prediction.
 
+Refer to the [`ALS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
 
diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index d8c7bdc63c70e..3c0c0479674df 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -33,6 +33,8 @@ implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.lin
 using the factory methods implemented in
 [`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) to create local vectors.
 
+Refer to the [`Vector` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
@@ -59,6 +61,8 @@ implementations: [`DenseVector`](api/java/org/apache/spark/mllib/linalg/DenseVec
 using the factory methods implemented in
 [`Vectors`](api/java/org/apache/spark/mllib/linalg/Vectors.html) to create local vectors.
 
+Refer to the [`Vector` Java docs](api/java/org/apache/spark/mllib/linalg/Vector.html) and [`Vectors` Java docs](api/java/org/apache/spark/mllib/linalg/Vectors.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
@@ -86,6 +90,8 @@ and the following as sparse vectors:
 We recommend using NumPy arrays over lists for efficiency, and using the factory methods implemented
 in [`Vectors`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) to create sparse vectors.
 
+Refer to the [`Vectors` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors) for more details on the API.
+
 {% highlight python %}
 import numpy as np
 import scipy.sparse as sps
@@ -119,6 +125,8 @@ For multiclass classification, labels should be class indices starting from zero
 A labeled point is represented by the case class
 [`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint).
 
+Refer to the [`LabeledPoint` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -136,6 +144,8 @@ val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
 A labeled point is represented by
 [`LabeledPoint`](api/java/org/apache/spark/mllib/regression/LabeledPoint.html).
 
+Refer to the [`LabeledPoint` Java docs](api/java/org/apache/spark/mllib/regression/LabeledPoint.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
@@ -153,6 +163,8 @@ LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new
 A labeled point is represented by
 [`LabeledPoint`](api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint).
 
+Refer to the [`LabeledPoint` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.linalg import SparseVector
 from pyspark.mllib.regression import LabeledPoint
@@ -187,6 +199,8 @@ After loading, the feature indices are converted to zero-based.
 [`MLUtils.loadLibSVMFile`](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
 examples stored in LIBSVM format.
 
+Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
@@ -200,6 +214,8 @@ val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_
 [`MLUtils.loadLibSVMFile`](api/java/org/apache/spark/mllib/util/MLUtils.html) reads training
 examples stored in LIBSVM format.
 
+Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
@@ -214,6 +230,8 @@ JavaRDD<LabeledPoint> examples =
 [`MLUtils.loadLibSVMFile`](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) reads training
 examples stored in LIBSVM format.
 
+Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.util import MLUtils
 
@@ -246,6 +264,8 @@ We recommend using the factory methods implemented
 in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) to create local
 matrices. Remember, local matrices in MLlib are stored in column-major order.
 
+Refer to the [`Matrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix) and [`Matrices` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Matrix, Matrices}
 
@@ -267,6 +287,8 @@ We recommend using the factory methods implemented
 in [`Matrices`](api/java/org/apache/spark/mllib/linalg/Matrices.html) to create local
 matrices. Remember, local matrices in MLlib are stored in column-major order.
 
+Refer to the [`Matrix` Java docs](api/java/org/apache/spark/mllib/linalg/Matrix.html) and [`Matrices` Java docs](api/java/org/apache/spark/mllib/linalg/Matrices.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Matrix;
 import org.apache.spark.mllib.linalg.Matrices;
@@ -289,6 +311,8 @@ We recommend using the factory methods implemented
 in [`Matrices`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) to create local
 matrices. Remember, local matrices in MLlib are stored in column-major order.
 
+Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix) and [`Matrices` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) for more details on the API.
+
 {% highlight python %}
 import org.apache.spark.mllib.linalg.{Matrix, Matrices}
 
@@ -341,6 +365,7 @@ created from an `RDD[Vector]` instance.  Then we can compute its column summary
 [QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) is of the form A = QR where Q is an orthogonal matrix and R is an upper triangular matrix.
 For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [principal component analysis (PCA)](https://en.wikipedia.org/wiki/Principal_component_analysis), please refer to [Dimensionality reduction](mllib-dimensionality-reduction.html).
 
+Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vector
@@ -364,6 +389,8 @@ val qrResult = mat.tallSkinnyQR(true)
 A [`RowMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) can be
 created from a `JavaRDD<Vector>` instance.  Then we can compute its column summary statistics.
 
+Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.Vector;
@@ -387,6 +414,8 @@ QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
 A [`RowMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) can be 
 created from an `RDD` of vectors.
 
+Refer to the [`RowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.linalg.distributed import RowMatrix
 
@@ -423,6 +452,8 @@ can be created from an `RDD[IndexedRow]` instance, where
 wrapper over `(Long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
+Refer to the [`IndexedRowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
 
@@ -448,6 +479,8 @@ can be created from an `JavaRDD<IndexedRow>` instance, where
 wrapper over `(long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
+Refer to the [`IndexedRowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.distributed.IndexedRow;
@@ -475,6 +508,8 @@ can be created from an `RDD` of `IndexedRow`s, where
 wrapper over `(long, vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
+Refer to the [`IndexedRowMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
 
@@ -529,6 +564,8 @@ wrapper over `(Long, Long, Double)`.  A `CoordinateMatrix` can be converted to a
 with sparse rows by calling `toIndexedRowMatrix`.  Other computations for 
 `CoordinateMatrix` are not currently supported.
 
+Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
 
@@ -555,6 +592,8 @@ wrapper over `(long, long, double)`.  A `CoordinateMatrix` can be converted to a
 with sparse rows by calling `toIndexedRowMatrix`. Other computations for 
 `CoordinateMatrix` are not currently supported.
 
+Refer to the [`CoordinateMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
@@ -582,6 +621,8 @@ can be created from an `RDD` of `MatrixEntry` entries, where
 wrapper over `(long, long, float)`.  A `CoordinateMatrix` can be converted to a `RowMatrix` by 
 calling `toRowMatrix`, or to an `IndexedRowMatrix` with sparse rows by calling `toIndexedRowMatrix`.
 
+Refer to the [`CoordinateMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.CoordinateMatrix) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
 
@@ -631,6 +672,8 @@ most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling
 `toBlockMatrix` creates blocks of size 1024 x 1024 by default.
 Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
 
+Refer to the [`BlockMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
 
@@ -656,6 +699,8 @@ most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling
 `toBlockMatrix` creates blocks of size 1024 x 1024 by default.
 Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
 
+Refer to the [`BlockMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
@@ -683,6 +728,8 @@ A [`BlockMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed
 can be created from an `RDD` of sub-matrix blocks, where a sub-matrix block is a 
 `((blockRowIndex, blockColIndex), sub-matrix)` tuple.
 
+Refer to the [`BlockMatrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.BlockMatrix) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.linalg import Matrices
 from pyspark.mllib.linalg.distributed import BlockMatrix
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index c1d0f8a6b1cd8..f31c4f88936bd 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -191,7 +191,9 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`DecisionTree` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.DecisionTree
 import org.apache.spark.mllib.tree.model.DecisionTreeModel
@@ -229,7 +231,9 @@ val sameModel = DecisionTreeModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`DecisionTree` Java docs](api/java/org/apache/spark/mllib/tree/DecisionTree.html) and [`DecisionTreeModel` Java docs](api/java/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for details on the API.
+
 {% highlight java %}
 import java.util.HashMap;
 import scala.Tuple2;
@@ -291,7 +295,8 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`DecisionTree` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTreeModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
@@ -335,7 +340,9 @@ depth of 5. The Mean Squared Error (MSE) is computed at the end to evaluate
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`DecisionTree` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.DecisionTree
 import org.apache.spark.mllib.tree.model.DecisionTreeModel
@@ -372,7 +379,9 @@ val sameModel = DecisionTreeModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`DecisionTree` Java docs](api/java/org/apache/spark/mllib/tree/DecisionTree.html) and [`DecisionTreeModel` Java docs](api/java/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for details on the API.
+
 {% highlight java %}
 import java.util.HashMap;
 import scala.Tuple2;
@@ -440,7 +449,8 @@ DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`DecisionTree` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTreeModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index 05f51168d837c..ac3526908a9f4 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -62,6 +62,8 @@ MLlib provides SVD functionality to row-oriented matrices, provided in the
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+Refer to the [`SingularValueDecomposition` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.SingularValueDecomposition) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Matrix
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
@@ -80,6 +82,8 @@ The same code applies to `IndexedRowMatrix` if `U` is defined as an
 `IndexedRowMatrix`.
 </div>
 <div data-lang="java" markdown="1">
+Refer to the [`SingularValueDecomposition` Java docs](api/java/org/apache/spark/mllib/linalg/SingularValueDecomposition.html) for details on the API.
+
 {% highlight java %}
 import java.util.LinkedList;
 
@@ -145,6 +149,8 @@ MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format an
 The following code demonstrates how to compute principal components on a `RowMatrix`
 and use them to project the vectors into a low-dimensional space.
 
+Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Matrix
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
@@ -161,6 +167,8 @@ val projected: RowMatrix = mat.multiply(pc)
 The following code demonstrates how to compute principal components on source vectors
 and use them to project the vectors into a low-dimensional space while keeping associated labels:
 
+Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.feature.PCA
@@ -182,6 +190,8 @@ The following code demonstrates how to compute principal components on a `RowMat
 and use them to project the vectors into a low-dimensional space.
 The number of columns should be small, e.g, less than 1000.
 
+Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
+
 {% highlight java %}
 import java.util.LinkedList;
 
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 1e00b2083ed73..fc587298f7d2e 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -95,7 +95,9 @@ The test error is calculated to measure the algorithm accuracy.
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`RandomForest` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest) and [`RandomForestModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.RandomForestModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.RandomForest
 import org.apache.spark.mllib.tree.model.RandomForestModel
@@ -135,7 +137,9 @@ val sameModel = RandomForestModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`RandomForest` Java docs](api/java/org/apache/spark/mllib/tree/RandomForest.html) and [`RandomForestModel` Java docs](api/java/org/apache/spark/mllib/tree/model/RandomForestModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 import java.util.HashMap;
@@ -200,7 +204,8 @@ RandomForestModel sameModel = RandomForestModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`RandomForest` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.RandomForest) and [`RandomForest` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.RandomForestModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.tree import RandomForest, RandomForestModel
@@ -246,7 +251,9 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`RandomForest` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest) and [`RandomForestModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.RandomForestModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.RandomForest
 import org.apache.spark.mllib.tree.model.RandomForestModel
@@ -286,7 +293,9 @@ val sameModel = RandomForestModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`RandomForest` Java docs](api/java/org/apache/spark/mllib/tree/RandomForest.html) and [`RandomForestModel` Java docs](api/java/org/apache/spark/mllib/tree/model/RandomForestModel.html) for details on the API.
+
 {% highlight java %}
 import java.util.HashMap;
 import scala.Tuple2;
@@ -354,7 +363,8 @@ RandomForestModel sameModel = RandomForestModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`RandomForest` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.RandomForest) and [`RandomForest` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.RandomForestModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.tree import RandomForest, RandomForestModel
@@ -479,7 +489,9 @@ The test error is calculated to measure the algorithm accuracy.
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`GradientBoostedTrees` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.GradientBoostedTreesModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.GradientBoostedTrees
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
@@ -518,7 +530,9 @@ val sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`GradientBoostedTrees` Java docs](api/java/org/apache/spark/mllib/tree/GradientBoostedTrees.html) and [`GradientBoostedTreesModel` Java docs](api/java/org/apache/spark/mllib/tree/model/GradientBoostedTreesModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 import java.util.HashMap;
@@ -583,7 +597,8 @@ GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(sc.sc(), "m
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`GradientBoostedTrees` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.GradientBoostedTreesModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
@@ -627,7 +642,9 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`GradientBoostedTrees` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.GradientBoostedTreesModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.tree.GradientBoostedTrees
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
@@ -665,7 +682,9 @@ val sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`GradientBoostedTrees` Java docs](api/java/org/apache/spark/mllib/tree/GradientBoostedTrees.html) and [`GradientBoostedTreesModel` Java docs](api/java/org/apache/spark/mllib/tree/model/GradientBoostedTreesModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 import java.util.HashMap;
@@ -736,7 +755,8 @@ GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(sc.sc(), "m
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`GradientBoostedTrees` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.GradientBoostedTreesModel) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 7066d5c97418c..2270f7a34b069 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -102,6 +102,7 @@ The following code snippets illustrate how to load a sample dataset, train a bin
 data, and evaluate the performance of the algorithm by several binary evaluation metrics.
 
 <div data-lang="scala" markdown="1">
+Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -179,6 +180,7 @@ println("Area under ROC = " + auROC)
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) and [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) for details on the API.
 
 {% highlight java %}
 import scala.Tuple2;
@@ -276,6 +278,7 @@ public class BinaryClassification {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`BinaryClassificationMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.BinaryClassificationMetrics) and [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
@@ -428,6 +431,7 @@ The following code snippets illustrate how to load a sample dataset, train a mul
 the data, and evaluate the performance of the algorithm by several multiclass classification evaluation metrics.
 
 <div data-lang="scala" markdown="1">
+Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -501,6 +505,7 @@ println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
 {% highlight java %}
 import scala.Tuple2;
@@ -580,6 +585,7 @@ public class MulticlassClassification {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`MulticlassMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MulticlassMetrics) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
@@ -758,6 +764,7 @@ True classes:
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
@@ -802,6 +809,7 @@ println(s"Subset accuracy = ${metrics.subsetAccuracy}")
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
 {% highlight java %}
 import scala.Tuple2;
@@ -864,6 +872,7 @@ public class MultilabelClassification {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`MultilabelMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MultilabelMetrics) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.evaluation import MultilabelMetrics
@@ -1016,6 +1025,7 @@ expanded world of non-positive weights are "the same as never having interacted
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
@@ -1095,6 +1105,7 @@ println(s"R-squared = ${regressionMetrics.r2}")
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API.
 
 {% highlight java %}
 import scala.Tuple2;
@@ -1256,6 +1267,7 @@ public class Ranking {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.recommendation import ALS, Rating
@@ -1336,6 +1348,7 @@ The following code snippets illustrate how to load a sample dataset, train a lin
 and evaluate the performance of the algorithm by several regression metrics.
 
 <div data-lang="scala" markdown="1">
+Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -1379,6 +1392,7 @@ println(s"Explained variance = ${metrics.explainedVariance}")
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API.
 
 {% highlight java %}
 import scala.Tuple2;
@@ -1455,6 +1469,7 @@ public class LinearRegression {
 </div>
 
 <div data-lang="python" markdown="1">
+Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 7e417ed5f37a9..5bee170c61fe9 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -56,6 +56,9 @@ and [IDF](api/scala/index.html#org.apache.spark.mllib.feature.IDF).
 `HashingTF` takes an `RDD[Iterable[_]]` as the input.
 Each record could be an iterable of strings or other types.
 
+Refer to the [`HashingTF` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF) for details on the API.
+
+
 {% highlight scala %}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
@@ -103,6 +106,9 @@ and [IDF](api/python/pyspark.mllib.html#pyspark.mllib.feature.IDF).
 `HashingTF` takes an RDD of list as the input.
 Each record could be an iterable of strings or other types.
 
+
+Refer to the [`HashingTF` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF) for details on the API.
+
 {% highlight python %}
 from pyspark import SparkContext
 from pyspark.mllib.feature import HashingTF
@@ -183,7 +189,9 @@ the [text8](http://mattmahoney.net/dc/text8.zip) data and extract it to your pre
 Here we assume the extracted file is `text8` and in same directory as you run the spark shell.  
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`Word2Vec` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark._
 import org.apache.spark.rdd._
@@ -207,7 +215,9 @@ model.save(sc, "myModelPath")
 val sameModel = Word2VecModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`Word2Vec` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec) for more details on the API.
+
 {% highlight python %}
 from pyspark import SparkContext
 from pyspark.mllib.feature import Word2Vec
@@ -264,7 +274,9 @@ The example below demonstrates how to load a dataset in libsvm format, and stand
 so that the new features have unit standard deviation and/or zero mean.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`StandardScaler` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.feature.StandardScaler
@@ -288,7 +300,9 @@ val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.t
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`StandardScaler` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.StandardScaler) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.util import MLUtils
 from pyspark.mllib.linalg import Vectors
@@ -338,7 +352,9 @@ The example below demonstrates how to load a dataset in libsvm format, and norma
 with $L^2$ norm, and $L^\infty$ norm.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`Normalizer` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.feature.Normalizer
@@ -358,7 +374,9 @@ val data2 = data.map(x => (x.label, normalizer2.transform(x.features)))
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`Normalizer` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Normalizer) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.util import MLUtils
 from pyspark.mllib.linalg import Vectors
@@ -532,7 +550,10 @@ v_N
 This example below demonstrates how to transform vectors using a transforming vector value.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`ElementwiseProduct` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.feature.ElementwiseProduct
@@ -551,7 +572,9 @@ val transformedData2 = data.map(x => transformer.transform(x))
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+Refer to the [`ElementwiseProduct` Java docs](api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 import org.apache.spark.api.java.JavaRDD;
@@ -580,7 +603,9 @@ JavaRDD<Vector> transformedData2 = data.map(
 {% endhighlight %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+Refer to the [`ElementwiseProduct` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ElementwiseProduct) for more details on the API.
+
 {% highlight python %}
 from pyspark import SparkContext
 from pyspark.mllib.linalg import Vectors
@@ -617,7 +642,9 @@ and use them to project the vectors into a low-dimensional space while keeping a
 for calculation a [Linear Regression]((mllib-linear-methods.html))
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 4d4f5cfdc564e..f749eb4f2ff4f 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -50,6 +50,7 @@ example illustrates how to mine frequent itemsets and association rules
 Rules](mllib-frequent-pattern-mining.html#association-rules) for
 details) from `transactions`.
 
+Refer to the [`FPGrowth` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.rdd.RDD
@@ -92,6 +93,8 @@ example illustrates how to mine frequent itemsets and association rules
 Rules](mllib-frequent-pattern-mining.html#association-rules) for
 details) from `transactions`.
 
+Refer to the [`FPGrowth` Java docs](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 import java.util.List;
@@ -144,6 +147,8 @@ Calling `FPGrowth.train` with transactions returns an
 [`FPGrowthModel`](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowthModel)
 that stores the frequent itemsets with their frequencies.
 
+Refer to the [`FPGrowth` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowth) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.fpm import FPGrowth
 
@@ -170,6 +175,8 @@ for fi in result:
 implements a parallel rule generation algorithm for constructing rules
 that have a single item as the consequent.
 
+Refer to the [`AssociationRules` Scala docs](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.fpm.AssociationRules
@@ -199,6 +206,8 @@ results.collect().foreach { rule =>
 implements a parallel rule generation algorithm for constructing rules
 that have a single item as the consequent.
 
+Refer to the [`AssociationRules` Java docs](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -267,6 +276,8 @@ Calling `PrefixSpan.run` returns a
 [`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel)
 that stores the frequent sequences with their frequencies.
 
+Refer to the [`PrefixSpan` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) and [`PrefixSpanModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.fpm.PrefixSpan
 
@@ -296,6 +307,8 @@ Calling `PrefixSpan.run` returns a
 [`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html)
 that stores the frequent sequences with their frequencies.
 
+Refer to the [`PrefixSpan` Java docs](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) and [`PrefixSpanModel` Java docs](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 import java.util.List;
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index 6aa881f749185..f91a697b31891 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -59,6 +59,8 @@ i.e. 4710.28,500.00. The data are split to training and testing set.
 Model is created using the training set and a mean squared error is calculated from the predicted
 labels and real labels in the test set.
 
+Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegression) and [`IsotonicRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegressionModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
 
@@ -101,6 +103,8 @@ i.e. 4710.28,500.00. The data are split to training and testing set.
 Model is created using the training set and a mean squared error is calculated from the predicted
 labels and real labels in the test set.
 
+Refer to the [`IsotonicRegression` Java docs](api/java/org/apache/spark/mllib/regression/IsotonicRegression.html) and [`IsotonicRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/IsotonicRegressionModel.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaDoubleRDD;
@@ -167,6 +171,8 @@ i.e. 4710.28,500.00. The data are split to training and testing set.
 Model is created using the training set and a mean squared error is calculated from the predicted
 labels and real labels in the test set.
 
+Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.IsotonicRegression) and [`IsotonicRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.IsotonicRegressionModel) for more details on the API.
+
 {% highlight python %}
 import math
 from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index e9b2d276cd389..a3e1620c778ff 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -165,6 +165,8 @@ training algorithm on this training data using a static method in the algorithm
 object, and make predictions with the resulting model to compute the training
 error.
 
+Refer to the [`SVMWithSGD` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD) and [`SVMModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
@@ -230,6 +232,8 @@ Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
 that is equivalent to the provided example in Scala is given bellow:
 
+Refer to the [`SVMWithSGD` Java docs](api/java/org/apache/spark/mllib/classification/SVMWithSGD.html) and [`SVMModel` Java docs](api/java/org/apache/spark/mllib/classification/SVMModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -316,6 +320,8 @@ a dependency.
 The following example shows how to load a sample dataset, build SVM model,
 and make predictions with the resulting model to compute the training error.
 
+Refer to the [`SVMWithSGD` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.SVMWithSGD) and [`SVMModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.SVMModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.classification import SVMWithSGD, SVMModel
 from pyspark.mllib.regression import LabeledPoint
@@ -395,6 +401,8 @@ test, and use
 to fit a logistic regression model.
 Then the model is evaluated against the test dataset and saved to disk.
 
+Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`LogisticRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel}
@@ -441,6 +449,8 @@ test, and use
 to fit a logistic regression model.
 Then the model is evaluated against the test dataset and saved to disk.
 
+Refer to the [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) and [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -501,6 +511,8 @@ and make predictions with the resulting model to compute the training error.
 Note that the Python API does not yet support multiclass classification and model save/load but
 will in the future.
 
+Refer to the [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) and [`LogisticRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
 from pyspark.mllib.regression import LabeledPoint
@@ -558,6 +570,8 @@ The example then uses LinearRegressionWithSGD to build a simple linear model to
 values. We compute the mean squared error at the end to evaluate
 [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
+Refer to the [`LinearRegressionWithSGD` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD) and [`LinearRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.regression.LinearRegressionModel
@@ -600,6 +614,8 @@ Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a
 calling `.rdd()` on your `JavaRDD` object. The corresponding Java example to
 the Scala snippet provided, is presented bellow:
 
+Refer to the [`LinearRegressionWithSGD` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html) and [`LinearRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -673,6 +689,8 @@ values. We compute the mean squared error at the end to evaluate
 
 Note that the Python API does not yet support model save/load but will in the future.
 
+Refer to the [`LinearRegressionWithSGD` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LinearRegressionWithSGD) and [`LinearRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LinearRegressionModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
 
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index e73bd30f3a90a..f4f6a10c8299e 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -38,6 +38,8 @@ smoothing parameter `lambda` as input, an optional model type parameter (default
 [NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
 can be used for evaluation and prediction.
 
+Refer to the [`NaiveBayes` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
 import org.apache.spark.mllib.linalg.Vectors
@@ -73,6 +75,8 @@ optionally smoothing parameter `lambda` as input, and output a
 [NaiveBayesModel](api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html), which
 can be used for evaluation and prediction.
 
+Refer to the [`NaiveBayes` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) and [`NaiveBayesModel` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html) for details on the API.
+
 {% highlight java %}
 import scala.Tuple2;
 
@@ -118,6 +122,8 @@ used for evaluation and prediction.
 
 Note that the Python API does not yet support model save/load but will in the future.
 
+Refer to the [`NaiveBayes` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayesModel) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
 from pyspark.mllib.linalg import Vectors
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index 6cabc1610a151..a3bd130ba077c 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -218,6 +218,8 @@ L-BFGS optimizer.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+Refer to the [`LBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS) and [`SquaredL2Updater` Scala docs](api/scala/index.html#org.apache.spark.mllib.optimization.SquaredL2Updater) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
@@ -278,6 +280,8 @@ println("Area under ROC = " + auROC)
 </div>
 
 <div data-lang="java" markdown="1">
+Refer to the [`LBFGS` Java docs](api/java/org/apache/spark/mllib/optimization/LBFGS.html) and [`SquaredL2Updater` Java docs](api/java/org/apache/spark/mllib/optimization/SquaredL2Updater.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 import java.util.Random;
diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md
index 42ea2ca81f80d..615287125c032 100644
--- a/docs/mllib-pmml-model-export.md
+++ b/docs/mllib-pmml-model-export.md
@@ -45,6 +45,8 @@ The table below outlines the MLlib models that can be exported to PMML and their
 <div data-lang="scala" markdown="1">
 To export a supported `model` (see table above) to PMML, simply call `model.toPMML`.
 
+Refer to the [`KMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors) for details on the API.
+
 Here a complete example of building a KMeansModel and print it out in PMML format:
 {% highlight scala %}
 import org.apache.spark.mllib.clustering.KMeans
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 6acfc71d7b014..2c7c9ed693fd4 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -38,6 +38,8 @@ available in `Statistics`.
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
+Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
@@ -60,6 +62,8 @@ println(summary.numNonzeros) // number of nonzeros in each column
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
+Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -86,6 +90,8 @@ System.out.println(summary.numNonzeros()); // number of nonzeros in each column
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
+Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.stat import Statistics
 
@@ -116,6 +122,8 @@ correlation methods are currently Pearson's and Spearman's correlation.
 calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
+Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg._
@@ -144,6 +152,8 @@ val correlMatrix: Matrix = Statistics.corr(data, "pearson")
 calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or 
 a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively.
 
+Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -173,6 +183,8 @@ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
 calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.stat import Statistics
 
@@ -338,6 +350,8 @@ featureTestResults.foreach { result =>
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
 hypothesis tests.
 
+Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -385,6 +399,8 @@ for (ChiSqTestResult result : featureTestResults) {
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
 hypothesis tests.
 
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
 {% highlight python %}
 from pyspark import SparkContext
 from pyspark.mllib.linalg import Vectors, Matrices
@@ -437,6 +453,8 @@ message.
 run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
 and interpret the hypothesis tests.
 
+Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.stat.Statistics
 
@@ -459,6 +477,8 @@ val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
 run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
 and interpret the hypothesis tests.
 
+Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -483,6 +503,8 @@ System.out.println(testResult);
 run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
 and interpret the hypothesis tests.
 
+Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.stat import Statistics
 
@@ -513,6 +535,8 @@ methods to generate random double RDDs or vector RDDs.
 The following example generates a random double RDD, whose values follows the standard normal
 distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
+Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.random.RandomRDDs._
@@ -533,6 +557,8 @@ methods to generate random double RDDs or vector RDDs.
 The following example generates a random double RDD, whose values follows the standard normal
 distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
+Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.SparkContext;
 import org.apache.spark.api.JavaDoubleRDD;
@@ -559,6 +585,8 @@ methods to generate random double RDDs or vector RDDs.
 The following example generates a random double RDD, whose values follows the standard normal
 distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
+Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.random import RandomRDDs
 
@@ -589,6 +617,8 @@ mean of PDFs of normal distributions centered around each of the samples.
 to compute kernel density estimates from an RDD of samples. The following example demonstrates how
 to do so.
 
+Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.mllib.stat.KernelDensity
 import org.apache.spark.rdd.RDD
@@ -611,6 +641,8 @@ val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
 to compute kernel density estimates from an RDD of samples. The following example demonstrates how
 to do so.
 
+Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
+
 {% highlight java %}
 import org.apache.spark.mllib.stat.KernelDensity;
 import org.apache.spark.rdd.RDD;
@@ -633,6 +665,8 @@ double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
 to compute kernel density estimates from an RDD of samples. The following example demonstrates how
 to do so.
 
+Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
+
 {% highlight python %}
 from pyspark.mllib.stat import KernelDensity
 

From f57c63d4c30d092a320c72f8c7181f2fa711ec30 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 7 Oct 2015 09:46:37 -0700
Subject: [PATCH 467/802] [SPARK-10752] [SPARKR] Implement corr() and cov in
 DataFrameStatFunctions.

Author: Sun Rui <rui.sun@intel.com>

Closes #8869 from sun-rui/SPARK-10752.
---
 R/pkg/DESCRIPTION                |   1 +
 R/pkg/NAMESPACE                  |   2 +
 R/pkg/R/DataFrame.R              |  33 +---------
 R/pkg/R/generics.R               |  10 ++-
 R/pkg/R/stats.R                  | 102 +++++++++++++++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R |  12 ++++
 6 files changed, 127 insertions(+), 33 deletions(-)
 create mode 100644 R/pkg/R/stats.R

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index a3a16c42a6214..3d6edb70ec98e 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -33,4 +33,5 @@ Collate:
     'mllib.R'
     'serialize.R'
     'sparkR.R'
+    'stats.R'
     'utils.R'
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index c28c47daeac13..9aad35469bbb7 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -27,6 +27,8 @@ exportMethods("arrange",
               "collect",
               "columns",
               "count",
+              "cov",
+              "corr",
               "crosstab",
               "describe",
               "dim",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 14aea923fcfef..85db3a5ed3705 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1828,36 +1828,6 @@ setMethod("fillna",
             dataFrame(sdf)
           })
 
-#' crosstab
-#'
-#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
-#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
-#' non-zero pair frequencies will be returned.
-#'
-#' @param col1 name of the first column. Distinct items will make the first item of each row.
-#' @param col2 name of the second column. Distinct items will make the column names of the output.
-#' @return a local R data.frame representing the contingency table. The first column of each row
-#'         will be the distinct values of `col1` and the column names will be the distinct values
-#'         of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
-#'         occurrences will have zero as their counts.
-#'
-#' @rdname statfunctions
-#' @name crosstab
-#' @export
-#' @examples
-#' \dontrun{
-#' df <- jsonFile(sqlCtx, "/path/to/file.json")
-#' ct = crosstab(df, "title", "gender")
-#' }
-setMethod("crosstab",
-          signature(x = "DataFrame", col1 = "character", col2 = "character"),
-          function(x, col1, col2) {
-            statFunctions <- callJMethod(x@sdf, "stat")
-            sct <- callJMethod(statFunctions, "crosstab", col1, col2)
-            collect(dataFrame(sct))
-          })
-
-
 #' This function downloads the contents of a DataFrame into an R's data.frame.
 #' Since data.frames are held in memory, ensure that you have enough memory
 #' in your system to accommodate the contents.
@@ -1879,5 +1849,4 @@ setMethod("as.data.frame",
               stop(paste("Unused argument(s): ", paste(list(...), collapse=", ")))
             }
             collect(x)
-          }
-)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 3db41e0fe2bb6..e9086fdbd18c6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -399,6 +399,14 @@ setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })
 
+#' @rdname statfunctions
+#' @export
+setGeneric("cov", function(x, col1, col2) {standardGeneric("cov") })
+
+#' @rdname statfunctions
+#' @export
+setGeneric("corr", function(x, col1, col2, method = "pearson") {standardGeneric("corr") })
+
 #' @rdname describe
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
@@ -986,4 +994,4 @@ setGeneric("rbind", signature = "...")
 
 #' @rdname as.data.frame
 #' @export
-setGeneric("as.data.frame")
\ No newline at end of file
+setGeneric("as.data.frame")
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
new file mode 100644
index 0000000000000..06382d55d086e
--- /dev/null
+++ b/R/pkg/R/stats.R
@@ -0,0 +1,102 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# stats.R - Statistic functions for DataFrames.
+
+setOldClass("jobj")
+
+#' crosstab
+#'
+#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
+#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
+#' non-zero pair frequencies will be returned.
+#'
+#' @param col1 name of the first column. Distinct items will make the first item of each row.
+#' @param col2 name of the second column. Distinct items will make the column names of the output.
+#' @return a local R data.frame representing the contingency table. The first column of each row
+#'         will be the distinct values of `col1` and the column names will be the distinct values
+#'         of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
+#'         occurrences will have zero as their counts.
+#'
+#' @rdname statfunctions
+#' @name crosstab
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' ct <- crosstab(df, "title", "gender")
+#' }
+setMethod("crosstab",
+          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          function(x, col1, col2) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "crosstab", col1, col2)
+            collect(dataFrame(sct))
+          })
+
+#' cov
+#'
+#' Calculate the sample covariance of two numerical columns of a DataFrame.
+#'
+#' @param x A SparkSQL DataFrame
+#' @param col1 the name of the first column
+#' @param col2 the name of the second column
+#' @return the covariance of the two columns.
+#'
+#' @rdname statfunctions
+#' @name cov
+#' @export
+#' @examples
+#'\dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' cov <- cov(df, "title", "gender")
+#' }
+setMethod("cov",
+          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          function(x, col1, col2) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "cov", col1, col2)
+          })
+
+#' corr
+#'
+#' Calculates the correlation of two columns of a DataFrame.
+#' Currently only supports the Pearson Correlation Coefficient.
+#' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics.
+#' 
+#' @param x A SparkSQL DataFrame
+#' @param col1 the name of the first column
+#' @param col2 the name of the second column
+#' @param method Optional. A character specifying the method for calculating the correlation.
+#'               only "pearson" is allowed now.
+#' @return The Pearson Correlation Coefficient as a Double.
+#'
+#' @rdname statfunctions
+#' @name corr
+#' @export
+#' @examples
+#'\dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' corr <- corr(df, "title", "gender")
+#' corr <- corr(df, "title", "gender", method = "pearson")
+#' }
+setMethod("corr",
+          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          function(x, col1, col2, method = "pearson") {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "corr", col1, col2, method)
+          })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index faf42b7182c30..bcf52b8fa7887 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1329,6 +1329,18 @@ test_that("crosstab() on a DataFrame", {
   expect_identical(expected, ordered)
 })
 
+test_that("cov() and corr() on a DataFrame", {
+  l <- lapply(c(0:9), function(x) { list(x, x * 2.0) })
+  df <- createDataFrame(sqlContext, l, c("singles", "doubles"))
+  result <- cov(df, "singles", "doubles")
+  expect_true(abs(result - 55.0 / 3) < 1e-12)
+
+  result <- corr(df, "singles", "doubles")
+  expect_true(abs(result - 1.0) < 1e-12)
+  result <- corr(df, "singles", "doubles", "pearson")
+  expect_true(abs(result - 1.0) < 1e-12)
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)

From 9672602c7ecb8117a1edb04067e2e3e776ee10d2 Mon Sep 17 00:00:00 2001
From: Kevin Cox <kevincox@kevincox.ca>
Date: Wed, 7 Oct 2015 09:53:17 -0700
Subject: [PATCH 468/802] [SPARK-10952] Only add hive to classpath if HIVE_HOME
 is set.

Currently if it isn't set it scans `/lib/*` and adds every dir to the
classpath which makes the env too large and every command called
afterwords fails.

Author: Kevin Cox <kevincox@kevincox.ca>

Closes #8994 from kevincox/kevincox-only-add-hive-to-classpath-if-var-is-set.
---
 build/sbt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/build/sbt b/build/sbt
index cc3203d79bccd..7d8d0993e57d8 100755
--- a/build/sbt
+++ b/build/sbt
@@ -20,10 +20,12 @@
 # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
 # that we can run Hive to generate the golden answer.  This is not required for normal development
 # or testing.
-for i in "$HIVE_HOME"/lib/*
-do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i"
-done
-export HADOOP_CLASSPATH
+if [ -n "$HIVE_HOME" ]; then
+    for i in "$HIVE_HOME"/lib/*
+    do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i"
+    done
+    export HADOOP_CLASSPATH
+fi
 
 realpath () {
 (

From f5d154bc731aedfc2eecdb4ed6af8cac820511c9 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 7 Oct 2015 10:17:29 -0700
Subject: [PATCH 469/802] [SPARK-10966] [SQL] Codegen framework cleanup

This PR is mostly cosmetic and cleans up some warts in codegen (nearly all of which were inherited from the original quasiquote version).
 - Add lines numbers to errors (in stacktraces when debug logging is on, and always for compile fails)
 - Use a variable for input row instead of hardcoding "i" everywhere
 - rename `primitive` -> `value` (since its often actually an object)

Author: Michael Armbrust <michael@databricks.com>

Closes #9006 from marmbrus/codegen-cleanup.
---
 .../catalyst/expressions/BoundAttribute.scala |  6 +-
 .../spark/sql/catalyst/expressions/Cast.scala |  2 +-
 .../sql/catalyst/expressions/Expression.scala | 18 ++---
 .../catalyst/expressions/InputFileName.scala  |  2 +-
 .../MonotonicallyIncreasingID.scala           |  2 +-
 .../sql/catalyst/expressions/SortOrder.scala  |  6 +-
 .../expressions/SparkPartitionID.scala        |  2 +-
 .../sql/catalyst/expressions/arithmetic.scala | 62 ++++++++---------
 .../expressions/codegen/CodeGenerator.scala   | 47 ++++++++-----
 .../expressions/codegen/CodegenFallback.scala |  6 +-
 .../codegen/GenerateMutableProjection.scala   |  8 +--
 .../codegen/GenerateOrdering.scala            | 10 +--
 .../codegen/GeneratePredicate.scala           |  4 +-
 .../codegen/GenerateProjection.scala          |  4 +-
 .../codegen/GenerateSafeProjection.scala      | 14 ++--
 .../codegen/GenerateUnsafeProjection.scala    | 20 +++---
 .../expressions/collectionOperations.scala    |  4 +-
 .../expressions/complexTypeCreator.scala      | 16 ++---
 .../expressions/complexTypeExtractors.scala   |  8 +--
 .../expressions/conditionalExpressions.scala  | 36 +++++-----
 .../expressions/datetimeExpressions.scala     | 62 ++++++++---------
 .../expressions/decimalExpressions.scala      |  6 +-
 .../sql/catalyst/expressions/literals.scala   | 14 ++--
 .../expressions/mathExpressions.scala         | 68 +++++++++----------
 .../spark/sql/catalyst/expressions/misc.scala | 10 +--
 .../expressions/nullExpressions.scala         | 24 +++----
 .../sql/catalyst/expressions/predicates.scala | 34 +++++-----
 .../expressions/randomExpressions.scala       |  4 +-
 .../expressions/regexpExpressions.scala       | 24 +++----
 .../spark/sql/catalyst/expressions/sets.scala | 10 +--
 .../expressions/stringExpressions.scala       | 64 ++++++++---------
 31 files changed, 306 insertions(+), 291 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 473b9b787058c..ff1f28ddbbf35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -68,10 +68,10 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val javaType = ctx.javaType(dataType)
-    val value = ctx.getValue("i", dataType, ordinal.toString)
+    val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString)
     s"""
-      boolean ${ev.isNull} = i.isNullAt($ordinal);
-      $javaType ${ev.primitive} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
+      boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal);
+      $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
     """
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f0bce388d959a..99d7444dc470f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -438,7 +438,7 @@ case class Cast(child: Expression, dataType: DataType)
     val eval = child.gen(ctx)
     val nullSafeCast = nullSafeCastFunction(child.dataType, dataType, ctx)
     eval.code +
-      castCode(ctx, eval.primitive, eval.isNull, ev.primitive, ev.isNull, dataType, nullSafeCast)
+      castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType, nullSafeCast)
   }
 
   // three function arguments are: child.primitive, result.primitive and result.isNull
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0b98f555a1d60..96284b9b421dd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -276,7 +276,7 @@ abstract class UnaryExpression extends Expression {
       ev: GeneratedExpressionCode,
       f: String => String): String = {
     nullSafeCodeGen(ctx, ev, eval => {
-      s"${ev.primitive} = ${f(eval)};"
+      s"${ev.value} = ${f(eval)};"
     })
   }
 
@@ -292,10 +292,10 @@ abstract class UnaryExpression extends Expression {
       ev: GeneratedExpressionCode,
       f: String => String): String = {
     val eval = child.gen(ctx)
-    val resultCode = f(eval.primitive)
+    val resultCode = f(eval.value)
     eval.code + s"""
       boolean ${ev.isNull} = ${eval.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       if (!${ev.isNull}) {
         $resultCode
       }
@@ -357,7 +357,7 @@ abstract class BinaryExpression extends Expression {
       ev: GeneratedExpressionCode,
       f: (String, String) => String): String = {
     nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-      s"${ev.primitive} = ${f(eval1, eval2)};"
+      s"${ev.value} = ${f(eval1, eval2)};"
     })
   }
 
@@ -375,11 +375,11 @@ abstract class BinaryExpression extends Expression {
       f: (String, String) => String): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val resultCode = f(eval1.primitive, eval2.primitive)
+    val resultCode = f(eval1.value, eval2.value)
     s"""
       ${eval1.code}
       boolean ${ev.isNull} = ${eval1.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       if (!${ev.isNull}) {
         ${eval2.code}
         if (!${eval2.isNull}) {
@@ -482,7 +482,7 @@ abstract class TernaryExpression extends Expression {
     ev: GeneratedExpressionCode,
     f: (String, String, String) => String): String = {
     nullSafeCodeGen(ctx, ev, (eval1, eval2, eval3) => {
-      s"${ev.primitive} = ${f(eval1, eval2, eval3)};"
+      s"${ev.value} = ${f(eval1, eval2, eval3)};"
     })
   }
 
@@ -499,11 +499,11 @@ abstract class TernaryExpression extends Expression {
     ev: GeneratedExpressionCode,
     f: (String, String, String) => String): String = {
     val evals = children.map(_.gen(ctx))
-    val resultCode = f(evals(0).primitive, evals(1).primitive, evals(2).primitive)
+    val resultCode = f(evals(0).value, evals(1).value, evals(2).value)
     s"""
       ${evals(0).code}
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       if (!${evals(0).isNull}) {
         ${evals(1).code}
         if (!${evals(1).isNull}) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
index 1e74f716955e3..d809877817a5b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -42,7 +42,7 @@ case class InputFileName() extends LeafExpression with Nondeterministic {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     ev.isNull = "false"
-    s"final ${ctx.javaType(dataType)} ${ev.primitive} = " +
+    s"final ${ctx.javaType(dataType)} ${ev.value} = " +
       "org.apache.spark.rdd.SqlNewHadoopRDD.getInputFileName();"
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 291b7a5bc3af5..2d7679fdfe043 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -66,7 +66,7 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression with
 
     ev.isNull = "false"
     s"""
-      final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm;
+      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
       $countTerm++;
     """
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 98e029035ab6f..290c128d65b30 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -63,7 +63,7 @@ case class SortPrefix(child: SortOrder) extends UnaryExpression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val childCode = child.child.gen(ctx)
-    val input = childCode.primitive
+    val input = childCode.value
     val BinaryPrefixCmp = classOf[BinaryPrefixComparator].getName
     val DoublePrefixCmp = classOf[DoublePrefixComparator].getName
 
@@ -97,10 +97,10 @@ case class SortPrefix(child: SortOrder) extends UnaryExpression {
 
     childCode.code +
     s"""
-      |long ${ev.primitive} = ${nullValue}L;
+      |long ${ev.value} = ${nullValue}L;
       |boolean ${ev.isNull} = false;
       |if (!${childCode.isNull}) {
-      |  ${ev.primitive} = $prefixCode;
+      |  ${ev.value} = $prefixCode;
       |}
     """.stripMargin
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 4b1772a2deed5..8bff173d64eb9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -47,6 +47,6 @@ private[sql] case class SparkPartitionID() extends LeafExpression with Nondeterm
     ctx.addMutableState(ctx.JAVA_INT, idTerm,
       s"$idTerm = org.apache.spark.TaskContext.getPartitionId();")
     ev.isNull = "false"
-    s"final ${ctx.javaType(dataType)} ${ev.primitive} = $idTerm;"
+    s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;"
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 98464edf4d390..61a17fd7db0fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -42,7 +42,7 @@ case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInp
       // for example, we could not write --9223372036854775808L in code
       s"""
         ${ctx.javaType(dt)} $originValue = (${ctx.javaType(dt)})($eval);
-        ${ev.primitive} = (${ctx.javaType(dt)})(-($originValue));
+        ${ev.value} = (${ctx.javaType(dt)})(-($originValue));
       """})
     case dt: CalendarIntervalType => defineCodeGen(ctx, ev, c => s"$c.negate()")
   }
@@ -223,20 +223,20 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val isZero = if (dataType.isInstanceOf[DecimalType]) {
-      s"${eval2.primitive}.isZero()"
+      s"${eval2.value}.isZero()"
     } else {
-      s"${eval2.primitive} == 0"
+      s"${eval2.value} == 0"
     }
     val javaType = ctx.javaType(dataType)
     val divide = if (dataType.isInstanceOf[DecimalType]) {
-      s"${eval1.primitive}.$decimalMethod(${eval2.primitive})"
+      s"${eval1.value}.$decimalMethod(${eval2.value})"
     } else {
-      s"($javaType)(${eval1.primitive} $symbol ${eval2.primitive})"
+      s"($javaType)(${eval1.value} $symbol ${eval2.value})"
     }
     s"""
       ${eval2.code}
       boolean ${ev.isNull} = false;
-      $javaType ${ev.primitive} = ${ctx.defaultValue(javaType)};
+      $javaType ${ev.value} = ${ctx.defaultValue(javaType)};
       if (${eval2.isNull} || $isZero) {
         ${ev.isNull} = true;
       } else {
@@ -244,7 +244,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
         if (${eval1.isNull}) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = $divide;
+          ${ev.value} = $divide;
         }
       }
     """
@@ -285,20 +285,20 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val isZero = if (dataType.isInstanceOf[DecimalType]) {
-      s"${eval2.primitive}.isZero()"
+      s"${eval2.value}.isZero()"
     } else {
-      s"${eval2.primitive} == 0"
+      s"${eval2.value} == 0"
     }
     val javaType = ctx.javaType(dataType)
     val remainder = if (dataType.isInstanceOf[DecimalType]) {
-      s"${eval1.primitive}.$decimalMethod(${eval2.primitive})"
+      s"${eval1.value}.$decimalMethod(${eval2.value})"
     } else {
-      s"($javaType)(${eval1.primitive} $symbol ${eval2.primitive})"
+      s"($javaType)(${eval1.value} $symbol ${eval2.value})"
     }
     s"""
       ${eval2.code}
       boolean ${ev.isNull} = false;
-      $javaType ${ev.primitive} = ${ctx.defaultValue(javaType)};
+      $javaType ${ev.value} = ${ctx.defaultValue(javaType)};
       if (${eval2.isNull} || $isZero) {
         ${ev.isNull} = true;
       } else {
@@ -306,7 +306,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
         if (${eval1.isNull}) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = $remainder;
+          ${ev.value} = $remainder;
         }
       }
     """
@@ -341,24 +341,24 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val compCode = ctx.genComp(dataType, eval1.primitive, eval2.primitive)
+    val compCode = ctx.genComp(dataType, eval1.value, eval2.value)
 
     eval1.code + eval2.code + s"""
       boolean ${ev.isNull} = false;
-      ${ctx.javaType(left.dataType)} ${ev.primitive} =
+      ${ctx.javaType(left.dataType)} ${ev.value} =
         ${ctx.defaultValue(left.dataType)};
 
       if (${eval1.isNull}) {
         ${ev.isNull} = ${eval2.isNull};
-        ${ev.primitive} = ${eval2.primitive};
+        ${ev.value} = ${eval2.value};
       } else if (${eval2.isNull}) {
         ${ev.isNull} = ${eval1.isNull};
-        ${ev.primitive} = ${eval1.primitive};
+        ${ev.value} = ${eval1.value};
       } else {
         if ($compCode > 0) {
-          ${ev.primitive} = ${eval1.primitive};
+          ${ev.value} = ${eval1.value};
         } else {
-          ${ev.primitive} = ${eval2.primitive};
+          ${ev.value} = ${eval2.value};
         }
       }
     """
@@ -395,24 +395,24 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val compCode = ctx.genComp(dataType, eval1.primitive, eval2.primitive)
+    val compCode = ctx.genComp(dataType, eval1.value, eval2.value)
 
     eval1.code + eval2.code + s"""
       boolean ${ev.isNull} = false;
-      ${ctx.javaType(left.dataType)} ${ev.primitive} =
+      ${ctx.javaType(left.dataType)} ${ev.value} =
         ${ctx.defaultValue(left.dataType)};
 
       if (${eval1.isNull}) {
         ${ev.isNull} = ${eval2.isNull};
-        ${ev.primitive} = ${eval2.primitive};
+        ${ev.value} = ${eval2.value};
       } else if (${eval2.isNull}) {
         ${ev.isNull} = ${eval1.isNull};
-        ${ev.primitive} = ${eval1.primitive};
+        ${ev.value} = ${eval1.value};
       } else {
         if ($compCode < 0) {
-          ${ev.primitive} = ${eval1.primitive};
+          ${ev.value} = ${eval1.value};
         } else {
-          ${ev.primitive} = ${eval2.primitive};
+          ${ev.value} = ${eval2.value};
         }
       }
     """
@@ -451,9 +451,9 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
           s"""
             ${ctx.javaType(dataType)} r = $eval1.remainder($eval2);
             if (r.compare(new org.apache.spark.sql.types.Decimal().set(0)) < 0) {
-              ${ev.primitive} = (r.$decimalAdd($eval2)).remainder($eval2);
+              ${ev.value} = (r.$decimalAdd($eval2)).remainder($eval2);
             } else {
-              ${ev.primitive} = r;
+              ${ev.value} = r;
             }
           """
         // byte and short are casted into int when add, minus, times or divide
@@ -461,18 +461,18 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
           s"""
             ${ctx.javaType(dataType)} r = (${ctx.javaType(dataType)})($eval1 % $eval2);
             if (r < 0) {
-              ${ev.primitive} = (${ctx.javaType(dataType)})((r + $eval2) % $eval2);
+              ${ev.value} = (${ctx.javaType(dataType)})((r + $eval2) % $eval2);
             } else {
-              ${ev.primitive} = r;
+              ${ev.value} = r;
             }
           """
         case _ =>
           s"""
             ${ctx.javaType(dataType)} r = $eval1 % $eval2;
             if (r < 0) {
-              ${ev.primitive} = (r + $eval2) % $eval2;
+              ${ev.value} = (r + $eval2) % $eval2;
             } else {
-              ${ev.primitive} = r;
+              ${ev.value} = r;
             }
           """
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 9a2878113304c..2dd680454b4cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -42,10 +42,10 @@ class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long]
  * @param code The sequence of statements required to evaluate the expression.
  * @param isNull A term that holds a boolean value representing whether the expression evaluated
  *                 to null.
- * @param primitive A term for a possible primitive value of the result of the evaluation. Not
- *                      valid if `isNull` is set to `true`.
+ * @param value A term for a (possibly primitive) value of the result of the evaluation. Not
+ *              valid if `isNull` is set to `true`.
  */
-case class GeneratedExpressionCode(var code: String, var isNull: String, var primitive: String)
+case class GeneratedExpressionCode(var code: String, var isNull: String, var value: String)
 
 /**
  * A context for codegen, which is used to bookkeeping the expressions those are not supported
@@ -99,6 +99,9 @@ class CodeGenContext {
   final val JAVA_FLOAT = "float"
   final val JAVA_DOUBLE = "double"
 
+  /** The variable name of the input row in generated code. */
+  final val INPUT_ROW = "i"
+
   private val curId = new java.util.concurrent.atomic.AtomicInteger()
 
   /**
@@ -112,21 +115,21 @@ class CodeGenContext {
   }
 
   /**
-   * Returns the code to access a value in `SpecializedGetters` for a given DataType.
+   * Returns the specialized code to access a value from `inputRow` at `ordinal`.
    */
-  def getValue(getter: String, dataType: DataType, ordinal: String): String = {
+  def getValue(input: String, dataType: DataType, ordinal: String): String = {
     val jt = javaType(dataType)
     dataType match {
-      case _ if isPrimitiveType(jt) => s"$getter.get${primitiveTypeName(jt)}($ordinal)"
-      case t: DecimalType => s"$getter.getDecimal($ordinal, ${t.precision}, ${t.scale})"
-      case StringType => s"$getter.getUTF8String($ordinal)"
-      case BinaryType => s"$getter.getBinary($ordinal)"
-      case CalendarIntervalType => s"$getter.getInterval($ordinal)"
-      case t: StructType => s"$getter.getStruct($ordinal, ${t.size})"
-      case _: ArrayType => s"$getter.getArray($ordinal)"
-      case _: MapType => s"$getter.getMap($ordinal)"
+      case _ if isPrimitiveType(jt) => s"$input.get${primitiveTypeName(jt)}($ordinal)"
+      case t: DecimalType => s"$input.getDecimal($ordinal, ${t.precision}, ${t.scale})"
+      case StringType => s"$input.getUTF8String($ordinal)"
+      case BinaryType => s"$input.getBinary($ordinal)"
+      case CalendarIntervalType => s"$input.getInterval($ordinal)"
+      case t: StructType => s"$input.getStruct($ordinal, ${t.size})"
+      case _: ArrayType => s"$input.getArray($ordinal)"
+      case _: MapType => s"$input.getMap($ordinal)"
       case NullType => "null"
-      case _ => s"($jt)$getter.get($ordinal, null)"
+      case _ => s"($jt)$input.get($ordinal, null)"
     }
   }
 
@@ -384,11 +387,23 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[UnsafeMapData].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
+
+    def formatted = CodeFormatter.format(code)
+    def withLineNums = formatted.split("\n").zipWithIndex.map {
+      case (l, n) => f"${n + 1}%03d $l"
+    }.mkString("\n")
+
+    logDebug({
+      // Only add extra debugging info to byte code when we are going to print the source code.
+      evaluator.setDebuggingInformation(false, true, false)
+      withLineNums
+    })
+
     try {
-      evaluator.cook(code)
+      evaluator.cook("generated.java", code)
     } catch {
       case e: Exception =>
-        val msg = s"failed to compile: $e\n" + CodeFormatter.format(code)
+        val msg = s"failed to compile: $e\n$withLineNums"
         logError(msg, e)
         throw new Exception(msg, e)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
index 3492d2c6189ed..d51a8dede7f34 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -34,11 +34,11 @@ trait CodegenFallback extends Expression {
     val objectTerm = ctx.freshName("obj")
     s"""
       /* expression: ${this} */
-      Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
+      Object $objectTerm = expressions[${ctx.references.size - 1}].eval(${ctx.INPUT_ROW});
       boolean ${ev.isNull} = $objectTerm == null;
-      ${ctx.javaType(this.dataType)} ${ev.primitive} = ${ctx.defaultValue(this.dataType)};
+      ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)};
       if (!${ev.isNull}) {
-        ${ev.primitive} = (${ctx.boxedType(this.dataType)}) $objectTerm;
+        ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
       }
     """
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 793023b9fbed3..d82d19185be04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -49,7 +49,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
             if (${evaluationCode.isNull}) {
               ${ctx.setColumn("mutableRow", e.dataType, i, null)};
             } else {
-              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
+              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.value)};
             }
           """
         } else {
@@ -58,12 +58,12 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
             if (${evaluationCode.isNull}) {
               mutableRow.setNullAt($i);
             } else {
-              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
+              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.value)};
             }
           """
         }
     }
-    val allProjections = ctx.splitExpressions("i", projectionCodes)
+    val allProjections = ctx.splitExpressions(ctx.INPUT_ROW, projectionCodes)
 
     val code = s"""
       public Object generate($exprType[] expr) {
@@ -94,7 +94,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
         }
 
         public Object apply(Object _i) {
-          InternalRow i = (InternalRow) _i;
+          InternalRow ${ctx.INPUT_ROW} = (InternalRow) _i;
           $allProjections
           return mutableRow;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 42be394c3bf5c..c2b420286f755 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -74,21 +74,21 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
       val isNullB = ctx.freshName("isNullB")
       val primitiveB = ctx.freshName("primitiveB")
       s"""
-          i = a;
+          ${ctx.INPUT_ROW} = a;
           boolean $isNullA;
           ${ctx.javaType(order.child.dataType)} $primitiveA;
           {
             ${eval.code}
             $isNullA = ${eval.isNull};
-            $primitiveA = ${eval.primitive};
+            $primitiveA = ${eval.value};
           }
-          i = b;
+          ${ctx.INPUT_ROW} = b;
           boolean $isNullB;
           ${ctx.javaType(order.child.dataType)} $primitiveB;
           {
             ${eval.code}
             $isNullB = ${eval.isNull};
-            $primitiveB = ${eval.primitive};
+            $primitiveB = ${eval.value};
           }
           if ($isNullA && $isNullB) {
             // Nothing
@@ -128,7 +128,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
 
         @Override
         public int compare(InternalRow a, InternalRow b) {
-          InternalRow i = null;  // Holds current row being evaluated.
+          InternalRow ${ctx.INPUT_ROW} = null;  // Holds current row being evaluated.
           $comparisons
           return 0;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index c7e718a526420..ae6ffe6293c5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -56,9 +56,9 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
         }
 
         @Override
-        public boolean eval(InternalRow i) {
+        public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
           ${eval.code}
-          return !${eval.isNull} && ${eval.primitive};
+          return !${eval.isNull} && ${eval.value};
         }
       }"""
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 75524b568d685..dbcc9dc08408f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -59,7 +59,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
           ${eval.code}
           nullBits[$i] = ${eval.isNull};
           if (!${eval.isNull}) {
-            c$i = ${eval.primitive};
+            c$i = ${eval.value};
           }
         }
         """
@@ -180,7 +180,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
         $columns
 
-        public SpecificRow(InternalRow i) {
+        public SpecificRow(InternalRow ${ctx.INPUT_ROW}) {
           $initColumns
         }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 7ad352d7ce3e9..ea09e029da901 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -51,7 +51,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
       s"""
         if (!$tmp.isNullAt($i)) {
           ${converter.code}
-          $values[$i] = ${converter.primitive};
+          $values[$i] = ${converter.value};
         }
       """
     }
@@ -85,7 +85,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
       for (int $index = 0; $index < $numElements; $index++) {
         if (!$tmp.isNullAt($index)) {
           ${elementConverter.code}
-          $values[$index] = ${elementConverter.primitive};
+          $values[$index] = ${elementConverter.value};
         }
       }
       final ArrayData $output = new $arrayClass($values);
@@ -109,7 +109,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
       final MapData $tmp = $input;
       ${keyConverter.code}
       ${valueConverter.code}
-      final MapData $output = new $mapClass(${keyConverter.primitive}, ${valueConverter.primitive});
+      final MapData $output = new $mapClass(${keyConverter.value}, ${valueConverter.value});
     """
 
     GeneratedExpressionCode(code, "false", output)
@@ -133,18 +133,18 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
       case (NoOp, _) => ""
       case (e, i) =>
         val evaluationCode = e.gen(ctx)
-        val converter = convertToSafe(ctx, evaluationCode.primitive, e.dataType)
+        val converter = convertToSafe(ctx, evaluationCode.value, e.dataType)
         evaluationCode.code +
           s"""
             if (${evaluationCode.isNull}) {
               mutableRow.setNullAt($i);
             } else {
               ${converter.code}
-              ${ctx.setColumn("mutableRow", e.dataType, i, converter.primitive)};
+              ${ctx.setColumn("mutableRow", e.dataType, i, converter.value)};
             }
           """
     }
-    val allExpressions = ctx.splitExpressions("i", expressionCodes)
+    val allExpressions = ctx.splitExpressions(ctx.INPUT_ROW, expressionCodes)
     val code = s"""
       public Object generate($exprType[] expr) {
         return new SpecificSafeProjection(expr);
@@ -164,7 +164,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
         }
 
         public Object apply(Object _i) {
-          InternalRow i = (InternalRow) _i;
+          InternalRow ${ctx.INPUT_ROW} = (InternalRow) _i;
           $allExpressions
           return mutableRow;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 8e58cb9ad1e5d..3e0e81733fb1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -93,7 +93,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
               // Remember the current cursor so that we can calculate how many bytes are
               // written later.
               final int $tmpCursor = $bufferHolder.cursor;
-              ${writeStructToBuffer(ctx, input.primitive, t.map(_.dataType), bufferHolder)}
+              ${writeStructToBuffer(ctx, input.value, t.map(_.dataType), bufferHolder)}
               $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
             """
 
@@ -102,7 +102,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
               // Remember the current cursor so that we can calculate how many bytes are
               // written later.
               final int $tmpCursor = $bufferHolder.cursor;
-              ${writeArrayToBuffer(ctx, input.primitive, et, bufferHolder)}
+              ${writeArrayToBuffer(ctx, input.value, et, bufferHolder)}
               $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
               $rowWriter.alignToWords($bufferHolder.cursor - $tmpCursor);
             """
@@ -112,7 +112,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
               // Remember the current cursor so that we can calculate how many bytes are
               // written later.
               final int $tmpCursor = $bufferHolder.cursor;
-              ${writeMapToBuffer(ctx, input.primitive, kt, vt, bufferHolder)}
+              ${writeMapToBuffer(ctx, input.value, kt, vt, bufferHolder)}
               $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor);
               $rowWriter.alignToWords($bufferHolder.cursor - $tmpCursor);
             """
@@ -122,19 +122,19 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
             s"""
               final long $fieldOffset = $rowWriter.getFieldOffset($index);
               Platform.putLong($bufferHolder.buffer, $fieldOffset, 0L);
-              ${writePrimitiveType(ctx, input.primitive, dt, s"$bufferHolder.buffer", fieldOffset)}
+              ${writePrimitiveType(ctx, input.value, dt, s"$bufferHolder.buffer", fieldOffset)}
             """
 
           case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-            s"$rowWriter.writeCompactDecimal($index, ${input.primitive}, " +
+            s"$rowWriter.writeCompactDecimal($index, ${input.value}, " +
               s"${t.precision}, ${t.scale});"
 
           case t: DecimalType =>
-            s"$rowWriter.write($index, ${input.primitive}, ${t.precision}, ${t.scale});"
+            s"$rowWriter.write($index, ${input.value}, ${t.precision}, ${t.scale});"
 
           case NullType => ""
 
-          case _ => s"$rowWriter.write($index, ${input.primitive});"
+          case _ => s"$rowWriter.write($index, ${input.value});"
         }
 
         s"""
@@ -324,7 +324,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val code =
       s"""
         $bufferHolder.reset();
-        ${writeExpressionsToBuffer(ctx, "i", exprEvals, exprTypes, bufferHolder)}
+        ${writeExpressionsToBuffer(ctx, ctx.INPUT_ROW, exprEvals, exprTypes, bufferHolder)}
         $result.pointTo($bufferHolder.buffer, ${expressions.length}, $bufferHolder.totalSize());
       """
     GeneratedExpressionCode(code, "false", result)
@@ -363,9 +363,9 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           return apply((InternalRow) row);
         }
 
-        public UnsafeRow apply(InternalRow i) {
+        public UnsafeRow apply(InternalRow ${ctx.INPUT_ROW}) {
           ${eval.code}
-          return ${eval.primitive};
+          return ${eval.value};
         }
       }
       """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 7b8c5b723ded4..75c66bc271fe0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -35,7 +35,7 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).numElements();")
+    nullSafeCodeGen(ctx, ev, c => s"${ev.value} = ($c).numElements();")
   }
 }
 
@@ -173,7 +173,7 @@ case class ArrayContains(left: Expression, right: Expression)
           ${ev.isNull} = true;
         } else if (${ctx.genEqual(right.dataType, value, getValue)}) {
           ${ev.isNull} = false;
-          ${ev.primitive} = true;
+          ${ev.value} = true;
           break;
         }
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 82eab5fb3d03a..a5f02e2463aed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -59,11 +59,11 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
           if (${eval.isNull}) {
             $values[$i] = null;
           } else {
-            $values[$i] = ${eval.primitive};
+            $values[$i] = ${eval.value};
           }
          """
       }.mkString("\n") +
-      s"final ArrayData ${ev.primitive} = new $arrayClass($values);"
+      s"final ArrayData ${ev.value} = new $arrayClass($values);"
   }
 
   override def prettyName: String = "array"
@@ -107,11 +107,11 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
           if (${eval.isNull}) {
             $values[$i] = null;
           } else {
-            $values[$i] = ${eval.primitive};
+            $values[$i] = ${eval.value};
           }
          """
       }.mkString("\n") +
-      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
+      s"final InternalRow ${ev.value} = new $rowClass($values);"
   }
 
   override def prettyName: String = "struct"
@@ -176,11 +176,11 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
           if (${eval.isNull}) {
             $values[$i] = null;
           } else {
-            $values[$i] = ${eval.primitive};
+            $values[$i] = ${eval.value};
           }
          """
       }.mkString("\n") +
-      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
+      s"final InternalRow ${ev.value} = new $rowClass($values);"
   }
 
   override def prettyName: String = "named_struct"
@@ -218,7 +218,7 @@ case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = GenerateUnsafeProjection.createCode(ctx, children)
     ev.isNull = eval.isNull
-    ev.primitive = eval.primitive
+    ev.value = eval.value
     eval.code
   }
 
@@ -258,7 +258,7 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = GenerateUnsafeProjection.createCode(ctx, valExprs)
     ev.isNull = eval.isNull
-    ev.primitive = eval.primitive
+    ev.value = eval.value
     eval.code
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 9927da21b052e..a2b5a6a58090e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -113,7 +113,7 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
         if ($eval.isNullAt($ordinal)) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = ${ctx.getValue(eval, dataType, ordinal.toString)};
+          ${ev.value} = ${ctx.getValue(eval, dataType, ordinal.toString)};
         }
       """
     })
@@ -175,7 +175,7 @@ case class GetArrayStructFields(
             }
           }
         }
-        ${ev.primitive} = new $arrayClass(values);
+        ${ev.value} = new $arrayClass(values);
       """
     })
   }
@@ -219,7 +219,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
         if (index >= $eval1.numElements() || index < 0) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = ${ctx.getValue(eval1, dataType, "index")};
+          ${ev.value} = ${ctx.getValue(eval1, dataType, "index")};
         }
       """
     })
@@ -295,7 +295,7 @@ case class GetMapValue(child: Expression, key: Expression)
         }
 
         if ($found) {
-          ${ev.primitive} = ${ctx.getValue(eval1 + ".valueArray()", dataType, index)};
+          ${ev.value} = ${ctx.getValue(eval1 + ".valueArray()", dataType, index)};
         } else {
           ${ev.isNull} = true;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index d51f3d3cef588..d532629984bec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -60,15 +60,15 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     s"""
       ${condEval.code}
       boolean ${ev.isNull} = false;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${condEval.isNull} && ${condEval.primitive}) {
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+      if (!${condEval.isNull} && ${condEval.value}) {
         ${trueEval.code}
         ${ev.isNull} = ${trueEval.isNull};
-        ${ev.primitive} = ${trueEval.primitive};
+        ${ev.value} = ${trueEval.value};
       } else {
         ${falseEval.code}
         ${ev.isNull} = ${falseEval.isNull};
-        ${ev.primitive} = ${falseEval.primitive};
+        ${ev.value} = ${falseEval.value};
       }
     """
   }
@@ -166,11 +166,11 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
       s"""
         if (!$got) {
           ${cond.code}
-          if (!${cond.isNull} && ${cond.primitive}) {
+          if (!${cond.isNull} && ${cond.value}) {
             $got = true;
             ${res.code}
             ${ev.isNull} = ${res.isNull};
-            ${ev.primitive} = ${res.primitive};
+            ${ev.value} = ${res.value};
           }
         }
       """
@@ -182,7 +182,7 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
         if (!$got) {
           ${res.code}
           ${ev.isNull} = ${res.isNull};
-          ${ev.primitive} = ${res.primitive};
+          ${ev.value} = ${res.value};
         }
       """
     } else {
@@ -192,7 +192,7 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike {
     s"""
       boolean $got = false;
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       $cases
       $other
     """
@@ -267,11 +267,11 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
       s"""
         if (!$got) {
           ${cond.code}
-          if (!${cond.isNull} && ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) {
+          if (!${cond.isNull} && ${ctx.genEqual(key.dataType, keyEval.value, cond.value)}) {
             $got = true;
             ${res.code}
             ${ev.isNull} = ${res.isNull};
-            ${ev.primitive} = ${res.primitive};
+            ${ev.value} = ${res.value};
           }
         }
       """
@@ -283,7 +283,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
         if (!$got) {
           ${res.code}
           ${ev.isNull} = ${res.isNull};
-          ${ev.primitive} = ${res.primitive};
+          ${ev.value} = ${res.value};
         }
       """
     } else {
@@ -293,7 +293,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     s"""
       boolean $got = false;
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       ${keyEval.code}
       if (!${keyEval.isNull}) {
         $cases
@@ -351,15 +351,15 @@ case class Least(children: Seq[Expression]) extends Expression {
     def updateEval(i: Int): String =
       s"""
         if (!${evalChildren(i).isNull} && (${ev.isNull} ||
-          ${ctx.genComp(dataType, evalChildren(i).primitive, ev.primitive)} < 0)) {
+          ${ctx.genComp(dataType, evalChildren(i).value, ev.value)} < 0)) {
           ${ev.isNull} = false;
-          ${ev.primitive} = ${evalChildren(i).primitive};
+          ${ev.value} = ${evalChildren(i).value};
         }
       """
     s"""
       ${evalChildren.map(_.code).mkString("\n")}
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       ${children.indices.map(updateEval).mkString("\n")}
     """
   }
@@ -406,15 +406,15 @@ case class Greatest(children: Seq[Expression]) extends Expression {
     def updateEval(i: Int): String =
       s"""
         if (!${evalChildren(i).isNull} && (${ev.isNull} ||
-          ${ctx.genComp(dataType, evalChildren(i).primitive, ev.primitive)} > 0)) {
+          ${ctx.genComp(dataType, evalChildren(i).value, ev.value)} > 0)) {
           ${ev.isNull} = false;
-          ${ev.primitive} = ${evalChildren(i).primitive};
+          ${ev.value} = ${evalChildren(i).value};
         }
       """
     s"""
       ${evalChildren.map(_.code).mkString("\n")}
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       ${children.indices.map(updateEval).mkString("\n")}
     """
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 32dc9b76821bf..13cc6bb6f27b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -82,7 +82,7 @@ case class DateAdd(startDate: Expression, days: Expression)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (sd, d) => {
-      s"""${ev.primitive} = $sd + $d;"""
+      s"""${ev.value} = $sd + $d;"""
     })
   }
 }
@@ -105,7 +105,7 @@ case class DateSub(startDate: Expression, days: Expression)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (sd, d) => {
-      s"""${ev.primitive} = $sd - $d;"""
+      s"""${ev.value} = $sd - $d;"""
     })
   }
 }
@@ -269,7 +269,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
          """)
       s"""
         $c.setTimeInMillis($time * 1000L * 3600L * 24L);
-        ${ev.primitive} = $c.get($cal.WEEK_OF_YEAR);
+        ${ev.value} = $c.get($cal.WEEK_OF_YEAR);
       """
     })
   }
@@ -368,19 +368,19 @@ case class UnixTimestamp(timeExp: Expression, format: Expression)
         if (fString == null) {
           s"""
             boolean ${ev.isNull} = true;
-            ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+            ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           """
         } else {
           val eval1 = left.gen(ctx)
           s"""
             ${eval1.code}
             boolean ${ev.isNull} = ${eval1.isNull};
-            ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+            ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
             if (!${ev.isNull}) {
               try {
                 $sdf $formatter = new $sdf("$fString");
-                ${ev.primitive} =
-                  $formatter.parse(${eval1.primitive}.toString()).getTime() / 1000L;
+                ${ev.value} =
+                  $formatter.parse(${eval1.value}.toString()).getTime() / 1000L;
               } catch (java.lang.Throwable e) {
                 ${ev.isNull} = true;
               }
@@ -392,7 +392,7 @@ case class UnixTimestamp(timeExp: Expression, format: Expression)
         nullSafeCodeGen(ctx, ev, (string, format) => {
           s"""
             try {
-              ${ev.primitive} =
+              ${ev.value} =
                 (new $sdf($format.toString())).parse($string.toString()).getTime() / 1000L;
             } catch (java.lang.Throwable e) {
               ${ev.isNull} = true;
@@ -404,9 +404,9 @@ case class UnixTimestamp(timeExp: Expression, format: Expression)
         s"""
           ${eval1.code}
           boolean ${ev.isNull} = ${eval1.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = ${eval1.primitive} / 1000000L;
+            ${ev.value} = ${eval1.value} / 1000000L;
           }
         """
       case DateType =>
@@ -415,9 +415,9 @@ case class UnixTimestamp(timeExp: Expression, format: Expression)
         s"""
           ${eval1.code}
           boolean ${ev.isNull} = ${eval1.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = $dtu.daysToMillis(${eval1.primitive}) / 1000L;
+            ${ev.value} = $dtu.daysToMillis(${eval1.value}) / 1000L;
           }
         """
     }
@@ -477,18 +477,18 @@ case class FromUnixTime(sec: Expression, format: Expression)
       if (constFormat == null) {
         s"""
           boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
         """
       } else {
         val t = left.gen(ctx)
         s"""
           ${t.code}
           boolean ${ev.isNull} = ${t.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
             try {
-              ${ev.primitive} = UTF8String.fromString(new $sdf("${constFormat.toString}").format(
-                new java.util.Date(${t.primitive} * 1000L)));
+              ${ev.value} = UTF8String.fromString(new $sdf("${constFormat.toString}").format(
+                new java.util.Date(${t.value} * 1000L)));
             } catch (java.lang.Throwable e) {
               ${ev.isNull} = true;
             }
@@ -499,7 +499,7 @@ case class FromUnixTime(sec: Expression, format: Expression)
       nullSafeCodeGen(ctx, ev, (seconds, f) => {
         s"""
         try {
-          ${ev.primitive} = UTF8String.fromString((new $sdf($f.toString())).format(
+          ${ev.value} = UTF8String.fromString((new $sdf($f.toString())).format(
             new java.util.Date($seconds * 1000L)));
         } catch (java.lang.Throwable e) {
           ${ev.isNull} = true;
@@ -571,7 +571,7 @@ case class NextDay(startDate: Expression, dayOfWeek: Expression)
         } else {
           val dayOfWeekValue = DateTimeUtils.getDayOfWeekFromString(input)
           s"""
-             |${ev.primitive} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekValue);
+             |${ev.value} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekValue);
            """.stripMargin
         }
       } else {
@@ -580,7 +580,7 @@ case class NextDay(startDate: Expression, dayOfWeek: Expression)
            |if ($dayOfWeekTerm == -1) {
            |  ${ev.isNull} = true;
            |} else {
-           |  ${ev.primitive} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekTerm);
+           |  ${ev.value} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekTerm);
            |}
          """.stripMargin
       }
@@ -640,7 +640,7 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
       if (tz == null) {
         s"""
            |boolean ${ev.isNull} = true;
-           |long ${ev.primitive} = 0;
+           |long ${ev.value} = 0;
          """.stripMargin
       } else {
         val tzTerm = ctx.freshName("tz")
@@ -650,10 +650,10 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
         s"""
            |${eval.code}
            |boolean ${ev.isNull} = ${eval.isNull};
-           |long ${ev.primitive} = 0;
+           |long ${ev.value} = 0;
            |if (!${ev.isNull}) {
-           |  ${ev.primitive} = ${eval.primitive} +
-           |   ${tzTerm}.getOffset(${eval.primitive} / 1000) * 1000L;
+           |  ${ev.value} = ${eval.value} +
+           |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
            |}
          """.stripMargin
       }
@@ -765,7 +765,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       if (tz == null) {
         s"""
            |boolean ${ev.isNull} = true;
-           |long ${ev.primitive} = 0;
+           |long ${ev.value} = 0;
          """.stripMargin
       } else {
         val tzTerm = ctx.freshName("tz")
@@ -775,10 +775,10 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
         s"""
            |${eval.code}
            |boolean ${ev.isNull} = ${eval.isNull};
-           |long ${ev.primitive} = 0;
+           |long ${ev.value} = 0;
            |if (!${ev.isNull}) {
-           |  ${ev.primitive} = ${eval.primitive} -
-           |   ${tzTerm}.getOffset(${eval.primitive} / 1000) * 1000L;
+           |  ${ev.value} = ${eval.value} -
+           |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
            |}
          """.stripMargin
       }
@@ -849,16 +849,16 @@ case class TruncDate(date: Expression, format: Expression)
       if (truncLevel == -1) {
         s"""
           boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
         """
       } else {
         val d = date.gen(ctx)
         s"""
           ${d.code}
           boolean ${ev.isNull} = ${d.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = $dtu.truncDate(${d.primitive}, $truncLevel);
+            ${ev.value} = $dtu.truncDate(${d.value}, $truncLevel);
           }
         """
       }
@@ -870,7 +870,7 @@ case class TruncDate(date: Expression, format: Expression)
           if ($form == -1) {
             ${ev.isNull} = true;
           } else {
-            ${ev.primitive} = $dtu.truncDate($dateVal, $form);
+            ${ev.value} = $dtu.truncDate($dateVal, $form);
           }
         """
       })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
index b7be12f7aa741..78f6631e46474 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -55,8 +55,8 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, eval => {
       s"""
-        ${ev.primitive} = (new Decimal()).setOrNull($eval, $precision, $scale);
-        ${ev.isNull} = ${ev.primitive} == null;
+        ${ev.value} = (new Decimal()).setOrNull($eval, $precision, $scale);
+        ${ev.isNull} = ${ev.value} == null;
       """
     })
   }
@@ -97,7 +97,7 @@ case class CheckOverflow(child: Expression, dataType: DecimalType) extends Unary
       s"""
          | Decimal $tmp = $eval.clone();
          | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) {
-         |   ${ev.primitive} = $tmp;
+         |   ${ev.value} = $tmp;
          | } else {
          |   ${ev.isNull} = true;
          | }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 8c0c5d5b1e31e..51be819e9d6fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -97,12 +97,12 @@ case class Literal protected (value: Any, dataType: DataType)
     // change the isNull and primitive to consts, to inline them
     if (value == null) {
       ev.isNull = "true"
-      s"final ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};"
+      s"final ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};"
     } else {
       dataType match {
         case BooleanType =>
           ev.isNull = "false"
-          ev.primitive = value.toString
+          ev.value = value.toString
           ""
         case FloatType =>
           val v = value.asInstanceOf[Float]
@@ -110,7 +110,7 @@ case class Literal protected (value: Any, dataType: DataType)
             super.genCode(ctx, ev)
           } else {
             ev.isNull = "false"
-            ev.primitive = s"${value}f"
+            ev.value = s"${value}f"
             ""
           }
         case DoubleType =>
@@ -119,20 +119,20 @@ case class Literal protected (value: Any, dataType: DataType)
             super.genCode(ctx, ev)
           } else {
             ev.isNull = "false"
-            ev.primitive = s"${value}D"
+            ev.value = s"${value}D"
             ""
           }
         case ByteType | ShortType =>
           ev.isNull = "false"
-          ev.primitive = s"(${ctx.javaType(dataType)})$value"
+          ev.value = s"(${ctx.javaType(dataType)})$value"
           ""
         case IntegerType | DateType =>
           ev.isNull = "false"
-          ev.primitive = value.toString
+          ev.value = value.toString
           ""
         case TimestampType | LongType =>
           ev.isNull = "false"
-          ev.primitive = s"${value}L"
+          ev.value = s"${value}L"
           ""
         // eval() version may be faster for non-primitive types
         case other =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 39de0e8f448de..a8164e9e29ec6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -89,7 +89,7 @@ abstract class UnaryLogExpression(f: Double => Double, name: String)
         if ($c <= $yAsymptote) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = java.lang.Math.${funcName}($c);
+          ${ev.value} = java.lang.Math.${funcName}($c);
         }
       """
     )
@@ -191,8 +191,8 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
     val numconv = NumberConverter.getClass.getName.stripSuffix("$")
     nullSafeCodeGen(ctx, ev, (num, from, to) =>
       s"""
-       ${ev.primitive} = $numconv.convert($num.getBytes(), $from, $to);
-       if (${ev.primitive} == null) {
+       ${ev.value} = $numconv.convert($num.getBytes(), $from, $to);
+       if (${ev.value} == null) {
          ${ev.isNull} = true;
        }
        """
@@ -270,7 +270,7 @@ case class Factorial(child: Expression) extends UnaryExpression with ImplicitCas
         if ($eval > 20 || $eval < 0) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} =
+          ${ev.value} =
             org.apache.spark.sql.catalyst.expressions.Factorial.factorial($eval);
         }
       """
@@ -288,7 +288,7 @@ case class Log2(child: Expression)
         if ($c <= $yAsymptote) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = java.lang.Math.log($c) / java.lang.Math.log(2);
+          ${ev.value} = java.lang.Math.log($c) / java.lang.Math.log(2);
         }
       """
     )
@@ -432,7 +432,7 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInput
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (c) => {
       val hex = Hex.getClass.getName.stripSuffix("$")
-      s"${ev.primitive} = " + (child.dataType match {
+      s"${ev.value} = " + (child.dataType match {
         case StringType => s"""$hex.hex($c.getBytes());"""
         case _ => s"""$hex.hex($c);"""
       })
@@ -458,8 +458,8 @@ case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInp
     nullSafeCodeGen(ctx, ev, (c) => {
       val hex = Hex.getClass.getName.stripSuffix("$")
       s"""
-        ${ev.primitive} = $hex.unhex($c.getBytes());
-        ${ev.isNull} = ${ev.primitive} == null;
+        ${ev.value} = $hex.unhex($c.getBytes());
+        ${ev.isNull} = ${ev.value} == null;
        """
     })
   }
@@ -605,7 +605,7 @@ case class Logarithm(left: Expression, right: Expression)
           if ($c2 <= 0.0) {
             ${ev.isNull} = true;
           } else {
-            ${ev.primitive} = java.lang.Math.log($c2);
+            ${ev.value} = java.lang.Math.log($c2);
           }
         """)
     } else {
@@ -614,7 +614,7 @@ case class Logarithm(left: Expression, right: Expression)
           if ($c1 <= 0.0 || $c2 <= 0.0) {
             ${ev.isNull} = true;
           } else {
-            ${ev.primitive} = java.lang.Math.log($c2) / java.lang.Math.log($c1);
+            ${ev.value} = java.lang.Math.log($c2) / java.lang.Math.log($c1);
           }
         """)
     }
@@ -727,74 +727,74 @@ case class Round(child: Expression, scale: Expression)
     val evaluationCode = child.dataType match {
       case _: DecimalType =>
         s"""
-        if (${ce.primitive}.changePrecision(${ce.primitive}.precision(), ${_scale})) {
-          ${ev.primitive} = ${ce.primitive};
+        if (${ce.value}.changePrecision(${ce.value}.precision(), ${_scale})) {
+          ${ev.value} = ${ce.value};
         } else {
           ${ev.isNull} = true;
         }"""
       case ByteType =>
         if (_scale < 0) {
           s"""
-          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+          ${ev.value} = new java.math.BigDecimal(${ce.value}).
             setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).byteValue();"""
         } else {
-          s"${ev.primitive} = ${ce.primitive};"
+          s"${ev.value} = ${ce.value};"
         }
       case ShortType =>
         if (_scale < 0) {
           s"""
-          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+          ${ev.value} = new java.math.BigDecimal(${ce.value}).
             setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).shortValue();"""
         } else {
-          s"${ev.primitive} = ${ce.primitive};"
+          s"${ev.value} = ${ce.value};"
         }
       case IntegerType =>
         if (_scale < 0) {
           s"""
-          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+          ${ev.value} = new java.math.BigDecimal(${ce.value}).
             setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).intValue();"""
         } else {
-          s"${ev.primitive} = ${ce.primitive};"
+          s"${ev.value} = ${ce.value};"
         }
       case LongType =>
         if (_scale < 0) {
           s"""
-          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+          ${ev.value} = new java.math.BigDecimal(${ce.value}).
             setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).longValue();"""
         } else {
-          s"${ev.primitive} = ${ce.primitive};"
+          s"${ev.value} = ${ce.value};"
         }
       case FloatType => // if child eval to NaN or Infinity, just return it.
         if (_scale == 0) {
           s"""
-            if (Float.isNaN(${ce.primitive}) || Float.isInfinite(${ce.primitive})){
-              ${ev.primitive} = ${ce.primitive};
+            if (Float.isNaN(${ce.value}) || Float.isInfinite(${ce.value})){
+              ${ev.value} = ${ce.value};
             } else {
-              ${ev.primitive} = Math.round(${ce.primitive});
+              ${ev.value} = Math.round(${ce.value});
             }"""
         } else {
           s"""
-            if (Float.isNaN(${ce.primitive}) || Float.isInfinite(${ce.primitive})){
-              ${ev.primitive} = ${ce.primitive};
+            if (Float.isNaN(${ce.value}) || Float.isInfinite(${ce.value})){
+              ${ev.value} = ${ce.value};
             } else {
-              ${ev.primitive} = java.math.BigDecimal.valueOf(${ce.primitive}).
+              ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}).
                 setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).floatValue();
             }"""
         }
       case DoubleType => // if child eval to NaN or Infinity, just return it.
         if (_scale == 0) {
           s"""
-            if (Double.isNaN(${ce.primitive}) || Double.isInfinite(${ce.primitive})){
-              ${ev.primitive} = ${ce.primitive};
+            if (Double.isNaN(${ce.value}) || Double.isInfinite(${ce.value})){
+              ${ev.value} = ${ce.value};
             } else {
-              ${ev.primitive} = Math.round(${ce.primitive});
+              ${ev.value} = Math.round(${ce.value});
             }"""
         } else {
           s"""
-            if (Double.isNaN(${ce.primitive}) || Double.isInfinite(${ce.primitive})){
-              ${ev.primitive} = ${ce.primitive};
+            if (Double.isNaN(${ce.value}) || Double.isInfinite(${ce.value})){
+              ${ev.value} = ${ce.value};
             } else {
-              ${ev.primitive} = java.math.BigDecimal.valueOf(${ce.primitive}).
+              ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}).
                 setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).doubleValue();
             }"""
         }
@@ -803,13 +803,13 @@ case class Round(child: Expression, scale: Expression)
     if (scaleV == null) { // if scale is null, no need to eval its child at all
       s"""
         boolean ${ev.isNull} = true;
-        ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       """
     } else {
       s"""
         ${ce.code}
         boolean ${ev.isNull} = ${ce.isNull};
-        ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
         if (!${ev.isNull}) {
           $evaluationCode
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 8d8d66ddeb341..0f6d02f2e00c2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -92,18 +92,18 @@ case class Sha2(left: Expression, right: Expression)
           try {
             java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
             md.update($eval1);
-            ${ev.primitive} = UTF8String.fromBytes(md.digest());
+            ${ev.value} = UTF8String.fromBytes(md.digest());
           } catch (java.security.NoSuchAlgorithmException e) {
             ${ev.isNull} = true;
           }
         } else if ($eval2 == 256 || $eval2 == 0) {
-          ${ev.primitive} =
+          ${ev.value} =
             UTF8String.fromString($digestUtils.sha256Hex($eval1));
         } else if ($eval2 == 384) {
-          ${ev.primitive} =
+          ${ev.value} =
             UTF8String.fromString($digestUtils.sha384Hex($eval1));
         } else if ($eval2 == 512) {
-          ${ev.primitive} =
+          ${ev.value} =
             UTF8String.fromString($digestUtils.sha512Hex($eval1));
         } else {
           ${ev.isNull} = true;
@@ -155,7 +155,7 @@ case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInp
       s"""
         $CRC32 checksum = new $CRC32();
         checksum.update($value, 0, $value.length);
-        ${ev.primitive} = checksum.getValue();
+        ${ev.value} = checksum.getValue();
       """
     })
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 287718fab7f0d..94deafb75b69c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -64,7 +64,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     s"""
       boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
     """ +
     children.map { e =>
       val eval = e.gen(ctx)
@@ -73,7 +73,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
           ${eval.code}
           if (!${eval.isNull}) {
             ${ev.isNull} = false;
-            ${ev.primitive} = ${eval.primitive};
+            ${ev.value} = ${eval.value};
           }
         }
       """
@@ -111,8 +111,8 @@ case class IsNaN(child: Expression) extends UnaryExpression
         s"""
           ${eval.code}
           boolean ${ev.isNull} = false;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          ${ev.primitive} = !${eval.isNull} && Double.isNaN(${eval.primitive});
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+          ${ev.value} = !${eval.isNull} && Double.isNaN(${eval.value});
         """
     }
   }
@@ -152,18 +152,18 @@ case class NaNvl(left: Expression, right: Expression)
         s"""
           ${leftGen.code}
           boolean ${ev.isNull} = false;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (${leftGen.isNull}) {
             ${ev.isNull} = true;
           } else {
-            if (!Double.isNaN(${leftGen.primitive})) {
-              ${ev.primitive} = ${leftGen.primitive};
+            if (!Double.isNaN(${leftGen.value})) {
+              ${ev.value} = ${leftGen.value};
             } else {
               ${rightGen.code}
               if (${rightGen.isNull}) {
                 ${ev.isNull} = true;
               } else {
-                ${ev.primitive} = ${rightGen.primitive};
+                ${ev.value} = ${rightGen.value};
               }
             }
           }
@@ -186,7 +186,7 @@ case class IsNull(child: Expression) extends UnaryExpression with Predicate {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     ev.isNull = "false"
-    ev.primitive = eval.isNull
+    ev.value = eval.isNull
     eval.code
   }
 }
@@ -205,7 +205,7 @@ case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     ev.isNull = "false"
-    ev.primitive = s"(!(${eval.isNull}))"
+    ev.value = s"(!(${eval.isNull}))"
     eval.code
   }
 }
@@ -249,7 +249,7 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
           s"""
             if ($nonnull < $n) {
               ${eval.code}
-              if (!${eval.isNull} && !Double.isNaN(${eval.primitive})) {
+              if (!${eval.isNull} && !Double.isNaN(${eval.value})) {
                 $nonnull += 1;
               }
             }
@@ -269,7 +269,7 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
       int $nonnull = 0;
       $code
       boolean ${ev.isNull} = false;
-      boolean ${ev.primitive} = $nonnull >= $n;
+      boolean ${ev.value} = $nonnull >= $n;
      """
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index daefc016bc91c..68557479a9591 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -148,19 +148,19 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate
     val listGen = list.map(_.gen(ctx))
     val listCode = listGen.map(x =>
       s"""
-        if (!${ev.primitive}) {
+        if (!${ev.value}) {
           ${x.code}
           if (${x.isNull}) {
             ${ev.isNull} = true;
-          } else if (${ctx.genEqual(value.dataType, valueGen.primitive, x.primitive)}) {
+          } else if (${ctx.genEqual(value.dataType, valueGen.value, x.value)}) {
             ${ev.isNull} = false;
-            ${ev.primitive} = true;
+            ${ev.value} = true;
           }
         }
        """).mkString("\n")
     s"""
       ${valueGen.code}
-      boolean ${ev.primitive} = false;
+      boolean ${ev.value} = false;
       boolean ${ev.isNull} = ${valueGen.isNull};
       if (!${ev.isNull}) {
         $listCode
@@ -208,10 +208,10 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     s"""
       ${childGen.code}
       boolean ${ev.isNull} = ${childGen.isNull};
-      boolean ${ev.primitive} = false;
+      boolean ${ev.value} = false;
       if (!${ev.isNull}) {
-        ${ev.primitive} = $hsetTerm.contains(${childGen.primitive});
-        if (!${ev.primitive} && $hasNullTerm) {
+        ${ev.value} = $hsetTerm.contains(${childGen.value});
+        if (!${ev.value} && $hasNullTerm) {
           ${ev.isNull} = true;
         }
       }
@@ -251,14 +251,14 @@ case class And(left: Expression, right: Expression) extends BinaryOperator with
     s"""
       ${eval1.code}
       boolean ${ev.isNull} = false;
-      boolean ${ev.primitive} = false;
+      boolean ${ev.value} = false;
 
-      if (!${eval1.isNull} && !${eval1.primitive}) {
+      if (!${eval1.isNull} && !${eval1.value}) {
       } else {
         ${eval2.code}
-        if (!${eval2.isNull} && !${eval2.primitive}) {
+        if (!${eval2.isNull} && !${eval2.value}) {
         } else if (!${eval1.isNull} && !${eval2.isNull}) {
-          ${ev.primitive} = true;
+          ${ev.value} = true;
         } else {
           ${ev.isNull} = true;
         }
@@ -300,14 +300,14 @@ case class Or(left: Expression, right: Expression) extends BinaryOperator with P
     s"""
       ${eval1.code}
       boolean ${ev.isNull} = false;
-      boolean ${ev.primitive} = true;
+      boolean ${ev.value} = true;
 
-      if (!${eval1.isNull} && ${eval1.primitive}) {
+      if (!${eval1.isNull} && ${eval1.value}) {
       } else {
         ${eval2.code}
-        if (!${eval2.isNull} && ${eval2.primitive}) {
+        if (!${eval2.isNull} && ${eval2.value}) {
         } else if (!${eval1.isNull} && !${eval2.isNull}) {
-          ${ev.primitive} = false;
+          ${ev.value} = false;
         } else {
           ${ev.isNull} = true;
         }
@@ -403,10 +403,10 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val equalCode = ctx.genEqual(left.dataType, eval1.primitive, eval2.primitive)
+    val equalCode = ctx.genEqual(left.dataType, eval1.value, eval2.value)
     ev.isNull = "false"
     eval1.code + eval2.code + s"""
-        boolean ${ev.primitive} = (${eval1.isNull} && ${eval2.isNull}) ||
+        boolean ${ev.value} = (${eval1.isNull} && ${eval2.isNull}) ||
            (!${eval1.isNull} && $equalCode);
       """
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index 62d3d204ca872..8bde8cb9fe876 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -69,7 +69,7 @@ case class Rand(seed: Long) extends RDG {
       s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
-      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextDouble();
+      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextDouble();
     """
   }
 }
@@ -92,7 +92,7 @@ case class Randn(seed: Long) extends RDG {
       s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
-      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
+      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();
     """
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 6dff28a7cde46..64f15945c790d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -92,15 +92,15 @@ case class Like(left: Expression, right: Expression)
         s"""
           ${eval.code}
           boolean ${ev.isNull} = ${eval.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches();
+            ${ev.value} = $pattern.matcher(${eval.value}.toString()).matches();
           }
         """
       } else {
         s"""
           boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
         """
       }
     } else {
@@ -108,7 +108,7 @@ case class Like(left: Expression, right: Expression)
         s"""
           String rightStr = ${eval2}.toString();
           ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr));
-          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches();
+          ${ev.value} = $pattern.matcher(${eval1}.toString()).matches();
         """
       })
     }
@@ -140,15 +140,15 @@ case class RLike(left: Expression, right: Expression)
         s"""
           ${eval.code}
           boolean ${ev.isNull} = ${eval.isNull};
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0);
+            ${ev.value} = $pattern.matcher(${eval.value}.toString()).find(0);
           }
         """
       } else {
         s"""
           boolean ${ev.isNull} = true;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
         """
       }
     } else {
@@ -156,7 +156,7 @@ case class RLike(left: Expression, right: Expression)
         s"""
           String rightStr = ${eval2}.toString();
           ${patternClass} $pattern = ${patternClass}.compile(rightStr);
-          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0);
+          ${ev.value} = $pattern.matcher(${eval1}.toString()).find(0);
         """
       })
     }
@@ -184,7 +184,7 @@ case class StringSplit(str: Expression, pattern: Expression)
     val arrayClass = classOf[GenericArrayData].getName
     nullSafeCodeGen(ctx, ev, (str, pattern) =>
       // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
-      s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""")
+      s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
   }
 
   override def prettyName: String = "split"
@@ -275,7 +275,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
         m.appendReplacement(${termResult}, ${termLastReplacement});
       }
       m.appendTail(${termResult});
-      ${ev.primitive} = UTF8String.fromString(${termResult}.toString());
+      ${ev.value} = UTF8String.fromString(${termResult}.toString());
       ${ev.isNull} = false;
     """
     })
@@ -335,10 +335,10 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
         ${termPattern}.matcher($subject.toString());
       if (m.find()) {
         java.util.regex.MatchResult mr = m.toMatchResult();
-        ${ev.primitive} = UTF8String.fromString(mr.group($idx));
+        ${ev.value} = UTF8String.fromString(mr.group($idx));
         ${ev.isNull} = false;
       } else {
-        ${ev.primitive} = UTF8String.EMPTY_UTF8;
+        ${ev.value} = UTF8String.EMPTY_UTF8;
         ${ev.isNull} = false;
       }"""
     })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 5b0fe8dfe2fc8..d124d29d534b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -67,7 +67,7 @@ case class NewSet(elementType: DataType) extends LeafExpression with CodegenFall
       case IntegerType | LongType =>
         ev.isNull = "false"
         s"""
-          ${ctx.javaType(dataType)} ${ev.primitive} = new ${ctx.javaType(dataType)}();
+          ${ctx.javaType(dataType)} ${ev.value} = new ${ctx.javaType(dataType)}();
         """
       case _ => super.genCode(ctx, ev)
     }
@@ -116,10 +116,10 @@ case class AddItemToSet(item: Expression, set: Expression)
         val htype = ctx.javaType(dataType)
 
         ev.isNull = "false"
-        ev.primitive = setEval.primitive
+        ev.value = setEval.value
         itemEval.code + setEval.code +  s"""
           if (!${itemEval.isNull} && !${setEval.isNull}) {
-           (($htype)${setEval.primitive}).add(${itemEval.primitive});
+           (($htype)${setEval.value}).add(${itemEval.value});
           }
          """
       case _ => super.genCode(ctx, ev)
@@ -167,10 +167,10 @@ case class CombineSets(left: Expression, right: Expression)
         val htype = ctx.javaType(dataType)
 
         ev.isNull = leftEval.isNull
-        ev.primitive = leftEval.primitive
+        ev.value = leftEval.value
         leftEval.code + rightEval.code + s"""
           if (!${leftEval.isNull} && !${rightEval.isNull}) {
-            ${leftEval.primitive}.union((${htype})${rightEval.primitive});
+            ${leftEval.value}.union((${htype})${rightEval.value});
           }
         """
       case _ => super.genCode(ctx, ev)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 4ab27c044f50e..abc5c94589baa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -50,12 +50,12 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val evals = children.map(_.gen(ctx))
     val inputs = evals.map { eval =>
-      s"${eval.isNull} ? null : ${eval.primitive}"
+      s"${eval.isNull} ? null : ${eval.value}"
     }.mkString(", ")
     evals.map(_.code).mkString("\n") + s"""
       boolean ${ev.isNull} = false;
-      UTF8String ${ev.primitive} = UTF8String.concat($inputs);
-      if (${ev.primitive} == null) {
+      UTF8String ${ev.value} = UTF8String.concat($inputs);
+      if (${ev.value} == null) {
         ${ev.isNull} = true;
       }
     """
@@ -104,12 +104,12 @@ case class ConcatWs(children: Seq[Expression])
       val evals = children.map(_.gen(ctx))
 
       val inputs = evals.map { eval =>
-        s"${eval.isNull} ? (UTF8String) null : ${eval.primitive}"
+        s"${eval.isNull} ? (UTF8String) null : ${eval.value}"
       }.mkString(", ")
 
       evals.map(_.code).mkString("\n") + s"""
-        UTF8String ${ev.primitive} = UTF8String.concatWs($inputs);
-        boolean ${ev.isNull} = ${ev.primitive} == null;
+        UTF8String ${ev.value} = UTF8String.concatWs($inputs);
+        boolean ${ev.isNull} = ${ev.value} == null;
       """
     } else {
       val array = ctx.freshName("array")
@@ -121,19 +121,19 @@ case class ConcatWs(children: Seq[Expression])
         child.dataType match {
           case StringType =>
             ("", // we count all the StringType arguments num at once below.
-              s"$array[$idxInVararg ++] = ${eval.isNull} ? (UTF8String) null : ${eval.primitive};")
+              s"$array[$idxInVararg ++] = ${eval.isNull} ? (UTF8String) null : ${eval.value};")
           case _: ArrayType =>
             val size = ctx.freshName("n")
             (s"""
               if (!${eval.isNull}) {
-                $varargNum += ${eval.primitive}.numElements();
+                $varargNum += ${eval.value}.numElements();
               }
             """,
             s"""
             if (!${eval.isNull}) {
-              final int $size = ${eval.primitive}.numElements();
+              final int $size = ${eval.value}.numElements();
               for (int j = 0; j < $size; j ++) {
-                $array[$idxInVararg ++] = ${ctx.getValue(eval.primitive, StringType, "j")};
+                $array[$idxInVararg ++] = ${ctx.getValue(eval.value, StringType, "j")};
               }
             }
             """)
@@ -147,8 +147,8 @@ case class ConcatWs(children: Seq[Expression])
         ${varargCount.mkString("\n")}
         UTF8String[] $array = new UTF8String[$varargNum];
         ${varargBuild.mkString("\n")}
-        UTF8String ${ev.primitive} = UTF8String.concatWs(${evals.head.primitive}, $array);
-        boolean ${ev.isNull} = ${ev.primitive} == null;
+        UTF8String ${ev.value} = UTF8String.concatWs(${evals.head.value}, $array);
+        boolean ${ev.isNull} = ${ev.value} == null;
       """
     }
   }
@@ -308,7 +308,7 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
         ${termDict} = org.apache.spark.sql.catalyst.expressions.StringTranslate
           .buildDict(${termLastMatching}, ${termLastReplace});
       }
-      ${ev.primitive} = ${src}.translate(${termDict});
+      ${ev.value} = ${src}.translate(${termDict});
       """
     })
   }
@@ -334,7 +334,7 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (word, set) =>
-      s"${ev.primitive} = $set.findInSet($word);"
+      s"${ev.value} = $set.findInSet($word);"
     )
   }
 
@@ -481,7 +481,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
     val strGen = str.gen(ctx)
     val startGen = start.gen(ctx)
     s"""
-      int ${ev.primitive} = 0;
+      int ${ev.value} = 0;
       boolean ${ev.isNull} = false;
       ${startGen.code}
       if (!${startGen.isNull}) {
@@ -489,8 +489,8 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
         if (!${substrGen.isNull}) {
           ${strGen.code}
           if (!${strGen.isNull}) {
-            ${ev.primitive} = ${strGen.primitive}.indexOf(${substrGen.primitive},
-              ${startGen.primitive}) + 1;
+            ${ev.value} = ${strGen.value}.indexOf(${substrGen.value},
+              ${startGen.value}) + 1;
           } else {
             ${ev.isNull} = true;
           }
@@ -586,9 +586,9 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
         if (ctx.boxedType(v._1) != ctx.javaType(v._1)) {
           // Java primitives get boxed in order to allow null values.
           s"(${v._2.isNull}) ? (${ctx.boxedType(v._1)}) null : " +
-            s"new ${ctx.boxedType(v._1)}(${v._2.primitive})"
+            s"new ${ctx.boxedType(v._1)}(${v._2.value})"
         } else {
-          s"(${v._2.isNull}) ? null : ${v._2.primitive}"
+          s"(${v._2.isNull}) ? null : ${v._2.value}"
         }
       s + "," + nullSafeString
     })
@@ -600,13 +600,13 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
     s"""
       ${pattern.code}
       boolean ${ev.isNull} = ${pattern.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
       if (!${ev.isNull}) {
         ${argListCode.mkString}
         $stringBuffer $sb = new $stringBuffer();
         $formatter $form = new $formatter($sb, ${classOf[Locale].getName}.US);
-        $form.format(${pattern.primitive}.toString() $argListString);
-        ${ev.primitive} = UTF8String.fromString($sb.toString());
+        $form.format(${pattern.value}.toString() $argListString);
+        ${ev.value} = UTF8String.fromString($sb.toString());
       }
      """
   }
@@ -682,7 +682,7 @@ case class StringSpace(child: Expression)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (length) =>
-      s"""${ev.primitive} = UTF8String.blankString(($length < 0) ? 0 : $length);""")
+      s"""${ev.value} = UTF8String.blankString(($length < 0) ? 0 : $length);""")
   }
 
   override def prettyName: String = "space"
@@ -760,7 +760,7 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (left, right) =>
-      s"${ev.primitive} = $left.levenshteinDistance($right);")
+      s"${ev.value} = $left.levenshteinDistance($right);")
   }
 }
 
@@ -803,9 +803,9 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
       s"""
         byte[] $bytes = $child.getBytes();
         if ($bytes.length > 0) {
-          ${ev.primitive} = (int) $bytes[0];
+          ${ev.value} = (int) $bytes[0];
         } else {
-          ${ev.primitive} = 0;
+          ${ev.value} = 0;
         }
        """})
   }
@@ -827,7 +827,7 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (child) => {
-      s"""${ev.primitive} = UTF8String.fromBytes(
+      s"""${ev.value} = UTF8String.fromBytes(
             org.apache.commons.codec.binary.Base64.encodeBase64($child));
        """})
   }
@@ -848,7 +848,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (child) => {
       s"""
-         ${ev.primitive} = org.apache.commons.codec.binary.Base64.decodeBase64($child.toString());
+         ${ev.value} = org.apache.commons.codec.binary.Base64.decodeBase64($child.toString());
        """})
   }
 }
@@ -875,7 +875,7 @@ case class Decode(bin: Expression, charset: Expression)
     nullSafeCodeGen(ctx, ev, (bytes, charset) =>
       s"""
         try {
-          ${ev.primitive} = UTF8String.fromString(new String($bytes, $charset.toString()));
+          ${ev.value} = UTF8String.fromString(new String($bytes, $charset.toString()));
         } catch (java.io.UnsupportedEncodingException e) {
           org.apache.spark.unsafe.Platform.throwException(e);
         }
@@ -905,7 +905,7 @@ case class Encode(value: Expression, charset: Expression)
     nullSafeCodeGen(ctx, ev, (string, charset) =>
       s"""
         try {
-          ${ev.primitive} = $string.toString().getBytes($charset.toString());
+          ${ev.value} = $string.toString().getBytes($charset.toString());
         } catch (java.io.UnsupportedEncodingException e) {
           org.apache.spark.unsafe.Platform.throwException(e);
         }""")
@@ -1014,9 +1014,9 @@ case class FormatNumber(x: Expression, d: Expression)
             $lastDValue = $d;
             $numberFormat.applyPattern($dFormat.toPattern());
           }
-          ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)}));
+          ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)}));
         } else {
-          ${ev.primitive} = null;
+          ${ev.value} = null;
           ${ev.isNull} = true;
         }
        """

From 4b74755122d51edb1257d4f3785fb24508681068 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 7 Oct 2015 11:38:07 -0700
Subject: [PATCH 470/802] [SPARK-10812] [YARN] Fix shutdown of token renewer.

A recent change to fix the referenced bug caused this exception in
the `SparkContext.stop()` path:

org.apache.spark.SparkException: YarnSparkHadoopUtil is not available in non-YARN mode!
        at org.apache.spark.deploy.yarn.YarnSparkHadoopUtil$.get(YarnSparkHadoopUtil.scala:167)
        at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.stop(YarnClientSchedulerBackend.scala:182)
        at org.apache.spark.scheduler.TaskSchedulerImpl.stop(TaskSchedulerImpl.scala:440)
        at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1579)
        at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1730)
        at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
        at org.apache.spark.SparkContext.stop(SparkContext.scala:1729)

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8996 from vanzin/SPARK-10812.
---
 .../spark/scheduler/cluster/YarnClientSchedulerBackend.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index d06d95140438c..36d5759554d98 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -178,8 +178,8 @@ private[spark] class YarnClientSchedulerBackend(
       monitorThread.stopMonitor()
     }
     super.stop()
-    client.stop()
     YarnSparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()
+    client.stop()
     logInfo("Stopped")
   }
 

From 6ca27f855075d65eb4535f1f2ed4fc9e68744231 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 7 Oct 2015 11:38:47 -0700
Subject: [PATCH 471/802] [SPARK-10964] [YARN] Correctly register the AM with
 the driver.

The `self` method returns null when called from the constructor;
instead, registration should happen in the `onStart` method, at
which point the `self` reference has already been initialized.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9005 from vanzin/SPARK-10964.
---
 .../apache/spark/scheduler/cluster/YarnSchedulerBackend.scala | 2 +-
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala      | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 6a4b536dee191..e0107f9d3dd19 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -169,7 +169,7 @@ private[spark] abstract class YarnSchedulerBackend(
     override def receive: PartialFunction[Any, Unit] = {
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
-        amEndpoint = Some(am)
+        amEndpoint = Option(am)
 
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 0df31736c163f..a2ccdc05d73c8 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -556,7 +556,9 @@ private[spark] class ApplicationMaster(
       override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
     extends RpcEndpoint with Logging {
 
-    driver.send(RegisterClusterManager(self))
+    override def onStart(): Unit = {
+      driver.send(RegisterClusterManager(self))
+    }
 
     override def receive: PartialFunction[Any, Unit] = {
       case x: AddWebUIFilter =>

From 5be5d247440d6346d667c4b3d817666126f62906 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 7 Oct 2015 12:00:56 -0700
Subject: [PATCH 472/802] [SPARK-9841] [ML] Make clear public

It is currently impossible to clear Param values once set. It would be helpful to be able to.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8619 from holdenk/SPARK-9841-params-clear-needs-to-be-public.
---
 mllib/src/main/scala/org/apache/spark/ml/param/params.scala  | 2 +-
 .../test/scala/org/apache/spark/ml/param/ParamsSuite.scala   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 48f6269e57e98..ec98b05e13b89 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -454,7 +454,7 @@ trait Params extends Identifiable with Serializable {
   /**
    * Clears the user-supplied value for the input param.
    */
-  protected final def clear(param: Param[_]): this.type = {
+  final def clear(param: Param[_]): this.type = {
     shouldOwn(param)
     paramMap.remove(param)
     this
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index dfab82c8b67ad..a2ea279f5d5e4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -156,6 +156,11 @@ class ParamsSuite extends SparkFunSuite {
     solver.clearMaxIter()
     assert(!solver.isSet(maxIter))
 
+    // Re-set and clear maxIter using the generic clear API
+    solver.setMaxIter(10)
+    solver.clear(maxIter)
+    assert(!solver.isSet(maxIter))
+
     val copied = solver.copy(ParamMap(solver.maxIter -> 50))
     assert(copied.uid === solver.uid)
     assert(copied.getInputCol === solver.getInputCol)

From a9ecd06149df4ccafd3927c35f63b9f03f170ae5 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 7 Oct 2015 13:19:49 -0700
Subject: [PATCH 473/802] [SPARK-10941] [SQL] Refactor AggregateFunction2 and
 AlgebraicAggregate interfaces to improve code clarity

This patch refactors several of the Aggregate2 interfaces in order to improve code clarity.

The biggest change is a refactoring of the `AggregateFunction2` class hierarchy. In the old code, we had a class named `AlgebraicAggregate` that inherited from `AggregateFunction2`, added a new set of methods, then banned the use of the inherited methods. I found this to be fairly confusing because.

If you look carefully at the existing code, you'll see that subclasses of `AggregateFunction2` fall into two disjoint categories: imperative aggregation functions which directly extended `AggregateFunction2` and declarative, expression-based aggregate functions which extended `AlgebraicAggregate`. In order to make this more explicit, this patch refactors things so that `AggregateFunction2` is a sealed abstract class with two subclasses, `ImperativeAggregateFunction` and `ExpressionAggregateFunction`. The superclass, `AggregateFunction2`, now only contains methods and fields that are common to both subclasses.

After making this change, I updated the various AggregationIterator classes to comply with this new naming scheme. I also performed several small renamings in the aggregate interfaces themselves in order to improve clarity and rewrote or expanded a number of comments.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8973 from JoshRosen/tungsten-agg-comments.
---
 .../expressions/aggregate/functions.scala     |  70 +++---
 .../expressions/aggregate/interfaces.scala    | 235 ++++++++++++------
 .../aggregate/HyperLogLogPlusPlusSuite.scala  |   2 +-
 .../aggregate/AggregationIterator.scala       | 188 +++++++-------
 .../SortBasedAggregationIterator.scala        |   2 +-
 .../aggregate/TungstenAggregate.scala         |   2 -
 .../TungstenAggregationIterator.scala         |  80 +++---
 .../spark/sql/execution/aggregate/udaf.scala  |  36 ++-
 .../spark/sql/execution/aggregate/utils.scala |  30 +--
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |   8 +-
 .../execution/AggregationQuerySuite.scala     |   4 +-
 12 files changed, 356 insertions(+), 303 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index e7f8104d439c9..4ad2607a85ef2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 
-case class Average(child: Expression) extends AlgebraicAggregate {
+case class Average(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -57,7 +57,7 @@ case class Average(child: Expression) extends AlgebraicAggregate {
   private val currentSum = AttributeReference("currentSum", sumDataType)()
   private val currentCount = AttributeReference("currentCount", LongType)()
 
-  override val bufferAttributes = currentSum :: currentCount :: Nil
+  override val aggBufferAttributes = currentSum :: currentCount :: Nil
 
   override val initialValues = Seq(
     /* currentSum = */ Cast(Literal(0), sumDataType),
@@ -88,7 +88,7 @@ case class Average(child: Expression) extends AlgebraicAggregate {
   }
 }
 
-case class Count(child: Expression) extends AlgebraicAggregate {
+case class Count(child: Expression) extends ExpressionAggregate {
   override def children: Seq[Expression] = child :: Nil
 
   override def nullable: Boolean = false
@@ -101,7 +101,7 @@ case class Count(child: Expression) extends AlgebraicAggregate {
 
   private val currentCount = AttributeReference("currentCount", LongType)()
 
-  override val bufferAttributes = currentCount :: Nil
+  override val aggBufferAttributes = currentCount :: Nil
 
   override val initialValues = Seq(
     /* currentCount = */ Literal(0L)
@@ -118,7 +118,7 @@ case class Count(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = Cast(currentCount, LongType)
 }
 
-case class First(child: Expression) extends AlgebraicAggregate {
+case class First(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -135,7 +135,7 @@ case class First(child: Expression) extends AlgebraicAggregate {
 
   private val first = AttributeReference("first", child.dataType)()
 
-  override val bufferAttributes = first :: Nil
+  override val aggBufferAttributes = first :: Nil
 
   override val initialValues = Seq(
     /* first = */ Literal.create(null, child.dataType)
@@ -152,7 +152,7 @@ case class First(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = first
 }
 
-case class Last(child: Expression) extends AlgebraicAggregate {
+case class Last(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -169,7 +169,7 @@ case class Last(child: Expression) extends AlgebraicAggregate {
 
   private val last = AttributeReference("last", child.dataType)()
 
-  override val bufferAttributes = last :: Nil
+  override val aggBufferAttributes = last :: Nil
 
   override val initialValues = Seq(
     /* last = */ Literal.create(null, child.dataType)
@@ -186,7 +186,7 @@ case class Last(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = last
 }
 
-case class Max(child: Expression) extends AlgebraicAggregate {
+case class Max(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -200,7 +200,7 @@ case class Max(child: Expression) extends AlgebraicAggregate {
 
   private val max = AttributeReference("max", child.dataType)()
 
-  override val bufferAttributes = max :: Nil
+  override val aggBufferAttributes = max :: Nil
 
   override val initialValues = Seq(
     /* max = */ Literal.create(null, child.dataType)
@@ -220,7 +220,7 @@ case class Max(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = max
 }
 
-case class Min(child: Expression) extends AlgebraicAggregate {
+case class Min(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -234,7 +234,7 @@ case class Min(child: Expression) extends AlgebraicAggregate {
 
   private val min = AttributeReference("min", child.dataType)()
 
-  override val bufferAttributes = min :: Nil
+  override val aggBufferAttributes = min :: Nil
 
   override val initialValues = Seq(
     /* min = */ Literal.create(null, child.dataType)
@@ -277,7 +277,7 @@ case class StddevSamp(child: Expression) extends StddevAgg(child) {
 
 // Compute standard deviation based on online algorithm specified here:
 // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-abstract class StddevAgg(child: Expression) extends AlgebraicAggregate {
+abstract class StddevAgg(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -304,7 +304,7 @@ abstract class StddevAgg(child: Expression) extends AlgebraicAggregate {
   private val currentAvg = AttributeReference("currentAvg", resultType)()
   private val currentMk = AttributeReference("currentMk", resultType)()
 
-  override val bufferAttributes = preCount :: currentCount :: preAvg ::
+  override val aggBufferAttributes = preCount :: currentCount :: preAvg ::
                                   currentAvg :: currentMk :: Nil
 
   override val initialValues = Seq(
@@ -397,7 +397,7 @@ abstract class StddevAgg(child: Expression) extends AlgebraicAggregate {
   }
 }
 
-case class Sum(child: Expression) extends AlgebraicAggregate {
+case class Sum(child: Expression) extends ExpressionAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -429,7 +429,7 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 
   private val zero = Cast(Literal(0), sumDataType)
 
-  override val bufferAttributes = currentSum :: Nil
+  override val aggBufferAttributes = currentSum :: Nil
 
   override val initialValues = Seq(
     /* currentSum = */ Literal.create(null, sumDataType)
@@ -473,7 +473,7 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
  */
 // scalastyle:on
 case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
-    extends AggregateFunction2 {
+    extends ImperativeAggregate {
   import HyperLogLogPlusPlus._
 
   /**
@@ -531,28 +531,26 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
    */
   private[this] val numWords = m / REGISTERS_PER_WORD + 1
 
-  def children: Seq[Expression] = Seq(child)
+  override def children: Seq[Expression] = Seq(child)
 
-  def nullable: Boolean = false
-
-  def dataType: DataType = LongType
+  override def nullable: Boolean = false
 
-  def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+  override def dataType: DataType = LongType
 
-  def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
 
-  def cloneBufferAttributes: Seq[Attribute] = bufferAttributes.map(_.newInstance())
+  override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
 
   /** Allocate enough words to store all registers. */
-  val bufferAttributes: Seq[AttributeReference] = Seq.tabulate(numWords) { i =>
+  override val aggBufferAttributes: Seq[AttributeReference] = Seq.tabulate(numWords) { i =>
     AttributeReference(s"MS[$i]", LongType)()
   }
 
   /** Fill all words with zeros. */
-  def initialize(buffer: MutableRow): Unit = {
+  override def initialize(buffer: MutableRow): Unit = {
     var word = 0
     while (word < numWords) {
-      buffer.setLong(mutableBufferOffset + word, 0)
+      buffer.setLong(mutableAggBufferOffset + word, 0)
       word += 1
     }
   }
@@ -562,7 +560,7 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
    *
    * Variable names in the HLL++ paper match variable names in the code.
    */
-  def update(buffer: MutableRow, input: InternalRow): Unit = {
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
     val v = child.eval(input)
     if (v != null) {
       // Create the hashed value 'x'.
@@ -576,7 +574,7 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
 
       // Get the word containing the register we are interested in.
       val wordOffset = idx / REGISTERS_PER_WORD
-      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+      val word = buffer.getLong(mutableAggBufferOffset + wordOffset)
 
       // Extract the M[J] register value from the word.
       val shift = REGISTER_SIZE * (idx - (wordOffset * REGISTERS_PER_WORD))
@@ -585,7 +583,7 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
 
       // Assign the maximum number of leading zeros to the register.
       if (pw > Midx) {
-        buffer.setLong(mutableBufferOffset + wordOffset, (word & ~mask) | (pw << shift))
+        buffer.setLong(mutableAggBufferOffset + wordOffset, (word & ~mask) | (pw << shift))
       }
     }
   }
@@ -594,12 +592,12 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
    * Merge the HLL buffers by iterating through the registers in both buffers and select the
    * maximum number of leading zeros for each register.
    */
-  def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
     var idx = 0
     var wordOffset = 0
     while (wordOffset < numWords) {
-      val word1 = buffer1.getLong(mutableBufferOffset + wordOffset)
-      val word2 = buffer2.getLong(inputBufferOffset + wordOffset)
+      val word1 = buffer1.getLong(mutableAggBufferOffset + wordOffset)
+      val word2 = buffer2.getLong(inputAggBufferOffset + wordOffset)
       var word = 0L
       var i = 0
       var mask = REGISTER_WORD_MASK
@@ -609,7 +607,7 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
         i += 1
         idx += 1
       }
-      buffer1.setLong(mutableBufferOffset + wordOffset, word)
+      buffer1.setLong(mutableAggBufferOffset + wordOffset, word)
       wordOffset += 1
     }
   }
@@ -664,14 +662,14 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
    *
    * Variable names in the HLL++ paper match variable names in the code.
    */
-  def eval(buffer: InternalRow): Any = {
+  override def eval(buffer: InternalRow): Any = {
     // Compute the inverse of indicator value 'z' and count the number of zeros 'V'.
     var zInverse = 0.0d
     var V = 0.0d
     var idx = 0
     var wordOffset = 0
     while (wordOffset < numWords) {
-      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+      val word = buffer.getLong(mutableAggBufferOffset + wordOffset)
       var i = 0
       var shift = 0
       while (idx < m && i < REGISTERS_PER_WORD) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index d8699533cd156..74e15ec90b7f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -69,9 +69,6 @@ private[sql] case object NoOp extends Expression with Unevaluable {
 /**
  * A container for an [[AggregateFunction2]] with its [[AggregateMode]] and a field
  * (`isDistinct`) indicating if DISTINCT keyword is specified for this function.
- * @param aggregateFunction
- * @param mode
- * @param isDistinct
  */
 private[sql] case class AggregateExpression2(
     aggregateFunction: AggregateFunction2,
@@ -86,7 +83,7 @@ private[sql] case class AggregateExpression2(
   override def references: AttributeSet = {
     val childReferences = mode match {
       case Partial | Complete => aggregateFunction.references.toSeq
-      case PartialMerge | Final => aggregateFunction.bufferAttributes
+      case PartialMerge | Final => aggregateFunction.aggBufferAttributes
     }
 
     AttributeSet(childReferences)
@@ -95,98 +92,192 @@ private[sql] case class AggregateExpression2(
   override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)"
 }
 
-abstract class AggregateFunction2
-  extends Expression with ImplicitCastInputTypes {
+/**
+ * AggregateFunction2 is the superclass of two aggregation function interfaces:
+ *
+ *  - [[ImperativeAggregate]] is for aggregation functions that are specified in terms of
+ *    initialize(), update(), and merge() functions that operate on Row-based aggregation buffers.
+ *  - [[ExpressionAggregate]] is for aggregation functions that are specified using
+ *    Catalyst expressions.
+ *
+ * In both interfaces, aggregates must define the schema ([[aggBufferSchema]]) and attributes
+ * ([[aggBufferAttributes]]) of an aggregation buffer which is used to hold partial aggregate
+ * results. At runtime, multiple aggregate functions are evaluated by the same operator using a
+ * combined aggregation buffer which concatenates the aggregation buffers of the individual
+ * aggregate functions.
+ *
+ * Code which accepts [[AggregateFunction2]] instances should be prepared to handle both types of
+ * aggregate functions.
+ */
+sealed abstract class AggregateFunction2 extends Expression with ImplicitCastInputTypes {
 
   /** An aggregate function is not foldable. */
   final override def foldable: Boolean = false
 
+  /** The schema of the aggregation buffer. */
+  def aggBufferSchema: StructType
+
+  /** Attributes of fields in aggBufferSchema. */
+  def aggBufferAttributes: Seq[AttributeReference]
+
   /**
-   * The offset of this function's start buffer value in the
-   * underlying shared mutable aggregation buffer.
-   * For example, we have two aggregate functions `avg(x)` and `avg(y)`, which share
-   * the same aggregation buffer. In this shared buffer, the position of the first
-   * buffer value of `avg(x)` will be 0 and the position of the first buffer value of `avg(y)`
-   * will be 2.
+   * Attributes of fields in input aggregation buffers (immutable aggregation buffers that are
+   * merged with mutable aggregation buffers in the merge() function or merge expressions).
+   * These attributes are created automatically by cloning the [[aggBufferAttributes]].
    */
-  protected var mutableBufferOffset: Int = 0
-
-  def withNewMutableBufferOffset(newMutableBufferOffset: Int): Unit = {
-    mutableBufferOffset = newMutableBufferOffset
-  }
+  def inputAggBufferAttributes: Seq[AttributeReference]
 
   /**
-   * The offset of this function's start buffer value in the
-   * underlying shared input aggregation buffer. An input aggregation buffer is used
-   * when we merge two aggregation buffers and it is basically the immutable one
-   * (we merge an input aggregation buffer and a mutable aggregation buffer and
-   * then store the new buffer values to the mutable aggregation buffer).
-   * Usually, an input aggregation buffer also contain extra elements like grouping
-   * keys at the beginning. So, mutableBufferOffset and inputBufferOffset are often
-   * different.
-   * For example, we have a grouping expression `key``, and two aggregate functions
-   * `avg(x)` and `avg(y)`. In this shared input aggregation buffer, the position of the first
-   * buffer value of `avg(x)` will be 1 and the position of the first buffer value of `avg(y)`
-   * will be 3 (position 0 is used for the value of key`).
+   * Indicates if this function supports partial aggregation.
+   * Currently Hive UDAF is the only one that doesn't support partial aggregation.
    */
-  protected var inputBufferOffset: Int = 0
+  def supportsPartial: Boolean = true
 
-  def withNewInputBufferOffset(newInputBufferOffset: Int): Unit = {
-    inputBufferOffset = newInputBufferOffset
-  }
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+}
 
-  /** The schema of the aggregation buffer. */
-  def bufferSchema: StructType
+/**
+ * API for aggregation functions that are expressed in terms of imperative initialize(), update(),
+ * and merge() functions which operate on Row-based aggregation buffers.
+ *
+ * Within these functions, code should access fields of the mutable aggregation buffer by adding the
+ * bufferSchema-relative field number to `mutableAggBufferOffset` then using this new field number
+ * to access the buffer Row. This is necessary because this aggregation function's buffer is
+ * embedded inside of a larger shared aggregation buffer when an aggregation operator evaluates
+ * multiple aggregate functions at the same time.
+ *
+ * We need to perform similar field number arithmetic when merging multiple intermediate
+ * aggregate buffers together in `merge()` (in this case, use `inputAggBufferOffset` when accessing
+ * the input buffer).
+ */
+abstract class ImperativeAggregate extends AggregateFunction2 {
 
-  /** Attributes of fields in bufferSchema. */
-  def bufferAttributes: Seq[AttributeReference]
+  /**
+   * The offset of this function's first buffer value in the underlying shared mutable aggregation
+   * buffer.
+   *
+   * For example, we have two aggregate functions `avg(x)` and `avg(y)`, which share the same
+   * aggregation buffer. In this shared buffer, the position of the first buffer value of `avg(x)`
+   * will be 0 and the position of the first buffer value of `avg(y)` will be 2:
+   *
+   *          avg(x) mutableAggBufferOffset = 0
+   *                  |
+   *                  v
+   *                  +--------+--------+--------+--------+
+   *                  |  sum1  | count1 |  sum2  | count2 |
+   *                  +--------+--------+--------+--------+
+   *                                    ^
+   *                                    |
+   *                     avg(y) mutableAggBufferOffset = 2
+   *
+   */
+  protected var mutableAggBufferOffset: Int = 0
 
-  /** Clones bufferAttributes. */
-  def cloneBufferAttributes: Seq[Attribute]
+  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Unit = {
+    mutableAggBufferOffset = newMutableAggBufferOffset
+  }
 
   /**
-   * Initializes its aggregation buffer located in `buffer`.
-   * It will use bufferOffset to find the starting point of
-   * its buffer in the given `buffer` shared with other functions.
+   * The offset of this function's start buffer value in the underlying shared input aggregation
+   * buffer. An input aggregation buffer is used when we merge two aggregation buffers together in
+   * the `update()` function and is immutable (we merge an input aggregation buffer and a mutable
+   * aggregation buffer and then store the new buffer values to the mutable aggregation buffer).
+   *
+   * An input aggregation buffer may contain extra fields, such as grouping keys, at its start, so
+   * mutableAggBufferOffset and inputAggBufferOffset are often different.
+   *
+   * For example, say we have a grouping expression, `key`, and two aggregate functions,
+   * `avg(x)` and `avg(y)`. In the shared input aggregation buffer, the position of the first
+   * buffer value of `avg(x)` will be 1 and the position of the first buffer value of `avg(y)`
+   * will be 3 (position 0 is used for the value of `key`):
+   *
+   *          avg(x) inputAggBufferOffset = 1
+   *                   |
+   *                   v
+   *          +--------+--------+--------+--------+--------+
+   *          |  key   |  sum1  | count1 |  sum2  | count2 |
+   *          +--------+--------+--------+--------+--------+
+   *                                     ^
+   *                                     |
+   *                       avg(y) inputAggBufferOffset = 3
+   *
    */
-  def initialize(buffer: MutableRow): Unit
+  protected var inputAggBufferOffset: Int = 0
+
+  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): Unit = {
+    inputAggBufferOffset = newInputAggBufferOffset
+  }
 
   /**
-   * Updates its aggregation buffer located in `buffer` based on the given `input`.
-   * It will use bufferOffset to find the starting point of its buffer in the given `buffer`
-   * shared with other functions.
+   * Initializes the mutable aggregation buffer located in `mutableAggBuffer`.
+   *
+   * Use `fieldNumber + mutableAggBufferOffset` to access fields of `mutableAggBuffer`.
    */
-  def update(buffer: MutableRow, input: InternalRow): Unit
+  def initialize(mutableAggBuffer: MutableRow): Unit
 
   /**
-   * Updates its aggregation buffer located in `buffer1` by combining intermediate results
-   * in the current buffer and intermediate results from another buffer `buffer2`.
-   * It will use bufferOffset to find the starting point of its buffer in the given `buffer1`
-   * and `buffer2`.
+   * Updates its aggregation buffer, located in `mutableAggBuffer`, based on the given `inputRow`.
+   *
+   * Use `fieldNumber + mutableAggBufferOffset` to access fields of `mutableAggBuffer`.
    */
-  def merge(buffer1: MutableRow, buffer2: InternalRow): Unit
-
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
-    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+  def update(mutableAggBuffer: MutableRow, inputRow: InternalRow): Unit
 
   /**
-   * Indicates if this function supports partial aggregation.
-   * Currently Hive UDAF is the only one that doesn't support partial aggregation.
+   * Combines new intermediate results from the `inputAggBuffer` with the existing intermediate
+   * results in the `mutableAggBuffer.`
+   *
+   * Use `fieldNumber + mutableAggBufferOffset` to access fields of `mutableAggBuffer`.
+   * Use `fieldNumber + inputAggBufferOffset` to access fields of `inputAggBuffer`.
    */
-  def supportsPartial: Boolean = true
+  def merge(mutableAggBuffer: MutableRow, inputAggBuffer: InternalRow): Unit
+
+  final lazy val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
 }
 
 /**
- * A helper class for aggregate functions that can be implemented in terms of catalyst expressions.
+ * API for aggregation functions that are expressed in terms of Catalyst expressions.
+ *
+ * When implementing a new expression-based aggregate function, start by implementing
+ * `bufferAttributes`, defining attributes for the fields of the mutable aggregation buffer. You
+ * can then use these attributes when defining `updateExpressions`, `mergeExpressions`, and
+ * `evaluateExpressions`.
  */
-abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable with Unevaluable {
+abstract class ExpressionAggregate
+  extends AggregateFunction2
+  with Serializable
+  with Unevaluable {
 
+  /**
+   * Expressions for initializing empty aggregation buffers.
+   */
   val initialValues: Seq[Expression]
+
+  /**
+   * Expressions for updating the mutable aggregation buffer based on an input row.
+   */
   val updateExpressions: Seq[Expression]
+
+  /**
+   * A sequence of expressions for merging two aggregation buffers together. When defining these
+   * expressions, you can use the syntax `attributeName.left` and `attributeName.right` to refer
+   * to the attributes corresponding to each of the buffers being merged (this magic is enabled
+   * by the [[RichAttribute]] implicit class).
+   */
   val mergeExpressions: Seq[Expression]
+
+  /**
+   * An expression which returns the final value for this aggregate function. Its data type should
+   * match this expression's [[dataType]].
+   */
   val evaluateExpression: Expression
 
-  override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance())
+  /** An expression-based aggregate's bufferSchema is derived from bufferAttributes. */
+  final override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
+
+  final lazy val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
 
   /**
    * A helper class for representing an attribute used in merging two
@@ -194,33 +285,13 @@ abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable w
    * we merge buffer values and then update bufferLeft. A [[RichAttribute]]
    * of an [[AttributeReference]] `a` has two functions `left` and `right`,
    * which represent `a` in `bufferLeft` and `bufferRight`, respectively.
-   * @param a
    */
   implicit class RichAttribute(a: AttributeReference) {
     /** Represents this attribute at the mutable buffer side. */
     def left: AttributeReference = a
 
     /** Represents this attribute at the input buffer side (the data value is read-only). */
-    def right: AttributeReference = cloneBufferAttributes(bufferAttributes.indexOf(a))
-  }
-
-  /** An AlgebraicAggregate's bufferSchema is derived from bufferAttributes. */
-  override def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
-
-  override def initialize(buffer: MutableRow): Unit = {
-    throw new UnsupportedOperationException(
-      "AlgebraicAggregate's initialize should not be called directly")
-  }
-
-  override final def update(buffer: MutableRow, input: InternalRow): Unit = {
-    throw new UnsupportedOperationException(
-      "AlgebraicAggregate's update should not be called directly")
+    def right: AttributeReference = inputAggBufferAttributes(aggBufferAttributes.indexOf(a))
   }
-
-  override final def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
-    throw new UnsupportedOperationException(
-      "AlgebraicAggregate's merge should not be called directly")
-  }
-
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
index ecc0644164de2..0d329497758c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -38,7 +38,7 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
   }
 
   def createBuffer(hll: HyperLogLogPlusPlus): MutableRow = {
-    val buffer = new SpecificMutableRow(hll.bufferAttributes.map(_.dataType))
+    val buffer = new SpecificMutableRow(hll.aggBufferAttributes.map(_.dataType))
     hll.initialize(buffer)
     buffer
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 62dbc07e883f4..04903022e5469 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -85,9 +85,9 @@ abstract class AggregationIterator(
     while (i < allAggregateExpressions.length) {
       val func = allAggregateExpressions(i).aggregateFunction
       val funcWithBoundReferences = allAggregateExpressions(i).mode match {
-        case Partial | Complete if !func.isInstanceOf[AlgebraicAggregate] =>
+        case Partial | Complete if func.isInstanceOf[ImperativeAggregate] =>
           // We need to create BoundReferences if the function is not an
-          // AlgebraicAggregate (it does not support code-gen) and the mode of
+          // expression-based aggregate function (it does not support code-gen) and the mode of
           // this function is Partial or Complete because we will call eval of this
           // function's children in the update method of this aggregate function.
           // Those eval calls require BoundReferences to work.
@@ -95,31 +95,39 @@ abstract class AggregationIterator(
         case _ =>
           // We only need to set inputBufferOffset for aggregate functions with mode
           // PartialMerge and Final.
-          func.withNewInputBufferOffset(inputBufferOffset)
-          inputBufferOffset += func.bufferSchema.length
+          func match {
+            case function: ImperativeAggregate =>
+              function.withNewInputAggBufferOffset(inputBufferOffset)
+            case _ =>
+          }
+          inputBufferOffset += func.aggBufferSchema.length
           func
       }
       // Set mutableBufferOffset for this function. It is important that setting
       // mutableBufferOffset happens after all potential bindReference operations
       // because bindReference will create a new instance of the function.
-      funcWithBoundReferences.withNewMutableBufferOffset(mutableBufferOffset)
-      mutableBufferOffset += funcWithBoundReferences.bufferSchema.length
+      funcWithBoundReferences match {
+        case function: ImperativeAggregate =>
+          function.withNewMutableAggBufferOffset(mutableBufferOffset)
+        case _ =>
+      }
+      mutableBufferOffset += funcWithBoundReferences.aggBufferSchema.length
       functions(i) = funcWithBoundReferences
       i += 1
     }
     functions
   }
 
-  // Positions of those non-algebraic aggregate functions in allAggregateFunctions.
+  // Positions of those imperative aggregate functions in allAggregateFunctions.
   // For example, we have func1, func2, func3, func4 in aggregateFunctions, and
-  // func2 and func3 are non-algebraic aggregate functions.
-  // nonAlgebraicAggregateFunctionPositions will be [1, 2].
-  private[this] val allNonAlgebraicAggregateFunctionPositions: Array[Int] = {
+  // func2 and func3 are imperative aggregate functions.
+  // ImperativeAggregateFunctionPositions will be [1, 2].
+  private[this] val allImperativeAggregateFunctionPositions: Array[Int] = {
     val positions = new ArrayBuffer[Int]()
     var i = 0
     while (i < allAggregateFunctions.length) {
       allAggregateFunctions(i) match {
-        case agg: AlgebraicAggregate =>
+        case agg: ExpressionAggregate =>
         case _ => positions += i
       }
       i += 1
@@ -131,24 +139,26 @@ abstract class AggregationIterator(
   private[this] val nonCompleteAggregateFunctions: Array[AggregateFunction2] =
     allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
 
-  // All non-algebraic aggregate functions with mode Partial, PartialMerge, or Final.
-  private[this] val nonCompleteNonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
-    nonCompleteAggregateFunctions.collect {
-      case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-    }
+  // All imperative aggregate functions with mode Partial, PartialMerge, or Final.
+  private[this] val nonCompleteImperativeAggregateFunctions: Array[ImperativeAggregate] =
+    nonCompleteAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
-  // The projection used to initialize buffer values for all AlgebraicAggregates.
-  private[this] val algebraicInitialProjection = {
+  // The projection used to initialize buffer values for all expression-based aggregates.
+  private[this] val expressionAggInitialProjection = {
     val initExpressions = allAggregateFunctions.flatMap {
-      case ae: AlgebraicAggregate => ae.initialValues
-      case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+      case ae: ExpressionAggregate => ae.initialValues
+      // For the positions corresponding to imperative aggregate functions, we'll use special
+      // no-op expressions which are ignored during projection code-generation.
+      case i: ImperativeAggregate => Seq.fill(i.aggBufferAttributes.length)(NoOp)
     }
     newMutableProjection(initExpressions, Nil)()
   }
 
-  // All non-Algebraic AggregateFunctions.
-  private[this] val allNonAlgebraicAggregateFunctions =
-    allNonAlgebraicAggregateFunctionPositions.map(allAggregateFunctions)
+  // All imperative AggregateFunctions.
+  private[this] val allImperativeAggregateFunctions: Array[ImperativeAggregate] =
+    allImperativeAggregateFunctionPositions
+      .map(allAggregateFunctions)
+      .map(_.asInstanceOf[ImperativeAggregate])
 
   ///////////////////////////////////////////////////////////////////////////
   // Methods and fields used by sub-classes.
@@ -157,25 +167,25 @@ abstract class AggregationIterator(
   // Initializing functions used to process a row.
   protected val processRow: (MutableRow, InternalRow) => Unit = {
     val rowToBeProcessed = new JoinedRow
-    val aggregationBufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes)
+    val aggregationBufferSchema = allAggregateFunctions.flatMap(_.aggBufferAttributes)
     aggregationMode match {
       // Partial-only
       case (Some(Partial), None) =>
         val updateExpressions = nonCompleteAggregateFunctions.flatMap {
-          case ae: AlgebraicAggregate => ae.updateExpressions
-          case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+          case ae: ExpressionAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
         }
-        val algebraicUpdateProjection =
+        val expressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferSchema ++ valueAttributes)()
 
         (currentBuffer: MutableRow, row: InternalRow) => {
-          algebraicUpdateProjection.target(currentBuffer)
-          // Process all algebraic aggregate functions.
-          algebraicUpdateProjection(rowToBeProcessed(currentBuffer, row))
-          // Process all non-algebraic aggregate functions.
+          expressionAggUpdateProjection.target(currentBuffer)
+          // Process all expression-based aggregate functions.
+          expressionAggUpdateProjection(rowToBeProcessed(currentBuffer, row))
+          // Process all imperative aggregate functions.
           var i = 0
-          while (i < nonCompleteNonAlgebraicAggregateFunctions.length) {
-            nonCompleteNonAlgebraicAggregateFunctions(i).update(currentBuffer, row)
+          while (i < nonCompleteImperativeAggregateFunctions.length) {
+            nonCompleteImperativeAggregateFunctions(i).update(currentBuffer, row)
             i += 1
           }
         }
@@ -186,30 +196,30 @@ abstract class AggregationIterator(
           // If initialInputBufferOffset, the input value does not contain
           // grouping keys.
           // This part is pretty hacky.
-          allAggregateFunctions.flatMap(_.cloneBufferAttributes).toSeq
+          allAggregateFunctions.flatMap(_.inputAggBufferAttributes).toSeq
         } else {
-          groupingKeyAttributes ++ allAggregateFunctions.flatMap(_.cloneBufferAttributes)
+          groupingKeyAttributes ++ allAggregateFunctions.flatMap(_.inputAggBufferAttributes)
         }
         // val inputAggregationBufferSchema =
         //  groupingKeyAttributes ++
         //    allAggregateFunctions.flatMap(_.cloneBufferAttributes)
         val mergeExpressions = nonCompleteAggregateFunctions.flatMap {
-          case ae: AlgebraicAggregate => ae.mergeExpressions
-          case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+          case ae: ExpressionAggregate => ae.mergeExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
         }
-        // This projection is used to merge buffer values for all AlgebraicAggregates.
-        val algebraicMergeProjection =
+        // This projection is used to merge buffer values for all expression-based aggregates.
+        val expressionAggMergeProjection =
           newMutableProjection(
             mergeExpressions,
             aggregationBufferSchema ++ inputAggregationBufferSchema)()
 
         (currentBuffer: MutableRow, row: InternalRow) => {
-          // Process all algebraic aggregate functions.
-          algebraicMergeProjection.target(currentBuffer)(rowToBeProcessed(currentBuffer, row))
-          // Process all non-algebraic aggregate functions.
+          // Process all expression-based aggregate functions.
+          expressionAggMergeProjection.target(currentBuffer)(rowToBeProcessed(currentBuffer, row))
+          // Process all imperative aggregate functions.
           var i = 0
-          while (i < nonCompleteNonAlgebraicAggregateFunctions.length) {
-            nonCompleteNonAlgebraicAggregateFunctions(i).merge(currentBuffer, row)
+          while (i < nonCompleteImperativeAggregateFunctions.length) {
+            nonCompleteImperativeAggregateFunctions(i).merge(currentBuffer, row)
             i += 1
           }
         }
@@ -218,57 +228,55 @@ abstract class AggregationIterator(
       case (Some(Final), Some(Complete)) =>
         val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
-        // All non-algebraic aggregate functions with mode Complete.
-        val completeNonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
-          completeAggregateFunctions.collect {
-            case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-          }
+        // All imperative aggregate functions with mode Complete.
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
         // The first initialInputBufferOffset values of the input aggregation buffer is
         // for grouping expressions and distinct columns.
         val groupingAttributesAndDistinctColumns = valueAttributes.take(initialInputBufferOffset)
 
         val completeOffsetExpressions =
-          Seq.fill(completeAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
+          Seq.fill(completeAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
         // We do not touch buffer values of aggregate functions with the Final mode.
         val finalOffsetExpressions =
-          Seq.fill(nonCompleteAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
+          Seq.fill(nonCompleteAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
 
         val mergeInputSchema =
           aggregationBufferSchema ++
             groupingAttributesAndDistinctColumns ++
-            nonCompleteAggregateFunctions.flatMap(_.cloneBufferAttributes)
+            nonCompleteAggregateFunctions.flatMap(_.inputAggBufferAttributes)
         val mergeExpressions =
           nonCompleteAggregateFunctions.flatMap {
-            case ae: AlgebraicAggregate => ae.mergeExpressions
-            case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+            case ae: ExpressionAggregate => ae.mergeExpressions
+            case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           } ++ completeOffsetExpressions
-        val finalAlgebraicMergeProjection =
+        val finalExpressionAggMergeProjection =
           newMutableProjection(mergeExpressions, mergeInputSchema)()
 
         val updateExpressions =
           finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
-            case ae: AlgebraicAggregate => ae.updateExpressions
-            case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+            case ae: ExpressionAggregate => ae.updateExpressions
+            case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           }
-        val completeAlgebraicUpdateProjection =
+        val completeExpressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferSchema ++ valueAttributes)()
 
         (currentBuffer: MutableRow, row: InternalRow) => {
           val input = rowToBeProcessed(currentBuffer, row)
           // For all aggregate functions with mode Complete, update buffers.
-          completeAlgebraicUpdateProjection.target(currentBuffer)(input)
+          completeExpressionAggUpdateProjection.target(currentBuffer)(input)
           var i = 0
-          while (i < completeNonAlgebraicAggregateFunctions.length) {
-            completeNonAlgebraicAggregateFunctions(i).update(currentBuffer, row)
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
             i += 1
           }
 
           // For all aggregate functions with mode Final, merge buffers.
-          finalAlgebraicMergeProjection.target(currentBuffer)(input)
+          finalExpressionAggMergeProjection.target(currentBuffer)(input)
           i = 0
-          while (i < nonCompleteNonAlgebraicAggregateFunctions.length) {
-            nonCompleteNonAlgebraicAggregateFunctions(i).merge(currentBuffer, row)
+          while (i < nonCompleteImperativeAggregateFunctions.length) {
+            nonCompleteImperativeAggregateFunctions(i).merge(currentBuffer, row)
             i += 1
           }
         }
@@ -277,27 +285,25 @@ abstract class AggregationIterator(
       case (None, Some(Complete)) =>
         val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
-        // All non-algebraic aggregate functions with mode Complete.
-        val completeNonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
-          completeAggregateFunctions.collect {
-            case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-          }
+        // All imperative aggregate functions with mode Complete.
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
         val updateExpressions =
           completeAggregateFunctions.flatMap {
-            case ae: AlgebraicAggregate => ae.updateExpressions
-            case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+            case ae: ExpressionAggregate => ae.updateExpressions
+            case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           }
-        val completeAlgebraicUpdateProjection =
+        val completeExpressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferSchema ++ valueAttributes)()
 
         (currentBuffer: MutableRow, row: InternalRow) => {
           val input = rowToBeProcessed(currentBuffer, row)
           // For all aggregate functions with mode Complete, update buffers.
-          completeAlgebraicUpdateProjection.target(currentBuffer)(input)
+          completeExpressionAggUpdateProjection.target(currentBuffer)(input)
           var i = 0
-          while (i < completeNonAlgebraicAggregateFunctions.length) {
-            completeNonAlgebraicAggregateFunctions(i).update(currentBuffer, row)
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
             i += 1
           }
         }
@@ -315,11 +321,11 @@ abstract class AggregationIterator(
   // Initializing the function used to generate the output row.
   protected val generateOutput: (InternalRow, MutableRow) => InternalRow = {
     val rowToBeEvaluated = new JoinedRow
-    val safeOutoutRow = new GenericMutableRow(resultExpressions.length)
+    val safeOutputRow = new GenericMutableRow(resultExpressions.length)
     val mutableOutput = if (outputsUnsafeRows) {
-      UnsafeProjection.create(resultExpressions.map(_.dataType).toArray).apply(safeOutoutRow)
+      UnsafeProjection.create(resultExpressions.map(_.dataType).toArray).apply(safeOutputRow)
     } else {
-      safeOutoutRow
+      safeOutputRow
     }
 
     aggregationMode match {
@@ -329,7 +335,7 @@ abstract class AggregationIterator(
         // Because we cannot copy a joinedRow containing a UnsafeRow (UnsafeRow does not
         // support generic getter), we create a mutable projection to output the
         // JoinedRow(currentGroupingKey, currentBuffer)
-        val bufferSchema = nonCompleteAggregateFunctions.flatMap(_.bufferAttributes)
+        val bufferSchema = nonCompleteAggregateFunctions.flatMap(_.aggBufferAttributes)
         val resultProjection =
           newMutableProjection(
             groupingKeyAttributes ++ bufferSchema,
@@ -345,12 +351,12 @@ abstract class AggregationIterator(
       // resultExpressions.
       case (Some(Final), None) | (Some(Final) | None, Some(Complete)) =>
         val bufferSchemata =
-          allAggregateFunctions.flatMap(_.bufferAttributes)
+          allAggregateFunctions.flatMap(_.aggBufferAttributes)
         val evalExpressions = allAggregateFunctions.map {
-          case ae: AlgebraicAggregate => ae.evaluateExpression
+          case ae: ExpressionAggregate => ae.evaluateExpression
           case agg: AggregateFunction2 => NoOp
         }
-        val algebraicEvalProjection = newMutableProjection(evalExpressions, bufferSchemata)()
+        val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferSchemata)()
         val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
         // TODO: Use unsafe row.
         val aggregateResult = new GenericMutableRow(aggregateResultSchema.length)
@@ -360,14 +366,14 @@ abstract class AggregationIterator(
         resultProjection.target(mutableOutput)
 
         (currentGroupingKey: InternalRow, currentBuffer: MutableRow) => {
-          // Generate results for all algebraic aggregate functions.
-          algebraicEvalProjection.target(aggregateResult)(currentBuffer)
-          // Generate results for all non-algebraic aggregate functions.
+          // Generate results for all expression-based aggregate functions.
+          expressionAggEvalProjection.target(aggregateResult)(currentBuffer)
+          // Generate results for all imperative aggregate functions.
           var i = 0
-          while (i < allNonAlgebraicAggregateFunctions.length) {
+          while (i < allImperativeAggregateFunctions.length) {
             aggregateResult.update(
-              allNonAlgebraicAggregateFunctionPositions(i),
-              allNonAlgebraicAggregateFunctions(i).eval(currentBuffer))
+              allImperativeAggregateFunctionPositions(i),
+              allImperativeAggregateFunctions(i).eval(currentBuffer))
             i += 1
           }
           resultProjection(rowToBeEvaluated(currentGroupingKey, aggregateResult))
@@ -392,10 +398,10 @@ abstract class AggregationIterator(
 
   /** Initializes buffer values for all aggregate functions. */
   protected def initializeBuffer(buffer: MutableRow): Unit = {
-    algebraicInitialProjection.target(buffer)(EmptyRow)
+    expressionAggInitialProjection.target(buffer)(EmptyRow)
     var i = 0
-    while (i < allNonAlgebraicAggregateFunctions.length) {
-      allNonAlgebraicAggregateFunctions(i).initialize(buffer)
+    while (i < allImperativeAggregateFunctions.length) {
+      allImperativeAggregateFunctions(i).initialize(buffer)
       i += 1
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
index 73d50e07cf0b5..a9e5d175bf895 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -54,7 +54,7 @@ class SortBasedAggregationIterator(
     outputsUnsafeRows) {
 
   override protected def newBuffer: MutableRow = {
-    val bufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes)
+    val bufferSchema = allAggregateFunctions.flatMap(_.aggBufferAttributes)
     val bufferRowSize: Int = bufferSchema.length
 
     val genericMutableBuffer = new GenericMutableRow(bufferRowSize)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index ba379d358d206..3cd22af30592c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -32,7 +32,6 @@ case class TungstenAggregate(
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateExpressions: Seq[AggregateExpression2],
-    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
   extends UnaryNode {
@@ -79,7 +78,6 @@ case class TungstenAggregate(
         groupingExpressions,
         nonCompleteAggregateExpressions,
         completeAggregateExpressions,
-        initialInputBufferOffset,
         resultExpressions,
         newMutableProjection,
         child.output,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 26fdbc83ef50b..6a84c0af0b460 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -62,10 +62,6 @@ import org.apache.spark.sql.types.StructType
  *   [[PartialMerge]], or [[Final]].
  * @param completeAggregateExpressions
  *   [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Complete]].
- * @param initialInputBufferOffset
- *   If this iterator is used to handle functions with mode [[PartialMerge]] or [[Final]].
- *   The input rows have the format of `grouping keys + aggregation buffer`.
- *   This offset indicates the starting position of aggregation buffer in a input row.
  * @param resultExpressions
  *   expressions for generating output rows.
  * @param newMutableProjection
@@ -77,7 +73,6 @@ class TungstenAggregationIterator(
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateExpressions: Seq[AggregateExpression2],
-    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
@@ -133,17 +128,18 @@ class TungstenAggregationIterator(
       completeAggregateExpressions.map(_.mode).distinct.headOption
   }
 
-  // All aggregate functions. TungstenAggregationIterator only handles AlgebraicAggregates.
-  // If there is any functions that is not an AlgebraicAggregate, we throw an
+  // All aggregate functions. TungstenAggregationIterator only handles expression-based aggregate.
+  // If there is any functions that is an ImperativeAggregateFunction, we throw an
   // IllegalStateException.
-  private[this] val allAggregateFunctions: Array[AlgebraicAggregate] = {
-    if (!allAggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate])) {
+  private[this] val allAggregateFunctions: Array[ExpressionAggregate] = {
+    if (!allAggregateExpressions.forall(
+        _.aggregateFunction.isInstanceOf[ExpressionAggregate])) {
       throw new IllegalStateException(
-        "Only AlgebraicAggregates should be passed in TungstenAggregationIterator.")
+        "Only ExpressionAggregateFunctions should be passed in TungstenAggregationIterator.")
     }
 
     allAggregateExpressions
-      .map(_.aggregateFunction.asInstanceOf[AlgebraicAggregate])
+      .map(_.aggregateFunction.asInstanceOf[ExpressionAggregate])
       .toArray
   }
 
@@ -154,7 +150,7 @@ class TungstenAggregationIterator(
   ///////////////////////////////////////////////////////////////////////////
 
   // The projection used to initialize buffer values.
-  private[this] val algebraicInitialProjection: MutableProjection = {
+  private[this] val initialProjection: MutableProjection = {
     val initExpressions = allAggregateFunctions.flatMap(_.initialValues)
     newMutableProjection(initExpressions, Nil)()
   }
@@ -164,14 +160,14 @@ class TungstenAggregationIterator(
   // when we switch to sort-based aggregation, and when we create the re-used buffer for
   // sort-based aggregation).
   private def createNewAggregationBuffer(): UnsafeRow = {
-    val bufferSchema = allAggregateFunctions.flatMap(_.bufferAttributes)
+    val bufferSchema = allAggregateFunctions.flatMap(_.aggBufferAttributes)
     val bufferRowSize: Int = bufferSchema.length
 
     val genericMutableBuffer = new GenericMutableRow(bufferRowSize)
     val unsafeProjection =
       UnsafeProjection.create(bufferSchema.map(_.dataType))
     val buffer = unsafeProjection.apply(genericMutableBuffer)
-    algebraicInitialProjection.target(buffer)(EmptyRow)
+    initialProjection.target(buffer)(EmptyRow)
     buffer
   }
 
@@ -179,84 +175,78 @@ class TungstenAggregationIterator(
   private def generateProcessRow(
       inputAttributes: Seq[Attribute]): (UnsafeRow, InternalRow) => Unit = {
 
-    val aggregationBufferAttributes = allAggregateFunctions.flatMap(_.bufferAttributes)
+    val aggregationBufferAttributes = allAggregateFunctions.flatMap(_.aggBufferAttributes)
     val joinedRow = new JoinedRow()
 
     aggregationMode match {
       // Partial-only
       case (Some(Partial), None) =>
         val updateExpressions = allAggregateFunctions.flatMap(_.updateExpressions)
-        val algebraicUpdateProjection =
+        val updateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          algebraicUpdateProjection.target(currentBuffer)
-          algebraicUpdateProjection(joinedRow(currentBuffer, row))
+          updateProjection.target(currentBuffer)
+          updateProjection(joinedRow(currentBuffer, row))
         }
 
       // PartialMerge-only or Final-only
       case (Some(PartialMerge), None) | (Some(Final), None) =>
         val mergeExpressions = allAggregateFunctions.flatMap(_.mergeExpressions)
-        // This projection is used to merge buffer values for all AlgebraicAggregates.
-        val algebraicMergeProjection =
-          newMutableProjection(
-            mergeExpressions,
-            aggregationBufferAttributes ++ inputAttributes)()
+        val mergeProjection =
+          newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          // Process all algebraic aggregate functions.
-          algebraicMergeProjection.target(currentBuffer)
-          algebraicMergeProjection(joinedRow(currentBuffer, row))
+          mergeProjection.target(currentBuffer)
+          mergeProjection(joinedRow(currentBuffer, row))
         }
 
       // Final-Complete
       case (Some(Final), Some(Complete)) =>
-        val nonCompleteAggregateFunctions: Array[AlgebraicAggregate] =
+        val nonCompleteAggregateFunctions: Array[ExpressionAggregate] =
           allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
-        val completeAggregateFunctions: Array[AlgebraicAggregate] =
+        val completeAggregateFunctions: Array[ExpressionAggregate] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
 
         val completeOffsetExpressions =
-          Seq.fill(completeAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
+          Seq.fill(completeAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
         val mergeExpressions =
           nonCompleteAggregateFunctions.flatMap(_.mergeExpressions) ++ completeOffsetExpressions
-        val finalAlgebraicMergeProjection =
-          newMutableProjection(
-            mergeExpressions,
-            aggregationBufferAttributes ++ inputAttributes)()
+        val finalMergeProjection =
+          newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         // We do not touch buffer values of aggregate functions with the Final mode.
         val finalOffsetExpressions =
-          Seq.fill(nonCompleteAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
+          Seq.fill(nonCompleteAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
         val updateExpressions =
           finalOffsetExpressions ++ completeAggregateFunctions.flatMap(_.updateExpressions)
-        val completeAlgebraicUpdateProjection =
+        val completeUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
           val input = joinedRow(currentBuffer, row)
           // For all aggregate functions with mode Complete, update the given currentBuffer.
-          completeAlgebraicUpdateProjection.target(currentBuffer)(input)
+          completeUpdateProjection.target(currentBuffer)(input)
 
           // For all aggregate functions with mode Final, merge buffer values in row to
           // currentBuffer.
-          finalAlgebraicMergeProjection.target(currentBuffer)(input)
+          finalMergeProjection.target(currentBuffer)(input)
         }
 
       // Complete-only
       case (None, Some(Complete)) =>
-        val completeAggregateFunctions: Array[AlgebraicAggregate] =
+        val completeAggregateFunctions: Array[ExpressionAggregate] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
 
         val updateExpressions =
           completeAggregateFunctions.flatMap(_.updateExpressions)
-        val completeAlgebraicUpdateProjection =
+        val completeUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          completeAlgebraicUpdateProjection.target(currentBuffer)
+          completeUpdateProjection.target(currentBuffer)
           // For all aggregate functions with mode Complete, update the given currentBuffer.
-          completeAlgebraicUpdateProjection(joinedRow(currentBuffer, row))
+          completeUpdateProjection(joinedRow(currentBuffer, row))
         }
 
       // Grouping only.
@@ -272,7 +262,7 @@ class TungstenAggregationIterator(
   private def generateResultProjection(): (UnsafeRow, UnsafeRow) => UnsafeRow = {
 
     val groupingAttributes = groupingExpressions.map(_.toAttribute)
-    val bufferAttributes = allAggregateFunctions.flatMap(_.bufferAttributes)
+    val bufferAttributes = allAggregateFunctions.flatMap(_.aggBufferAttributes)
 
     aggregationMode match {
       // Partial-only or PartialMerge-only: every output row is basically the values of
@@ -339,7 +329,7 @@ class TungstenAggregationIterator(
   // all groups and their corresponding aggregation buffers for hash-based aggregation.
   private[this] val hashMap = new UnsafeFixedWidthAggregationMap(
     initialAggregationBuffer,
-    StructType.fromAttributes(allAggregateFunctions.flatMap(_.bufferAttributes)),
+    StructType.fromAttributes(allAggregateFunctions.flatMap(_.aggBufferAttributes)),
     StructType.fromAttributes(groupingExpressions.map(_.toAttribute)),
     TaskContext.get.taskMemoryManager(),
     SparkEnv.get.shuffleMemoryManager,
@@ -480,7 +470,7 @@ class TungstenAggregationIterator(
       // The originalInputAttributes are using cloneBufferAttributes. So, we need to use
       // allAggregateFunctions.flatMap(_.cloneBufferAttributes).
       val bufferExtractor = newMutableProjection(
-        allAggregateFunctions.flatMap(_.cloneBufferAttributes),
+        allAggregateFunctions.flatMap(_.inputAggBufferAttributes),
         originalInputAttributes)()
       bufferExtractor.target(buffer)
 
@@ -510,7 +500,7 @@ class TungstenAggregationIterator(
     // Basically the value of the KVIterator returned by externalSorter
     // will just aggregation buffer. At here, we use cloneBufferAttributes.
     val newInputAttributes: Seq[Attribute] =
-      allAggregateFunctions.flatMap(_.cloneBufferAttributes)
+      allAggregateFunctions.flatMap(_.inputAggBufferAttributes)
 
     // Set up new processRow and generateOutput.
     processRow = generateProcessRow(newInputAttributes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index 1114fe6552bdb..fd02be1225f27 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, InterpretedMutableProjection, AttributeReference, Expression}
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction2
+import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, AggregateFunction2}
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 import org.apache.spark.sql.types._
 
@@ -318,13 +318,11 @@ private[sql] class InputAggregationBuffer private[sql] (
 /**
  * The internal wrapper used to hook a [[UserDefinedAggregateFunction]] `udaf` in the
  * internal aggregation code path.
- * @param children
- * @param udaf
  */
 private[sql] case class ScalaUDAF(
     children: Seq[Expression],
     udaf: UserDefinedAggregateFunction)
-  extends AggregateFunction2 with Logging {
+  extends ImperativeAggregate with Logging {
 
   require(
     children.length == udaf.inputSchema.length,
@@ -339,11 +337,9 @@ private[sql] case class ScalaUDAF(
 
   override val inputTypes: Seq[DataType] = udaf.inputSchema.map(_.dataType)
 
-  override val bufferSchema: StructType = udaf.bufferSchema
+  override val aggBufferSchema: StructType = udaf.bufferSchema
 
-  override val bufferAttributes: Seq[AttributeReference] = bufferSchema.toAttributes
-
-  override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance())
+  override val aggBufferAttributes: Seq[AttributeReference] = aggBufferSchema.toAttributes
 
   private[this] lazy val childrenSchema: StructType = {
     val inputFields = children.zipWithIndex.map {
@@ -370,13 +366,13 @@ private[sql] case class ScalaUDAF(
     CatalystTypeConverters.createToScalaConverter(childrenSchema)
 
   private[this] lazy val bufferValuesToCatalystConverters: Array[Any => Any] = {
-    bufferSchema.fields.map { field =>
+    aggBufferSchema.fields.map { field =>
       CatalystTypeConverters.createToCatalystConverter(field.dataType)
     }
   }
 
   private[this] lazy val bufferValuesToScalaConverters: Array[Any => Any] = {
-    bufferSchema.fields.map { field =>
+    aggBufferSchema.fields.map { field =>
       CatalystTypeConverters.createToScalaConverter(field.dataType)
     }
   }
@@ -398,15 +394,15 @@ private[sql] case class ScalaUDAF(
    * Sets the inputBufferOffset to newInputBufferOffset and then create a new instance of
    * `inputAggregateBuffer` based on this new inputBufferOffset.
    */
-  override def withNewInputBufferOffset(newInputBufferOffset: Int): Unit = {
-    super.withNewInputBufferOffset(newInputBufferOffset)
+  override def withNewInputAggBufferOffset(newInputBufferOffset: Int): Unit = {
+    super.withNewInputAggBufferOffset(newInputBufferOffset)
     // inputBufferOffset has been updated.
     inputAggregateBuffer =
       new InputAggregationBuffer(
-        bufferSchema,
+        aggBufferSchema,
         bufferValuesToCatalystConverters,
         bufferValuesToScalaConverters,
-        inputBufferOffset,
+        inputAggBufferOffset,
         null)
   }
 
@@ -414,22 +410,22 @@ private[sql] case class ScalaUDAF(
    * Sets the mutableBufferOffset to newMutableBufferOffset and then create a new instance of
    * `mutableAggregateBuffer` and `evalAggregateBuffer` based on this new mutableBufferOffset.
    */
-  override def withNewMutableBufferOffset(newMutableBufferOffset: Int): Unit = {
-    super.withNewMutableBufferOffset(newMutableBufferOffset)
+  override def withNewMutableAggBufferOffset(newMutableBufferOffset: Int): Unit = {
+    super.withNewMutableAggBufferOffset(newMutableBufferOffset)
     // mutableBufferOffset has been updated.
     mutableAggregateBuffer =
       new MutableAggregationBufferImpl(
-        bufferSchema,
+        aggBufferSchema,
         bufferValuesToCatalystConverters,
         bufferValuesToScalaConverters,
-        mutableBufferOffset,
+        mutableAggBufferOffset,
         null)
     evalAggregateBuffer =
       new InputAggregationBuffer(
-        bufferSchema,
+        aggBufferSchema,
         bufferValuesToCatalystConverters,
         bufferValuesToScalaConverters,
-        mutableBufferOffset,
+        mutableAggBufferOffset,
         null)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 4f5e86cceb8ac..e1d7e1bf02fcd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -97,10 +97,10 @@ object Utils {
     // Check if we can use TungstenAggregate.
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate]) &&
+      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[ExpressionAggregate]) &&
       supportsTungstenAggregate(
         groupingExpressions,
-        aggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes))
+        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
 
     // 1. Create an Aggregate Operator for partial aggregations.
@@ -117,10 +117,10 @@ object Utils {
     val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
     val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial))
     val partialAggregateAttributes =
-      partialAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)
+      partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialResultExpressions =
       namedGroupingAttributes ++
-        partialAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes)
+        partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
 
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
@@ -128,7 +128,6 @@ object Utils {
         groupingExpressions = namedGroupingExpressions.map(_._2),
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         completeAggregateExpressions = Nil,
-        initialInputBufferOffset = 0,
         resultExpressions = partialResultExpressions,
         child = child)
     } else {
@@ -158,7 +157,7 @@ object Utils {
             // aggregateFunctionMap contains unique aggregate functions.
             val aggregateFunction =
               aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._1
-            aggregateFunction.asInstanceOf[AlgebraicAggregate].evaluateExpression
+            aggregateFunction.asInstanceOf[ExpressionAggregate].evaluateExpression
           case expression =>
             // We do not rely on the equality check at here since attributes may
             // different cosmetically. Instead, we use semanticEquals.
@@ -173,7 +172,6 @@ object Utils {
         groupingExpressions = namedGroupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         completeAggregateExpressions = Nil,
-        initialInputBufferOffset = namedGroupingAttributes.length,
         resultExpressions = rewrittenResultExpressions,
         child = partialAggregate)
     } else {
@@ -216,10 +214,11 @@ object Utils {
     val aggregateExpressions = functionsWithDistinct ++ functionsWithoutDistinct
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-        aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[AlgebraicAggregate]) &&
+        aggregateExpressions.forall(
+          _.aggregateFunction.isInstanceOf[ExpressionAggregate]) &&
         supportsTungstenAggregate(
           groupingExpressions,
-          aggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes))
+          aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
     // 1. Create an Aggregate Operator for partial aggregations.
     // The grouping expressions are original groupingExpressions and
@@ -252,20 +251,19 @@ object Utils {
 
     val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial))
     val partialAggregateAttributes =
-      partialAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)
+      partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialAggregateGroupingExpressions =
       (namedGroupingExpressions ++ namedDistinctColumnExpressions).map(_._2)
     val partialAggregateResult =
       namedGroupingAttributes ++
         distinctColumnAttributes ++
-        partialAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes)
+        partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
         requiredChildDistributionExpressions = None: Option[Seq[Expression]],
         groupingExpressions = partialAggregateGroupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         completeAggregateExpressions = Nil,
-        initialInputBufferOffset = 0,
         resultExpressions = partialAggregateResult,
         child = child)
     } else {
@@ -284,18 +282,17 @@ object Utils {
     // 2. Create an Aggregate Operator for partial merge aggregations.
     val partialMergeAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = PartialMerge))
     val partialMergeAggregateAttributes =
-      partialMergeAggregateExpressions.flatMap(_.aggregateFunction.bufferAttributes)
+      partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialMergeAggregateResult =
       namedGroupingAttributes ++
         distinctColumnAttributes ++
-        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.cloneBufferAttributes)
+        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialMergeAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
         requiredChildDistributionExpressions = Some(namedGroupingAttributes),
         groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes,
         nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
         completeAggregateExpressions = Nil,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
         resultExpressions = partialMergeAggregateResult,
         child = partialAggregate)
     } else {
@@ -362,7 +359,7 @@ object Utils {
                 // aggregate functions that have not been rewritten.
                 aggregateFunctionMap(function, isDistinct)._1
               }
-            aggregateFunction.asInstanceOf[AlgebraicAggregate].evaluateExpression
+            aggregateFunction.asInstanceOf[ExpressionAggregate].evaluateExpression
           case expression =>
             // We do not rely on the equality check at here since attributes may
             // different cosmetically. Instead, we use semanticEquals.
@@ -377,7 +374,6 @@ object Utils {
         groupingExpressions = namedGroupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         completeAggregateExpressions = completeAggregateExpressions,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
         resultExpressions = rewrittenResultExpressions,
         child = partialMergeAggregate)
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index afda0d29f6d91..7ca677a6c72ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -38,7 +38,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
         () => new InterpretedMutableProjection(expr, schema)
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
-      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, 0,
+      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty,
         Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index a85d4db88d7eb..18bbdb9908142 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -554,7 +554,7 @@ private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
     children: Seq[Expression],
     isUDAFBridgeRequired: Boolean = false)
-  extends AggregateFunction2 with HiveInspectors {
+  extends ImperativeAggregate with HiveInspectors {
 
   def this() = this(null, null)
 
@@ -598,7 +598,7 @@ private[hive] case class HiveUDAFFunction(
 
   // Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation
   // buffer for it.
-  override def bufferSchema: StructType = StructType(Nil)
+  override def aggBufferSchema: StructType = StructType(Nil)
 
   override def update(_buffer: MutableRow, input: InternalRow): Unit = {
     val inputs = inputProjection(input)
@@ -610,13 +610,11 @@ private[hive] case class HiveUDAFFunction(
       "Hive UDAF doesn't support partial aggregate")
   }
 
-  override def cloneBufferAttributes: Seq[Attribute] = Nil
-
   override def initialize(_buffer: MutableRow): Unit = {
     buffer = function.getNewAggregationBuffer
   }
 
-  override def bufferAttributes: Seq[AttributeReference] = Nil
+  override def aggBufferAttributes: Seq[AttributeReference] = Nil
 
   // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
   // catalyst type checking framework.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 24b1846923c77..c9e1bb199591d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -343,7 +343,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         Row(null, null, 110.0, null, null, 10.0) :: Nil)
   }
 
-  test("non-AlgebraicAggregate aggreguate function") {
+  test("interpreted aggregate function") {
     checkAnswer(
       sqlContext.sql(
         """
@@ -368,7 +368,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       Row(null) :: Nil)
   }
 
-  test("non-AlgebraicAggregate and AlgebraicAggregate aggreguate function") {
+  test("interpreted and expression-based aggregation functions") {
     checkAnswer(
       sqlContext.sql(
         """

From 94fc57afdf8ac6be35f13956232b6cf58857d047 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 7 Oct 2015 14:11:21 -0700
Subject: [PATCH 474/802] [SPARK-10300] [BUILD] [TESTS] Add support for test
 tags in run-tests.py.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8775 from vanzin/SPARK-10300.
---
 bagel/pom.xml                                 |  4 ++
 core/pom.xml                                  | 14 ++----
 dev/run-tests.py                              | 19 ++++++-
 dev/sparktestsupport/modules.py               | 24 ++++++++-
 external/flume-sink/pom.xml                   |  4 ++
 external/flume/pom.xml                        | 10 +---
 external/kafka/pom.xml                        | 10 +---
 external/mqtt/pom.xml                         | 14 ++----
 external/twitter/pom.xml                      | 10 +---
 external/zeromq/pom.xml                       | 10 +---
 extras/java8-tests/pom.xml                    | 10 +---
 extras/kinesis-asl/pom.xml                    |  5 +-
 graphx/pom.xml                                |  4 ++
 launcher/pom.xml                              | 10 ++--
 mllib/pom.xml                                 | 14 ++----
 network/common/pom.xml                        | 14 ++----
 network/shuffle/pom.xml                       | 10 +---
 network/yarn/pom.xml                          |  4 ++
 pom.xml                                       | 24 ++++++++-
 project/SparkBuild.scala                      | 19 +++++--
 repl/pom.xml                                  |  4 ++
 sql/catalyst/pom.xml                          |  4 ++
 sql/core/pom.xml                              |  9 ++--
 sql/hive-thriftserver/pom.xml                 |  4 ++
 .../execution/HiveCompatibilitySuite.scala    |  2 +
 sql/hive/pom.xml                              |  9 ++--
 .../spark/sql/hive/client/VersionsSuite.scala |  2 +
 streaming/pom.xml                             | 14 ++----
 tags/pom.xml                                  | 50 +++++++++++++++++++
 .../apache/spark/tags/ExtendedHiveTest.java   | 26 ++++++++++
 .../apache/spark/tags/ExtendedYarnTest.java   | 26 ++++++++++
 unsafe/pom.xml                                | 10 +---
 yarn/pom.xml                                  |  4 ++
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  2 +
 .../yarn/YarnShuffleIntegrationSuite.scala    |  2 +
 35 files changed, 267 insertions(+), 134 deletions(-)
 create mode 100644 tags/pom.xml
 create mode 100644 tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
 create mode 100644 tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java

diff --git a/bagel/pom.xml b/bagel/pom.xml
index 3baf8d47b4dc7..672e9469aec92 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -52,6 +52,10 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/core/pom.xml b/core/pom.xml
index e31d90f608892..c0af98a04fb1d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -331,16 +331,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.curator</groupId>
       <artifactId>curator-test</artifactId>
@@ -362,6 +352,10 @@
       <artifactId>py4j</artifactId>
       <version>0.8.2.1</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/dev/run-tests.py b/dev/run-tests.py
index d8b22e1665e7b..1a816585187d9 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -118,6 +118,14 @@ def determine_modules_to_test(changed_modules):
     return modules_to_test.union(set(changed_modules))
 
 
+def determine_tags_to_exclude(changed_modules):
+    tags = []
+    for m in modules.all_modules:
+        if m not in changed_modules:
+            tags += m.test_tags
+    return tags
+
+
 # -------------------------------------------------------------------------------------------------
 # Functions for working with subprocesses and shell tools
 # -------------------------------------------------------------------------------------------------
@@ -369,6 +377,7 @@ def detect_binary_inop_with_mima():
 
 def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
+
     profiles_and_goals = test_profiles + mvn_test_goals
 
     print("[info] Running Spark tests using Maven with these arguments: ",
@@ -392,7 +401,7 @@ def run_scala_tests_sbt(test_modules, test_profiles):
     exec_sbt(profiles_and_goals)
 
 
-def run_scala_tests(build_tool, hadoop_version, test_modules):
+def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
     """Function to properly execute all tests passed in as a set from the
     `determine_test_suites` function"""
     set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
@@ -401,6 +410,10 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_profiles = get_hadoop_profiles(hadoop_version) + \
         list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
+
+    if excluded_tags:
+        test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
+
     if build_tool == "maven":
         run_scala_tests_maven(test_profiles)
     else:
@@ -500,8 +513,10 @@ def main():
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
+        excluded_tags = determine_tags_to_exclude(changed_modules)
     if not changed_modules:
         changed_modules = [modules.root]
+        excluded_tags = []
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -541,7 +556,7 @@ def main():
         detect_binary_inop_with_mima()
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules)
+    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 346452f3174e4..d65547e04db4b 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -31,7 +31,7 @@ class Module(object):
 
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
                  sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
-                 should_run_r_tests=False):
+                 test_tags=(), should_run_r_tests=False):
         """
         Define a new module.
 
@@ -50,6 +50,8 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         :param blacklisted_python_implementations: A set of Python implementations that are not
             supported by this module's Python components. The values in this set should match
             strings returned by Python's `platform.python_implementation()`.
+        :param test_tags A set of tags that will be excluded when running unit tests if the module
+            is not explicitly changed.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         """
         self.name = name
@@ -60,6 +62,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.environ = environ
         self.python_test_goals = python_test_goals
         self.blacklisted_python_implementations = blacklisted_python_implementations
+        self.test_tags = test_tags
         self.should_run_r_tests = should_run_r_tests
 
         self.dependent_modules = set()
@@ -85,6 +88,9 @@ def contains_file(self, filename):
         "catalyst/test",
         "sql/test",
         "hive/test",
+    ],
+    test_tags=[
+        "org.apache.spark.tags.ExtendedHiveTest"
     ]
 )
 
@@ -398,6 +404,22 @@ def contains_file(self, filename):
 )
 
 
+yarn = Module(
+    name="yarn",
+    dependencies=[],
+    source_file_regexes=[
+        "yarn/",
+        "network/yarn/",
+    ],
+    sbt_test_goals=[
+        "yarn/test",
+        "network-yarn/test",
+    ],
+    test_tags=[
+        "org.apache.spark.tags.ExtendedYarnTest"
+    ]
+)
+
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index d7c2ac474a18d..75113ff753e7a 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -90,6 +90,10 @@
       <version>3.4.0.Final</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 132062f94fb45..57f83607365d6 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -67,14 +67,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
   <build>
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 05abd9e2e6810..79258c126e043 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -87,14 +87,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
   <build>
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 05e6338a08b0a..59fba8b826b4f 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -58,22 +58,16 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.activemq</groupId>
       <artifactId>activemq-core</artifactId>
       <version>5.7.0</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 244ad58ae9593..4c22ec8b3b154 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -59,14 +59,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
   <build>
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 171df8682c848..02d6b81281576 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -58,14 +58,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
   <build>
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 81794a8536318..4ce90e75fd359 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -59,14 +59,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
 
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 6dd8ff69c2943..ef72d97eae69d 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -75,9 +75,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
   </dependencies>
   <build>
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 202fc19002d12..987b831021a54 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -66,6 +66,10 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/launcher/pom.xml b/launcher/pom.xml
index ed38e66aa2467..d595d74642ab2 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -42,11 +42,6 @@
       <artifactId>log4j</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
@@ -63,6 +58,11 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
+
     <!-- Not needed by the test code, but referenced by SparkSubmit which is used by the tests. -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 22c0c6008ba37..70139121d8c78 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -94,16 +94,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
@@ -131,6 +121,10 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 1cc054a8936c5..9af6cc5e925f9 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -64,21 +64,15 @@
     </dependency>
 
     <!-- Test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 7a66c968041ce..70ba5cb1995bb 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -79,14 +79,8 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
     <dependency>
       <groupId>log4j</groupId>
diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index e745180eace78..541ed9a8d0ab6 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -44,6 +44,10 @@
       <artifactId>spark-network-shuffle_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
 
     <!-- Provided dependencies -->
     <dependency>
diff --git a/pom.xml b/pom.xml
index d04ed1e798657..445e65c0459bf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -86,6 +86,7 @@
   </mailingLists>
 
   <modules>
+    <module>tags</module>
     <module>core</module>
     <module>bagel</module> <!-- Deprecated -->
     <module>graphx</module>
@@ -181,6 +182,7 @@
     <libthrift.version>0.9.2</libthrift.version>
 
     <test.java.home>${java.home}</test.java.home>
+    <test.exclude.tags></test.exclude.tags>
 
     <!--
       Dependency scopes that can be overridden by enabling certain profiles. These profiles are
@@ -339,9 +341,25 @@
       <artifactId>scalatest_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <dependencyManagement>
     <dependencies>
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+        <version>${project.version}</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill_${scala.binary.version}</artifactId>
@@ -742,7 +760,7 @@
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
-        <version>4.10</version>
+        <version>4.11</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -760,7 +778,7 @@
       <dependency>
         <groupId>com.novocode</groupId>
         <artifactId>junit-interface</artifactId>
-        <version>0.10</version>
+        <version>0.11</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -1915,6 +1933,7 @@
               <test.src.tables>src</test.src.tables>
             </systemProperties>
             <failIfNoTests>false</failIfNoTests>
+            <excludedGroups>${test.exclude.tags}</excludedGroups>
           </configuration>
         </plugin>
         <!-- Scalatest runs all Scala tests -->
@@ -1952,6 +1971,7 @@
               <!-- Needed by sql/hive tests. -->
               <test.src.tables>__not_used__</test.src.tables>
             </systemProperties>
+            <tagsToExclude>${test.exclude.tags}</tagsToExclude>
           </configuration>
           <executions>
             <execution>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 901cfa538d23e..1339980c38800 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -35,11 +35,11 @@ object BuildCommons {
 
   val allProjects@Seq(bagel, catalyst, core, graphx, hive, hiveThriftServer, mllib, repl,
     sql, networkCommon, networkShuffle, streaming, streamingFlumeSink, streamingFlume, streamingKafka,
-    streamingMqtt, streamingTwitter, streamingZeromq, launcher, unsafe) =
+    streamingMqtt, streamingTwitter, streamingZeromq, launcher, unsafe, testTags) =
     Seq("bagel", "catalyst", "core", "graphx", "hive", "hive-thriftserver", "mllib", "repl",
       "sql", "network-common", "network-shuffle", "streaming", "streaming-flume-sink",
       "streaming-flume", "streaming-kafka", "streaming-mqtt", "streaming-twitter",
-      "streaming-zeromq", "launcher", "unsafe").map(ProjectRef(buildLocation, _))
+      "streaming-zeromq", "launcher", "unsafe", "test-tags").map(ProjectRef(buildLocation, _))
 
   val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl,
     streamingKinesisAsl) = Seq("yarn", "yarn-stable", "java8-tests", "ganglia-lgpl",
@@ -202,7 +202,7 @@ object SparkBuild extends PomBuild {
   (allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings))
 
   allProjects.filterNot(x => Seq(spark, hive, hiveThriftServer, catalyst, repl,
-    networkCommon, networkShuffle, networkYarn, unsafe).contains(x)).foreach {
+    networkCommon, networkShuffle, networkYarn, unsafe, testTags).contains(x)).foreach {
       x => enable(MimaBuild.mimaSettings(sparkHome, x))(x)
     }
 
@@ -567,11 +567,20 @@ object TestSettings {
     javaOptions in Test ++= "-Xmx3g -Xss4096k -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
     javaOptions += "-Xmx3g",
+    // Exclude tags defined in a system property
+    testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest,
+      sys.props.get("test.exclude.tags").map { tags =>
+        tags.split(",").flatMap { tag => Seq("-l", tag) }.toSeq
+      }.getOrElse(Nil): _*),
+    testOptions in Test += Tests.Argument(TestFrameworks.JUnit,
+      sys.props.get("test.exclude.tags").map { tags =>
+        Seq("--exclude-categories=" + tags)
+      }.getOrElse(Nil): _*),
     // Show full stack trace and duration in test cases.
     testOptions in Test += Tests.Argument("-oDF"),
-    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
+    testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
     // Enable Junit testing.
-    libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test",
+    libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test",
     // Only allow one test at a time, even across projects, since they run in the same JVM
     parallelExecution in Test := false,
     // Make sure the test temp directory exists.
diff --git a/repl/pom.xml b/repl/pom.xml
index 5cf416a4a5448..fb0a0e1286c80 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -91,6 +91,10 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
 
     <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
     <dependency>
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 6cfd53e868f83..61d6fc63554bb 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -53,6 +53,10 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-unsafe_${scala.binary.version}</artifactId>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 465aa3a3888c2..c96855e261ee8 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -60,6 +60,10 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
@@ -73,11 +77,6 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index f7fe085f34d84..b5b2143292a69 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -93,6 +93,10 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index ab309e0a1d36b..8f29fa91f7ebb 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -25,10 +25,12 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.tags.ExtendedHiveTest
 
 /**
  * Runs the test cases that are included in the hive distribution.
  */
+@ExtendedHiveTest
 class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   // TODO: bundle in jar files... get from classpath
   private lazy val hiveQueryDir = TestHive.getHiveFile(
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index ac67fe5f47be9..d96f3e2b9f62b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -58,6 +58,10 @@
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
 <!--
     <dependency>
       <groupId>com.google.guava</groupId>
@@ -160,11 +164,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index f0bb77092c0cf..2da22ec2379f3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.tags.ExtendedHiveTest
 import org.apache.spark.util.Utils
 
 /**
@@ -32,6 +33,7 @@ import org.apache.spark.util.Utils
  * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionality
  * is not fully tested.
  */
+@ExtendedHiveTest
 class VersionsSuite extends SparkFunSuite with Logging {
 
   // Do not use a temp path here to speed up subsequent executions of the unit test during
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 5cc9001b0e9ab..145c8a7321c05 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -47,6 +47,10 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
 
     <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
     <dependency>
@@ -84,21 +88,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/tags/pom.xml b/tags/pom.xml
new file mode 100644
index 0000000000000..ca93722e73345
--- /dev/null
+++ b/tags/pom.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.10</artifactId>
+    <version>1.6.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-test-tags_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Test Tags</name>
+  <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>test-tags</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>compile</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+</project>
diff --git a/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java b/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
new file mode 100644
index 0000000000000..1b0c416b0fe4e
--- /dev/null
+++ b/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.tags;
+
+import java.lang.annotation.*;
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ExtendedHiveTest { }
diff --git a/tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java b/tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java
new file mode 100644
index 0000000000000..2a631bfc88cf0
--- /dev/null
+++ b/tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.tags;
+
+import java.lang.annotation.*;
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ExtendedYarnTest { }
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 066abe92e51c0..caf1f77890b58 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -56,14 +56,8 @@
 
     <!-- Test dependencies -->
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.novocode</groupId>
-      <artifactId>junit-interface</artifactId>
-      <scope>test</scope>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
     </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
diff --git a/yarn/pom.xml b/yarn/pom.xml
index d8e4a4bbead81..3eadacba13e18 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -51,6 +51,10 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-yarn-api</artifactId>
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index b5a42fd6afd98..f1601cd16100f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.launcher.TestClasspathBuilder
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
   SparkListenerExecutorAdded}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.tags.ExtendedYarnTest
 import org.apache.spark.util.Utils
 
 /**
@@ -39,6 +40,7 @@ import org.apache.spark.util.Utils
  * applications, and require the Spark assembly to be built before they can be successfully
  * run.
  */
+@ExtendedYarnTest
 class YarnClusterSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 8d9c9b3004eda..a85e5772a0fa4 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -28,10 +28,12 @@ import org.scalatest.Matchers
 import org.apache.spark._
 import org.apache.spark.network.shuffle.ShuffleTestAccessor
 import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
+import org.apache.spark.tags.ExtendedYarnTest
 
 /**
  * Integration test for the external shuffle service with a yarn mini-cluster
  */
+@ExtendedYarnTest
 class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
   override def newYarnConfig(): YarnConfiguration = {

From c14aee4da97d4a7fcc8a2565a159edc74a5c8e10 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 7 Oct 2015 14:49:08 -0700
Subject: [PATCH 475/802] [SPARK-10856][SQL] Mapping TimestampType to DATETIME
 for SQL Server jdbc dialect

JIRA: https://issues.apache.org/jira/browse/SPARK-10856

For Microsoft SQL Server, TimestampType should be mapped to DATETIME instead of TIMESTAMP.

Related information for the datatype mapping: https://msdn.microsoft.com/en-us/library/ms378878(v=sql.110).aspx

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8978 from viirya/mysql-jdbc-timestamp.
---
 .../main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index c70fea1c3f50e..5abbfbf7b3186 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -277,4 +277,9 @@ case object MsSqlServerDialect extends JdbcDialect {
       Some(StringType)
     } else None
   }
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case TimestampType => Some(JdbcType("DATETIME", java.sql.Types.TIMESTAMP))
+    case _ => None
+  }
 }

From 713e4f44e92f25f1092c898923f77249934f38b2 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Wed, 7 Oct 2015 14:56:02 -0700
Subject: [PATCH 476/802] [SPARK-10679] [CORE] javax.jdo.JDOFatalUserException
 in executor

HadoopRDD throws exception in executor, something like below.
{noformat}
5/09/17 18:51:21 INFO metastore.HiveMetaStore: 0: Opening raw store with implemenation class:org.apache.hadoop.hive.metastore.ObjectStore
15/09/17 18:51:21 INFO metastore.ObjectStore: ObjectStore, initialize called
15/09/17 18:51:21 WARN metastore.HiveMetaStore: Retrying creating default database after error: Class org.datanucleus.api.jdo.JDOPersistenceManagerFactory was not found.
javax.jdo.JDOFatalUserException: Class org.datanucleus.api.jdo.JDOPersistenceManagerFactory was not found.
	at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1175)
	at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
	at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
	at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
	at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
	at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
	at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
	at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:73)
	at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
	at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
	at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:620)
	at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
	at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
	at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
	at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
	at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
	at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
	at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
	at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
	at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
	at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
	at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
	at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
	at org.apache.hadoop.hive.ql.metadata.Hive.getAllDatabases(Hive.java:1234)
	at org.apache.hadoop.hive.ql.metadata.Hive.reloadFunctions(Hive.java:174)
	at org.apache.hadoop.hive.ql.metadata.Hive.<clinit>(Hive.java:166)
	at org.apache.hadoop.hive.ql.plan.PlanUtils.configureJobPropertiesForStorageHandler(PlanUtils.java:803)
	at org.apache.hadoop.hive.ql.plan.PlanUtils.configureInputJobPropertiesForStorageHandler(PlanUtils.java:782)
	at org.apache.spark.sql.hive.HadoopTableReader$.initializeLocalJobConfFunc(TableReader.scala:298)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$12.apply(TableReader.scala:274)
	at org.apache.spark.sql.hive.HadoopTableReader$$anonfun$12.apply(TableReader.scala:274)
	at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
	at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
	at scala.Option.map(Option.scala:145)
	at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:176)
	at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:220)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:216)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:87)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
{noformat}

Author: navis.ryu <navis@apache.org>

Closes #8804 from navis/SPARK-10679.
---
 .../apache/spark/sql/hive/TableReader.scala   | 31 +++++++++++++++++--
 .../spark/sql/hive/hiveWriterContainers.scala |  4 +--
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index e35468a624c3e..69f481c49a655 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.hive
 
+import java.util
+
 import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants._
 import org.apache.hadoop.hive.ql.exec.Utilities
-import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable}
-import org.apache.hadoop.hive.ql.plan.{PlanUtils, TableDesc}
+import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable, Hive, HiveUtils, HiveStorageHandler}
+import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.hive.serde2.Deserializer
 import org.apache.hadoop.hive.serde2.objectinspector.primitive._
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, StructObjectInspector}
@@ -287,6 +289,29 @@ class HadoopTableReader(
   }
 }
 
+private[hive] object HiveTableUtil {
+
+  // copied from PlanUtils.configureJobPropertiesForStorageHandler(tableDesc)
+  // that calls Hive.get() which tries to access metastore, but it's not valid in runtime
+  // it would be fixed in next version of hive but till then, we should use this instead
+  def configureJobPropertiesForStorageHandler(
+      tableDesc: TableDesc, jobConf: JobConf, input: Boolean) {
+    val property = tableDesc.getProperties.getProperty(META_TABLE_STORAGE)
+    val storageHandler = HiveUtils.getStorageHandler(jobConf, property)
+    if (storageHandler != null) {
+      val jobProperties = new util.LinkedHashMap[String, String]
+      if (input) {
+        storageHandler.configureInputJobProperties(tableDesc, jobProperties)
+      } else {
+        storageHandler.configureOutputJobProperties(tableDesc, jobProperties)
+      }
+      if (!jobProperties.isEmpty) {
+        tableDesc.setJobProperties(jobProperties)
+      }
+    }
+  }
+}
+
 private[hive] object HadoopTableReader extends HiveInspectors with Logging {
   /**
    * Curried. After given an argument for 'path', the resulting JobConf => Unit closure is used to
@@ -295,7 +320,7 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
   def initializeLocalJobConfFunc(path: String, tableDesc: TableDesc)(jobConf: JobConf) {
     FileInputFormat.setInputPaths(jobConf, Seq[Path](new Path(path)): _*)
     if (tableDesc != null) {
-      PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc)
+      HiveTableUtil.configureJobPropertiesForStorageHandler(tableDesc, jobConf, true)
       Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf)
     }
     val bufferSize = System.getProperty("spark.buffer.size", "65536")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index c8d6b718045a5..93c016b6c6c7a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.exec.{FileSinkOperator, Utilities}
 import org.apache.hadoop.hive.ql.io.{HiveFileFormatUtils, HiveOutputFormat}
-import org.apache.hadoop.hive.ql.plan.{PlanUtils, TableDesc}
+import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.hive.common.FileUtils
@@ -55,7 +55,7 @@ private[hive] class SparkHiveWriterContainer(
   // Add table properties from storage handler to jobConf, so any custom storage
   // handler settings can be set to jobConf
   if (tableDesc != null) {
-    PlanUtils.configureOutputJobPropertiesForStorageHandler(tableDesc)
+    HiveTableUtil.configureJobPropertiesForStorageHandler(tableDesc, jobConf, false)
     Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf)
   }
   protected val conf = new SerializableJobConf(jobConf)

From da936fbb74b852d5c98286ce92522dc3efd6ad6c Mon Sep 17 00:00:00 2001
From: Evan Chen <chene@us.ibm.com>
Date: Wed, 7 Oct 2015 15:04:53 -0700
Subject: [PATCH 477/802] [SPARK-10779] [PYSPARK] [MLLIB] Set initialModel for
 KMeans model in PySpark (spark.mllib)

Provide initialModel param for pyspark.mllib.clustering.KMeans

Author: Evan Chen <chene@us.ibm.com>

Closes #8967 from evanyc15/SPARK-10779-pyspark-mllib.
---
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  4 +++-
 python/pyspark/mllib/clustering.py              | 17 +++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 69ce7f50709a1..21e55938fa7aa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -336,7 +336,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       initializationMode: String,
       seed: java.lang.Long,
       initializationSteps: Int,
-      epsilon: Double): KMeansModel = {
+      epsilon: Double,
+      initialModel: java.util.ArrayList[Vector]): KMeansModel = {
     val kMeansAlg = new KMeans()
       .setK(k)
       .setMaxIterations(maxIterations)
@@ -346,6 +347,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setEpsilon(epsilon)
 
     if (seed != null) kMeansAlg.setSeed(seed)
+    if (!initialModel.isEmpty()) kMeansAlg.setInitialModel(new KMeansModel(initialModel))
 
     try {
       kMeansAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 900ade248c386..6964a45db2493 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -90,6 +90,12 @@ class KMeansModel(Saveable, Loader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    >>> data = array([-383.1,-382.9, 28.7,31.2, 366.2,367.3]).reshape(3, 2)
+    >>> model = KMeans.train(sc.parallelize(data), 3, maxIterations=0,
+    ...     initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)]))
+    >>> model.clusterCenters
+    [array([-1000., -1000.]), array([ 5.,  5.]), array([ 1000.,  1000.])]
     """
 
     def __init__(self, centers):
@@ -144,10 +150,17 @@ class KMeans(object):
 
     @classmethod
     def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
-              seed=None, initializationSteps=5, epsilon=1e-4):
+              seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
         """Train a k-means clustering model."""
+        clusterInitialModel = []
+        if initialModel is not None:
+            if not isinstance(initialModel, KMeansModel):
+                raise Exception("initialModel is of "+str(type(initialModel))+". It needs "
+                                "to be of <type 'KMeansModel'>")
+            clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]
         model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
-                              runs, initializationMode, seed, initializationSteps, epsilon)
+                              runs, initializationMode, seed, initializationSteps, epsilon,
+                              clusterInitialModel)
         centers = callJavaFunc(rdd.context, model.clusterCenters)
         return KMeansModel([c.toArray() for c in centers])
 

From 6dbfd7ecf41297213f4ce8024d00c40808c5ac8f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 7 Oct 2015 15:38:46 -0700
Subject: [PATCH 478/802] [SPARK-10982] [SQL] Rename ExpressionAggregate ->
 DeclarativeAggregate.

DeclarativeAggregate matches more closely with ImperativeAggregate we already have.

Author: Reynold Xin <rxin@databricks.com>

Closes #9013 from rxin/SPARK-10982.
---
 .../expressions/aggregate/functions.scala        | 16 ++++++++--------
 .../expressions/aggregate/interfaces.scala       |  4 ++--
 .../aggregate/AggregationIterator.scala          | 16 ++++++++--------
 .../aggregate/TungstenAggregationIterator.scala  | 12 ++++++------
 .../spark/sql/execution/aggregate/utils.scala    |  8 ++++----
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 4ad2607a85ef2..8aad0b7dee054 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 
-case class Average(child: Expression) extends ExpressionAggregate {
+case class Average(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -88,7 +88,7 @@ case class Average(child: Expression) extends ExpressionAggregate {
   }
 }
 
-case class Count(child: Expression) extends ExpressionAggregate {
+case class Count(child: Expression) extends DeclarativeAggregate {
   override def children: Seq[Expression] = child :: Nil
 
   override def nullable: Boolean = false
@@ -118,7 +118,7 @@ case class Count(child: Expression) extends ExpressionAggregate {
   override val evaluateExpression = Cast(currentCount, LongType)
 }
 
-case class First(child: Expression) extends ExpressionAggregate {
+case class First(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -152,7 +152,7 @@ case class First(child: Expression) extends ExpressionAggregate {
   override val evaluateExpression = first
 }
 
-case class Last(child: Expression) extends ExpressionAggregate {
+case class Last(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -186,7 +186,7 @@ case class Last(child: Expression) extends ExpressionAggregate {
   override val evaluateExpression = last
 }
 
-case class Max(child: Expression) extends ExpressionAggregate {
+case class Max(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -220,7 +220,7 @@ case class Max(child: Expression) extends ExpressionAggregate {
   override val evaluateExpression = max
 }
 
-case class Min(child: Expression) extends ExpressionAggregate {
+case class Min(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -277,7 +277,7 @@ case class StddevSamp(child: Expression) extends StddevAgg(child) {
 
 // Compute standard deviation based on online algorithm specified here:
 // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-abstract class StddevAgg(child: Expression) extends ExpressionAggregate {
+abstract class StddevAgg(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -397,7 +397,7 @@ abstract class StddevAgg(child: Expression) extends ExpressionAggregate {
   }
 }
 
-case class Sum(child: Expression) extends ExpressionAggregate {
+case class Sum(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 74e15ec90b7f5..9ba3a9c980457 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -97,7 +97,7 @@ private[sql] case class AggregateExpression2(
  *
  *  - [[ImperativeAggregate]] is for aggregation functions that are specified in terms of
  *    initialize(), update(), and merge() functions that operate on Row-based aggregation buffers.
- *  - [[ExpressionAggregate]] is for aggregation functions that are specified using
+ *  - [[DeclarativeAggregate]] is for aggregation functions that are specified using
  *    Catalyst expressions.
  *
  * In both interfaces, aggregates must define the schema ([[aggBufferSchema]]) and attributes
@@ -244,7 +244,7 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
  * can then use these attributes when defining `updateExpressions`, `mergeExpressions`, and
  * `evaluateExpressions`.
  */
-abstract class ExpressionAggregate
+abstract class DeclarativeAggregate
   extends AggregateFunction2
   with Serializable
   with Unevaluable {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 04903022e5469..5f7341e88c7c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -127,7 +127,7 @@ abstract class AggregationIterator(
     var i = 0
     while (i < allAggregateFunctions.length) {
       allAggregateFunctions(i) match {
-        case agg: ExpressionAggregate =>
+        case agg: DeclarativeAggregate =>
         case _ => positions += i
       }
       i += 1
@@ -146,7 +146,7 @@ abstract class AggregationIterator(
   // The projection used to initialize buffer values for all expression-based aggregates.
   private[this] val expressionAggInitialProjection = {
     val initExpressions = allAggregateFunctions.flatMap {
-      case ae: ExpressionAggregate => ae.initialValues
+      case ae: DeclarativeAggregate => ae.initialValues
       // For the positions corresponding to imperative aggregate functions, we'll use special
       // no-op expressions which are ignored during projection code-generation.
       case i: ImperativeAggregate => Seq.fill(i.aggBufferAttributes.length)(NoOp)
@@ -172,7 +172,7 @@ abstract class AggregationIterator(
       // Partial-only
       case (Some(Partial), None) =>
         val updateExpressions = nonCompleteAggregateFunctions.flatMap {
-          case ae: ExpressionAggregate => ae.updateExpressions
+          case ae: DeclarativeAggregate => ae.updateExpressions
           case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
         }
         val expressionAggUpdateProjection =
@@ -204,7 +204,7 @@ abstract class AggregationIterator(
         //  groupingKeyAttributes ++
         //    allAggregateFunctions.flatMap(_.cloneBufferAttributes)
         val mergeExpressions = nonCompleteAggregateFunctions.flatMap {
-          case ae: ExpressionAggregate => ae.mergeExpressions
+          case ae: DeclarativeAggregate => ae.mergeExpressions
           case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
         }
         // This projection is used to merge buffer values for all expression-based aggregates.
@@ -248,7 +248,7 @@ abstract class AggregationIterator(
             nonCompleteAggregateFunctions.flatMap(_.inputAggBufferAttributes)
         val mergeExpressions =
           nonCompleteAggregateFunctions.flatMap {
-            case ae: ExpressionAggregate => ae.mergeExpressions
+            case ae: DeclarativeAggregate => ae.mergeExpressions
             case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           } ++ completeOffsetExpressions
         val finalExpressionAggMergeProjection =
@@ -256,7 +256,7 @@ abstract class AggregationIterator(
 
         val updateExpressions =
           finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
-            case ae: ExpressionAggregate => ae.updateExpressions
+            case ae: DeclarativeAggregate => ae.updateExpressions
             case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           }
         val completeExpressionAggUpdateProjection =
@@ -291,7 +291,7 @@ abstract class AggregationIterator(
 
         val updateExpressions =
           completeAggregateFunctions.flatMap {
-            case ae: ExpressionAggregate => ae.updateExpressions
+            case ae: DeclarativeAggregate => ae.updateExpressions
             case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
           }
         val completeExpressionAggUpdateProjection =
@@ -353,7 +353,7 @@ abstract class AggregationIterator(
         val bufferSchemata =
           allAggregateFunctions.flatMap(_.aggBufferAttributes)
         val evalExpressions = allAggregateFunctions.map {
-          case ae: ExpressionAggregate => ae.evaluateExpression
+          case ae: DeclarativeAggregate => ae.evaluateExpression
           case agg: AggregateFunction2 => NoOp
         }
         val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferSchemata)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 6a84c0af0b460..a6f4c1d92f6dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -131,15 +131,15 @@ class TungstenAggregationIterator(
   // All aggregate functions. TungstenAggregationIterator only handles expression-based aggregate.
   // If there is any functions that is an ImperativeAggregateFunction, we throw an
   // IllegalStateException.
-  private[this] val allAggregateFunctions: Array[ExpressionAggregate] = {
+  private[this] val allAggregateFunctions: Array[DeclarativeAggregate] = {
     if (!allAggregateExpressions.forall(
-        _.aggregateFunction.isInstanceOf[ExpressionAggregate])) {
+        _.aggregateFunction.isInstanceOf[DeclarativeAggregate])) {
       throw new IllegalStateException(
         "Only ExpressionAggregateFunctions should be passed in TungstenAggregationIterator.")
     }
 
     allAggregateExpressions
-      .map(_.aggregateFunction.asInstanceOf[ExpressionAggregate])
+      .map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
       .toArray
   }
 
@@ -203,9 +203,9 @@ class TungstenAggregationIterator(
 
       // Final-Complete
       case (Some(Final), Some(Complete)) =>
-        val nonCompleteAggregateFunctions: Array[ExpressionAggregate] =
+        val nonCompleteAggregateFunctions: Array[DeclarativeAggregate] =
           allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
-        val completeAggregateFunctions: Array[ExpressionAggregate] =
+        val completeAggregateFunctions: Array[DeclarativeAggregate] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
 
         val completeOffsetExpressions =
@@ -235,7 +235,7 @@ class TungstenAggregationIterator(
 
       // Complete-only
       case (None, Some(Complete)) =>
-        val completeAggregateFunctions: Array[ExpressionAggregate] =
+        val completeAggregateFunctions: Array[DeclarativeAggregate] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
 
         val updateExpressions =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index e1d7e1bf02fcd..e1c2d9475a10f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -97,7 +97,7 @@ object Utils {
     // Check if we can use TungstenAggregate.
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[ExpressionAggregate]) &&
+      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
       supportsTungstenAggregate(
         groupingExpressions,
         aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
@@ -157,7 +157,7 @@ object Utils {
             // aggregateFunctionMap contains unique aggregate functions.
             val aggregateFunction =
               aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._1
-            aggregateFunction.asInstanceOf[ExpressionAggregate].evaluateExpression
+            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
           case expression =>
             // We do not rely on the equality check at here since attributes may
             // different cosmetically. Instead, we use semanticEquals.
@@ -215,7 +215,7 @@ object Utils {
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
         aggregateExpressions.forall(
-          _.aggregateFunction.isInstanceOf[ExpressionAggregate]) &&
+          _.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
         supportsTungstenAggregate(
           groupingExpressions,
           aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
@@ -359,7 +359,7 @@ object Utils {
                 // aggregate functions that have not been rewritten.
                 aggregateFunctionMap(function, isDistinct)._1
               }
-            aggregateFunction.asInstanceOf[ExpressionAggregate].evaluateExpression
+            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
           case expression =>
             // We do not rely on the equality check at here since attributes may
             // different cosmetically. Instead, we use semanticEquals.

From 7bf07faa716bd6a01252c5e888d0956096bde026 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 7 Oct 2015 15:50:45 -0700
Subject: [PATCH 479/802] [SPARK-10490] [ML] Consolidate the Cholesky solvers
 in WeightedLeastSquares and ALS

Consolidate the Cholesky solvers in WeightedLeastSquares and ALS.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8936 from yanboliang/spark-10490.
---
 .../spark/ml/optim/WeightedLeastSquares.scala | 23 +---------
 .../apache/spark/ml/recommendation/ALS.scala  | 10 +----
 .../mllib/linalg/CholeskyDecomposition.scala  | 43 +++++++++++++++++++
 .../linalg/EigenValueDecomposition.scala      |  6 +--
 4 files changed, 47 insertions(+), 35 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 4374e99631560..d7eaa5a9268ff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -17,12 +17,8 @@
 
 package org.apache.spark.ml.optim
 
-import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
-import org.netlib.util.intW
-
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.rdd.RDD
 
 /**
@@ -110,7 +106,7 @@ private[ml] class WeightedLeastSquares(
       j += 1
     }
 
-    val x = choleskySolve(aaBar.values, abBar)
+    val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values))
 
     // compute intercept
     val intercept = if (fitIntercept) {
@@ -121,23 +117,6 @@ private[ml] class WeightedLeastSquares(
 
     new WeightedLeastSquaresModel(x, intercept)
   }
-
-  /**
-   * Solves a symmetric positive definite linear system via Cholesky factorization.
-   * The input arguments are modified in-place to store the factorization and the solution.
-   * @param A the upper triangular part of A
-   * @param bx right-hand side
-   * @return the solution vector
-   */
-  // TODO: SPARK-10490 - consolidate this and the Cholesky solver in ALS
-  private def choleskySolve(A: Array[Double], bx: DenseVector): DenseVector = {
-    val k = bx.size
-    val info = new intW(0)
-    lapack.dppsv("U", k, 1, A, bx.values, k, info)
-    val code = info.`val`
-    assert(code == 0, s"lapack.dpotrs returned $code.")
-    bx
-  }
 }
 
 private[ml] object WeightedLeastSquares {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index f6f5281f71a5f..535f266b9a944 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -26,9 +26,7 @@ import scala.util.Sorting
 import scala.util.hashing.byteswap64
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.netlib.util.intW
 
 import org.apache.spark.{Logging, Partitioner}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
@@ -36,6 +34,7 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.linalg.CholeskyDecomposition
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
@@ -366,8 +365,6 @@ object ALS extends Logging {
   /** Cholesky solver for least square problems. */
   private[recommendation] class CholeskySolver extends LeastSquaresNESolver {
 
-    private val upper = "U"
-
     /**
      * Solves a least squares problem with L2 regularization:
      *
@@ -387,10 +384,7 @@ object ALS extends Logging {
         i += j
         j += 1
       }
-      val info = new intW(0)
-      lapack.dppsv(upper, k, 1, ne.ata, ne.atb, k, info)
-      val code = info.`val`
-      assert(code == 0, s"lapack.dppsv returned $code.")
+      CholeskyDecomposition.solve(ne.ata, ne.atb)
       val x = new Array[Float](k)
       i = 0
       while (i < k) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
new file mode 100644
index 0000000000000..66eb40b6f4a69
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
+import org.netlib.util.intW
+
+/**
+ * Compute Cholesky decomposition.
+ */
+private[spark] object CholeskyDecomposition {
+
+  /**
+   * Solves a symmetric positive definite linear system via Cholesky factorization.
+   * The input arguments are modified in-place to store the factorization and the solution.
+   * @param A the upper triangular part of A
+   * @param bx right-hand side
+   * @return the solution array
+   */
+  def solve(A: Array[Double], bx: Array[Double]): Array[Double] = {
+    val k = bx.size
+    val info = new intW(0)
+    lapack.dppsv("U", k, 1, A, bx, k, info)
+    val code = info.`val`
+    assert(code == 0, s"lapack.dpotrs returned $code.")
+    bx
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index ae3ba3099c878..863abe86d38d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -21,13 +21,9 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
 import com.github.fommil.netlib.ARPACK
 import org.netlib.util.{intW, doubleW}
 
-import org.apache.spark.annotation.Experimental
-
 /**
- * :: Experimental ::
  * Compute eigen-decomposition.
  */
-@Experimental
 private[mllib] object EigenValueDecomposition {
   /**
    * Compute the leading k eigenvalues and eigenvectors on a symmetric square matrix using ARPACK.
@@ -46,7 +42,7 @@ private[mllib] object EigenValueDecomposition {
    *       for more details). The maximum number of Arnoldi update iterations is set to 300 in this
    *       function.
    */
-  private[mllib] def symmetricEigs(
+  def symmetricEigs(
       mul: BDV[Double] => BDV[Double],
       n: Int,
       k: Int,

From 37526aca2430e36a931fbe6e01a152e701a1b94e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 7 Oct 2015 15:51:09 -0700
Subject: [PATCH 480/802] [SPARK-10980] [SQL] fix bug in create Decimal

The created decimal is wrong if using `Decimal(unscaled, precision, scale)` with unscaled > 1e18 and and precision > 18 and scale > 0.

This bug exists since the beginning.

Author: Davies Liu <davies@databricks.com>

Closes #9014 from davies/fix_decimal.
---
 .../src/main/scala/org/apache/spark/sql/types/Decimal.scala     | 2 +-
 .../scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index bfcf111385b7e..909b8e31f2458 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -88,7 +88,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
       if (precision < 19) {
         return null  // Requested precision is too low to represent this value
       }
-      this.decimalVal = BigDecimal(unscaled)
+      this.decimalVal = BigDecimal(unscaled, scale)
       this.longVal = 0L
     } else {
       val p = POW_10(math.min(precision, MAX_LONG_DIGITS))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index 6921d15958a55..f9aceb8d3b13e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -44,6 +44,7 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
     checkDecimal(Decimal(170L, 4, 2), "1.70", 4, 2)
     checkDecimal(Decimal(17L, 24, 1), "1.7", 24, 1)
     checkDecimal(Decimal(1e17.toLong, 18, 0), 1e17.toLong.toString, 18, 0)
+    checkDecimal(Decimal(1000000000000000000L, 20, 2), "10000000000000000.00", 20, 2)
     checkDecimal(Decimal(Long.MaxValue), Long.MaxValue.toString, 20, 0)
     checkDecimal(Decimal(Long.MinValue), Long.MinValue.toString, 20, 0)
     intercept[IllegalArgumentException](Decimal(170L, 2, 1))

From 7e2e268289828ae664622c59b90d82938d957ff3 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 7 Oct 2015 15:53:37 -0700
Subject: [PATCH 481/802] [SPARK-9702] [SQL] Use Exchange to implement logical
 Repartition operator

This patch allows `Repartition` to support UnsafeRows. This is accomplished by implementing the logical `Repartition` operator in terms of `Exchange` and a new `RoundRobinPartitioning`.

Author: Josh Rosen <joshrosen@databricks.com>
Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8083 from JoshRosen/SPARK-9702.
---
 .../catalyst/plans/physical/partitioning.scala | 16 ++++++++++++++++
 .../apache/spark/sql/execution/Exchange.scala  | 16 +++++++++++++---
 .../spark/sql/execution/SparkStrategies.scala  |  6 +++++-
 .../spark/sql/execution/basicOperators.scala   | 18 +++++++++---------
 4 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 5ac3f1f5b0cac..86b9417477ba3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -194,6 +194,22 @@ case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
   override def guarantees(other: Partitioning): Boolean = false
 }
 
+/**
+ * Represents a partitioning where rows are distributed evenly across output partitions
+ * by starting from a random target partition number and distributing rows in a round-robin
+ * fashion. This partitioning is used when implementing the DataFrame.repartition() operator.
+ */
+case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning {
+  override def satisfies(required: Distribution): Boolean = required match {
+    case UnspecifiedDistribution => true
+    case _ => false
+  }
+
+  override def compatibleWith(other: Partitioning): Boolean = false
+
+  override def guarantees(other: Partitioning): Boolean = false
+}
+
 case object SinglePartition extends Partitioning {
   val numPartitions = 1
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 029f2264a6a27..8efa471600b1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.Random
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
@@ -31,7 +33,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.util.MutablePair
-import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEnv}
+import org.apache.spark._
 
 /**
  * :: DeveloperApi ::
@@ -130,7 +132,6 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
   @transient private lazy val sparkConf = child.sqlContext.sparkContext.getConf
 
   private val serializer: Serializer = {
-    val rowDataTypes = child.output.map(_.dataType).toArray
     if (tungstenMode) {
       new UnsafeRowSerializer(child.output.size)
     } else {
@@ -141,6 +142,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
   protected override def doExecute(): RDD[InternalRow] = attachTree(this , "execute") {
     val rdd = child.execute()
     val part: Partitioner = newPartitioning match {
+      case RoundRobinPartitioning(numPartitions) => new HashPartitioner(numPartitions)
       case HashPartitioning(expressions, numPartitions) => new HashPartitioner(numPartitions)
       case RangePartitioning(sortingExpressions, numPartitions) =>
         // Internally, RangePartitioner runs a job on the RDD that samples keys to compute
@@ -162,7 +164,15 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
       case _ => sys.error(s"Exchange not implemented for $newPartitioning")
       // TODO: Handle BroadcastPartitioning.
     }
-    def getPartitionKeyExtractor(): InternalRow => InternalRow = newPartitioning match {
+    def getPartitionKeyExtractor(): InternalRow => Any = newPartitioning match {
+      case RoundRobinPartitioning(numPartitions) =>
+        // Distributes elements evenly across output partitions, starting from a random partition.
+        var position = new Random(TaskContext.get().partitionId()).nextInt(numPartitions)
+        (row: InternalRow) => {
+          // The HashPartitioner will handle the `mod` by the number of partitions
+          position += 1
+          position
+        }
       case HashPartitioning(expressions, _) => newMutableProjection(expressions, child.output)()
       case RangePartitioning(_, _) | SinglePartition => identity
       case _ => sys.error(s"Exchange not implemented for $newPartitioning")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index b078c8b6b05ca..d1bbf2e20fcf4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -336,7 +336,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         throw new IllegalStateException(
           "logical distinct operator should have been replaced by aggregate in the optimizer")
       case logical.Repartition(numPartitions, shuffle, child) =>
-        execution.Repartition(numPartitions, shuffle, planLater(child)) :: Nil
+        if (shuffle) {
+          execution.Exchange(RoundRobinPartitioning(numPartitions), planLater(child)) :: Nil
+        } else {
+          execution.Coalesce(numPartitions, planLater(child)) :: Nil
+        }
       case logical.SortPartitions(sortExprs, child) =>
         // This sort only sorts tuples within a partition. Its requiredDistribution will be
         // an UnspecifiedDistribution.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 3e49e0a35741d..d4bbbeb39efe2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,21 +17,20 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.Random
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
+import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.collection.ExternalSorter
-import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
 import org.apache.spark.util.random.PoissonSampler
-import org.apache.spark.util.{CompletionIterator, MutablePair}
+import org.apache.spark.util.MutablePair
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
 /**
@@ -279,10 +278,12 @@ case class TakeOrderedAndProject(
 /**
  * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.
+ * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
+ * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
+ * the 100 new partitions will claim 10 of the current partitions.
  */
 @DeveloperApi
-case class Repartition(numPartitions: Int, shuffle: Boolean, child: SparkPlan)
-  extends UnaryNode {
+case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = {
@@ -291,11 +292,10 @@ case class Repartition(numPartitions: Int, shuffle: Boolean, child: SparkPlan)
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().map(_.copy()).coalesce(numPartitions, shuffle)
+    child.execute().map(_.copy()).coalesce(numPartitions, shuffle = false)
   }
 }
 
-
 /**
  * :: DeveloperApi ::
  * Returns a table with the elements from left that are not in right using

From dd36ec6bc5844aaa045a4bd9ba49113528e1740c Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 7 Oct 2015 15:56:57 -0700
Subject: [PATCH 482/802] [SPARK-10738] [ML] Refactoring `Instance` out from
 LOR and LIR, and also cleaning up some code

Refactoring `Instance` case class out from LOR and LIR, and also cleaning up some code.

Author: DB Tsai <dbt@netflix.com>

Closes #8853 from dbtsai/refactoring.
---
 .../classification/LogisticRegression.scala   | 116 ++++++++----------
 .../apache/spark/ml/feature/Instance.scala    |  29 +++++
 .../ml/regression/LinearRegression.scala      |  82 ++++++-------
 .../LogisticRegressionSuite.scala             |   1 +
 .../ml/regression/LinearRegressionSuite.scala |   1 +
 5 files changed, 125 insertions(+), 104 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c17a7b0c361ee..6f839ff4d7cd8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -24,6 +24,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS,
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
@@ -146,17 +147,6 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   }
 }
 
-/**
- * Class that represents an instance of weighted data point with label and features.
- *
- * TODO: Refactor this class to proper place.
- *
- * @param label Label for this data point.
- * @param weight The weight of this instance.
- * @param features The vector of features for this data point.
- */
-private[classification] case class Instance(label: Double, weight: Double, features: Vector)
-
 /**
  * :: Experimental ::
  * Logistic regression.
@@ -322,7 +312,7 @@ class LogisticRegression(override val uid: String)
 
     if ($(fitIntercept)) {
       /*
-         For binary logistic regression, when we initialize the weights as zeros,
+         For binary logistic regression, when we initialize the coefficients as zeros,
          it will converge faster if we initialize the intercept such that
          it follows the distribution of the labels.
 
@@ -757,62 +747,63 @@ private class LogisticAggregator(
   private val gradientSumArray = Array.ofDim[Double](coefficientsArray.length)
 
   /**
-   * Add a new training data to this LogisticAggregator, and update the loss and gradient
+   * Add a new training instance to this LogisticAggregator, and update the loss and gradient
    * of the objective function.
    *
-   * @param label The label for this data point.
-   * @param data The features for one data point in dense/sparse vector format to be added
-   *             into this aggregator.
-   * @param weight The weight for over-/undersamples each of training instance. Default is one.
+   * @param instance The instance of data point to be added.
    * @return This LogisticAggregator object.
    */
-  def add(label: Double, data: Vector, weight: Double = 1.0): this.type = {
-    require(dim == data.size, s"Dimensions mismatch when adding new instance." +
-      s" Expecting $dim but got ${data.size}.")
-    require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
-
-    if (weight == 0.0) return this
+  def add(instance: Instance): this.type = {
+    instance match { case Instance(label, weight, features) =>
+      require(dim == features.size, s"Dimensions mismatch when adding new instance." +
+        s" Expecting $dim but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
+
+      if (weight == 0.0) return this
+
+      val localCoefficientsArray = coefficientsArray
+      val localGradientSumArray = gradientSumArray
+
+      numClasses match {
+        case 2 =>
+          // For Binary Logistic Regression.
+          val margin = - {
+            var sum = 0.0
+            features.foreachActive { (index, value) =>
+              if (featuresStd(index) != 0.0 && value != 0.0) {
+                sum += localCoefficientsArray(index) * (value / featuresStd(index))
+              }
+            }
+            sum + {
+              if (fitIntercept) localCoefficientsArray(dim) else 0.0
+            }
+          }
 
-    val localCoefficientsArray = coefficientsArray
-    val localGradientSumArray = gradientSumArray
+          val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
 
-    numClasses match {
-      case 2 =>
-        // For Binary Logistic Regression.
-        val margin = - {
-          var sum = 0.0
-          data.foreachActive { (index, value) =>
+          features.foreachActive { (index, value) =>
             if (featuresStd(index) != 0.0 && value != 0.0) {
-              sum += localCoefficientsArray(index) * (value / featuresStd(index))
+              localGradientSumArray(index) += multiplier * (value / featuresStd(index))
             }
           }
-          sum + { if (fitIntercept) localCoefficientsArray(dim) else 0.0 }
-        }
-
-        val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
 
-        data.foreachActive { (index, value) =>
-          if (featuresStd(index) != 0.0 && value != 0.0) {
-            localGradientSumArray(index) += multiplier * (value / featuresStd(index))
+          if (fitIntercept) {
+            localGradientSumArray(dim) += multiplier
           }
-        }
 
-        if (fitIntercept) {
-          localGradientSumArray(dim) += multiplier
-        }
-
-        if (label > 0) {
-          // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
-          lossSum += weight * MLUtils.log1pExp(margin)
-        } else {
-          lossSum += weight * (MLUtils.log1pExp(margin) - margin)
-        }
-      case _ =>
-        new NotImplementedError("LogisticRegression with ElasticNet in ML package only supports " +
-          "binary classification for now.")
+          if (label > 0) {
+            // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
+            lossSum += weight * MLUtils.log1pExp(margin)
+          } else {
+            lossSum += weight * (MLUtils.log1pExp(margin) - margin)
+          }
+        case _ =>
+          new NotImplementedError("LogisticRegression with ElasticNet in ML package " +
+            "only supports binary classification for now.")
+      }
+      weightSum += weight
+      this
     }
-    weightSum += weight
-    this
   }
 
   /**
@@ -861,11 +852,11 @@ private class LogisticAggregator(
 /**
  * LogisticCostFun implements Breeze's DiffFunction[T] for a multinomial logistic loss function,
  * as used in multi-class classification (it is also used in binary logistic regression).
- * It returns the loss and gradient with L2 regularization at a particular point (weights).
+ * It returns the loss and gradient with L2 regularization at a particular point (coefficients).
  * It's used in Breeze's convex optimization routines.
  */
 private class LogisticCostFun(
-    data: RDD[Instance],
+    instances: RDD[Instance],
     numClasses: Int,
     fitIntercept: Boolean,
     standardization: Boolean,
@@ -875,15 +866,14 @@ private class LogisticCostFun(
 
   override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
     val numFeatures = featuresStd.length
-    val w = Vectors.fromBreeze(coefficients)
+    val coeffs = Vectors.fromBreeze(coefficients)
 
     val logisticAggregator = {
-      val seqOp = (c: LogisticAggregator, instance: Instance) =>
-        c.add(instance.label, instance.features, instance.weight)
+      val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance)
       val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2)
 
-      data.treeAggregate(
-        new LogisticAggregator(w, numClasses, fitIntercept, featuresStd, featuresMean)
+      instances.treeAggregate(
+        new LogisticAggregator(coeffs, numClasses, fitIntercept, featuresStd, featuresMean)
       )(seqOp, combOp)
     }
 
@@ -894,7 +884,7 @@ private class LogisticCostFun(
       0.0
     } else {
       var sum = 0.0
-      w.foreachActive { (index, value) =>
+      coeffs.foreachActive { (index, value) =>
         // If `fitIntercept` is true, the last term which is intercept doesn't
         // contribute to the regularization.
         if (index != numFeatures) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
new file mode 100644
index 0000000000000..12176757aee3d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.mllib.linalg.Vector
+
+/**
+ * Class that represents an instance of weighted data point with label and features.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param features The vector of features for this data point.
+ */
+private[ml] case class Instance(label: Double, weight: Double, features: Vector)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index a77e702141b3d..0dc084fdd12f7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -19,11 +19,12 @@ package org.apache.spark.ml.regression
 
 import scala.collection.mutable
 
-import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
+import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -44,24 +45,13 @@ private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
     with HasFitIntercept with HasStandardization with HasWeightCol
 
-/**
- * Class that represents an instance of weighted data point with label and features.
- *
- * TODO: Refactor this class to proper place.
- *
- * @param label Label for this data point.
- * @param weight The weight of this instance.
- * @param features The vector of features for this data point.
- */
-private[regression] case class Instance(label: Double, weight: Double, features: Vector)
-
 /**
  * :: Experimental ::
  * Linear regression.
  *
  * The learning objective is to minimize the squared error, with regularization.
  * The specific squared error loss function used is:
- *   L = 1/2n ||A weights - y||^2^
+ *   L = 1/2n ||A coefficients - y||^2^
  *
  * This support multiple types of regularization:
  *  - none (a.k.a. ordinary least squares)
@@ -172,13 +162,14 @@ class LinearRegression(override val uid: String)
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
     if (yStd == 0.0) {
-      logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " +
-        s"and the intercept will be the mean of the label; as a result, training is not needed.")
+      logWarning(s"The standard deviation of the label is zero, so the coefficients will be " +
+        s"zeros and the intercept will be the mean of the label; as a result, " +
+        s"training is not needed.")
       if (handlePersistence) instances.unpersist()
-      val weights = Vectors.sparse(numFeatures, Seq())
+      val coefficients = Vectors.sparse(numFeatures, Seq())
       val intercept = yMean
 
-      val model = new LinearRegressionModel(uid, weights, intercept)
+      val model = new LinearRegressionModel(uid, coefficients, intercept)
       val trainingSummary = new LinearRegressionTrainingSummary(
         model.transform(dataset),
         $(predictionCol),
@@ -218,11 +209,11 @@ class LinearRegression(override val uid: String)
       new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegFun, $(tol))
     }
 
-    val initialWeights = Vectors.zeros(numFeatures)
+    val initialCoefficients = Vectors.zeros(numFeatures)
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialWeights.toBreeze.toDenseVector)
+      initialCoefficients.toBreeze.toDenseVector)
 
-    val (weights, objectiveHistory) = {
+    val (coefficients, objectiveHistory) = {
       /*
          Note that in Linear Regression, the objective history (loss + regularization) returned
          from optimizer is computed in the scaled space given by the following formula.
@@ -243,18 +234,18 @@ class LinearRegression(override val uid: String)
       }
 
       /*
-         The weights are trained in the scaled space; we're converting them back to
+         The coefficients are trained in the scaled space; we're converting them back to
          the original space.
        */
-      val rawWeights = state.x.toArray.clone()
+      val rawCoefficients = state.x.toArray.clone()
       var i = 0
-      val len = rawWeights.length
+      val len = rawCoefficients.length
       while (i < len) {
-        rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
+        rawCoefficients(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
         i += 1
       }
 
-      (Vectors.dense(rawWeights).compressed, arrayBuilder.result())
+      (Vectors.dense(rawCoefficients).compressed, arrayBuilder.result())
     }
 
     /*
@@ -262,11 +253,15 @@ class LinearRegression(override val uid: String)
        converged. See the following discussion for detail.
        http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
      */
-    val intercept = if ($(fitIntercept)) yMean - dot(weights, Vectors.dense(featuresMean)) else 0.0
+    val intercept = if ($(fitIntercept)) {
+      yMean - dot(coefficients, Vectors.dense(featuresMean))
+    } else {
+      0.0
+    }
 
     if (handlePersistence) instances.unpersist()
 
-    val model = copyValues(new LinearRegressionModel(uid, weights, intercept))
+    val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
     val trainingSummary = new LinearRegressionTrainingSummary(
       model.transform(dataset),
       $(predictionCol),
@@ -425,7 +420,7 @@ class LinearRegressionSummary private[regression] (
  * For improving the convergence rate during the optimization process, and also preventing against
  * features with very large variances exerting an overly large influence during model training,
  * package like R's GLMNET performs the scaling to unit variance and removing the mean to reduce
- * the condition number, and then trains the model in scaled space but returns the weights in
+ * the condition number, and then trains the model in scaled space but returns the coefficients in
  * the original scale. See page 9 in http://cran.r-project.org/web/packages/glmnet/glmnet.pdf
  *
  * However, we don't want to apply the `StandardScaler` on the training dataset, and then cache
@@ -456,7 +451,7 @@ class LinearRegressionSummary private[regression] (
  *     + \bar{y} / \hat{y}||^2
  *   = 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
  * }}}
- * where w_i^\prime^ is the effective weights defined by w_i/\hat{x_i}, offset is
+ * where w_i^\prime^ is the effective coefficients defined by w_i/\hat{x_i}, offset is
  * {{{
  * - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
  * }}}, and diff is
@@ -465,7 +460,7 @@ class LinearRegressionSummary private[regression] (
  * }}}
  *
  *
- * Note that the effective weights and offset don't depend on training dataset,
+ * Note that the effective coefficients and offset don't depend on training dataset,
  * so they can be precomputed.
  *
  * Now, the first derivative of the objective function in scaled space is
@@ -543,13 +538,13 @@ private class LeastSquaresAggregator(
   private val gradientSumArray = Array.ofDim[Double](dim)
 
   /**
-   * Add a new training data to this LeastSquaresAggregator, and update the loss and gradient
+   * Add a new training instance to this LeastSquaresAggregator, and update the loss and gradient
    * of the objective function.
    *
-   * @param instance  The data point instance to be added.
+   * @param instance The instance of data point to be added.
    * @return This LeastSquaresAggregator object.
    */
-  def add(instance: Instance): this.type =
+  def add(instance: Instance): this.type = {
     instance match { case Instance(label, weight, features) =>
       require(dim == features.size, s"Dimensions mismatch when adding new sample." +
         s" Expecting $dim but got ${features.size}.")
@@ -573,6 +568,7 @@ private class LeastSquaresAggregator(
       weightSum += weight
       this
     }
+  }
 
   /**
    * Merge another LeastSquaresAggregator, and update the loss and gradient
@@ -621,11 +617,11 @@ private class LeastSquaresAggregator(
 
 /**
  * LeastSquaresCostFun implements Breeze's DiffFunction[T] for Least Squares cost.
- * It returns the loss and gradient with L2 regularization at a particular point (weights).
+ * It returns the loss and gradient with L2 regularization at a particular point (coefficients).
  * It's used in Breeze's convex optimization routines.
  */
 private class LeastSquaresCostFun(
-    data: RDD[Instance],
+    instances: RDD[Instance],
     labelStd: Double,
     labelMean: Double,
     fitIntercept: Boolean,
@@ -635,12 +631,16 @@ private class LeastSquaresCostFun(
     effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
 
   override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
-    val coeff = Vectors.fromBreeze(coefficients)
+    val coeffs = Vectors.fromBreeze(coefficients)
 
-    val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(coeff, labelStd,
-      labelMean, fitIntercept, featuresStd, featuresMean))(
-        seqOp = (aggregator, instance) => aggregator.add(instance),
-        combOp = (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
+    val leastSquaresAggregator = {
+      val seqOp = (c: LeastSquaresAggregator, instance: Instance) => c.add(instance)
+      val combOp = (c1: LeastSquaresAggregator, c2: LeastSquaresAggregator) => c1.merge(c2)
+
+      instances.treeAggregate(
+        new LeastSquaresAggregator(coeffs, labelStd, labelMean, fitIntercept, featuresStd,
+          featuresMean))(seqOp, combOp)
+    }
 
     val totalGradientArray = leastSquaresAggregator.gradient.toArray
 
@@ -648,7 +648,7 @@ private class LeastSquaresCostFun(
       0.0
     } else {
       var sum = 0.0
-      coeff.foreachActive { (index, value) =>
+      coeffs.foreachActive { (index, value) =>
         // The following code will compute the loss of the regularization; also
         // the gradient of the regularization, and add back to totalGradientArray.
         sum += {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index ec01998601fbc..5186c4e2be64d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.classification
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 7cb9471e69860..32729470d5bff 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.regression
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.regression.LabeledPoint

From 075a0b658289608c8732e07e26e14d736e673ce9 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 7 Oct 2015 15:58:07 -0700
Subject: [PATCH 483/802] [SPARK-10917] [SQL] improve performance of complex
 type in columnar cache

This PR improve the performance of complex types in columnar cache by using UnsafeProjection instead of KryoSerializer.

A simple benchmark show that this PR could improve the performance of scanning a cached table with complex columns by 15x (comparing to Spark 1.5).

Here is the code used to benchmark:

```
df = sc.range(1<<23).map(lambda i: Row(a=Row(b=i, c=str(i)), d=range(10), e=dict(zip(range(10), [str(i) for i in range(10)])))).toDF()
df.write.parquet("table")
```
```
df = sqlContext.read.parquet("table")
df.cache()
df.count()
t = time.time()
print df.select("*")._jdf.queryExecution().toRdd().count()
print time.time() - t
```

Author: Davies Liu <davies@databricks.com>

Closes #8971 from davies/complex.
---
 .../catalyst/expressions/UnsafeArrayData.java |   1 -
 .../spark/sql/types/ArrayBasedMapData.scala   |   5 +
 .../spark/sql/columnar/ColumnAccessor.scala   |  38 ++-
 .../spark/sql/columnar/ColumnBuilder.scala    |  40 ++-
 .../spark/sql/columnar/ColumnStats.scala      |  11 +-
 .../spark/sql/columnar/ColumnType.scala       | 219 +++++++++++++---
 .../spark/sql/columnar/ColumnStatsSuite.scala |   9 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  | 235 ++++--------------
 .../sql/columnar/ColumnarTestUtils.scala      |  18 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |   8 +-
 .../NullableColumnAccessorSuite.scala         |  13 +-
 .../columnar/NullableColumnBuilderSuite.scala |  21 +-
 12 files changed, 352 insertions(+), 266 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 6a16d34083163..fdd9125613a26 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -23,7 +23,6 @@
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
-import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
index f6fa021adee95..52069598ee30e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -48,6 +48,11 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte
 }
 
 object ArrayBasedMapData {
+  def apply(map: Map[Any, Any]): ArrayBasedMapData = {
+    val array = map.toArray
+    ArrayBasedMapData(array.map(_._1), array.map(_._2))
+  }
+
   def apply(keys: Array[Any], values: Array[Any]): ArrayBasedMapData = {
     new ArrayBasedMapData(new GenericArrayData(keys), new GenericArrayData(values))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 2b1d7009874ac..f04099f54c41d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.MutableRow
 import org.apache.spark.sql.columnar.compression.CompressibleColumnAccessor
 import org.apache.spark.sql.types._
@@ -61,6 +62,10 @@ private[sql] abstract class BasicColumnAccessor[JvmType](
   protected def underlyingBuffer = buffer
 }
 
+private[sql] class NullColumnAccess(buffer: ByteBuffer)
+  extends BasicColumnAccessor[Any](buffer, NULL)
+  with NullableColumnAccessor
+
 private[sql] abstract class NativeColumnAccessor[T <: AtomicType](
     override protected val buffer: ByteBuffer,
     override protected val columnType: NativeColumnType[T])
@@ -96,11 +101,23 @@ private[sql] class BinaryColumnAccessor(buffer: ByteBuffer)
   extends BasicColumnAccessor[Array[Byte]](buffer, BINARY)
   with NullableColumnAccessor
 
-private[sql] class FixedDecimalColumnAccessor(buffer: ByteBuffer, precision: Int, scale: Int)
-  extends NativeColumnAccessor(buffer, FIXED_DECIMAL(precision, scale))
+private[sql] class CompactDecimalColumnAccessor(buffer: ByteBuffer, dataType: DecimalType)
+  extends NativeColumnAccessor(buffer, COMPACT_DECIMAL(dataType))
+
+private[sql] class DecimalColumnAccessor(buffer: ByteBuffer, dataType: DecimalType)
+  extends BasicColumnAccessor[Decimal](buffer, LARGE_DECIMAL(dataType))
+  with NullableColumnAccessor
+
+private[sql] class StructColumnAccessor(buffer: ByteBuffer, dataType: StructType)
+  extends BasicColumnAccessor[InternalRow](buffer, STRUCT(dataType))
+  with NullableColumnAccessor
+
+private[sql] class ArrayColumnAccessor(buffer: ByteBuffer, dataType: ArrayType)
+  extends BasicColumnAccessor[ArrayData](buffer, ARRAY(dataType))
+  with NullableColumnAccessor
 
-private[sql] class GenericColumnAccessor(buffer: ByteBuffer, dataType: DataType)
-  extends BasicColumnAccessor[Array[Byte]](buffer, GENERIC(dataType))
+private[sql] class MapColumnAccessor(buffer: ByteBuffer, dataType: MapType)
+  extends BasicColumnAccessor[MapData](buffer, MAP(dataType))
   with NullableColumnAccessor
 
 private[sql] object ColumnAccessor {
@@ -108,6 +125,7 @@ private[sql] object ColumnAccessor {
     val buf = buffer.order(ByteOrder.nativeOrder)
 
     dataType match {
+      case NullType => new NullColumnAccess(buf)
       case BooleanType => new BooleanColumnAccessor(buf)
       case ByteType => new ByteColumnAccessor(buf)
       case ShortType => new ShortColumnAccessor(buf)
@@ -117,9 +135,15 @@ private[sql] object ColumnAccessor {
       case DoubleType => new DoubleColumnAccessor(buf)
       case StringType => new StringColumnAccessor(buf)
       case BinaryType => new BinaryColumnAccessor(buf)
-      case DecimalType.Fixed(precision, scale) if precision < 19 =>
-        new FixedDecimalColumnAccessor(buf, precision, scale)
-      case other => new GenericColumnAccessor(buf, other)
+      case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS =>
+        new CompactDecimalColumnAccessor(buf, dt)
+      case dt: DecimalType => new DecimalColumnAccessor(buf, dt)
+      case struct: StructType => new StructColumnAccessor(buf, struct)
+      case array: ArrayType => new ArrayColumnAccessor(buf, array)
+      case map: MapType => new MapColumnAccessor(buf, map)
+      case udt: UserDefinedType[_] => ColumnAccessor(udt.sqlType, buffer)
+      case other =>
+        throw new Exception(s"not support type: $other")
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index 2e60564f7c64e..7a7345a7e004b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -77,6 +77,10 @@ private[sql] class BasicColumnBuilder[JvmType](
   }
 }
 
+private[sql] class NullColumnBuilder
+  extends BasicColumnBuilder[Any](new ObjectColumnStats(NullType), NULL)
+  with NullableColumnBuilder
+
 private[sql] abstract class ComplexColumnBuilder[JvmType](
     columnStats: ColumnStats,
     columnType: ColumnType[JvmType])
@@ -109,16 +113,20 @@ private[sql] class StringColumnBuilder extends NativeColumnBuilder(new StringCol
 
 private[sql] class BinaryColumnBuilder extends ComplexColumnBuilder(new BinaryColumnStats, BINARY)
 
-private[sql] class FixedDecimalColumnBuilder(
-    precision: Int,
-    scale: Int)
-  extends NativeColumnBuilder(
-    new FixedDecimalColumnStats(precision, scale),
-    FIXED_DECIMAL(precision, scale))
+private[sql] class CompactDecimalColumnBuilder(dataType: DecimalType)
+  extends NativeColumnBuilder(new DecimalColumnStats(dataType), COMPACT_DECIMAL(dataType))
+
+private[sql] class DecimalColumnBuilder(dataType: DecimalType)
+  extends ComplexColumnBuilder(new DecimalColumnStats(dataType), LARGE_DECIMAL(dataType))
+
+private[sql] class StructColumnBuilder(dataType: StructType)
+  extends ComplexColumnBuilder(new ObjectColumnStats(dataType), STRUCT(dataType))
+
+private[sql] class ArrayColumnBuilder(dataType: ArrayType)
+  extends ComplexColumnBuilder(new ObjectColumnStats(dataType), ARRAY(dataType))
 
-// TODO (lian) Add support for array, struct and map
-private[sql] class GenericColumnBuilder(dataType: DataType)
-  extends ComplexColumnBuilder(new GenericColumnStats(dataType), GENERIC(dataType))
+private[sql] class MapColumnBuilder(dataType: MapType)
+  extends ComplexColumnBuilder(new ObjectColumnStats(dataType), MAP(dataType))
 
 private[sql] object ColumnBuilder {
   val DEFAULT_INITIAL_BUFFER_SIZE = 1024 * 1024
@@ -145,6 +153,7 @@ private[sql] object ColumnBuilder {
       columnName: String = "",
       useCompression: Boolean = false): ColumnBuilder = {
     val builder: ColumnBuilder = dataType match {
+      case NullType => new NullColumnBuilder
       case BooleanType => new BooleanColumnBuilder
       case ByteType => new ByteColumnBuilder
       case ShortType => new ShortColumnBuilder
@@ -154,9 +163,16 @@ private[sql] object ColumnBuilder {
       case DoubleType => new DoubleColumnBuilder
       case StringType => new StringColumnBuilder
       case BinaryType => new BinaryColumnBuilder
-      case DecimalType.Fixed(precision, scale) if precision < 19 =>
-        new FixedDecimalColumnBuilder(precision, scale)
-      case other => new GenericColumnBuilder(other)
+      case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS =>
+        new CompactDecimalColumnBuilder(dt)
+      case dt: DecimalType => new DecimalColumnBuilder(dt)
+      case struct: StructType => new StructColumnBuilder(struct)
+      case array: ArrayType => new ArrayColumnBuilder(array)
+      case map: MapType => new MapColumnBuilder(map)
+      case udt: UserDefinedType[_] =>
+        return apply(udt.sqlType, initialSize, columnName, useCompression)
+      case other =>
+        throw new Exception(s"not suppported type: $other")
     }
 
     builder.initialize(initialSize, columnName, useCompression)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 3b5052b754060..ba61003ba41c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -235,7 +235,9 @@ private[sql] class BinaryColumnStats extends ColumnStats {
     new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
 }
 
-private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
+private[sql] class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
+  def this(dt: DecimalType) = this(dt.precision, dt.scale)
+
   protected var upper: Decimal = null
   protected var lower: Decimal = null
 
@@ -245,7 +247,8 @@ private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends C
       val value = row.getDecimal(ordinal, precision, scale)
       if (upper == null || value.compareTo(upper) > 0) upper = value
       if (lower == null || value.compareTo(lower) < 0) lower = value
-      sizeInBytes += FIXED_DECIMAL.defaultSize
+      // TODO: this is not right for DecimalType with precision > 18
+      sizeInBytes += 8
     }
   }
 
@@ -253,8 +256,8 @@ private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends C
     new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
 }
 
-private[sql] class GenericColumnStats(dataType: DataType) extends ColumnStats {
-  val columnType = GENERIC(dataType)
+private[sql] class ObjectColumnStats(dataType: DataType) extends ColumnStats {
+  val columnType = ColumnType(dataType)
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 3a0cea8750f1e..3563eacb3a3e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.ByteBuffer
+import java.math.{BigDecimal, BigInteger}
+import java.nio.{ByteOrder, ByteBuffer}
 
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.MutableRow
-import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -102,6 +103,16 @@ private[sql] sealed abstract class ColumnType[JvmType] {
   override def toString: String = getClass.getSimpleName.stripSuffix("$")
 }
 
+private[sql] object NULL extends ColumnType[Any] {
+
+  override def dataType: DataType = NullType
+  override def defaultSize: Int = 0
+  override def append(v: Any, buffer: ByteBuffer): Unit = {}
+  override def extract(buffer: ByteBuffer): Any = null
+  override def setField(row: MutableRow, ordinal: Int, value: Any): Unit = row.setNullAt(ordinal)
+  override def getField(row: InternalRow, ordinal: Int): Any = null
+}
+
 private[sql] abstract class NativeColumnType[T <: AtomicType](
     val dataType: T,
     val defaultSize: Int)
@@ -339,10 +350,8 @@ private[sql] object STRING extends NativeColumnType(StringType, 8) {
   override def clone(v: UTF8String): UTF8String = v.clone()
 }
 
-private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
-  extends NativeColumnType(
-    DecimalType(precision, scale),
-    FIXED_DECIMAL.defaultSize) {
+private[sql] case class COMPACT_DECIMAL(precision: Int, scale: Int)
+  extends NativeColumnType(DecimalType(precision, scale), 8) {
 
   override def extract(buffer: ByteBuffer): Decimal = {
     Decimal(buffer.getLong(), precision, scale)
@@ -365,32 +374,39 @@ private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
   }
 }
 
-private[sql] object FIXED_DECIMAL {
-  val defaultSize = 8
+private[sql] object COMPACT_DECIMAL {
+  def apply(dt: DecimalType): COMPACT_DECIMAL = {
+    COMPACT_DECIMAL(dt.precision, dt.scale)
+  }
 }
 
-private[sql] sealed abstract class ByteArrayColumnType(val defaultSize: Int)
-  extends ColumnType[Array[Byte]] {
+private[sql] sealed abstract class ByteArrayColumnType[JvmType](val defaultSize: Int)
+  extends ColumnType[JvmType] {
+
+  def serialize(value: JvmType): Array[Byte]
+  def deserialize(bytes: Array[Byte]): JvmType
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
-    getField(row, ordinal).length + 4
+    // TODO: grow the buffer in append(), so serialize() will not be called twice
+    serialize(getField(row, ordinal)).length + 4
   }
 
-  override def append(v: Array[Byte], buffer: ByteBuffer): Unit = {
-    buffer.putInt(v.length).put(v, 0, v.length)
+  override def append(v: JvmType, buffer: ByteBuffer): Unit = {
+    val bytes = serialize(v)
+    buffer.putInt(bytes.length).put(bytes, 0, bytes.length)
   }
 
-  override def extract(buffer: ByteBuffer): Array[Byte] = {
+  override def extract(buffer: ByteBuffer): JvmType = {
     val length = buffer.getInt()
     val bytes = new Array[Byte](length)
     buffer.get(bytes, 0, length)
-    bytes
+    deserialize(bytes)
   }
 }
 
-private[sql] object BINARY extends ByteArrayColumnType(16) {
+private[sql] object BINARY extends ByteArrayColumnType[Array[Byte]](16) {
 
-  def dataType: DataType = BooleanType
+  def dataType: DataType = BinaryType
 
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]): Unit = {
     row.update(ordinal, value)
@@ -399,24 +415,164 @@ private[sql] object BINARY extends ByteArrayColumnType(16) {
   override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
     row.getBinary(ordinal)
   }
+
+  def serialize(value: Array[Byte]): Array[Byte] = value
+  def deserialize(bytes: Array[Byte]): Array[Byte] = bytes
 }
 
-// Used to process generic objects (all types other than those listed above). Objects should be
-// serialized first before appending to the column `ByteBuffer`, and is also extracted as serialized
-// byte array.
-private[sql] case class GENERIC(dataType: DataType) extends ByteArrayColumnType(16) {
-  override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]): Unit = {
-    row.update(ordinal, SparkSqlSerializer.deserialize[Any](value))
+private[sql] case class LARGE_DECIMAL(precision: Int, scale: Int)
+  extends ByteArrayColumnType[Decimal](12) {
+
+  override val dataType: DataType = DecimalType(precision, scale)
+
+  override def getField(row: InternalRow, ordinal: Int): Decimal = {
+    row.getDecimal(ordinal, precision, scale)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
-    SparkSqlSerializer.serialize(row.get(ordinal, dataType))
+  override def setField(row: MutableRow, ordinal: Int, value: Decimal): Unit = {
+    row.setDecimal(ordinal, value, precision)
+  }
+
+  override def serialize(value: Decimal): Array[Byte] = {
+    value.toJavaBigDecimal.unscaledValue().toByteArray
+  }
+
+  override def deserialize(bytes: Array[Byte]): Decimal = {
+    val javaDecimal = new BigDecimal(new BigInteger(bytes), scale)
+    Decimal.apply(javaDecimal, precision, scale)
+  }
+}
+
+private[sql] object LARGE_DECIMAL {
+  def apply(dt: DecimalType): LARGE_DECIMAL = {
+    LARGE_DECIMAL(dt.precision, dt.scale)
+  }
+}
+
+private[sql] case class STRUCT(dataType: StructType)
+  extends ByteArrayColumnType[InternalRow](20) {
+
+  private val projection: UnsafeProjection =
+    UnsafeProjection.create(dataType)
+  private val numOfFields: Int = dataType.fields.size
+
+  override def setField(row: MutableRow, ordinal: Int, value: InternalRow): Unit = {
+    row.update(ordinal, value)
+  }
+
+  override def getField(row: InternalRow, ordinal: Int): InternalRow = {
+    row.getStruct(ordinal, numOfFields)
+  }
+
+  override def serialize(value: InternalRow): Array[Byte] = {
+    val unsafeRow = if (value.isInstanceOf[UnsafeRow]) {
+      value.asInstanceOf[UnsafeRow]
+    } else {
+      projection(value)
+    }
+    unsafeRow.getBytes
+  }
+
+  override def deserialize(bytes: Array[Byte]): InternalRow = {
+    val unsafeRow = new UnsafeRow
+    unsafeRow.pointTo(bytes, numOfFields, bytes.length)
+    unsafeRow
+  }
+
+  override def clone(v: InternalRow): InternalRow = v.copy()
+}
+
+private[sql] case class ARRAY(dataType: ArrayType)
+  extends ByteArrayColumnType[ArrayData](16) {
+
+  private lazy val projection = UnsafeProjection.create(Array[DataType](dataType))
+  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+
+  override def setField(row: MutableRow, ordinal: Int, value: ArrayData): Unit = {
+    row.update(ordinal, value)
+  }
+
+  override def getField(row: InternalRow, ordinal: Int): ArrayData = {
+    row.getArray(ordinal)
+  }
+
+  override def serialize(value: ArrayData): Array[Byte] = {
+    val unsafeArray = if (value.isInstanceOf[UnsafeArrayData]) {
+      value.asInstanceOf[UnsafeArrayData]
+    } else {
+      mutableRow(0) = value
+      projection(mutableRow).getArray(0)
+    }
+    val outputBuffer =
+      ByteBuffer.allocate(4 + unsafeArray.getSizeInBytes).order(ByteOrder.nativeOrder())
+    outputBuffer.putInt(unsafeArray.numElements())
+    val underlying = outputBuffer.array()
+    unsafeArray.writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 4)
+    underlying
+  }
+
+  override def deserialize(bytes: Array[Byte]): ArrayData = {
+    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
+    val numElements = buffer.getInt
+    val array = new UnsafeArrayData
+    array.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 4, numElements, bytes.length - 4)
+    array
   }
+
+  override def clone(v: ArrayData): ArrayData = v.copy()
+}
+
+private[sql] case class MAP(dataType: MapType) extends ByteArrayColumnType[MapData](32) {
+
+  private lazy val projection: UnsafeProjection = UnsafeProjection.create(Array[DataType](dataType))
+  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+
+  override def setField(row: MutableRow, ordinal: Int, value: MapData): Unit = {
+    row.update(ordinal, value)
+  }
+
+  override def getField(row: InternalRow, ordinal: Int): MapData = {
+    row.getMap(ordinal)
+  }
+
+  override def serialize(value: MapData): Array[Byte] = {
+    val unsafeMap = if (value.isInstanceOf[UnsafeMapData]) {
+      value.asInstanceOf[UnsafeMapData]
+    } else {
+      mutableRow(0) = value
+      projection(mutableRow).getMap(0)
+    }
+
+    val outputBuffer =
+      ByteBuffer.allocate(8 + unsafeMap.getSizeInBytes).order(ByteOrder.nativeOrder())
+    outputBuffer.putInt(unsafeMap.numElements())
+    val keyBytes = unsafeMap.keyArray().getSizeInBytes
+    outputBuffer.putInt(keyBytes)
+    val underlying = outputBuffer.array()
+    unsafeMap.keyArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8)
+    unsafeMap.valueArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8 + keyBytes)
+    underlying
+  }
+
+  override def deserialize(bytes: Array[Byte]): MapData = {
+    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
+    val numElements = buffer.getInt
+    val keyArraySize = buffer.getInt
+    val keyArray = new UnsafeArrayData
+    val valueArray = new UnsafeArrayData
+    keyArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8, numElements, keyArraySize)
+    valueArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8 + keyArraySize, numElements,
+      bytes.length - 8 - keyArraySize)
+    new UnsafeMapData(keyArray, valueArray)
+  }
+
+  override def clone(v: MapData): MapData = v.copy()
 }
 
 private[sql] object ColumnType {
   def apply(dataType: DataType): ColumnType[_] = {
     dataType match {
+      case NullType => NULL
       case BooleanType => BOOLEAN
       case ByteType => BYTE
       case ShortType => SHORT
@@ -426,9 +582,14 @@ private[sql] object ColumnType {
       case DoubleType => DOUBLE
       case StringType => STRING
       case BinaryType => BINARY
-      case DecimalType.Fixed(precision, scale) if precision < 19 =>
-        FIXED_DECIMAL(precision, scale)
-      case other => GENERIC(other)
+      case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => COMPACT_DECIMAL(dt)
+      case dt: DecimalType => LARGE_DECIMAL(dt)
+      case arr: ArrayType => ARRAY(arr)
+      case map: MapType => MAP(map)
+      case struct: StructType => STRUCT(struct)
+      case udt: UserDefinedType[_] => apply(udt.sqlType)
+      case other =>
+        throw new Exception(s"Unsupported type: $other")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 708fb4cf79332..89a664001bdd2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.types._
 
@@ -76,11 +75,11 @@ class ColumnStatsSuite extends SparkFunSuite {
   def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats](
       initialStatistics: GenericInternalRow): Unit = {
 
-    val columnStatsName = classOf[FixedDecimalColumnStats].getSimpleName
-    val columnType = FIXED_DECIMAL(15, 10)
+    val columnStatsName = classOf[DecimalColumnStats].getSimpleName
+    val columnType = COMPACT_DECIMAL(15, 10)
 
     test(s"$columnStatsName: empty") {
-      val columnStats = new FixedDecimalColumnStats(15, 10)
+      val columnStats = new DecimalColumnStats(15, 10)
       columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach {
         case (actual, expected) => assert(actual === expected)
       }
@@ -89,7 +88,7 @@ class ColumnStatsSuite extends SparkFunSuite {
     test(s"$columnStatsName: non-empty") {
       import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
-      val columnStats = new FixedDecimalColumnStats(15, 10)
+      val columnStats = new DecimalColumnStats(15, 10)
       val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
       rows.foreach(columnStats.gatherStats(_, 0))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index a4cbe3525ef91..ceb8ad97bb320 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -19,28 +19,25 @@ package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
 
-import com.esotericsoftware.kryo.io.{Input, Output}
-import com.esotericsoftware.kryo.{Kryo, Serializer}
-
-import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
-import org.apache.spark.serializer.KryoRegistrator
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
-import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.{Logging, SparkFunSuite}
 
 
 class ColumnTypeSuite extends SparkFunSuite with Logging {
   private val DEFAULT_BUFFER_SIZE = 512
-  private val MAP_GENERIC = GENERIC(MapType(IntegerType, StringType))
+  private val MAP_TYPE = MAP(MapType(IntegerType, StringType))
+  private val ARRAY_TYPE = ARRAY(ArrayType(IntegerType))
+  private val STRUCT_TYPE = STRUCT(StructType(StructField("a", StringType) :: Nil))
 
   test("defaultSize") {
     val checks = Map(
-      BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4,
-      LONG -> 8, FLOAT -> 4, DOUBLE -> 8,
-      STRING -> 8, BINARY -> 16, FIXED_DECIMAL(15, 10) -> 8,
-      MAP_GENERIC -> 16)
+      NULL-> 0, BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4, LONG -> 8,
+      FLOAT -> 4, DOUBLE -> 8, COMPACT_DECIMAL(15, 10) -> 8, LARGE_DECIMAL(20, 10) -> 12,
+      STRING -> 8, BINARY -> 16, STRUCT_TYPE -> 20, ARRAY_TYPE -> 16, MAP_TYPE -> 32)
 
     checks.foreach { case (columnType, expectedSize) =>
       assertResult(expectedSize, s"Wrong defaultSize for $columnType") {
@@ -50,18 +47,19 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
   }
 
   test("actualSize") {
-    def checkActualSize[JvmType](
-        columnType: ColumnType[JvmType],
-        value: JvmType,
+    def checkActualSize(
+        columnType: ColumnType[_],
+        value: Any,
         expected: Int): Unit = {
 
       assertResult(expected, s"Wrong actualSize for $columnType") {
         val row = new GenericMutableRow(1)
-        columnType.setField(row, 0, value)
+        row.update(0, CatalystTypeConverters.convertToCatalyst(value))
         columnType.actualSize(row, 0)
       }
     }
 
+    checkActualSize(NULL, null, 0)
     checkActualSize(BOOLEAN, true, 1)
     checkActualSize(BYTE, Byte.MaxValue, 1)
     checkActualSize(SHORT, Short.MaxValue, 2)
@@ -69,176 +67,65 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(LONG, Long.MaxValue, 8)
     checkActualSize(FLOAT, Float.MaxValue, 4)
     checkActualSize(DOUBLE, Double.MaxValue, 8)
-    checkActualSize(STRING, UTF8String.fromString("hello"), 4 + "hello".getBytes("utf-8").length)
+    checkActualSize(STRING, "hello", 4 + "hello".getBytes("utf-8").length)
     checkActualSize(BINARY, Array.fill[Byte](4)(0.toByte), 4 + 4)
-    checkActualSize(FIXED_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
-
-    val generic = Map(1 -> "a")
-    checkActualSize(MAP_GENERIC, SparkSqlSerializer.serialize(generic), 4 + 8)
+    checkActualSize(COMPACT_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
+    checkActualSize(LARGE_DECIMAL(20, 10), Decimal(0, 20, 10), 5)
+    checkActualSize(ARRAY_TYPE, Array[Any](1), 16)
+    checkActualSize(MAP_TYPE, Map(1 -> "a"), 25)
+    checkActualSize(STRUCT_TYPE, Row("hello"), 28)
   }
 
-  testNativeColumnType(BOOLEAN)(
-    (buffer: ByteBuffer, v: Boolean) => {
-      buffer.put((if (v) 1 else 0).toByte)
-    },
-    (buffer: ByteBuffer) => {
-      buffer.get() == 1
-    })
-
-  testNativeColumnType(BYTE)(_.put(_), _.get)
-
-  testNativeColumnType(SHORT)(_.putShort(_), _.getShort)
-
-  testNativeColumnType(INT)(_.putInt(_), _.getInt)
-
-  testNativeColumnType(LONG)(_.putLong(_), _.getLong)
-
-  testNativeColumnType(FLOAT)(_.putFloat(_), _.getFloat)
-
-  testNativeColumnType(DOUBLE)(_.putDouble(_), _.getDouble)
-
-  testNativeColumnType(FIXED_DECIMAL(15, 10))(
-    (buffer: ByteBuffer, decimal: Decimal) => {
-      buffer.putLong(decimal.toUnscaledLong)
-    },
-    (buffer: ByteBuffer) => {
-      Decimal(buffer.getLong(), 15, 10)
-    })
-
-
-  testNativeColumnType(STRING)(
-    (buffer: ByteBuffer, string: UTF8String) => {
-      val bytes = string.getBytes
-      buffer.putInt(bytes.length)
-      buffer.put(bytes)
-    },
-    (buffer: ByteBuffer) => {
-      val length = buffer.getInt()
-      val bytes = new Array[Byte](length)
-      buffer.get(bytes)
-      UTF8String.fromBytes(bytes)
-    })
-
-  testColumnType[Array[Byte]](
-    BINARY,
-    (buffer: ByteBuffer, bytes: Array[Byte]) => {
-      buffer.putInt(bytes.length).put(bytes)
-    },
-    (buffer: ByteBuffer) => {
-      val length = buffer.getInt()
-      val bytes = new Array[Byte](length)
-      buffer.get(bytes, 0, length)
-      bytes
-    })
-
-  test("GENERIC") {
-    val buffer = ByteBuffer.allocate(512)
-    val obj = Map(1 -> "spark", 2 -> "sql")
-    val serializedObj = SparkSqlSerializer.serialize(obj)
-
-    MAP_GENERIC.append(SparkSqlSerializer.serialize(obj), buffer)
-    buffer.rewind()
-
-    val length = buffer.getInt()
-    assert(length === serializedObj.length)
-
-    assertResult(obj, "Deserialized object didn't equal to the original object") {
-      val bytes = new Array[Byte](length)
-      buffer.get(bytes, 0, length)
-      SparkSqlSerializer.deserialize(bytes)
-    }
-
-    buffer.rewind()
-    buffer.putInt(serializedObj.length).put(serializedObj)
-
-    assertResult(obj, "Deserialized object didn't equal to the original object") {
-      buffer.rewind()
-      SparkSqlSerializer.deserialize(MAP_GENERIC.extract(buffer))
-    }
+  testNativeColumnType(BOOLEAN)
+  testNativeColumnType(BYTE)
+  testNativeColumnType(SHORT)
+  testNativeColumnType(INT)
+  testNativeColumnType(LONG)
+  testNativeColumnType(FLOAT)
+  testNativeColumnType(DOUBLE)
+  testNativeColumnType(COMPACT_DECIMAL(15, 10))
+  testNativeColumnType(STRING)
+
+  testColumnType(NULL)
+  testColumnType(BINARY)
+  testColumnType(LARGE_DECIMAL(20, 10))
+  testColumnType(STRUCT_TYPE)
+  testColumnType(ARRAY_TYPE)
+  testColumnType(MAP_TYPE)
+
+  def testNativeColumnType[T <: AtomicType](columnType: NativeColumnType[T]): Unit = {
+    testColumnType[T#InternalType](columnType)
   }
 
-  test("CUSTOM") {
-    val conf = new SparkConf()
-    conf.set("spark.kryo.registrator", "org.apache.spark.sql.columnar.Registrator")
-    val serializer = new SparkSqlSerializer(conf).newInstance()
-
-    val buffer = ByteBuffer.allocate(512)
-    val obj = CustomClass(Int.MaxValue, Long.MaxValue)
-    val serializedObj = serializer.serialize(obj).array()
-
-    MAP_GENERIC.append(serializer.serialize(obj).array(), buffer)
-    buffer.rewind()
-
-    val length = buffer.getInt
-    assert(length === serializedObj.length)
-    assert(13 == length) // id (1) + int (4) + long (8)
-
-    val genericSerializedObj = SparkSqlSerializer.serialize(obj)
-    assert(length != genericSerializedObj.length)
-    assert(length < genericSerializedObj.length)
-
-    assertResult(obj, "Custom deserialized object didn't equal the original object") {
-      val bytes = new Array[Byte](length)
-      buffer.get(bytes, 0, length)
-      serializer.deserialize(ByteBuffer.wrap(bytes))
-    }
-
-    buffer.rewind()
-    buffer.putInt(serializedObj.length).put(serializedObj)
-
-    assertResult(obj, "Custom deserialized object didn't equal the original object") {
-      buffer.rewind()
-      serializer.deserialize(ByteBuffer.wrap(MAP_GENERIC.extract(buffer)))
-    }
-  }
-
-  def testNativeColumnType[T <: AtomicType](
-      columnType: NativeColumnType[T])
-      (putter: (ByteBuffer, T#InternalType) => Unit,
-      getter: (ByteBuffer) => T#InternalType): Unit = {
-
-    testColumnType[T#InternalType](columnType, putter, getter)
-  }
-
-  def testColumnType[JvmType](
-      columnType: ColumnType[JvmType],
-      putter: (ByteBuffer, JvmType) => Unit,
-      getter: (ByteBuffer) => JvmType): Unit = {
+  def testColumnType[JvmType](columnType: ColumnType[JvmType]): Unit = {
 
     val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE)
     val seq = (0 until 4).map(_ => makeRandomValue(columnType))
+    val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType)
 
-    test(s"$columnType.extract") {
+    test(s"$columnType append/extract") {
       buffer.rewind()
-      seq.foreach(putter(buffer, _))
+      seq.foreach(columnType.append(_, buffer))
 
       buffer.rewind()
       seq.foreach { expected =>
         logInfo("buffer = " + buffer + ", expected = " + expected)
         val extracted = columnType.extract(buffer)
         assert(
-          expected === extracted,
+          converter(expected) === converter(extracted),
           "Extracted value didn't equal to the original one. " +
             hexDump(expected) + " != " + hexDump(extracted) +
             ", buffer = " + dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
       }
     }
-
-    test(s"$columnType.append") {
-      buffer.rewind()
-      seq.foreach(columnType.append(_, buffer))
-
-      buffer.rewind()
-      seq.foreach { expected =>
-        assert(
-          expected === getter(buffer),
-          "Extracted value didn't equal to the original one")
-      }
-    }
   }
 
   private def hexDump(value: Any): String = {
-    value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" ")
+    if (value == null) {
+      ""
+    } else {
+      value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" ")
+    }
   }
 
   private def dumpBuffer(buff: ByteBuffer): Any = {
@@ -253,33 +140,13 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
   test("column type for decimal types with different precision") {
     (1 to 18).foreach { i =>
-      assertResult(FIXED_DECIMAL(i, 0)) {
+      assertResult(COMPACT_DECIMAL(i, 0)) {
         ColumnType(DecimalType(i, 0))
       }
     }
 
-    assertResult(GENERIC(DecimalType(19, 0))) {
+    assertResult(LARGE_DECIMAL(19, 0)) {
       ColumnType(DecimalType(19, 0))
     }
   }
 }
-
-private[columnar] final case class CustomClass(a: Int, b: Long)
-
-private[columnar] object CustomerSerializer extends Serializer[CustomClass] {
-  override def write(kryo: Kryo, output: Output, t: CustomClass) {
-    output.writeInt(t.a)
-    output.writeLong(t.b)
-  }
-  override def read(kryo: Kryo, input: Input, aClass: Class[CustomClass]): CustomClass = {
-    val a = input.readInt()
-    val b = input.readLong()
-    CustomClass(a, b)
-  }
-}
-
-private[columnar] final class Registrator extends KryoRegistrator {
-  override def registerClasses(kryo: Kryo) {
-    kryo.register(classOf[CustomClass], CustomerSerializer)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 123a7053c0a6f..964cdb52b245a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -21,8 +21,8 @@ import scala.collection.immutable.HashSet
 import scala.util.Random
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{AtomicType, Decimal}
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow}
+import org.apache.spark.sql.types.{ArrayBasedMapData, GenericArrayData, AtomicType, Decimal}
 import org.apache.spark.unsafe.types.UTF8String
 
 object ColumnarTestUtils {
@@ -40,6 +40,7 @@ object ColumnarTestUtils {
     }
 
     (columnType match {
+      case NULL => null
       case BOOLEAN => Random.nextBoolean()
       case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
       case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
@@ -49,10 +50,15 @@ object ColumnarTestUtils {
       case DOUBLE => Random.nextDouble()
       case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
       case BINARY => randomBytes(Random.nextInt(32))
-      case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
-      case _ =>
-        // Using a random one-element map instead of an arbitrary object
-        Map(Random.nextInt() -> Random.nextString(Random.nextInt(32)))
+      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
+      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
+      case STRUCT(_) =>
+        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
+      case ARRAY(_) =>
+        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
+      case MAP(_) =>
+        ArrayBasedMapData(
+          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
     }).asInstanceOf[JvmType]
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index ea5dd2be33d63..6265e40a0a07b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -157,7 +157,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
     // Create a RDD for the schema
     val rdd =
-      sparkContext.parallelize((1 to 100), 10).map { i =>
+      sparkContext.parallelize((1 to 10000), 10).map { i =>
         Row(
           s"str${i}: test cache.",
           s"binary${i}: test cache.".getBytes("UTF-8"),
@@ -172,9 +172,9 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
           BigDecimal(Long.MaxValue.toString + ".12345"),
           new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456"),
           new Date(i),
-          new Timestamp(i),
-          (1 to i).toSeq,
-          (0 to i).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
+          new Timestamp(i * 1000000L),
+          (i to i + 10).toSeq,
+          (i to i + 10).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
           Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
     sqlContext.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index a3a23d37d766a..78cebbf3cc934 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.columnar
 import java.nio.ByteBuffer
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.{ArrayType, StringType}
+import org.apache.spark.sql.types._
 
 class TestNullableColumnAccessor[JvmType](
     buffer: ByteBuffer,
@@ -40,8 +41,10 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
   import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
   Seq(
-    BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE,
-    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
+    NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE,
+    STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10),
+    STRUCT(StructType(StructField("a", StringType) :: Nil)),
+    ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType)))
     .foreach {
     testNullableColumnAccessor(_)
   }
@@ -69,11 +72,13 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
 
       val accessor = TestNullableColumnAccessor(builder.build(), columnType)
       val row = new GenericMutableRow(1)
+      val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType)
 
       (0 until 4).foreach { _ =>
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
-        assert(row.get(0, columnType.dataType) === randomRow.get(0, columnType.dataType))
+        assert(converter(row.get(0, columnType.dataType))
+          === converter(randomRow.get(0, columnType.dataType)))
 
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index 9557eead2710e..fba08e626d720 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.types._
 
 class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType])
@@ -35,11 +36,13 @@ object TestNullableColumnBuilder {
 }
 
 class NullableColumnBuilderSuite extends SparkFunSuite {
-  import ColumnarTestUtils._
+  import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
   Seq(
     BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE,
-    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
+    STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10),
+    STRUCT(StructType(StructField("a", StringType) :: Nil)),
+    ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType)))
     .foreach {
     testNullableColumnBuilder(_)
   }
@@ -74,6 +77,8 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val columnBuilder = TestNullableColumnBuilder(columnType)
       val randomRow = makeRandomRow(columnType)
       val nullRow = makeNullRow(1)
+      val dataType = columnType.dataType
+      val converter = CatalystTypeConverters.createToScalaConverter(dataType)
 
       (0 until 4).foreach { _ =>
         columnBuilder.appendFrom(randomRow, 0)
@@ -88,14 +93,10 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt()))
 
       // For non-null values
+      val actual = new GenericMutableRow(new Array[Any](1))
       (0 until 4).foreach { _ =>
-        val actual = if (columnType.isInstanceOf[GENERIC]) {
-          SparkSqlSerializer.deserialize[Any](columnType.extract(buffer).asInstanceOf[Array[Byte]])
-        } else {
-          columnType.extract(buffer)
-        }
-
-        assert(actual === randomRow.get(0, columnType.dataType),
+        columnType.extract(buffer, actual, 0)
+        assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)),
           "Extracted value didn't equal to the original one")
       }
 

From 1bc435ae3afb7a007b8a8ff00dcad4738a9ff055 Mon Sep 17 00:00:00 2001
From: Nathan Howell <nhowell@godaddy.com>
Date: Wed, 7 Oct 2015 17:46:16 -0700
Subject: [PATCH 484/802] [SPARK-10064] [ML] Parallelize decision tree bin
 split calculations

Reimplement `DecisionTree.findSplitsBins` via `RDD` to parallelize bin calculation.

With large feature spaces the current implementation is very slow. This change limits the features that are distributed (or collected) to just the continuous features, and performs the split calculations in parallel. It completes on a real multi terabyte dataset in less than a minute instead of multiple hours.

Author: Nathan Howell <nhowell@godaddy.com>

Closes #8246 from NathanHowell/SPARK-10064.
---
 .../spark/mllib/tree/DecisionTree.scala       | 164 +++++++++---------
 .../spark/mllib/tree/impl/NodeIdCache.scala   |  18 +-
 .../spark/mllib/tree/DecisionTreeSuite.scala  |   6 -
 .../spark/mllib/tree/EnsembleTestHelper.scala |   4 +-
 4 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 4a77d4adcd865..53d6482f8057c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -19,7 +19,6 @@ package org.apache.spark.mllib.tree
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{Experimental, Since}
@@ -643,8 +642,8 @@ object DecisionTree extends Serializable with Logging {
 
     val nodeToBestSplits = partitionAggregates.reduceByKey((a, b) => a.merge(b))
         .map { case (nodeIndex, aggStats) =>
-          val featuresForNode = nodeToFeaturesBc.value.flatMap { nodeToFeatures =>
-            Some(nodeToFeatures(nodeIndex))
+          val featuresForNode = nodeToFeaturesBc.value.map { nodeToFeatures =>
+            nodeToFeatures(nodeIndex)
           }
 
           // find best split for each node
@@ -976,8 +975,8 @@ object DecisionTree extends Serializable with Logging {
     val numFeatures = metadata.numFeatures
 
     // Sample the input only if there are continuous features.
-    val hasContinuousFeatures = Range(0, numFeatures).exists(metadata.isContinuous)
-    val sampledInput = if (hasContinuousFeatures) {
+    val continuousFeatures = Range(0, numFeatures).filter(metadata.isContinuous)
+    val sampledInput = if (continuousFeatures.nonEmpty) {
       // Calculate the number of samples for approximate quantile calculation.
       val requiredSamples = math.max(metadata.maxBins * metadata.maxBins, 10000)
       val fraction = if (requiredSamples < metadata.numExamples) {
@@ -986,81 +985,14 @@ object DecisionTree extends Serializable with Logging {
         1.0
       }
       logDebug("fraction of data used for calculating quantiles = " + fraction)
-      input.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect()
+      input.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt())
     } else {
-      new Array[LabeledPoint](0)
+      input.sparkContext.emptyRDD[LabeledPoint]
     }
 
     metadata.quantileStrategy match {
       case Sort =>
-        val splits = new Array[Array[Split]](numFeatures)
-        val bins = new Array[Array[Bin]](numFeatures)
-
-        // Find all splits.
-        // Iterate over all features.
-        var featureIndex = 0
-        while (featureIndex < numFeatures) {
-          if (metadata.isContinuous(featureIndex)) {
-            val featureSamples = sampledInput.map(lp => lp.features(featureIndex))
-            val featureSplits = findSplitsForContinuousFeature(featureSamples,
-              metadata, featureIndex)
-
-            val numSplits = featureSplits.length
-            val numBins = numSplits + 1
-            logDebug(s"featureIndex = $featureIndex, numSplits = $numSplits")
-            splits(featureIndex) = new Array[Split](numSplits)
-            bins(featureIndex) = new Array[Bin](numBins)
-
-            var splitIndex = 0
-            while (splitIndex < numSplits) {
-              val threshold = featureSplits(splitIndex)
-              splits(featureIndex)(splitIndex) =
-                new Split(featureIndex, threshold, Continuous, List())
-              splitIndex += 1
-            }
-            bins(featureIndex)(0) = new Bin(new DummyLowSplit(featureIndex, Continuous),
-              splits(featureIndex)(0), Continuous, Double.MinValue)
-
-            splitIndex = 1
-            while (splitIndex < numSplits) {
-              bins(featureIndex)(splitIndex) =
-                new Bin(splits(featureIndex)(splitIndex - 1), splits(featureIndex)(splitIndex),
-                  Continuous, Double.MinValue)
-              splitIndex += 1
-            }
-            bins(featureIndex)(numSplits) = new Bin(splits(featureIndex)(numSplits - 1),
-              new DummyHighSplit(featureIndex, Continuous), Continuous, Double.MinValue)
-          } else {
-            val numSplits = metadata.numSplits(featureIndex)
-            val numBins = metadata.numBins(featureIndex)
-            // Categorical feature
-            val featureArity = metadata.featureArity(featureIndex)
-            if (metadata.isUnordered(featureIndex)) {
-              // Unordered features
-              // 2^(maxFeatureValue - 1) - 1 combinations
-              splits(featureIndex) = new Array[Split](numSplits)
-              var splitIndex = 0
-              while (splitIndex < numSplits) {
-                val categories: List[Double] =
-                  extractMultiClassCategories(splitIndex + 1, featureArity)
-                splits(featureIndex)(splitIndex) =
-                  new Split(featureIndex, Double.MinValue, Categorical, categories)
-                splitIndex += 1
-              }
-            } else {
-              // Ordered features
-              //   Bins correspond to feature values, so we do not need to compute splits or bins
-              //   beforehand.  Splits are constructed as needed during training.
-              splits(featureIndex) = new Array[Split](0)
-            }
-            // For ordered features, bins correspond to feature values.
-            // For unordered categorical features, there is no need to construct the bins.
-            // since there is a one-to-one correspondence between the splits and the bins.
-            bins(featureIndex) = new Array[Bin](0)
-          }
-          featureIndex += 1
-        }
-        (splits, bins)
+        findSplitsBinsBySorting(sampledInput, metadata, continuousFeatures)
       case MinMax =>
         throw new UnsupportedOperationException("minmax not supported yet.")
       case ApproxHist =>
@@ -1068,6 +1000,82 @@ object DecisionTree extends Serializable with Logging {
     }
   }
 
+  private def findSplitsBinsBySorting(
+      input: RDD[LabeledPoint],
+      metadata: DecisionTreeMetadata,
+      continuousFeatures: IndexedSeq[Int]): (Array[Array[Split]], Array[Array[Bin]]) = {
+    def findSplits(
+        featureIndex: Int,
+        featureSamples: Iterable[Double]): (Int, (Array[Split], Array[Bin])) = {
+      val splits = {
+        val featureSplits = findSplitsForContinuousFeature(
+          featureSamples.toArray,
+          metadata,
+          featureIndex)
+        logDebug(s"featureIndex = $featureIndex, numSplits = ${featureSplits.length}")
+
+        featureSplits.map(threshold => new Split(featureIndex, threshold, Continuous, Nil))
+      }
+
+      val bins = {
+        val lowSplit = new DummyLowSplit(featureIndex, Continuous)
+        val highSplit = new DummyHighSplit(featureIndex, Continuous)
+
+        // tack the dummy splits on either side of the computed splits
+        val allSplits = lowSplit +: splits.toSeq :+ highSplit
+
+        // slide across the split points pairwise to allocate the bins
+        allSplits.sliding(2).map {
+          case Seq(left, right) => new Bin(left, right, Continuous, Double.MinValue)
+        }.toArray
+      }
+
+      (featureIndex, (splits, bins))
+    }
+
+    val continuousSplits = {
+      // reduce the parallelism for split computations when there are less
+      // continuous features than input partitions. this prevents tasks from
+      // being spun up that will definitely do no work.
+      val numPartitions = math.min(continuousFeatures.length, input.partitions.length)
+
+      input
+        .flatMap(point => continuousFeatures.map(idx => (idx, point.features(idx))))
+        .groupByKey(numPartitions)
+        .map { case (k, v) => findSplits(k, v) }
+        .collectAsMap()
+    }
+
+    val numFeatures = metadata.numFeatures
+    val (splits, bins) = Range(0, numFeatures).unzip {
+      case i if metadata.isContinuous(i) =>
+        val (split, bin) = continuousSplits(i)
+        metadata.setNumSplits(i, split.length)
+        (split, bin)
+
+      case i if metadata.isCategorical(i) && metadata.isUnordered(i) =>
+        // Unordered features
+        // 2^(maxFeatureValue - 1) - 1 combinations
+        val featureArity = metadata.featureArity(i)
+        val split = Range(0, metadata.numSplits(i)).map { splitIndex =>
+          val categories = extractMultiClassCategories(splitIndex + 1, featureArity)
+          new Split(i, Double.MinValue, Categorical, categories)
+        }
+
+        // For unordered categorical features, there is no need to construct the bins.
+        // since there is a one-to-one correspondence between the splits and the bins.
+        (split.toArray, Array.empty[Bin])
+
+      case i if metadata.isCategorical(i) =>
+        // Ordered features
+        //   Bins correspond to feature values, so we do not need to compute splits or bins
+        //   beforehand.  Splits are constructed as needed during training.
+        (Array.empty[Split], Array.empty[Bin])
+    }
+
+    (splits.toArray, bins.toArray)
+  }
+
   /**
    * Nested method to extract list of eligible categories given an index. It extracts the
    * position of ones in a binary representation of the input. If binary
@@ -1131,7 +1139,7 @@ object DecisionTree extends Serializable with Logging {
         logDebug("stride = " + stride)
 
         // iterate `valueCount` to find splits
-        val splitsBuilder = ArrayBuilder.make[Double]
+        val splitsBuilder = Array.newBuilder[Double]
         var index = 1
         // currentCount: sum of counts of values that have been visited
         var currentCount = valueCounts(0)._2
@@ -1163,8 +1171,8 @@ object DecisionTree extends Serializable with Logging {
     assert(splits.length > 0,
       s"DecisionTree could not handle feature $featureIndex since it had only 1 unique value." +
         "  Please remove this feature and then try again.")
-    // set number of splits accordingly
-    metadata.setNumSplits(featureIndex, splits.length)
+
+    // the split metadata must be updated on the driver
 
     splits
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
index 0abed5411143d..1c611976a9308 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
@@ -108,21 +108,21 @@ private[spark] class NodeIdCache(
 
     prevNodeIdsForInstances = nodeIdsForInstances
     nodeIdsForInstances = data.zip(nodeIdsForInstances).map {
-      dataPoint => {
+      case (point, node) => {
         var treeId = 0
         while (treeId < nodeIdUpdaters.length) {
-          val nodeIdUpdater = nodeIdUpdaters(treeId).getOrElse(dataPoint._2(treeId), null)
+          val nodeIdUpdater = nodeIdUpdaters(treeId).getOrElse(node(treeId), null)
           if (nodeIdUpdater != null) {
             val newNodeIndex = nodeIdUpdater.updateNodeIndex(
-              binnedFeatures = dataPoint._1.datum.binnedFeatures,
+              binnedFeatures = point.datum.binnedFeatures,
               bins = bins)
-            dataPoint._2(treeId) = newNodeIndex
+            node(treeId) = newNodeIndex
           }
 
           treeId += 1
         }
 
-        dataPoint._2
+        node
       }
     }
 
@@ -138,7 +138,7 @@ private[spark] class NodeIdCache(
       while (checkpointQueue.size > 1 && canDelete) {
         // We can delete the oldest checkpoint iff
         // the next checkpoint actually exists in the file system.
-        if (checkpointQueue.get(1).get.getCheckpointFile != None) {
+        if (checkpointQueue.get(1).get.getCheckpointFile.isDefined) {
           val old = checkpointQueue.dequeue()
 
           // Since the old checkpoint is not deleted by Spark,
@@ -159,11 +159,11 @@ private[spark] class NodeIdCache(
    * Call this after training is finished to delete any remaining checkpoints.
    */
   def deleteAllCheckpoints(): Unit = {
-    while (checkpointQueue.size > 0) {
+    while (checkpointQueue.nonEmpty) {
       val old = checkpointQueue.dequeue()
-      if (old.getCheckpointFile != None) {
+      for (checkpointFile <- old.getCheckpointFile) {
         val fs = FileSystem.get(old.sparkContext.hadoopConfiguration)
-        fs.delete(new Path(old.getCheckpointFile.get), true)
+        fs.delete(new Path(checkpointFile), true)
       }
     }
     if (prevNodeIdsForInstances != null) {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 356d957f15909..1a4299db4eab2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -135,8 +135,6 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext {
       val featureSamples = Array(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3).map(_.toDouble)
       val splits = DecisionTree.findSplitsForContinuousFeature(featureSamples, fakeMetadata, 0)
       assert(splits.length === 3)
-      assert(fakeMetadata.numSplits(0) === 3)
-      assert(fakeMetadata.numBins(0) === 4)
       // check returned splits are distinct
       assert(splits.distinct.length === splits.length)
     }
@@ -151,8 +149,6 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext {
       val featureSamples = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5).map(_.toDouble)
       val splits = DecisionTree.findSplitsForContinuousFeature(featureSamples, fakeMetadata, 0)
       assert(splits.length === 2)
-      assert(fakeMetadata.numSplits(0) === 2)
-      assert(fakeMetadata.numBins(0) === 3)
       assert(splits(0) === 2.0)
       assert(splits(1) === 3.0)
     }
@@ -167,8 +163,6 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext {
       val featureSamples = Array(0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2).map(_.toDouble)
       val splits = DecisionTree.findSplitsForContinuousFeature(featureSamples, fakeMetadata, 0)
       assert(splits.length === 1)
-      assert(fakeMetadata.numSplits(0) === 1)
-      assert(fakeMetadata.numBins(0) === 2)
       assert(splits(0) === 1.0)
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index 334bf3790fc7a..3d3f80063f904 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -69,8 +69,8 @@ object EnsembleTestHelper {
       required: Double,
       metricName: String = "mse") {
     val predictions = input.map(x => model.predict(x.features))
-    val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
-      label - prediction
+    val errors = predictions.zip(input).map { case (prediction, point) =>
+      point.label - prediction
     }
     val metric = metricName match {
       case "mse" =>

From 3aff0866a8601b4daf760d6bf175f68d5a0c8912 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 7 Oct 2015 17:50:35 -0700
Subject: [PATCH 485/802] [SPARK-9774] [ML] [PYSPARK] Add python api for ml
 regression isotonicregression

Add the Python API for isotonicregression.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8214 from holdenk/SPARK-9774-add-python-api-for-ml-regression-isotonicregression.
---
 .../ml/param/_shared_params_code_gen.py       |   5 +-
 python/pyspark/ml/param/shared.py             |  27 ++++
 python/pyspark/ml/regression.py               | 118 ++++++++++++++++++
 3 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 5b39e5dd4e25b..45a94e9c32962 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -133,7 +133,10 @@ def get$Name(self):
         ("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
          "predicting each class. Array must have length equal to the number of classes, with " +
          "values >= 0. The class with largest value p/t is predicted, where p is the original " +
-         "probability of that class and t is the class' threshold.", None)]
+         "probability of that class and t is the class' threshold.", None),
+        ("weightCol", "weight column name. If this is not set or empty, we treat " +
+         "all instance weights as 1.0.", None)]
+
     code = []
     for name, doc, defaultValueStr in shared:
         param_code = _gen_param_header(name, doc, defaultValueStr)
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index af1218128602b..8c438bc74f51f 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -570,6 +570,33 @@ def getThresholds(self):
         return self.getOrDefault(self.thresholds)
 
 
+class HasWeightCol(Params):
+    """
+    Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.")
+
+    def __init__(self):
+        super(HasWeightCol, self).__init__()
+        #: param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.
+        self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.")
+
+    def setWeightCol(self, value):
+        """
+        Sets the value of :py:attr:`weightCol`.
+        """
+        self._paramMap[self.weightCol] = value
+        return self
+
+    def getWeightCol(self):
+        """
+        Gets the value of weightCol or its default value.
+        """
+        return self.getOrDefault(self.weightCol)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index e12abeba019f7..eb5f4bd6d70b4 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -25,6 +25,7 @@
 __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel',
            'DecisionTreeRegressor', 'DecisionTreeRegressionModel',
            'GBTRegressor', 'GBTRegressionModel',
+           'IsotonicRegression', 'IsotonicRegressionModel',
            'LinearRegression', 'LinearRegressionModel',
            'RandomForestRegressor', 'RandomForestRegressionModel']
 
@@ -142,6 +143,123 @@ def intercept(self):
         return self._call_java("intercept")
 
 
+@inherit_doc
+class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                         HasWeightCol):
+    """
+    .. note:: Experimental
+
+    Currently implemented using parallelized pool adjacent violators algorithm.
+    Only univariate (single feature) algorithm supported.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (1.0, Vectors.dense(1.0)),
+    ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
+    >>> ir = IsotonicRegression()
+    >>> model = ir.fit(df)
+    >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
+    >>> model.transform(test0).head().prediction
+    0.0
+    >>> model.boundaries
+    DenseVector([0.0, 1.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    isotonic = \
+        Param(Params._dummy(), "isotonic",
+              "whether the output sequence should be isotonic/increasing (true) or" +
+              "antitonic/decreasing (false).")
+    featureIndex = \
+        Param(Params._dummy(), "featureIndex",
+              "The index of the feature if featuresCol is a vector column, no effect otherwise.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 weightCol=None, isotonic=True, featureIndex=0):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 weightCol=None, isotonic=True, featureIndex=0):
+        """
+        super(IsotonicRegression, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.regression.IsotonicRegression", self.uid)
+        self.isotonic = \
+            Param(self, "isotonic",
+                  "whether the output sequence should be isotonic/increasing (true) or" +
+                  "antitonic/decreasing (false).")
+        self.featureIndex = \
+            Param(self, "featureIndex",
+                  "The index of the feature if featuresCol is a vector column, no effect " +
+                  "otherwise.")
+        self._setDefault(isotonic=True, featureIndex=0)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  weightCol=None, isotonic=True, featureIndex=0):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 weightCol=None, isotonic=True, featureIndex=0):
+        Set the params for IsotonicRegression.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return IsotonicRegressionModel(java_model)
+
+    def setIsotonic(self, value):
+        """
+        Sets the value of :py:attr:`isotonic`.
+        """
+        self._paramMap[self.isotonic] = value
+        return self
+
+    def getIsotonic(self):
+        """
+        Gets the value of isotonic or its default value.
+        """
+        return self.getOrDefault(self.isotonic)
+
+    def setFeatureIndex(self, value):
+        """
+        Sets the value of :py:attr:`featureIndex`.
+        """
+        self._paramMap[self.featureIndex] = value
+        return self
+
+    def getFeatureIndex(self):
+        """
+        Gets the value of featureIndex or its default value.
+        """
+        return self.getOrDefault(self.featureIndex)
+
+
+class IsotonicRegressionModel(JavaModel):
+    """
+    .. note:: Experimental
+
+    Model fitted by IsotonicRegression.
+    """
+
+    @property
+    def boundaries(self):
+        """
+        Model boundaries.
+        """
+        return self._call_java("boundaries")
+
+    @property
+    def predictions(self):
+        """
+        Predictions associated with the boundaries at the same index, monotone because of isotonic
+        regression.
+        """
+        return self._call_java("predictions")
+
+
 class TreeRegressorParams(object):
     """
     Private class to track supported impurity measures.

From b8f849b546739d3e4339563557509a51417fcb68 Mon Sep 17 00:00:00 2001
From: 0x0FFF <programmerag@gmail.com>
Date: Wed, 7 Oct 2015 23:12:35 -0700
Subject: [PATCH 486/802] [SPARK-7869][SQL] Adding Postgres JSON and JSONb data
 types support

This PR addresses [SPARK-7869](https://issues.apache.org/jira/browse/SPARK-7869)

Before the patch, attempt to load the table from Postgres with JSON/JSONb datatype caused error `java.sql.SQLException: Unsupported type 1111`
Postgres data types JSON and JSONb are now mapped to String on Spark side thus they can be loaded into DF and processed on Spark side

Example

Postgres:
```
create table test_json  (id int, value json);
create table test_jsonb (id int, value jsonb);

insert into test_json (id, value) values
(1, '{"field1":"value1","field2":"value2","field3":[1,2,3]}'::json),
(2, '{"field1":"value3","field2":"value4","field3":[4,5,6]}'::json),
(3, '{"field3":"value5","field4":"value6","field3":[7,8,9]}'::json);

insert into test_jsonb (id, value) values
(4, '{"field1":"value1","field2":"value2","field3":[1,2,3]}'::jsonb),
(5, '{"field1":"value3","field2":"value4","field3":[4,5,6]}'::jsonb),
(6, '{"field3":"value5","field4":"value6","field3":[7,8,9]}'::jsonb);
```

PySpark:
```
>>> import json
>>> df1 = sqlContext.read.jdbc("jdbc:postgresql://127.0.0.1:5432/test?user=testuser", "test_json")
>>> df1.map(lambda x: (x.id, json.loads(x.value))).map(lambda (id, value): (id, value.get('field3'))).collect()
[(1, [1, 2, 3]), (2, [4, 5, 6]), (3, [7, 8, 9])]
>>> df2 = sqlContext.read.jdbc("jdbc:postgresql://127.0.0.1:5432/test?user=testuser", "test_jsonb")
>>> df2.map(lambda x: (x.id, json.loads(x.value))).map(lambda (id, value): (id, value.get('field1'))).collect()
[(4, u'value1'), (5, u'value3'), (6, None)]
```

Author: 0x0FFF <programmerag@gmail.com>

Closes #8948 from 0x0FFF/SPARK-7869.
---
 .../scala/org/apache/spark/sql/jdbc/JdbcDialects.scala     | 4 ++++
 .../test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala   | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 5abbfbf7b3186..0cd356f222984 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -202,6 +202,10 @@ case object PostgresDialect extends JdbcDialect {
       Some(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("inet")) {
       Some(StringType)
+    } else if (sqlType == Types.OTHER && typeName.equals("json")) {
+      Some(StringType)
+    } else if (sqlType == Types.OTHER && typeName.equals("jsonb")) {
+      Some(StringType)
     } else None
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index c4b039a9c5359..bbf705ce95933 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -452,6 +452,13 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)")
   }
 
+  test("PostgresDialect type mapping") {
+    val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    // SPARK-7869: Testing JSON types handling
+    assert(Postgres.getCatalystType(java.sql.Types.OTHER, "json", 1, null) === Some(StringType))
+    assert(Postgres.getCatalystType(java.sql.Types.OTHER, "jsonb", 1, null) === Some(StringType))
+  }
+
   test("table exists query by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")

From cd28139c9b31201d54ee34b72da24417ece029f7 Mon Sep 17 00:00:00 2001
From: admackin <admackin@users.noreply.github.com>
Date: Thu, 8 Oct 2015 00:01:23 -0700
Subject: [PATCH 487/802] Akka framesize units should be specified

1.4 docs noted that the units were MB - i have assumed this is still the case

Author: admackin <admackin@users.noreply.github.com>

Closes #9025 from admackin/master.
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index d99092e3c506e..154a3aee6855a 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -884,7 +884,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.akka.frameSize</code></td>
   <td>128</td>
   <td>
-    Maximum message size to allow in "control plane" communication; generally only applies to map
+    Maximum message size (in MB) to allow in "control plane" communication; generally only applies to map
     output size information sent between executors and the driver. Increase this if you are running
     jobs with many thousands of map and reduce tasks and see messages about the frame size.
   </td>

From 60150cf00a70e684d2cad864ab055ad53106938b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Baptiste=20Onofr=C3=A9?= <jbonofre@apache.org>
Date: Thu, 8 Oct 2015 11:38:39 +0100
Subject: [PATCH 488/802] [SPARK-10883] Add a note about how to build Spark
 sub-modules (reactor)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Jean-Baptiste Onofré <jbonofre@apache.org>

Closes #8993 from jbonofre/SPARK-10883-2.
---
 docs/building-spark.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 4db32cfd628bc..4d929ee10a33f 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -144,6 +144,17 @@ The ScalaTest plugin also supports running only a specific test suite as follows
 
     mvn -Dhadoop.version=... -DwildcardSuites=org.apache.spark.repl.ReplSuite test
 
+# Building submodules individually
+
+It's possible to build Spark sub-modules using the `mvn -pl` option.
+
+For instance, you can build the Spark Streaming module using:
+
+{% highlight bash %}
+mvn -pl :spark-streaming_2.10 clean install
+{% endhighlight %}
+
+where `spark-streaming_2.10` is the `artifactId` as defined in `streaming/pom.xml` file.
 
 # Continuous Compilation
 

From 59b0606f334a192e110e6a79003145931f62b928 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 8 Oct 2015 09:20:36 -0700
Subject: [PATCH 489/802] [SPARK-10999] [SQL] Coalesce should be able to handle
 UnsafeRow

Author: Cheng Lian <lian@databricks.com>

Closes #9024 from liancheng/spark-10999.coalesce-unsafe-row-handling.
---
 .../org/apache/spark/sql/execution/basicOperators.scala  | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index d4bbbeb39efe2..7804b67ac2367 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,20 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import java.util.Random
-
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
-import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.sort.SortShuffleManager
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.util.random.PoissonSampler
 import org.apache.spark.util.MutablePair
+import org.apache.spark.util.random.PoissonSampler
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
 /**
@@ -294,6 +289,8 @@ case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   protected override def doExecute(): RDD[InternalRow] = {
     child.execute().map(_.copy()).coalesce(numPartitions, shuffle = false)
   }
+
+  override def canProcessUnsafeRows: Boolean = true
 }
 
 /**

From 2df882ef14d376f9e49380ff15a8a5e6997024a7 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 8 Oct 2015 09:22:42 -0700
Subject: [PATCH 490/802] [SPARK-5775] [SPARK-5508] [SQL] Re-enable Hive
 Parquet array reading tests

Since SPARK-5508 has already been fixed.

Author: Cheng Lian <lian@databricks.com>

Closes #8999 from liancheng/spark-5775.enable-array-tests.
---
 .../test/scala/org/apache/spark/sql/hive/parquetSuites.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index ff2d0a35e309a..905eb7a3925b2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -869,8 +869,7 @@ abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with
         (1 to 10).map(i => Row(1, i, f"${i}_string")))
     }
 
-    // Re-enable this after SPARK-5508 is fixed
-    ignore(s"SPARK-5775 read array from $table") {
+    test(s"SPARK-5775 read array from $table") {
       checkAnswer(
         sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
         (1 to 10).map(i => Row(1 to i, 1)))

From 56a9692fc06077e31b37c00957e8011235f4e4eb Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 8 Oct 2015 09:47:58 -0700
Subject: [PATCH 491/802] [SPARK-10987] [YARN] Workaround for missing netty rpc
 disconnection event.

In YARN client mode, when the AM connects to the driver, it may be the case
that the driver never needs to send a message back to the AM (i.e., no
dynamic allocation or preemption). This triggers an issue in the netty rpc
backend where no disconnection event is sent to endpoints, and the AM never
exits after the driver shuts down.

The real fix is too complicated, so this is a quick hack to unblock YARN
client mode until we can work on the real fix. It forces the driver to
send a message to the AM when the AM registers, thus establishing that
connection and enabling the disconnection event when the driver goes
away.

Also, a minor side issue: when the executor is shutting down, it needs
to send an "ack" back to the driver when using the netty rpc backend; but
that "ack" wasn't being sent because the handler was shutting down the rpc
env before returning. So added a change to delay the shutdown a little bit,
allowing the ack to be sent back.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9021 from vanzin/SPARK-10987.
---
 .../spark/executor/CoarseGrainedExecutorBackend.scala      | 5 +++++
 .../scheduler/cluster/CoarseGrainedClusterMessage.scala    | 7 +++++++
 .../spark/scheduler/cluster/YarnSchedulerBackend.scala     | 2 ++
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala   | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index fcd76ec52742a..49059de50b42b 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -110,6 +110,11 @@ private[spark] class CoarseGrainedExecutorBackend(
 
     case StopExecutor =>
       logInfo("Driver commanded a shutdown")
+      // Cannot shutdown here because an ack may need to be sent back to the caller. So send
+      // a message to self to actually do the shutdown.
+      self.send(Shutdown)
+
+    case Shutdown =>
       executor.stop()
       stop()
       rpcEnv.shutdown()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index d94743677783f..e0d25dc50c988 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -100,4 +100,11 @@ private[spark] object CoarseGrainedClusterMessages {
 
   case class KillExecutors(executorIds: Seq[String]) extends CoarseGrainedClusterMessage
 
+  // Used internally by executors to shut themselves down.
+  case object Shutdown extends CoarseGrainedClusterMessage
+
+  // SPARK-10987: workaround for netty RPC issue; forces a connection from the driver back
+  // to the AM.
+  case object DriverHello extends CoarseGrainedClusterMessage
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index e0107f9d3dd19..38218b9c08fd8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -170,6 +170,8 @@ private[spark] abstract class YarnSchedulerBackend(
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
         amEndpoint = Option(am)
+        // See SPARK-10987.
+        am.send(DriverHello)
 
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index a2ccdc05d73c8..3791eea5bf178 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -564,6 +564,9 @@ private[spark] class ApplicationMaster(
       case x: AddWebUIFilter =>
         logInfo(s"Add WebUI Filter. $x")
         driver.send(x)
+
+      case DriverHello =>
+        // SPARK-10987: no action needed for this message.
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {

From e8f90d9dda3f87fef01c683462eac67aad750f60 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>
Date: Thu, 8 Oct 2015 09:53:44 -0700
Subject: [PATCH 492/802] [SPARK-10836] [SPARKR] Added sort(x, decreasing, col,
 ... ) method to DataFrame

the sort function can be used as an alternative to arrange(... ).
As arguments it accepts x - dataframe, decreasing - TRUE/FALSE, a list of orderings for columns and the list of columns, represented as string names

for example:
sort(df, TRUE, "col1","col2","col3","col5") # for example, if we want to sort some of the columns in the same order

sort(df, decreasing=TRUE, "col1")
sort(df, decreasing=c(TRUE,FALSE), "col1","col2")

Author: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>

Closes #8920 from NarineK/sparkrsort.
---
 R/pkg/R/DataFrame.R              | 47 ++++++++++++++++++++++++++------
 R/pkg/inst/tests/test_sparkSQL.R | 11 +++++++-
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 85db3a5ed3705..1b9137e6c7934 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1298,8 +1298,10 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' Sort a DataFrame by the specified column(s).
 #'
 #' @param x A DataFrame to be sorted.
-#' @param col Either a Column object or character vector indicating the field to sort on
+#' @param col A character or Column object vector indicating the fields to sort on
 #' @param ... Additional sorting fields
+#' @param decreasing A logical argument indicating sorting order for columns when
+#'                   a character vector is specified for col
 #' @return A DataFrame where all elements are sorted.
 #' @rdname arrange
 #' @name arrange
@@ -1312,23 +1314,52 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' path <- "path/to/file.json"
 #' df <- jsonFile(sqlContext, path)
 #' arrange(df, df$col1)
-#' arrange(df, "col1")
 #' arrange(df, asc(df$col1), desc(abs(df$col2)))
+#' arrange(df, "col1", decreasing = TRUE)
+#' arrange(df, "col1", "col2", decreasing = c(TRUE, FALSE))
 #' }
 setMethod("arrange",
-          signature(x = "DataFrame", col = "characterOrColumn"),
+          signature(x = "DataFrame", col = "Column"),
           function(x, col, ...) {
-            if (class(col) == "character") {
-              sdf <- callJMethod(x@sdf, "sort", col, list(...))
-            } else if (class(col) == "Column") {
               jcols <- lapply(list(col, ...), function(c) {
                 c@jc
               })
-              sdf <- callJMethod(x@sdf, "sort", jcols)
-            }
+
+            sdf <- callJMethod(x@sdf, "sort", jcols)
             dataFrame(sdf)
           })
 
+#' @rdname arrange
+#' @export
+setMethod("arrange",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ..., decreasing = FALSE) {
+
+            # all sorting columns
+            by <- list(col, ...)
+
+            if (length(decreasing) == 1) {
+              # in case only 1 boolean argument - decreasing value is specified,
+              # it will be used for all columns
+              decreasing <- rep(decreasing, length(by))
+            } else if (length(decreasing) != length(by)) {
+              stop("Arguments 'col' and 'decreasing' must have the same length")
+            }
+
+            # builds a list of columns of type Column
+            # example: [[1]] Column Species ASC
+            #          [[2]] Column Petal_Length DESC
+            jcols <- lapply(seq_len(length(decreasing)), function(i){
+              if (decreasing[[i]]) {
+                desc(getColumn(x, by[[i]]))
+              } else {
+                asc(getColumn(x, by[[i]]))
+              }
+            })
+
+            do.call("arrange", c(x, jcols))
+          })
+
 #' @rdname arrange
 #' @name orderby
 setMethod("orderBy",
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index bcf52b8fa7887..e85de2507085c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -989,7 +989,7 @@ test_that("arrange() and orderBy() on a DataFrame", {
   sorted <- arrange(df, df$age)
   expect_equal(collect(sorted)[1,2], "Michael")
 
-  sorted2 <- arrange(df, "name")
+  sorted2 <- arrange(df, "name", decreasing = FALSE)
   expect_equal(collect(sorted2)[2,"age"], 19)
 
   sorted3 <- orderBy(df, asc(df$age))
@@ -999,6 +999,15 @@ test_that("arrange() and orderBy() on a DataFrame", {
   sorted4 <- orderBy(df, desc(df$name))
   expect_equal(first(sorted4)$name, "Michael")
   expect_equal(collect(sorted4)[3,"name"], "Andy")
+
+  sorted5 <- arrange(df, "age", "name", decreasing = TRUE)
+  expect_equal(collect(sorted5)[1,2], "Andy")
+
+  sorted6 <- arrange(df, "age","name", decreasing = c(T, F))
+  expect_equal(collect(sorted6)[1,2], "Andy")
+
+  sorted7 <- arrange(df, "name", decreasing = FALSE)
+  expect_equal(collect(sorted7)[2,"age"], 19)
 })
 
 test_that("filter() on a DataFrame", {

From 5c9fdf74e328c067388ba1109eb48bf9d128bcf4 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 8 Oct 2015 10:22:06 -0700
Subject: [PATCH 493/802] [SPARK-10998] [SQL] Show non-children in default
 Expression.toString

Its pretty hard to debug problems with expressions when you can't see all the arguments.

Before: `invoke()`
After: `invoke(inputObject#1, intField, IntegerType)`

Author: Michael Armbrust <michael@databricks.com>

Closes #9022 from marmbrus/expressionToString.
---
 .../spark/sql/catalyst/expressions/Expression.scala       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 96284b9b421dd..96fcc799e537a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -174,7 +174,13 @@ abstract class Expression extends TreeNode[Expression] {
     }.toString
   }
 
-  override def toString: String = prettyName + children.mkString("(", ",", ")")
+
+  private def flatArguments = productIterator.flatMap {
+    case t: Traversable[_] => t
+    case single => single :: Nil
+  }
+
+  override def toString: String = prettyName + flatArguments.mkString("(", ",", ")")
 }
 
 
From dcbd58a929be0058b1cfa59b14898c4c428a7680 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Thu, 8 Oct 2015 10:41:45 -0700
Subject: [PATCH 494/802] [SPARK-8654] [SQL] Fix Analysis exception when using
 NULL IN (...)

In the analysis phase , while processing the rules for IN predicate, we
compare the in-list types to the lhs expression type and generate
cast operation if necessary. In the case of NULL [NOT] IN expr1 , we end up
generating cast between in list types to NULL like cast (1 as NULL) which
is not a valid cast.

The fix is to not generate such a cast if the lhs type is a NullType instead
we translate the expression to Literal(Null).

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #8983 from dilipbiswal/spark_8654.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 10 +++++++--
 .../sql/catalyst/analysis/AnalysisSuite.scala | 21 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87a3845b2d9e5..7192c931d2e51 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -304,7 +304,10 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Convert all expressions in in() list to the left operator type
+   * Convert the value and in list expressions to the common operator type
+   * by looking at all the argument types and finding the closest one that
+   * all the arguments can be cast to. When no common operator type is found
+   * an Analysis Exception is raised.
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
@@ -312,7 +315,10 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
-        i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
+        findWiderCommonType(i.children.map(_.dataType)) match {
+          case Some(finalDataType) => i.withNewChildren(i.children.map(Cast(_, finalDataType)))
+          case None => i
+        }
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 820b336aac759..77a4765e7751c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -135,4 +135,25 @@ class AnalysisSuite extends AnalysisTest {
     plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
     checkAnalysis(plan, plan)
   }
+
+  test("SPARK-8654: invalid CAST in NULL IN(...) expression") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(2))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: different types in inlist but can be converted to a commmon type") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(1.2345))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: check type compatibility error") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(true), Literal(1))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisError(plan, Seq("data type mismatch: Arguments must be same type"))
+  }
 }

From 0903c6489e9fa39db9575dace22a64015b9cd4c5 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 8 Oct 2015 11:16:20 -0700
Subject: [PATCH 495/802] [SPARK-9718] [ML] linear regression training summary
 all columns

LinearRegression training summary: The transformed dataset should hold all columns, not just selected ones like prediction and label. There is no real need to remove some, and the user may find them useful.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #8564 from holdenk/SPARK-9718-LinearRegressionTrainingSummary-all-columns.
---
 .../ml/regression/LinearRegression.scala      | 35 ++++++++++++++-----
 .../ml/regression/LinearRegressionSuite.scala | 13 +++++++
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 0dc084fdd12f7..dd09667ef5a0f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -170,9 +170,12 @@ class LinearRegression(override val uid: String)
       val intercept = yMean
 
       val model = new LinearRegressionModel(uid, coefficients, intercept)
+      // Handle possible missing or invalid prediction columns
+      val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
+
       val trainingSummary = new LinearRegressionTrainingSummary(
-        model.transform(dataset),
-        $(predictionCol),
+        summaryModel.transform(dataset),
+        predictionColName,
         $(labelCol),
         $(featuresCol),
         Array(0D))
@@ -262,9 +265,12 @@ class LinearRegression(override val uid: String)
     if (handlePersistence) instances.unpersist()
 
     val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
+    // Handle possible missing or invalid prediction columns
+    val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
+
     val trainingSummary = new LinearRegressionTrainingSummary(
-      model.transform(dataset),
-      $(predictionCol),
+      summaryModel.transform(dataset),
+      predictionColName,
       $(labelCol),
       $(featuresCol),
       objectiveHistory)
@@ -316,13 +322,26 @@ class LinearRegressionModel private[ml] (
    */
   // TODO: decide on a good name before exposing to public API
   private[regression] def evaluate(dataset: DataFrame): LinearRegressionSummary = {
-    val t = udf { features: Vector => predict(features) }
-    val predictionAndObservations = dataset
-      .select(col($(labelCol)), t(col($(featuresCol))).as($(predictionCol)))
+    // Handle possible missing or invalid prediction columns
+    val (summaryModel, predictionColName) = findSummaryModelAndPredictionCol()
+    new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName, $(labelCol))
+  }
 
-    new LinearRegressionSummary(predictionAndObservations, $(predictionCol), $(labelCol))
+  /**
+   * If the prediction column is set returns the current model and prediction column,
+   * otherwise generates a new column and sets it as the prediction column on a new copy
+   * of the current model.
+   */
+  private[regression] def findSummaryModelAndPredictionCol(): (LinearRegressionModel, String) = {
+    $(predictionCol) match {
+      case "" =>
+        val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString()
+        (copy(ParamMap.empty).setPredictionCol(predictionColName), predictionColName)
+      case p => (this, p)
+    }
   }
 
+
   override protected def predict(features: Vector): Double = {
     dot(features, weights) + intercept
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 32729470d5bff..73a0a5caf8640 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -462,9 +462,22 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model training summary") {
     val trainer = new LinearRegression
     val model = trainer.fit(dataset)
+    val trainerNoPredictionCol = trainer.setPredictionCol("")
+    val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
+
 
     // Training results for the model should be available
     assert(model.hasSummary)
+    assert(modelNoPredictionCol.hasSummary)
+
+    // Schema should be a superset of the input dataset
+    assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+      model.summary.predictions.schema.fieldNames.toSet))
+    // Validate that we re-insert a prediction column for evaluation
+    val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames
+    assert((dataset.schema.fieldNames.toSet).subsetOf(
+      modelNoPredictionColFieldNames.toSet))
+    assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
 
     // Residuals in [[LinearRegressionResults]] should equal those manually computed
     val expectedResiduals = dataset.select("features", "label")

From 22683560027806144c3a1141904b63eda86ae96c Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 8 Oct 2015 11:27:46 -0700
Subject: [PATCH 496/802] [SPARK-7770] [ML] GBT validationTol change to compare
 with relative or absolute error

GBT compare ValidateError with tolerance switching between relative and absolute ones, where the former one is relative to the current loss on the training set.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8549 from yanboliang/spark-7770.
---
 .../spark/mllib/tree/GradientBoostedTrees.scala   |  3 ++-
 .../tree/configuration/BoostingStrategy.scala     | 15 +++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 95ed48cea6716..66a07e31360d8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -262,7 +262,8 @@ object GradientBoostedTrees extends Logging {
           validationInput, validatePredError, baseLearnerWeights(m), baseLearners(m), loss)
         validatePredErrorCheckpointer.update(validatePredError)
         val currentValidateError = validatePredError.values.mean()
-        if (bestValidateError - currentValidateError < validationTol) {
+        if (bestValidateError - currentValidateError < validationTol * Math.max(
+          currentValidateError, 0.01)) {
           doneLearning = true
         } else if (currentValidateError < bestValidateError) {
           bestValidateError = currentValidateError
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index b5c72fba3ede1..fc13bcfd8e998 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -34,9 +34,16 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                      weak hypotheses used in the final model.
  * @param learningRate Learning rate for shrinking the contribution of each estimator. The
  *                     learning rate should be between in the interval (0, 1]
- * @param validationTol Useful when runWithValidation is used. If the error rate on the
- *                      validation input between two iterations is less than the validationTol
- *                      then stop.  Ignored when
+ * @param validationTol validationTol is a condition which decides iteration termination when
+ *                      runWithValidation is used.
+ *                      The end of iteration is decided based on below logic:
+ *                      If the current loss on the validation set is > 0.01, the diff
+ *                      of validation error is compared to relative tolerance which is
+ *                      validationTol * (current loss on the validation set).
+ *                      If the current loss on the validation set is <= 0.01, the diff
+ *                      of validation error is compared to absolute tolerance which is
+ *                      validationTol * 0.01.
+ *                      Ignored when
  *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
  */
 @Since("1.2.0")
@@ -48,7 +55,7 @@ case class BoostingStrategy @Since("1.4.0") (
     // Optional boosting parameters
     @Since("1.2.0") @BeanProperty var numIterations: Int = 100,
     @Since("1.2.0") @BeanProperty var learningRate: Double = 0.1,
-    @Since("1.4.0") @BeanProperty var validationTol: Double = 1e-5) extends Serializable {
+    @Since("1.4.0") @BeanProperty var validationTol: Double = 0.001) extends Serializable {
 
   /**
    * Check validity of parameters.

From 2a6f614cd6ffb0cc32460018cb13dad2fd94520f Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Thu, 8 Oct 2015 11:51:58 -0700
Subject: [PATCH 497/802] [SPARK-11006] Rename NullColumnAccess as
 NullColumnAccessor

davies
I think NullColumnAccessor follows same convention for other accessors

Author: tedyu <yuzhihong@gmail.com>

Closes #9028 from tedyu/master.
---
 .../scala/org/apache/spark/sql/columnar/ColumnAccessor.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index f04099f54c41d..62478667eb4fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -62,7 +62,7 @@ private[sql] abstract class BasicColumnAccessor[JvmType](
   protected def underlyingBuffer = buffer
 }
 
-private[sql] class NullColumnAccess(buffer: ByteBuffer)
+private[sql] class NullColumnAccessor(buffer: ByteBuffer)
   extends BasicColumnAccessor[Any](buffer, NULL)
   with NullableColumnAccessor
 
@@ -125,7 +125,7 @@ private[sql] object ColumnAccessor {
     val buf = buffer.order(ByteOrder.nativeOrder)
 
     dataType match {
-      case NullType => new NullColumnAccess(buf)
+      case NullType => new NullColumnAccessor(buf)
       case BooleanType => new BooleanColumnAccessor(buf)
       case ByteType => new ByteColumnAccessor(buf)
       case ShortType => new ShortColumnAccessor(buf)

From 82d275f27c3e9211ce69c5c8685a0fe90c0be26f Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 8 Oct 2015 11:56:44 -0700
Subject: [PATCH 498/802] [SPARK-10887] [SQL] Build HashedRelation outside of
 HashJoinNode.

This PR refactors `HashJoinNode` to take a existing `HashedRelation`. So, we can reuse this node for both `ShuffledHashJoin` and `BroadcastHashJoin`.

https://issues.apache.org/jira/browse/SPARK-10887

Author: Yin Huai <yhuai@databricks.com>

Closes #8953 from yhuai/SPARK-10887.
---
 .../codegen/GenerateMutableProjection.scala   |  2 +
 .../codegen/GenerateSafeProjection.scala      |  4 +-
 .../execution/local/BinaryHashJoinNode.scala  | 76 +++++++++++++++++
 .../local/BroadcastHashJoinNode.scala         | 59 +++++++++++++
 .../sql/execution/local/HashJoinNode.scala    | 67 +++++++--------
 .../execution/local/HashJoinNodeSuite.scala   | 85 ++++++++++++++++---
 .../sql/execution/local/LocalNodeTest.scala   | 20 ++++-
 7 files changed, 262 insertions(+), 51 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/BinaryHashJoinNode.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/local/BroadcastHashJoinNode.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index d82d19185be04..e8ee64756d5d0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -27,6 +27,8 @@ abstract class BaseMutableProjection extends MutableProjection
 /**
  * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
  * input [[InternalRow]] for a fixed set of [[Expression Expressions]].
+ * It exposes a `target` method, which is used to set the row that will be updated.
+ * The internal [[MutableRow]] object created internally is used only when `target` is not used.
  */
 object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => MutableProjection] {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index ea09e029da901..9873630937d31 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -23,8 +23,8 @@ import org.apache.spark.sql.types._
 
 
 /**
- * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
- * input [[InternalRow]] for a fixed set of [[Expression Expressions]].
+ * Generates byte code that produces a [[MutableRow]] object (not an [[UnsafeRow]]) that can update
+ * itself based on a new input [[InternalRow]] for a fixed set of [[Expression Expressions]].
  */
 object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BinaryHashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BinaryHashJoinNode.scala
new file mode 100644
index 0000000000000..52dcb9e43c4e8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BinaryHashJoinNode.scala
@@ -0,0 +1,76 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.joins.{HashedRelation, BuildLeft, BuildRight, BuildSide}
+
+/**
+ * A [[HashJoinNode]] that builds the [[HashedRelation]] according to the value of
+ * `buildSide`. The actual work of this node is defined in [[HashJoinNode]].
+ */
+case class BinaryHashJoinNode(
+    conf: SQLConf,
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    buildSide: BuildSide,
+    left: LocalNode,
+    right: LocalNode)
+  extends BinaryLocalNode(conf) with HashJoinNode {
+
+  protected override val (streamedNode, streamedKeys) = buildSide match {
+    case BuildLeft => (right, rightKeys)
+    case BuildRight => (left, leftKeys)
+  }
+
+  private val (buildNode, buildKeys) = buildSide match {
+    case BuildLeft => (left, leftKeys)
+    case BuildRight => (right, rightKeys)
+  }
+
+  override def output: Seq[Attribute] = left.output ++ right.output
+
+  private def buildSideKeyGenerator: Projection = {
+    // We are expecting the data types of buildKeys and streamedKeys are the same.
+    assert(buildKeys.map(_.dataType) == streamedKeys.map(_.dataType))
+    if (isUnsafeMode) {
+      UnsafeProjection.create(buildKeys, buildNode.output)
+    } else {
+      newMutableProjection(buildKeys, buildNode.output)()
+    }
+  }
+
+  protected override def doOpen(): Unit = {
+    buildNode.open()
+    val hashedRelation = HashedRelation(buildNode, buildSideKeyGenerator)
+    // We have built the HashedRelation. So, close buildNode.
+    buildNode.close()
+
+    streamedNode.open()
+    // Set the HashedRelation used by the HashJoinNode.
+    withHashedRelation(hashedRelation)
+  }
+
+  override def close(): Unit = {
+    // Please note that we do not need to call the close method of our buildNode because
+    // it has been called in this.open.
+    streamedNode.close()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BroadcastHashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BroadcastHashJoinNode.scala
new file mode 100644
index 0000000000000..cd1c86516ec5f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/BroadcastHashJoinNode.scala
@@ -0,0 +1,59 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation}
+
+/**
+ * A [[HashJoinNode]] for broadcast join. It takes a streamedNode and a broadcast
+ * [[HashedRelation]]. The actual work of this node is defined in [[HashJoinNode]].
+ */
+case class BroadcastHashJoinNode(
+    conf: SQLConf,
+    streamedKeys: Seq[Expression],
+    streamedNode: LocalNode,
+    buildSide: BuildSide,
+    buildOutput: Seq[Attribute],
+    hashedRelation: Broadcast[HashedRelation])
+  extends UnaryLocalNode(conf) with HashJoinNode {
+
+  override val child = streamedNode
+
+  // Because we do not pass in the buildNode, we take the output of buildNode to
+  // create the inputSet properly.
+  override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput)
+
+  override def output: Seq[Attribute] = buildSide match {
+    case BuildRight => streamedNode.output ++ buildOutput
+    case BuildLeft => buildOutput ++ streamedNode.output
+  }
+
+  protected override def doOpen(): Unit = {
+    streamedNode.open()
+    // Set the HashedRelation used by the HashJoinNode.
+    withHashedRelation(hashedRelation.value)
+  }
+
+  override def close(): Unit = {
+    streamedNode.close()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
index e7b24e3fca2b4..b1dc719ca8508 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
@@ -17,27 +17,23 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.joins._
-import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
+ * An abstract node for sharing common functionality among different implementations of
+ * inner hash equi-join, notably [[BinaryHashJoinNode]] and [[BroadcastHashJoinNode]].
+ *
  * Much of this code is similar to [[org.apache.spark.sql.execution.joins.HashJoin]].
  */
-case class HashJoinNode(
-    conf: SQLConf,
-    leftKeys: Seq[Expression],
-    rightKeys: Seq[Expression],
-    buildSide: BuildSide,
-    left: LocalNode,
-    right: LocalNode) extends BinaryLocalNode(conf) {
-
-  private[this] lazy val (buildNode, buildKeys, streamedNode, streamedKeys) = buildSide match {
-    case BuildLeft => (left, leftKeys, right, rightKeys)
-    case BuildRight => (right, rightKeys, left, leftKeys)
-  }
+trait HashJoinNode {
+
+  self: LocalNode =>
+
+  protected def streamedKeys: Seq[Expression]
+  protected def streamedNode: LocalNode
+  protected def buildSide: BuildSide
 
   private[this] var currentStreamedRow: InternalRow = _
   private[this] var currentHashMatches: Seq[InternalRow] = _
@@ -49,23 +45,14 @@ case class HashJoinNode(
   private[this] var hashed: HashedRelation = _
   private[this] var joinKeys: Projection = _
 
-  override def output: Seq[Attribute] = left.output ++ right.output
-
-  private[this] def isUnsafeMode: Boolean = {
-    (codegenEnabled && unsafeEnabled
-      && UnsafeProjection.canSupport(buildKeys)
-      && UnsafeProjection.canSupport(schema))
-  }
-
-  private[this] def buildSideKeyGenerator: Projection = {
-    if (isUnsafeMode) {
-      UnsafeProjection.create(buildKeys, buildNode.output)
-    } else {
-      newMutableProjection(buildKeys, buildNode.output)()
-    }
+  protected def isUnsafeMode: Boolean = {
+    (codegenEnabled &&
+      unsafeEnabled &&
+      UnsafeProjection.canSupport(schema) &&
+      UnsafeProjection.canSupport(streamedKeys))
   }
 
-  private[this] def streamSideKeyGenerator: Projection = {
+  private def streamSideKeyGenerator: Projection = {
     if (isUnsafeMode) {
       UnsafeProjection.create(streamedKeys, streamedNode.output)
     } else {
@@ -73,10 +60,21 @@ case class HashJoinNode(
     }
   }
 
+  /**
+   * Sets the HashedRelation used by this node. This method needs to be called after
+   * before the first `next` gets called.
+   */
+  protected def withHashedRelation(hashedRelation: HashedRelation): Unit = {
+    hashed = hashedRelation
+  }
+
+  /**
+   * Custom open implementation to be overridden by subclasses.
+   */
+  protected def doOpen(): Unit
+
   override def open(): Unit = {
-    buildNode.open()
-    hashed = HashedRelation(buildNode, buildSideKeyGenerator)
-    streamedNode.open()
+    doOpen()
     joinRow = new JoinedRow
     resultProjection = {
       if (isUnsafeMode) {
@@ -128,9 +126,4 @@ case class HashJoinNode(
     }
     resultProjection(ret)
   }
-
-  override def close(): Unit = {
-    left.close()
-    right.close()
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
index 5c1bdb088eeed..8c2e78b2a9db7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.mockito.Mockito.{mock, when}
+
+import org.apache.spark.broadcast.TorrentBroadcast
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
-
+import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, UnsafeProjection, Expression}
+import org.apache.spark.sql.execution.joins.{HashedRelation, BuildLeft, BuildRight, BuildSide}
 
 class HashJoinNodeSuite extends LocalNodeTest {
 
@@ -33,6 +36,35 @@ class HashJoinNodeSuite extends LocalNodeTest {
     }
   }
 
+  /**
+   * Builds a [[HashedRelation]] based on a resolved `buildKeys`
+   * and a resolved `buildNode`.
+   */
+  private def buildHashedRelation(
+      conf: SQLConf,
+      buildKeys: Seq[Expression],
+      buildNode: LocalNode): HashedRelation = {
+
+    val isUnsafeMode =
+      conf.codegenEnabled &&
+        conf.unsafeEnabled &&
+        UnsafeProjection.canSupport(buildKeys)
+
+    val buildSideKeyGenerator =
+      if (isUnsafeMode) {
+        UnsafeProjection.create(buildKeys, buildNode.output)
+      } else {
+        new InterpretedMutableProjection(buildKeys, buildNode.output)
+      }
+
+    buildNode.prepare()
+    buildNode.open()
+    val hashedRelation = HashedRelation(buildNode, buildSideKeyGenerator)
+    buildNode.close()
+
+    hashedRelation
+  }
+
   /**
    * Test inner hash join with varying degrees of matches.
    */
@@ -51,20 +83,51 @@ class HashJoinNodeSuite extends LocalNodeTest {
       val rightInputMap = rightInput.toMap
       val leftNode = new DummyNode(joinNameAttributes, leftInput)
       val rightNode = new DummyNode(joinNicknameAttributes, rightInput)
-      val makeNode = (node1: LocalNode, node2: LocalNode) => {
-        resolveExpressions(new HashJoinNode(
-          conf, Seq('id1), Seq('id2), buildSide, node1, node2))
+      val makeBinaryHashJoinNode = (node1: LocalNode, node2: LocalNode) => {
+        val binaryHashJoinNode =
+          BinaryHashJoinNode(conf, Seq('id1), Seq('id2), buildSide, node1, node2)
+        resolveExpressions(binaryHashJoinNode)
+      }
+      val makeBroadcastJoinNode = (node1: LocalNode, node2: LocalNode) => {
+        val leftKeys = Seq('id1.attr)
+        val rightKeys = Seq('id2.attr)
+        // Figure out the build side and stream side.
+        val (buildNode, buildKeys, streamedNode, streamedKeys) = buildSide match {
+          case BuildLeft => (node1, leftKeys, node2, rightKeys)
+          case BuildRight => (node2, rightKeys, node1, leftKeys)
+        }
+        // Resolve the expressions of the build side and then create a HashedRelation.
+        val resolvedBuildNode = resolveExpressions(buildNode)
+        val resolvedBuildKeys = resolveExpressions(buildKeys, resolvedBuildNode)
+        val hashedRelation = buildHashedRelation(conf, resolvedBuildKeys, resolvedBuildNode)
+        val broadcastHashedRelation = mock(classOf[TorrentBroadcast[HashedRelation]])
+        when(broadcastHashedRelation.value).thenReturn(hashedRelation)
+
+        val hashJoinNode =
+          BroadcastHashJoinNode(
+            conf,
+            streamedKeys,
+            streamedNode,
+            buildSide,
+            resolvedBuildNode.output,
+            broadcastHashedRelation)
+        resolveExpressions(hashJoinNode)
       }
-      val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
-      val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+
       val expectedOutput = leftInput
         .filter { case (k, _) => rightInputMap.contains(k) }
         .map { case (k, v) => (k, v, k, rightInputMap(k)) }
-      val actualOutput = hashJoinNode.collect().map { row =>
-        // (id, name, id, nickname)
-        (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
+
+      Seq(makeBinaryHashJoinNode, makeBroadcastJoinNode).foreach { makeNode =>
+        val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
+        val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+
+        val actualOutput = hashJoinNode.collect().map { row =>
+          // (id, name, id, nickname)
+          (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
+        }
+        assert(actualOutput === expectedOutput)
       }
-      assert(actualOutput === expectedOutput)
     }
 
     test(s"$testNamePrefix: empty") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
index 098050bcd2236..615c417093612 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.local
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.{Expression, AttributeReference}
 import org.apache.spark.sql.types.{IntegerType, StringType}
 
 
@@ -67,4 +67,22 @@ class LocalNodeTest extends SparkFunSuite {
     }
   }
 
+  /**
+   * Resolve all expressions in `expressions` based on the `output` of `localNode`.
+   * It assumes that all expressions in the `localNode` are resolved.
+   */
+  protected def resolveExpressions(
+      expressions: Seq[Expression],
+      localNode: LocalNode): Seq[Expression] = {
+    require(localNode.expressions.forall(_.resolved))
+    val inputMap = localNode.output.map { a => (a.name, a) }.toMap
+    expressions.map { expression =>
+      expression.transformUp {
+        case UnresolvedAttribute(Seq(u)) =>
+          inputMap.getOrElse(u,
+            sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+      }
+    }
+  }
+
 }

From af2a5544875b23b3b62fb6d4f3bf432828720008 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Thu, 8 Oct 2015 12:42:10 -0700
Subject: [PATCH 499/802] [SPARK-10337] [SQL] fix hive views on
 non-hive-compatible tables.

add a new config to deal with this special case.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8990 from cloud-fan/view-master.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  15 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  23 +++
 .../org/apache/spark/sql/hive/HiveQl.scala    | 164 +++++++++++++++---
 .../sql/hive/client/ClientInterface.scala     |  13 +-
 .../spark/sql/hive/client/ClientWrapper.scala |  31 ++++
 .../hive/execution/CreateViewAsSelect.scala   |  97 +++++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    | 117 +++++++++++++
 7 files changed, 433 insertions(+), 27 deletions(-)
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e7bbc7d5db493..8f0f8910b36ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -319,6 +319,15 @@ private[spark] object SQLConf {
     doc = "When true, some predicates will be pushed down into the Hive metastore so that " +
           "unmatching partitions can be eliminated earlier.")
 
+  val CANONICALIZE_VIEW = booleanConf("spark.sql.canonicalizeView",
+    defaultValue = Some(false),
+    doc = "When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands.  " +
+          "Note that this function is experimental and should ony be used when you are using " +
+          "non-hive-compatible tables written by Spark SQL.  The SQL string used to create " +
+          "view should be fully qualified, i.e. use `tbl1`.`col1` instead of `*` whenever " +
+          "possible, or you may get wrong result.",
+    isPublic = false)
+
   val COLUMN_NAME_OF_CORRUPT_RECORD = stringConf("spark.sql.columnNameOfCorruptRecord",
     defaultValue = Some("_corrupt_record"),
     doc = "<TODO>")
@@ -362,7 +371,7 @@ private[spark] object SQLConf {
 
   val PARTITION_DISCOVERY_ENABLED = booleanConf("spark.sql.sources.partitionDiscovery.enabled",
     defaultValue = Some(true),
-    doc = "When true, automtically discover data partitions.")
+    doc = "When true, automatically discover data partitions.")
 
   val PARTITION_COLUMN_TYPE_INFERENCE =
     booleanConf("spark.sql.sources.partitionColumnTypeInference.enabled",
@@ -372,7 +381,7 @@ private[spark] object SQLConf {
   val PARTITION_MAX_FILES =
     intConf("spark.sql.sources.maxConcurrentWrites",
       defaultValue = Some(5),
-      doc = "The maximum number of concurent files to open before falling back on sorting when " +
+      doc = "The maximum number of concurrent files to open before falling back on sorting when " +
             "writing out files using dynamic partitioning.")
 
   // The output committer class used by HadoopFsRelation. The specified class needs to be a
@@ -471,6 +480,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  private[spark] def canonicalizeView: Boolean = getConf(CANONICALIZE_VIEW)
+
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
   private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, getConf(TUNGSTEN_ENABLED))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ea1521a48c8a7..cf59bc0d590b0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.execution.{FileRelation, datasources}
 import org.apache.spark.sql.hive.client._
+import org.apache.spark.sql.hive.execution.HiveNativeCommand
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
@@ -588,6 +589,28 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       // Wait until children are resolved.
       case p: LogicalPlan if !p.childrenResolved => p
       case p: LogicalPlan if p.resolved => p
+
+      case CreateViewAsSelect(table, child, allowExisting, replace, sql) =>
+        if (conf.canonicalizeView) {
+          if (allowExisting && replace) {
+            throw new AnalysisException(
+              "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
+          }
+
+          val (dbName, tblName) = processDatabaseAndTableName(
+            table.specifiedDatabase.getOrElse(client.currentDatabase), table.name)
+
+          execution.CreateViewAsSelect(
+            table.copy(
+              specifiedDatabase = Some(dbName),
+              name = tblName),
+            child.output,
+            allowExisting,
+            replace)
+        } else {
+          HiveNativeCommand(sql)
+        }
+
       case p @ CreateTableAsSelect(table, child, allowExisting) =>
         val schema = if (table.schema.nonEmpty) {
           table.schema
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 256440a9a2e97..2bf22f5449641 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -77,6 +77,16 @@ private[hive] case class CreateTableAsSelect(
     childrenResolved
 }
 
+private[hive] case class CreateViewAsSelect(
+    tableDesc: HiveTable,
+    child: LogicalPlan,
+    allowExisting: Boolean,
+    replace: Boolean,
+    sql: String) extends UnaryNode with Command {
+  override def output: Seq[Attribute] = Seq.empty[Attribute]
+  override lazy val resolved: Boolean = false
+}
+
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
 private[hive] object HiveQl extends Logging {
   protected val nativeCommands = Seq(
@@ -99,7 +109,6 @@ private[hive] object HiveQl extends Logging {
     "TOK_ALTERTABLE_SKEWED",
     "TOK_ALTERTABLE_TOUCH",
     "TOK_ALTERTABLE_UNARCHIVE",
-    "TOK_ALTERVIEW",
     "TOK_ALTERVIEW_ADDPARTS",
     "TOK_ALTERVIEW_AS",
     "TOK_ALTERVIEW_DROPPARTS",
@@ -110,7 +119,6 @@ private[hive] object HiveQl extends Logging {
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
     "TOK_CREATEROLE",
-    "TOK_CREATEVIEW",
 
     "TOK_DESCDATABASE",
     "TOK_DESCFUNCTION",
@@ -254,12 +262,17 @@ private[hive] object HiveQl extends Logging {
      * Otherwise, there will be Null pointer exception,
      * when retrieving properties form HiveConf.
      */
-    val hContext = new Context(SessionState.get().getConf())
-    val node = ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, hContext))
+    val hContext = createContext()
+    val node = getAst(sql, hContext)
     hContext.clear()
     node
   }
 
+  private def createContext(): Context = new Context(SessionState.get().getConf())
+
+  private def getAst(sql: String, context: Context) =
+    ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, context))
+
   /**
    * Returns the HiveConf
    */
@@ -280,15 +293,18 @@ private[hive] object HiveQl extends Logging {
   /** Creates LogicalPlan for a given HiveQL string. */
   def createPlan(sql: String): LogicalPlan = {
     try {
-      val tree = getAst(sql)
-      if (nativeCommands contains tree.getText) {
+      val context = createContext()
+      val tree = getAst(sql, context)
+      val plan = if (nativeCommands contains tree.getText) {
         HiveNativeCommand(sql)
       } else {
-        nodeToPlan(tree) match {
+        nodeToPlan(tree, context) match {
           case NativePlaceholder => HiveNativeCommand(sql)
           case other => other
         }
       }
+      context.clear()
+      plan
     } catch {
       case pe: org.apache.hadoop.hive.ql.parse.ParseException =>
         pe.getMessage match {
@@ -342,7 +358,9 @@ private[hive] object HiveQl extends Logging {
     }
   }
 
-  protected def getClauses(clauseNames: Seq[String], nodeList: Seq[ASTNode]): Seq[Option[Node]] = {
+  protected def getClauses(
+      clauseNames: Seq[String],
+      nodeList: Seq[ASTNode]): Seq[Option[ASTNode]] = {
     var remainingNodes = nodeList
     val clauses = clauseNames.map { clauseName =>
       val (matches, nonMatches) = remainingNodes.partition(_.getText.toUpperCase == clauseName)
@@ -489,7 +507,43 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       }
   }
 
-  protected def nodeToPlan(node: Node): LogicalPlan = node match {
+  private def createView(
+      view: ASTNode,
+      context: Context,
+      viewNameParts: ASTNode,
+      query: ASTNode,
+      schema: Seq[HiveColumn],
+      properties: Map[String, String],
+      allowExist: Boolean,
+      replace: Boolean): CreateViewAsSelect = {
+    val (db, viewName) = extractDbNameTableName(viewNameParts)
+
+    val originalText = context.getTokenRewriteStream
+      .toString(query.getTokenStartIndex, query.getTokenStopIndex)
+
+    val tableDesc = HiveTable(
+      specifiedDatabase = db,
+      name = viewName,
+      schema = schema,
+      partitionColumns = Seq.empty[HiveColumn],
+      properties = properties,
+      serdeProperties = Map[String, String](),
+      tableType = VirtualView,
+      location = None,
+      inputFormat = None,
+      outputFormat = None,
+      serde = None,
+      viewText = Some(originalText))
+
+    // We need to keep the original SQL string so that if `spark.sql.canonicalizeView` is
+    // false, we can fall back to use hive native command later.
+    // We can remove this when parser is configurable(can access SQLConf) in the future.
+    val sql = context.getTokenRewriteStream
+      .toString(view.getTokenStartIndex, view.getTokenStopIndex)
+    CreateViewAsSelect(tableDesc, nodeToPlan(query, context), allowExist, replace, sql)
+  }
+
+  protected def nodeToPlan(node: ASTNode, context: Context): LogicalPlan = node match {
     // Special drop table that also uncaches.
     case Token("TOK_DROPTABLE",
            Token("TOK_TABNAME", tableNameParts) ::
@@ -521,14 +575,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       val Some(crtTbl) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_CREATETABLE", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
-        nodeToPlan(crtTbl),
+        nodeToPlan(crtTbl, context),
         extended = extended.isDefined)
     case Token("TOK_EXPLAIN", explainArgs) =>
       // Ignore FORMATTED if present.
       val Some(query) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
-        nodeToPlan(query),
+        nodeToPlan(query, context),
         extended = extended.isDefined)
 
     case Token("TOK_DESCTABLE", describeArgs) =>
@@ -563,6 +617,73 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }
       }
 
+    case view @ Token("TOK_ALTERVIEW", children) =>
+      val Some(viewNameParts) :: maybeQuery :: ignores =
+        getClauses(Seq(
+          "TOK_TABNAME",
+          "TOK_QUERY",
+          "TOK_ALTERVIEW_ADDPARTS",
+          "TOK_ALTERVIEW_DROPPARTS",
+          "TOK_ALTERVIEW_PROPERTIES",
+          "TOK_ALTERVIEW_RENAME"), children)
+
+      // if ALTER VIEW doesn't have query part, let hive to handle it.
+      maybeQuery.map { query =>
+        createView(view, context, viewNameParts, query, Nil, Map(), false, true)
+      }.getOrElse(NativePlaceholder)
+
+    case view @ Token("TOK_CREATEVIEW", children)
+        if children.collect { case t @ Token("TOK_QUERY", _) => t }.nonEmpty =>
+      val Seq(
+        Some(viewNameParts),
+        Some(query),
+        maybeComment,
+        replace,
+        allowExisting,
+        maybeProperties,
+        maybeColumns,
+        maybePartCols
+      ) = getClauses(Seq(
+        "TOK_TABNAME",
+        "TOK_QUERY",
+        "TOK_TABLECOMMENT",
+        "TOK_ORREPLACE",
+        "TOK_IFNOTEXISTS",
+        "TOK_TABLEPROPERTIES",
+        "TOK_TABCOLNAME",
+        "TOK_VIEWPARTCOLS"), children)
+
+      // If the view is partitioned, we let hive handle it.
+      if (maybePartCols.isDefined) {
+        NativePlaceholder
+      } else {
+        val schema = maybeColumns.map { cols =>
+          BaseSemanticAnalyzer.getColumns(cols, true).asScala.map { field =>
+            // We can't specify column types when create view, so fill it with null first, and
+            // update it after the schema has been resolved later.
+            HiveColumn(field.getName, null, field.getComment)
+          }
+        }.getOrElse(Seq.empty[HiveColumn])
+
+        val properties = scala.collection.mutable.Map.empty[String, String]
+
+        maybeProperties.foreach {
+          case Token("TOK_TABLEPROPERTIES", list :: Nil) =>
+            properties ++= getProperties(list)
+        }
+
+        maybeComment.foreach {
+          case Token("TOK_TABLECOMMENT", child :: Nil) =>
+            val comment = BaseSemanticAnalyzer.unescapeSQLString(child.getText)
+            if (comment ne null) {
+              properties += ("comment" -> comment)
+            }
+        }
+
+        createView(view, context, viewNameParts, query, schema, properties.toMap,
+          allowExisting.isDefined, replace.isDefined)
+      }
+
     case Token("TOK_CREATETABLE", children)
         if children.collect { case t @ Token("TOK_QUERY", _) => t }.nonEmpty =>
       // Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -774,7 +895,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case _ => // Unsupport features
       }
 
-      CreateTableAsSelect(tableDesc, nodeToPlan(query), allowExisting != None)
+      CreateTableAsSelect(tableDesc, nodeToPlan(query, context), allowExisting != None)
 
     // If its not a "CTAS" like above then take it as a native command
     case Token("TOK_CREATETABLE", _) => NativePlaceholder
@@ -793,7 +914,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             insertClauses.last match {
               case Token("TOK_CTE", cteClauses) =>
                 val cteRelations = cteClauses.map(node => {
-                  val relation = nodeToRelation(node).asInstanceOf[Subquery]
+                  val relation = nodeToRelation(node, context).asInstanceOf[Subquery]
                   (relation.alias, relation)
                 }).toMap
                 (Some(args.head), insertClauses.init, Some(cteRelations))
@@ -847,7 +968,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }
 
         val relations = fromClause match {
-          case Some(f) => nodeToRelation(f)
+          case Some(f) => nodeToRelation(f, context)
           case None => OneRowRelation
         }
 
@@ -1094,7 +1215,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       cteRelations.map(With(query, _)).getOrElse(query)
 
     // HIVE-9039 renamed TOK_UNION => TOK_UNIONALL while adding TOK_UNIONDISTINCT
-    case Token("TOK_UNIONALL", left :: right :: Nil) => Union(nodeToPlan(left), nodeToPlan(right))
+    case Token("TOK_UNIONALL", left :: right :: Nil) =>
+      Union(nodeToPlan(left, context), nodeToPlan(right, context))
 
     case a: ASTNode =>
       throw new NotImplementedError(s"No parse rules for $node:\n ${dumpTree(a).toString} ")
@@ -1102,10 +1224,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
   val allJoinTokens = "(TOK_.*JOIN)".r
   val laterViewToken = "TOK_LATERAL_VIEW(.*)".r
-  def nodeToRelation(node: Node): LogicalPlan = node match {
+  def nodeToRelation(node: Node, context: Context): LogicalPlan = node match {
     case Token("TOK_SUBQUERY",
            query :: Token(alias, Nil) :: Nil) =>
-      Subquery(cleanIdentifier(alias), nodeToPlan(query))
+      Subquery(cleanIdentifier(alias), nodeToPlan(query, context))
 
     case Token(laterViewToken(isOuter), selectClause :: relationClause :: Nil) =>
       val Token("TOK_SELECT",
@@ -1121,7 +1243,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           outer = isOuter.nonEmpty,
           Some(alias.toLowerCase),
           attributes.map(UnresolvedAttribute(_)),
-          nodeToRelation(relationClause))
+          nodeToRelation(relationClause, context))
 
     /* All relations, possibly with aliases or sampling clauses. */
     case Token("TOK_TABREF", clauses) =>
@@ -1189,7 +1311,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }.map(_._2)
 
       val isPreserved = tableOrdinals.map(i => (i - 1 < 0) || joinArgs(i - 1).getText == "PRESERVE")
-      val tables = tableOrdinals.map(i => nodeToRelation(joinArgs(i)))
+      val tables = tableOrdinals.map(i => nodeToRelation(joinArgs(i), context))
       val joinExpressions =
         tableOrdinals.map(i => joinArgs(i + 1).getChildren.asScala.map(nodeToExpr))
 
@@ -1244,8 +1366,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case "TOK_FULLOUTERJOIN" => FullOuter
         case "TOK_LEFTSEMIJOIN" => LeftSemi
       }
-      Join(nodeToRelation(relation1),
-        nodeToRelation(relation2),
+      Join(nodeToRelation(relation1, context),
+        nodeToRelation(relation2, context),
         joinType,
         other.headOption.map(nodeToExpr))
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 3811c152a7ae6..915eae9d21e23 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -19,13 +19,12 @@ package org.apache.spark.sql.hive.client
 
 import java.io.PrintStream
 import java.util.{Map => JMap}
+import javax.annotation.Nullable
 
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
 import org.apache.spark.sql.catalyst.expressions.Expression
 
-private[hive] case class HiveDatabase(
-    name: String,
-    location: String)
+private[hive] case class HiveDatabase(name: String, location: String)
 
 private[hive] abstract class TableType { val name: String }
 private[hive] case object ExternalTable extends TableType { override val name = "EXTERNAL_TABLE" }
@@ -45,7 +44,7 @@ private[hive] case class HivePartition(
     values: Seq[String],
     storage: HiveStorageDescriptor)
 
-private[hive] case class HiveColumn(name: String, hiveType: String, comment: String)
+private[hive] case class HiveColumn(name: String, @Nullable hiveType: String, comment: String)
 private[hive] case class HiveTable(
     specifiedDatabase: Option[String],
     name: String,
@@ -126,6 +125,12 @@ private[hive] trait ClientInterface {
   /** Returns the metadata for the specified table or None if it doens't exist. */
   def getTableOption(dbName: String, tableName: String): Option[HiveTable]
 
+  /** Creates a view with the given metadata. */
+  def createView(view: HiveTable): Unit
+
+  /** Updates the given view with new metadata. */
+  def alertView(view: HiveTable): Unit
+
   /** Creates a table with the given metadata. */
   def createTable(table: HiveTable): Unit
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 4d1e3ed9198e6..8f6d448b2aef4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -354,6 +354,37 @@ private[hive] class ClientWrapper(
     qlTable
   }
 
+  private def toViewTable(view: HiveTable): metadata.Table = {
+    // TODO: this is duplicated with `toQlTable` except the table type stuff.
+    val tbl = new metadata.Table(view.database, view.name)
+    tbl.setTableType(HTableType.VIRTUAL_VIEW)
+    tbl.setSerializationLib(null)
+    tbl.clearSerDeInfo()
+
+    // TODO: we will save the same SQL string to original and expanded text, which is different
+    // from Hive.
+    tbl.setViewOriginalText(view.viewText.get)
+    tbl.setViewExpandedText(view.viewText.get)
+
+    tbl.setFields(view.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
+    view.properties.foreach { case (k, v) => tbl.setProperty(k, v) }
+
+    // set owner
+    tbl.setOwner(conf.getUser)
+    // set create time
+    tbl.setCreateTime((System.currentTimeMillis() / 1000).asInstanceOf[Int])
+
+    tbl
+  }
+
+  override def createView(view: HiveTable): Unit = withHiveState {
+    client.createTable(toViewTable(view))
+  }
+
+  override def alertView(view: HiveTable): Unit = withHiveState {
+    client.alterTable(view.qualifiedName, toViewTable(view))
+  }
+
   override def createTable(table: HiveTable): Unit = withHiveState {
     val qlTable = toQlTable(table)
     client.createTable(qlTable)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
new file mode 100644
index 0000000000000..2b504ac974f07
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
+import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
+import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
+
+/**
+ * Create Hive view on non-hive-compatible tables by specifying schema ourselves instead of
+ * depending on Hive meta-store.
+ */
+// TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different
+// from Hive and may not work for some cases like create view on self join.
+private[hive] case class CreateViewAsSelect(
+    tableDesc: HiveTable,
+    childSchema: Seq[Attribute],
+    allowExisting: Boolean,
+    orReplace: Boolean) extends RunnableCommand {
+
+  assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
+  assert(tableDesc.viewText.isDefined)
+
+  override def run(sqlContext: SQLContext): Seq[Row] = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
+    val database = tableDesc.database
+    val viewName = tableDesc.name
+
+    if (hiveContext.catalog.tableExists(Seq(database, viewName))) {
+      if (allowExisting) {
+        // view already exists, will do nothing, to keep consistent with Hive
+      } else if (orReplace) {
+        hiveContext.catalog.client.alertView(prepareTable())
+      } else {
+        throw new AnalysisException(s"View $database.$viewName already exists. " +
+          "If you want to update the view definition, please use ALTER VIEW AS or " +
+          "CREATE OR REPLACE VIEW AS")
+      }
+    } else {
+      hiveContext.catalog.client.createView(prepareTable())
+    }
+
+    Seq.empty[Row]
+  }
+
+  private def prepareTable(): HiveTable = {
+    // setup column types according to the schema of child.
+    val schema = if (tableDesc.schema == Nil) {
+      childSchema.map { attr =>
+        HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null)
+      }
+    } else {
+      childSchema.zip(tableDesc.schema).map { case (attr, col) =>
+        HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment)
+      }
+    }
+
+    val columnNames = childSchema.map(f => verbose(f.name))
+
+    // When user specified column names for view, we should create a project to do the renaming.
+    // When no column name specified, we still need to create a project to declare the columns
+    // we need, to make us more robust to top level `*`s.
+    val projectList = if (tableDesc.schema == Nil) {
+      columnNames.mkString(", ")
+    } else {
+      columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map {
+        case (name, alias) => s"$name AS $alias"
+      }.mkString(", ")
+    }
+
+    val viewName = verbose(tableDesc.name)
+
+    val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName"
+
+    tableDesc.copy(schema = schema, viewText = Some(expandedText))
+  }
+
+  // escape backtick with double-backtick in column name and wrap it with backtick.
+  private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`"
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8c3f9ac202637..ec5b83b98e401 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1248,4 +1248,121 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         """.stripMargin), Row("b", 6.0) :: Row("a", 7.0) :: Nil)
     }
   }
+
+  test("correctly parse CREATE VIEW statement") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt") {
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt")
+        sql(
+          """CREATE VIEW IF NOT EXISTS
+            |default.testView (c1 COMMENT 'blabla', c2 COMMENT 'blabla')
+            |COMMENT 'blabla'
+            |TBLPROPERTIES ('a' = 'b')
+            |AS SELECT * FROM jt""".stripMargin)
+        checkAnswer(sql("SELECT c1, c2 FROM testView ORDER BY c1"), (1 to 9).map(i => Row(i, i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("correctly handle CREATE VIEW IF NOT EXISTS") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("CREATE VIEW IF NOT EXISTS testView AS SELECT * FROM jt2")
+
+        // make sure our view doesn't change.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("correctly handle CREATE OR REPLACE VIEW") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE OR REPLACE VIEW testView AS SELECT id FROM jt")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("CREATE OR REPLACE VIEW testView AS SELECT * FROM jt2")
+        // make sure the view has been changed.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY i"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+
+        val e = intercept[AnalysisException] {
+          sql("CREATE OR REPLACE VIEW IF NOT EXISTS testView AS SELECT id FROM jt")
+        }
+        assert(e.message.contains("not allowed to define a view"))
+      }
+    }
+  }
+
+  test("correctly handle ALTER VIEW") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("ALTER VIEW testView AS SELECT * FROM jt2")
+        // make sure the view has been changed.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY i"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for json table") {
+    // json table is not hive-compatible, make sure the new flag fix it.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for partitioned parquet table") {
+    // partitioned parquet table is not hive-compatible, make sure the new flag fix it.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("parTable") {
+        val df = Seq(1 -> "a").toDF("i", "j")
+        df.write.format("parquet").partitionBy("i").saveAsTable("parTable")
+        sql("CREATE VIEW testView AS SELECT i, j FROM parTable")
+        checkAnswer(sql("SELECT * FROM testView"), Row(1, "a"))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for joined tables") {
+    // make sure the new flag can handle some complex cases like join and schema change.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt1", "jt2") {
+        sqlContext.range(1, 10).toDF("id1").write.format("json").saveAsTable("jt1")
+        sqlContext.range(1, 10).toDF("id2").write.format("json").saveAsTable("jt2")
+        sql("CREATE VIEW testView AS SELECT * FROM jt1 JOIN jt2 ON id1 == id2")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i)))
+
+        val df = (1 until 10).map(i => i -> i).toDF("id1", "newCol")
+        df.write.format("json").mode(SaveMode.Overwrite).saveAsTable("jt1")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+      }
+    }
+  }
 }

From a8226a9f14e81c0b6712a30f1a60276200faebac Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 8 Oct 2015 13:49:10 -0700
Subject: [PATCH 500/802] Revert [SPARK-8654] [SQL] Fix Analysis exception when
 using NULL IN

This reverts commit dcbd58a929be0058b1cfa59b14898c4c428a7680 from #8983

Author: Michael Armbrust <michael@databricks.com>

Closes #9034 from marmbrus/revert8654.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 10 ++-------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 21 -------------------
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 7192c931d2e51..87a3845b2d9e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -304,10 +304,7 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Convert the value and in list expressions to the common operator type
-   * by looking at all the argument types and finding the closest one that
-   * all the arguments can be cast to. When no common operator type is found
-   * an Analysis Exception is raised.
+   * Convert all expressions in in() list to the left operator type
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
@@ -315,10 +312,7 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
-        findWiderCommonType(i.children.map(_.dataType)) match {
-          case Some(finalDataType) => i.withNewChildren(i.children.map(Cast(_, finalDataType)))
-          case None => i
-        }
+        i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 77a4765e7751c..820b336aac759 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -135,25 +135,4 @@ class AnalysisSuite extends AnalysisTest {
     plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
     checkAnalysis(plan, plan)
   }
-
-  test("SPARK-8654: invalid CAST in NULL IN(...) expression") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(2))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisSuccess(plan)
-  }
-
-  test("SPARK-8654: different types in inlist but can be converted to a commmon type") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(1.2345))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisSuccess(plan)
-  }
-
-  test("SPARK-8654: check type compatibility error") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(true), Literal(1))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisError(plan, Seq("data type mismatch: Arguments must be same type"))
-  }
 }

From 9e66a53c9955285a85c19f55c3ef62db2e1b868a Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 8 Oct 2015 14:28:14 -0700
Subject: [PATCH 501/802] [SPARK-10993] [SQL] Inital code generated encoder for
 product types

This PR is a first cut at code generating an encoder that takes a Scala `Product` type and converts it directly into the tungsten binary format.  This is done through the addition of a new set of expression that can be used to invoke methods on raw JVM objects, extracting fields and converting the result into the required format.  These can then be used directly in an `UnsafeProjection` allowing us to leverage the existing encoding logic.

According to some simple benchmarks, this can significantly speed up conversion (~4x).  However, replacing CatalystConverters is deferred to a later PR to keep this PR at a reasonable size.

```scala
case class SomeInts(a: Int, b: Int, c: Int, d: Int, e: Int)

val data = SomeInts(1, 2, 3, 4, 5)
val encoder = ProductEncoder[SomeInts]
val converter = CatalystTypeConverters.createToCatalystConverter(ScalaReflection.schemaFor[SomeInts].dataType)

(1 to 5).foreach {iter =>
  benchmark(s"converter $iter") {
    var i = 100000000
    while (i > 0) {
      val res = converter(data).asInstanceOf[InternalRow]
      assert(res.getInt(0) == 1)
      assert(res.getInt(1) == 2)
      i -= 1
    }
  }

  benchmark(s"encoder $iter") {
    var i = 100000000
    while (i > 0) {
      val res = encoder.toRow(data)
      assert(res.getInt(0) == 1)
      assert(res.getInt(1) == 2)
      i -= 1
    }
  }
}
```

Results:
```
[info] converter 1: 7170ms
[info] encoder 1: 1888ms
[info] converter 2: 6763ms
[info] encoder 2: 1824ms
[info] converter 3: 6912ms
[info] encoder 3: 1802ms
[info] converter 4: 7131ms
[info] encoder 4: 1798ms
[info] converter 5: 7350ms
[info] encoder 5: 1912ms
```

Author: Michael Armbrust <michael@databricks.com>

Closes #9019 from marmbrus/productEncoder.
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 238 ++++++++++++-
 .../spark/sql/catalyst/encoders/Encoder.scala |  44 +++
 .../catalyst/encoders/ProductEncoder.scala    |  67 ++++
 .../expressions/codegen/CodeGenerator.scala   |   4 +-
 .../sql/catalyst/expressions/objects.scala    | 334 ++++++++++++++++++
 .../spark/sql/types/GenericArrayData.scala    |   9 +
 .../apache/spark/sql/types/ObjectType.scala   |  42 +++
 .../encoders/ProductEncoderSuite.scala        | 174 +++++++++
 8 files changed, 910 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 2442341da106d..8b733f2a0b91f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.expressions._
@@ -75,6 +76,242 @@ trait ScalaReflection {
    */
   private def localTypeOf[T: TypeTag]: `Type` = typeTag[T].in(mirror).tpe
 
+  /**
+   * Returns the Spark SQL DataType for a given scala type.  Where this is not an exact mapping
+   * to a native type, an ObjectType is returned. Special handling is also used for Arrays including
+   * those that hold primitive types.
+   */
+  def dataTypeFor(tpe: `Type`): DataType = tpe match {
+    case t if t <:< definitions.IntTpe => IntegerType
+    case t if t <:< definitions.LongTpe => LongType
+    case t if t <:< definitions.DoubleTpe => DoubleType
+    case t if t <:< definitions.FloatTpe => FloatType
+    case t if t <:< definitions.ShortTpe => ShortType
+    case t if t <:< definitions.ByteTpe => ByteType
+    case t if t <:< definitions.BooleanTpe => BooleanType
+    case t if t <:< localTypeOf[Array[Byte]] => BinaryType
+    case _ =>
+      val className: String = tpe.erasure.typeSymbol.asClass.fullName
+      className match {
+        case "scala.Array" =>
+          val TypeRef(_, _, Seq(arrayType)) = tpe
+          val cls = arrayType match {
+            case t if t <:< definitions.IntTpe => classOf[Array[Int]]
+            case t if t <:< definitions.LongTpe => classOf[Array[Long]]
+            case t if t <:< definitions.DoubleTpe => classOf[Array[Double]]
+            case t if t <:< definitions.FloatTpe => classOf[Array[Float]]
+            case t if t <:< definitions.ShortTpe => classOf[Array[Short]]
+            case t if t <:< definitions.ByteTpe => classOf[Array[Byte]]
+            case t if t <:< definitions.BooleanTpe => classOf[Array[Boolean]]
+            case other =>
+              // There is probably a better way to do this, but I couldn't find it...
+              val elementType = dataTypeFor(other).asInstanceOf[ObjectType].cls
+              java.lang.reflect.Array.newInstance(elementType, 1).getClass
+
+          }
+          ObjectType(cls)
+        case other => ObjectType(Utils.classForName(className))
+      }
+  }
+
+  /** Returns expressions for extracting all the fields from the given type. */
+  def extractorsFor[T : TypeTag](inputObject: Expression): Seq[Expression] = {
+    ScalaReflectionLock.synchronized {
+      extractorFor(inputObject, typeTag[T].tpe).asInstanceOf[CreateStruct].children
+    }
+  }
+
+  /** Helper for extracting internal fields from a case class. */
+  protected def extractorFor(
+      inputObject: Expression,
+      tpe: `Type`): Expression = ScalaReflectionLock.synchronized {
+    if (!inputObject.dataType.isInstanceOf[ObjectType]) {
+      inputObject
+    } else {
+      tpe match {
+        case t if t <:< localTypeOf[Option[_]] =>
+          val TypeRef(_, _, Seq(optType)) = t
+          optType match {
+            // For primitive types we must manually unbox the value of the object.
+            case t if t <:< definitions.IntTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Integer]), inputObject),
+                "intValue",
+                IntegerType)
+            case t if t <:< definitions.LongTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Long]), inputObject),
+                "longValue",
+                LongType)
+            case t if t <:< definitions.DoubleTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Double]), inputObject),
+                "doubleValue",
+                DoubleType)
+            case t if t <:< definitions.FloatTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Float]), inputObject),
+                "floatValue",
+                FloatType)
+            case t if t <:< definitions.ShortTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Short]), inputObject),
+                "shortValue",
+                ShortType)
+            case t if t <:< definitions.ByteTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Byte]), inputObject),
+                "byteValue",
+                ByteType)
+            case t if t <:< definitions.BooleanTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Boolean]), inputObject),
+                "booleanValue",
+                BooleanType)
+
+            // For non-primitives, we can just extract the object from the Option and then recurse.
+            case other =>
+              val className: String = optType.erasure.typeSymbol.asClass.fullName
+              val classObj = Utils.classForName(className)
+              val optionObjectType = ObjectType(classObj)
+
+              val unwrapped = UnwrapOption(optionObjectType, inputObject)
+              expressions.If(
+                IsNull(unwrapped),
+                expressions.Literal.create(null, schemaFor(optType).dataType),
+                extractorFor(unwrapped, optType))
+          }
+
+        case t if t <:< localTypeOf[Product] =>
+          val formalTypeArgs = t.typeSymbol.asClass.typeParams
+          val TypeRef(_, _, actualTypeArgs) = t
+          val constructorSymbol = t.member(nme.CONSTRUCTOR)
+          val params = if (constructorSymbol.isMethod) {
+            constructorSymbol.asMethod.paramss
+          } else {
+            // Find the primary constructor, and use its parameter ordering.
+            val primaryConstructorSymbol: Option[Symbol] =
+              constructorSymbol.asTerm.alternatives.find(s =>
+                s.isMethod && s.asMethod.isPrimaryConstructor)
+
+            if (primaryConstructorSymbol.isEmpty) {
+              sys.error("Internal SQL error: Product object did not have a primary constructor.")
+            } else {
+              primaryConstructorSymbol.get.asMethod.paramss
+            }
+          }
+
+          CreateStruct(params.head.map { p =>
+            val fieldName = p.name.toString
+            val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+            val fieldValue = Invoke(inputObject, fieldName, dataTypeFor(fieldType))
+            extractorFor(fieldValue, fieldType)
+          })
+
+        case t if t <:< localTypeOf[Array[_]] =>
+          val TypeRef(_, _, Seq(elementType)) = t
+          val elementDataType = dataTypeFor(elementType)
+          val Schema(dataType, nullable) = schemaFor(elementType)
+
+          if (!elementDataType.isInstanceOf[AtomicType]) {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              inputObject :: Nil,
+              dataType = ArrayType(dataType, nullable))
+          }
+
+        case t if t <:< localTypeOf[Seq[_]] =>
+          val TypeRef(_, _, Seq(elementType)) = t
+          val elementDataType = dataTypeFor(elementType)
+          val Schema(dataType, nullable) = schemaFor(elementType)
+
+          if (!elementDataType.isInstanceOf[AtomicType]) {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              inputObject :: Nil,
+              dataType = ArrayType(dataType, nullable))
+          }
+
+        case t if t <:< localTypeOf[Map[_, _]] =>
+          val TypeRef(_, _, Seq(keyType, valueType)) = t
+          val Schema(keyDataType, _) = schemaFor(keyType)
+          val Schema(valueDataType, valueNullable) = schemaFor(valueType)
+
+          val rawMap = inputObject
+          val keys =
+            NewInstance(
+              classOf[GenericArrayData],
+              Invoke(rawMap, "keys", ObjectType(classOf[scala.collection.GenIterable[_]])) :: Nil,
+              dataType = ObjectType(classOf[ArrayData]))
+          val values =
+            NewInstance(
+              classOf[GenericArrayData],
+              Invoke(rawMap, "values", ObjectType(classOf[scala.collection.GenIterable[_]])) :: Nil,
+              dataType = ObjectType(classOf[ArrayData]))
+          NewInstance(
+            classOf[ArrayBasedMapData],
+            keys :: values :: Nil,
+            dataType = MapType(keyDataType, valueDataType, valueNullable))
+
+        case t if t <:< localTypeOf[String] =>
+          StaticInvoke(
+            classOf[UTF8String],
+            StringType,
+            "fromString",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.sql.Timestamp] =>
+          StaticInvoke(
+            DateTimeUtils,
+            TimestampType,
+            "fromJavaTimestamp",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.sql.Date] =>
+          StaticInvoke(
+            DateTimeUtils,
+            DateType,
+            "fromJavaDate",
+            inputObject :: Nil)
+        case t if t <:< localTypeOf[BigDecimal] =>
+          StaticInvoke(
+            Decimal,
+            DecimalType.SYSTEM_DEFAULT,
+            "apply",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.math.BigDecimal] =>
+          StaticInvoke(
+            Decimal,
+            DecimalType.SYSTEM_DEFAULT,
+            "apply",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.lang.Integer] =>
+          Invoke(inputObject, "intValue", IntegerType)
+        case t if t <:< localTypeOf[java.lang.Long] =>
+          Invoke(inputObject, "longValue", LongType)
+        case t if t <:< localTypeOf[java.lang.Double] =>
+          Invoke(inputObject, "doubleValue", DoubleType)
+        case t if t <:< localTypeOf[java.lang.Float] =>
+          Invoke(inputObject, "floatValue", FloatType)
+        case t if t <:< localTypeOf[java.lang.Short] =>
+          Invoke(inputObject, "shortValue", ShortType)
+        case t if t <:< localTypeOf[java.lang.Byte] =>
+          Invoke(inputObject, "byteValue", ByteType)
+        case t if t <:< localTypeOf[java.lang.Boolean] =>
+          Invoke(inputObject, "booleanValue", BooleanType)
+
+        case other =>
+          throw new UnsupportedOperationException(s"Extractor for type $other is not supported")
+      }
+    }
+  }
+
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = ScalaReflectionLock.synchronized {
     val className: String = tpe.erasure.typeSymbol.asClass.fullName
@@ -91,7 +328,6 @@ trait ScalaReflection {
       case t if t <:< localTypeOf[Option[_]] =>
         val TypeRef(_, _, Seq(optType)) = t
         Schema(schemaFor(optType).dataType, nullable = true)
-      // Need to decide if we actually need a special type here.
       case t if t <:< localTypeOf[Array[Byte]] => Schema(BinaryType, nullable = true)
       case t if t <:< localTypeOf[Array[_]] =>
         val TypeRef(_, _, Seq(elementType)) = t
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
new file mode 100644
index 0000000000000..8dacfa9477ee6
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Used to convert a JVM object of type `T` to and from the internal Spark SQL representation.
+ *
+ * Encoders are not intended to be thread-safe and thus they are allow to avoid internal locking
+ * and reuse internal buffers to improve performance.
+ */
+trait Encoder[T] {
+  /** Returns the schema of encoding this type of object as a Row. */
+  def schema: StructType
+
+  /** A ClassTag that can be used to construct and Array to contain a collection of `T`. */
+  def clsTag: ClassTag[T]
+
+  /**
+   * Returns an encoded version of `t` as a Spark SQL row.  Note that multiple calls to
+   * toRow are allowed to return the same actual [[InternalRow]] object.  Thus, the caller should
+   * copy the result before making another call if required.
+   */
+  def toRow(t: T): InternalRow
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
new file mode 100644
index 0000000000000..a23613673ebb5
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.{typeTag, TypeTag}
+
+import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.types.{ObjectType, StructType}
+
+/**
+ * A factory for constructing encoders that convert Scala's product type to/from the Spark SQL
+ * internal binary representation.
+ */
+object ProductEncoder {
+  def apply[T <: Product : TypeTag]: Encoder[T] = {
+    // We convert the not-serializable TypeTag into StructType and ClassTag.
+    val schema = ScalaReflection.schemaFor[T].dataType.asInstanceOf[StructType]
+    val mirror = typeTag[T].mirror
+    val cls = mirror.runtimeClass(typeTag[T].tpe)
+
+    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
+    val extractExpressions = ScalaReflection.extractorsFor[T](inputObject)
+    new ClassEncoder[T](schema, extractExpressions, ClassTag[T](cls))
+  }
+}
+
+/**
+ * A generic encoder for JVM objects.
+ *
+ * @param schema The schema after converting `T` to a Spark SQL row.
+ * @param extractExpressions A set of expressions, one for each top-level field that can be used to
+ *                           extract the values from a raw object.
+ * @param clsTag A classtag for `T`.
+ */
+case class ClassEncoder[T](
+    schema: StructType,
+    extractExpressions: Seq[Expression],
+    clsTag: ClassTag[T])
+  extends Encoder[T] {
+
+  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  private val inputRow = new GenericMutableRow(1)
+
+  override def toRow(t: T): InternalRow = {
+    inputRow(0) = t
+    extractProjection(inputRow)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2dd680454b4cf..a0fe5bd77e3aa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -177,6 +177,8 @@ class CodeGenContext {
     case _: MapType => "MapData"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
+    case ObjectType(cls) if cls.isArray => s"${javaType(ObjectType(cls.getComponentType))}[]"
+    case ObjectType(cls) => cls.getName
     case _ => "Object"
   }
 
@@ -395,7 +397,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
 
     logDebug({
       // Only add extra debugging info to byte code when we are going to print the source code.
-      evaluator.setDebuggingInformation(false, true, false)
+      evaluator.setDebuggingInformation(true, true, false)
       withLineNums
     })
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
new file mode 100644
index 0000000000000..e1f960a6e605c
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -0,0 +1,334 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import scala.language.existentials
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.types._
+
+/**
+ * Invokes a static function, returning the result.  By default, any of the arguments being null
+ * will result in returning null instead of calling the function.
+ *
+ * @param staticObject The target of the static call.  This can either be the object itself
+ *                     (methods defined on scala objects), or the class object
+ *                     (static methods defined in java).
+ * @param dataType The expected return type of the function call
+ * @param functionName The name of the method to call.
+ * @param arguments An optional list of expressions to pass as arguments to the function.
+ * @param propagateNull When true, and any of the arguments is null, null will be returned instead
+ *                      of calling the function.
+ */
+case class StaticInvoke(
+    staticObject: Any,
+    dataType: DataType,
+    functionName: String,
+    arguments: Seq[Expression] = Nil,
+    propagateNull: Boolean = true) extends Expression {
+
+  val objectName = staticObject match {
+    case c: Class[_] => c.getName
+    case other => other.getClass.getName.stripSuffix("$")
+  }
+  override def nullable: Boolean = true
+  override def children: Seq[Expression] = Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    if (propagateNull) {
+      val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+        s"${ev.isNull} = ${ev.value} == null;"
+      } else {
+        ""
+      }
+
+      val argsNonNull = s"!(${argGen.map(_.isNull).mkString(" || ")})"
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        boolean ${ev.isNull} = true;
+        $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+        if ($argsNonNull) {
+          ${ev.value} = $objectName.$functionName($argString);
+          $objNullCheck
+        }
+       """
+    } else {
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        final boolean ${ev.isNull} = ${ev.value} == null;
+        $javaType ${ev.value} = $objectName.$functionName($argString);
+      """
+    }
+  }
+}
+
+/**
+ * Calls the specified function on an object, optionally passing arguments.  If the `targetObject`
+ * expression evaluates to null then null will be returned.
+ *
+ * @param targetObject An expression that will return the object to call the method on.
+ * @param functionName The name of the method to call.
+ * @param dataType The expected return type of the function.
+ * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
+ */
+case class Invoke(
+    targetObject: Expression,
+    functionName: String,
+    dataType: DataType,
+    arguments: Seq[Expression] = Nil) extends Expression {
+
+  override def nullable: Boolean = true
+  override def children: Seq[Expression] = targetObject :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val obj = targetObject.gen(ctx)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    // If the function can return null, we do an extra check to make sure our null bit is still set
+    // correctly.
+    val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+      s"${ev.isNull} = ${ev.value} == null;"
+    } else {
+      ""
+    }
+
+    s"""
+      ${obj.code}
+      ${argGen.map(_.code).mkString("\n")}
+
+      boolean ${ev.isNull} = ${obj.value} == null;
+      $javaType ${ev.value} =
+        ${ev.isNull} ?
+        ${ctx.defaultValue(dataType)} : ($javaType) ${obj.value}.$functionName($argString);
+      $objNullCheck
+    """
+  }
+}
+
+/**
+ * Constructs a new instance of the given class, using the result of evaluating the specified
+ * expressions as arguments.
+ *
+ * @param cls The class to construct.
+ * @param arguments A list of expression to use as arguments to the constructor.
+ * @param propagateNull When true, if any of the arguments is null, then null will be returned
+ *                      instead of trying to construct the object.
+ * @param dataType The type of object being constructed, as a Spark SQL datatype.  This allows you
+ *                 to manually specify the type when the object in question is a valid internal
+ *                 representation (i.e. ArrayData) instead of an object.
+ */
+case class NewInstance(
+    cls: Class[_],
+    arguments: Seq[Expression],
+    propagateNull: Boolean = true,
+    dataType: DataType) extends Expression {
+  private val className = cls.getName
+
+  override def nullable: Boolean = propagateNull
+
+  override def children: Seq[Expression] = arguments
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    if (propagateNull) {
+      val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+        s"${ev.isNull} = ${ev.value} == null;"
+      } else {
+        ""
+      }
+
+      val argsNonNull = s"!(${argGen.map(_.isNull).mkString(" || ")})"
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        boolean ${ev.isNull} = true;
+        $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+        if ($argsNonNull) {
+          ${ev.value} = new $className($argString);
+          ${ev.isNull} = false;
+        }
+       """
+    } else {
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        final boolean ${ev.isNull} = ${ev.value} == null;
+        $javaType ${ev.value} = new $className($argString);
+      """
+    }
+  }
+}
+
+/**
+ * Given an expression that returns on object of type `Option[_]`, this expression unwraps the
+ * option into the specified Spark SQL datatype.  In the case of `None`, the nullbit is set instead.
+ *
+ * @param dataType The expected unwrapped option type.
+ * @param child An expression that returns an `Option`
+ */
+case class UnwrapOption(
+    dataType: DataType,
+    child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def nullable: Boolean = true
+
+  override def children: Seq[Expression] = Nil
+
+  override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val inputObject = child.gen(ctx)
+
+    s"""
+      ${inputObject.code}
+
+      boolean ${ev.isNull} = ${inputObject.value} == null || ${inputObject.value}.isEmpty();
+      $javaType ${ev.value} =
+        ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($javaType)${inputObject.value}.get();
+    """
+  }
+}
+
+case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends Expression {
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    throw new UnsupportedOperationException("Only calling gen() is supported.")
+
+  override def children: Seq[Expression] = Nil
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode =
+    GeneratedExpressionCode(code = "", value = value, isNull = isNull)
+
+  override def nullable: Boolean = false
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+}
+
+/**
+ * Applies the given expression to every element of a collection of items, returning the result
+ * as an ArrayType.  This is similar to a typical map operation, but where the lambda function
+ * is expressed using catalyst expressions.
+ *
+ * The following collection ObjectTypes are currently supported: Seq, Array
+ *
+ * @param function A function that returns an expression, given an attribute that can be used
+ *                 to access the current value.  This is does as a lambda function so that
+ *                 a unique attribute reference can be provided for each expression (thus allowing
+ *                 us to nest multiple MapObject calls).
+ * @param inputData An expression that when evaluted returns a collection object.
+ * @param elementType The type of element in the collection, expressed as a DataType.
+ */
+case class MapObjects(
+    function: AttributeReference => Expression,
+    inputData: Expression,
+    elementType: DataType) extends Expression {
+
+  private val loopAttribute = AttributeReference("loopVar", elementType)()
+  private val completeFunction = function(loopAttribute)
+
+  private val (lengthFunction, itemAccessor) = inputData.dataType match {
+    case ObjectType(cls) if cls.isAssignableFrom(classOf[Seq[_]]) =>
+      (".size()", (i: String) => s".apply($i)")
+    case ObjectType(cls) if cls.isArray =>
+      (".length", (i: String) => s"[$i]")
+  }
+
+  override def nullable: Boolean = true
+
+  override def children: Seq[Expression] = completeFunction :: inputData :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def dataType: DataType = ArrayType(completeFunction.dataType)
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val elementJavaType = ctx.javaType(elementType)
+    val genInputData = inputData.gen(ctx)
+
+    // Variables to hold the element that is currently being processed.
+    val loopValue = ctx.freshName("loopValue")
+    val loopIsNull = ctx.freshName("loopIsNull")
+
+    val loopVariable = LambdaVariable(loopValue, loopIsNull, elementType)
+    val boundFunction = completeFunction transform {
+      case a: AttributeReference if a == loopAttribute => loopVariable
+    }
+
+    val genFunction = boundFunction.gen(ctx)
+    val dataLength = ctx.freshName("dataLength")
+    val convertedArray = ctx.freshName("convertedArray")
+    val loopIndex = ctx.freshName("loopIndex")
+
+    s"""
+      ${genInputData.code}
+
+      boolean ${ev.isNull} = ${genInputData.value} == null;
+      $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+      if (!${ev.isNull}) {
+        Object[] $convertedArray = null;
+        int $dataLength = ${genInputData.value}$lengthFunction;
+        $convertedArray = new Object[$dataLength];
+
+        int $loopIndex = 0;
+        while ($loopIndex < $dataLength) {
+          $elementJavaType $loopValue =
+            ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
+          boolean $loopIsNull = $loopValue == null;
+
+          ${genFunction.code}
+
+          $convertedArray[$loopIndex] = ${genFunction.value};
+          $loopIndex += 1;
+        }
+
+        ${ev.isNull} = false;
+        ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray);
+      }
+    """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index 459fcb6fc0acc..c3816033275d5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -22,6 +22,15 @@ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
 
+  def this(seq: scala.collection.GenIterable[Any]) = this(seq.toArray)
+
+  // TODO: This is boxing.  We should specialize.
+  def this(primitiveArray: Array[Int]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Long]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Float]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Double]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq)
+
   override def copy(): ArrayData = new GenericArrayData(array.clone())
 
   override def numElements(): Int = array.length
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
new file mode 100644
index 0000000000000..fca0b799eb809
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import scala.language.existentials
+
+private[sql] object ObjectType extends AbstractDataType {
+  override private[sql] def defaultConcreteType: DataType =
+    throw new UnsupportedOperationException("null literals can't be casted to ObjectType")
+
+  // No casting or comparison is supported.
+  override private[sql] def acceptsType(other: DataType): Boolean = false
+
+  override private[sql] def simpleString: String = "Object"
+}
+
+/**
+ * Represents a JVM object that is passing through Spark SQL expression evaluation.  Note this
+ * is only used internally while converting into the internal format and is not intended for use
+ * outside of the execution engine.
+ */
+private[sql] case class ObjectType(cls: Class[_]) extends DataType {
+  override def defaultSize: Int =
+    throw new UnsupportedOperationException("No size estimation available for objects.")
+
+  def asNullable: DataType = this
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
new file mode 100644
index 0000000000000..99c993d3febc2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.ScalaReflection._
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst._
+
+
+case class RepeatedStruct(s: Seq[PrimitiveData])
+
+case class NestedArray(a: Array[Array[Int]])
+
+class ProductEncoderSuite extends SparkFunSuite {
+
+  test("convert PrimitiveData to InternalRow") {
+    val inputData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
+    val encoder = ProductEncoder[PrimitiveData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 1)
+    assert(convertedData.getLong(1) == 1.toLong)
+    assert(convertedData.getDouble(2) == 1.toDouble)
+    assert(convertedData.getFloat(3) == 1.toFloat)
+    assert(convertedData.getShort(4) == 1.toShort)
+    assert(convertedData.getByte(5) == 1.toByte)
+    assert(convertedData.getBoolean(6) == true)
+  }
+
+  test("convert Some[_] to InternalRow") {
+    val primitiveData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
+    val inputData = OptionalData(Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
+      Some(primitiveData))
+
+    val encoder = ProductEncoder[OptionalData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 2)
+    assert(convertedData.getLong(1) == 2.toLong)
+    assert(convertedData.getDouble(2) == 2.toDouble)
+    assert(convertedData.getFloat(3) == 2.toFloat)
+    assert(convertedData.getShort(4) == 2.toShort)
+    assert(convertedData.getByte(5) == 2.toByte)
+    assert(convertedData.getBoolean(6) == true)
+
+    val nestedRow = convertedData.getStruct(7, 7)
+    assert(nestedRow.getInt(0) == 1)
+    assert(nestedRow.getLong(1) == 1.toLong)
+    assert(nestedRow.getDouble(2) == 1.toDouble)
+    assert(nestedRow.getFloat(3) == 1.toFloat)
+    assert(nestedRow.getShort(4) == 1.toShort)
+    assert(nestedRow.getByte(5) == 1.toByte)
+    assert(nestedRow.getBoolean(6) == true)
+  }
+
+  test("convert None to InternalRow") {
+    val inputData = OptionalData(None, None, None, None, None, None, None, None)
+    val encoder = ProductEncoder[OptionalData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.isNullAt(0))
+    assert(convertedData.isNullAt(1))
+    assert(convertedData.isNullAt(2))
+    assert(convertedData.isNullAt(3))
+    assert(convertedData.isNullAt(4))
+    assert(convertedData.isNullAt(5))
+    assert(convertedData.isNullAt(6))
+    assert(convertedData.isNullAt(7))
+  }
+
+  test("convert nullable but present data to InternalRow") {
+    val inputData = NullableData(
+      1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true, "test", new java.math.BigDecimal(1), new Date(0),
+      new Timestamp(0), Array[Byte](1, 2, 3))
+
+    val encoder = ProductEncoder[NullableData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 1)
+    assert(convertedData.getLong(1) == 1.toLong)
+    assert(convertedData.getDouble(2) == 1.toDouble)
+    assert(convertedData.getFloat(3) == 1.toFloat)
+    assert(convertedData.getShort(4) == 1.toShort)
+    assert(convertedData.getByte(5) == 1.toByte)
+    assert(convertedData.getBoolean(6) == true)
+  }
+
+  test("convert nullable data to InternalRow") {
+    val inputData =
+      NullableData(null, null, null, null, null, null, null, null, null, null, null, null)
+
+    val encoder = ProductEncoder[NullableData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.isNullAt(0))
+    assert(convertedData.isNullAt(1))
+    assert(convertedData.isNullAt(2))
+    assert(convertedData.isNullAt(3))
+    assert(convertedData.isNullAt(4))
+    assert(convertedData.isNullAt(5))
+    assert(convertedData.isNullAt(6))
+    assert(convertedData.isNullAt(7))
+    assert(convertedData.isNullAt(8))
+    assert(convertedData.isNullAt(9))
+    assert(convertedData.isNullAt(10))
+    assert(convertedData.isNullAt(11))
+  }
+
+  test("convert repeated struct") {
+    val inputData = RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil)
+    val encoder = ProductEncoder[RepeatedStruct]
+
+    val converted = encoder.toRow(inputData)
+    val convertedStruct = converted.getArray(0).getStruct(0, 7)
+    assert(convertedStruct.getInt(0) == 1)
+    assert(convertedStruct.getLong(1) == 1.toLong)
+    assert(convertedStruct.getDouble(2) == 1.toDouble)
+    assert(convertedStruct.getFloat(3) == 1.toFloat)
+    assert(convertedStruct.getShort(4) == 1.toShort)
+    assert(convertedStruct.getByte(5) == 1.toByte)
+    assert(convertedStruct.getBoolean(6) == true)
+  }
+
+  test("convert nested seq") {
+    val convertedData = ProductEncoder[Tuple1[Seq[Seq[Int]]]].toRow(Tuple1(Seq(Seq(1))))
+    assert(convertedData.getArray(0).getArray(0).getInt(0) == 1)
+
+    val convertedData2 = ProductEncoder[Tuple1[Seq[Seq[Seq[Int]]]]].toRow(Tuple1(Seq(Seq(Seq(1)))))
+    assert(convertedData2.getArray(0).getArray(0).getArray(0).getInt(0) == 1)
+  }
+
+  test("convert nested array") {
+    val convertedData = ProductEncoder[Tuple1[Array[Array[Int]]]].toRow(Tuple1(Array(Array(1))))
+  }
+
+  test("convert complex") {
+    val inputData = ComplexData(
+      Seq(1, 2),
+      Array(1, 2),
+      1 :: 2 :: Nil,
+      Seq(new Integer(1), null, new Integer(2)),
+      Map(1 -> 2L),
+      Map(1 -> new java.lang.Long(2)),
+      PrimitiveData(1, 1, 1, 1, 1, 1, true),
+      Array(Array(1)))
+
+    val encoder = ProductEncoder[ComplexData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(!convertedData.isNullAt(0))
+    val seq = convertedData.getArray(0)
+    assert(seq.numElements() == 2)
+    assert(seq.getInt(0) == 1)
+    assert(seq.getInt(1) == 2)
+  }
+}

From 2816c89b6a304cb0b5214e14ebbc320158e88260 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 8 Oct 2015 14:53:21 -0700
Subject: [PATCH 502/802] [SPARK-10988] [SQL] Reduce duplication in
 Aggregate2's expression rewriting logic

In `aggregate/utils.scala`, there is a substantial amount of duplication in the expression-rewriting logic. As a prerequisite to supporting imperative aggregate functions in `TungstenAggregate`, this patch refactors this file so that the same expression-rewriting logic is used for both `SortAggregate` and `TungstenAggregate`.

In order to allow both operators to use the same rewriting logic, `TungstenAggregationIterator. generateResultProjection()` has been updated so that it first evaluates all declarative aggregate functions' `evaluateExpression`s and writes the results into a temporary buffer, and then uses this temporary buffer and the grouping expressions to evaluate the final resultExpressions. This matches the logic in SortAggregateIterator, where this two-pass approach is necessary in order to support imperative aggregates. If this change turns out to cause performance regressions, then we can look into re-implementing the single-pass evaluation in a cleaner way as part of a followup patch.

Since the rewriting logic is now shared across both operators, this patch also extracts that logic and places it in `SparkStrategies`. This makes the rewriting logic a bit easier to follow, I think.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9015 from JoshRosen/SPARK-10988.
---
 .../spark/sql/execution/SparkStrategies.scala |  67 +++--
 .../aggregate/TungstenAggregate.scala         |   4 +
 .../TungstenAggregationIterator.scala         |  22 +-
 .../spark/sql/execution/aggregate/utils.scala | 244 +++++-------------
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 5 files changed, 143 insertions(+), 196 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d1bbf2e20fcf4..79bd1a41808de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -195,19 +195,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         converted match {
           case None => Nil // Cannot convert to new aggregation code path.
           case Some(logical.Aggregate(groupingExpressions, resultExpressions, child)) =>
-            // Extracts all distinct aggregate expressions from the resultExpressions.
+            // A single aggregate expression might appear multiple times in resultExpressions.
+            // In order to avoid evaluating an individual aggregate function multiple times, we'll
+            // build a set of the distinct aggregate expressions and build a function which can
+            // be used to re-write expressions so that they reference the single copy of the
+            // aggregate function which actually gets computed.
             val aggregateExpressions = resultExpressions.flatMap { expr =>
               expr.collect {
                 case agg: AggregateExpression2 => agg
               }
-            }.toSet.toSeq
+            }.distinct
             // For those distinct aggregate expressions, we create a map from the
             // aggregate function to the corresponding attribute of the function.
-            val aggregateFunctionMap = aggregateExpressions.map { agg =>
+            val aggregateFunctionToAttribute = aggregateExpressions.map { agg =>
               val aggregateFunction = agg.aggregateFunction
-              val attribtue = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute
-              (aggregateFunction, agg.isDistinct) ->
-                (aggregateFunction -> attribtue)
+              val attribute = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute
+              (aggregateFunction, agg.isDistinct) -> attribute
             }.toMap
 
             val (functionsWithDistinct, functionsWithoutDistinct) =
@@ -220,6 +223,40 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                   "code path.")
             }
 
+            val namedGroupingExpressions = groupingExpressions.map {
+              case ne: NamedExpression => ne -> ne
+              // If the expression is not a NamedExpressions, we add an alias.
+              // So, when we generate the result of the operator, the Aggregate Operator
+              // can directly get the Seq of attributes representing the grouping expressions.
+              case other =>
+                val withAlias = Alias(other, other.toString)()
+                other -> withAlias
+            }
+            val groupExpressionMap = namedGroupingExpressions.toMap
+
+            // The original `resultExpressions` are a set of expressions which may reference
+            // aggregate expressions, grouping column values, and constants. When aggregate operator
+            // emits output rows, we will use `resultExpressions` to generate an output projection
+            // which takes the grouping columns and final aggregate result buffer as input.
+            // Thus, we must re-write the result expressions so that their attributes match up with
+            // the attributes of the final result projection's input row:
+            val rewrittenResultExpressions = resultExpressions.map { expr =>
+              expr.transformDown {
+                case AggregateExpression2(aggregateFunction, _, isDistinct) =>
+                  // The final aggregation buffer's attributes will be `finalAggregationAttributes`,
+                  // so replace each aggregate expression by its corresponding attribute in the set:
+                  aggregateFunctionToAttribute(aggregateFunction, isDistinct)
+                case expression =>
+                  // Since we're using `namedGroupingAttributes` to extract the grouping key
+                  // columns, we need to replace grouping key expressions with their corresponding
+                  // attributes. We do not rely on the equality check at here since attributes may
+                  // differ cosmetically. Instead, we use semanticEquals.
+                  groupExpressionMap.collectFirst {
+                    case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+                  }.getOrElse(expression)
+              }.asInstanceOf[NamedExpression]
+            }
+
             val aggregateOperator =
               if (aggregateExpressions.map(_.aggregateFunction).exists(!_.supportsPartial)) {
                 if (functionsWithDistinct.nonEmpty) {
@@ -227,26 +264,26 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                     "aggregate functions which don't support partial aggregation.")
                 } else {
                   aggregate.Utils.planAggregateWithoutPartial(
-                    groupingExpressions,
+                    namedGroupingExpressions.map(_._2),
                     aggregateExpressions,
-                    aggregateFunctionMap,
-                    resultExpressions,
+                    aggregateFunctionToAttribute,
+                    rewrittenResultExpressions,
                     planLater(child))
                 }
               } else if (functionsWithDistinct.isEmpty) {
                 aggregate.Utils.planAggregateWithoutDistinct(
-                  groupingExpressions,
+                  namedGroupingExpressions.map(_._2),
                   aggregateExpressions,
-                  aggregateFunctionMap,
-                  resultExpressions,
+                  aggregateFunctionToAttribute,
+                  rewrittenResultExpressions,
                   planLater(child))
               } else {
                 aggregate.Utils.planAggregateWithOneDistinct(
-                  groupingExpressions,
+                  namedGroupingExpressions.map(_._2),
                   functionsWithDistinct,
                   functionsWithoutDistinct,
-                  aggregateFunctionMap,
-                  resultExpressions,
+                  aggregateFunctionToAttribute,
+                  rewrittenResultExpressions,
                   planLater(child))
               }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 3cd22af30592c..7b3d072b2e067 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -31,7 +31,9 @@ case class TungstenAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
+    nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
   extends UnaryNode {
@@ -77,7 +79,9 @@ case class TungstenAggregate(
       new TungstenAggregationIterator(
         groupingExpressions,
         nonCompleteAggregateExpressions,
+        nonCompleteAggregateAttributes,
         completeAggregateExpressions,
+        completeAggregateAttributes,
         resultExpressions,
         newMutableProjection,
         child.output,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index a6f4c1d92f6dc..4bb95c9eb7f3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -60,8 +60,12 @@ import org.apache.spark.sql.types.StructType
  * @param nonCompleteAggregateExpressions
  *   [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Partial]],
  *   [[PartialMerge]], or [[Final]].
+ * @param nonCompleteAggregateAttributes the attributes of the nonCompleteAggregateExpressions'
+ *   outputs when they are stored in the final aggregation buffer.
  * @param completeAggregateExpressions
  *   [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Complete]].
+ * @param completeAggregateAttributes the attributes of completeAggregateExpressions' outputs
+ *   when they are stored in the final aggregation buffer.
  * @param resultExpressions
  *   expressions for generating output rows.
  * @param newMutableProjection
@@ -72,7 +76,9 @@ import org.apache.spark.sql.types.StructType
 class TungstenAggregationIterator(
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
+    nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
@@ -280,17 +286,25 @@ class TungstenAggregationIterator(
       // resultExpressions.
       case (Some(Final), None) | (Some(Final) | None, Some(Complete)) =>
         val joinedRow = new JoinedRow()
+        val evalExpressions = allAggregateFunctions.map {
+          case ae: DeclarativeAggregate => ae.evaluateExpression
+          // case agg: AggregateFunction2 => Literal.create(null, agg.dataType)
+        }
+        val expressionAggEvalProjection = UnsafeProjection.create(evalExpressions, bufferAttributes)
+        // These are the attributes of the row produced by `expressionAggEvalProjection`
+        val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
         val resultProjection =
-          UnsafeProjection.create(resultExpressions, groupingAttributes ++ bufferAttributes)
+          UnsafeProjection.create(resultExpressions, groupingAttributes ++ aggregateResultSchema)
 
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
-          resultProjection(joinedRow(currentGroupingKey, currentBuffer))
+          // Generate results for all expression-based aggregate functions.
+          val aggregateResult = expressionAggEvalProjection.apply(currentBuffer)
+          resultProjection(joinedRow(currentGroupingKey, aggregateResult))
         }
 
       // Grouping-only: a output row is generated from values of grouping expressions.
       case (None, None) =>
-        val resultProjection =
-          UnsafeProjection.create(resultExpressions, groupingAttributes)
+        val resultProjection = UnsafeProjection.create(resultExpressions, groupingAttributes)
 
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
           resultProjection(currentGroupingKey)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index e1c2d9475a10f..cf6e7ed0d337f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.aggregate
 
-import scala.collection.mutable
-
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan}
@@ -38,60 +36,35 @@ object Utils {
   }
 
   def planAggregateWithoutPartial(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       aggregateExpressions: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
 
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
-
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
     val completeAggregateExpressions = aggregateExpressions.map(_.copy(mode = Complete))
-    val completeAggregateAttributes =
-      completeAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
-
-    val rewrittenResultExpressions = resultExpressions.map { expr =>
-      expr.transformDown {
-        case agg: AggregateExpression2 =>
-          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-        case expression =>
-          // We do not rely on the equality check at here since attributes may
-          // different cosmetically. Instead, we use semanticEquals.
-          groupExpressionMap.collectFirst {
-            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-          }.getOrElse(expression)
-      }.asInstanceOf[NamedExpression]
+    val completeAggregateAttributes = completeAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
     }
 
     SortBasedAggregate(
-      requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-      groupingExpressions = namedGroupingExpressions.map(_._2),
+      requiredChildDistributionExpressions = Some(groupingAttributes),
+      groupingExpressions = groupingAttributes,
       nonCompleteAggregateExpressions = Nil,
       nonCompleteAggregateAttributes = Nil,
       completeAggregateExpressions = completeAggregateExpressions,
       completeAggregateAttributes = completeAggregateAttributes,
       initialInputBufferOffset = 0,
-      resultExpressions = rewrittenResultExpressions,
+      resultExpressions = resultExpressions,
       child = child
     ) :: Nil
   }
 
   def planAggregateWithoutDistinct(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       aggregateExpressions: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
     // Check if we can use TungstenAggregate.
@@ -104,36 +77,29 @@ object Utils {
 
 
     // 1. Create an Aggregate Operator for partial aggregations.
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
     val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial))
     val partialAggregateAttributes =
       partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialResultExpressions =
-      namedGroupingAttributes ++
+      groupingAttributes ++
         partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
 
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
         requiredChildDistributionExpressions = None: Option[Seq[Expression]],
-        groupingExpressions = namedGroupingExpressions.map(_._2),
+        groupingExpressions = groupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
+        nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialResultExpressions,
         child = child)
     } else {
       SortBasedAggregate(
         requiredChildDistributionExpressions = None: Option[Seq[Expression]],
-        groupingExpressions = namedGroupingExpressions.map(_._2),
+        groupingExpressions = groupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
@@ -145,58 +111,32 @@ object Utils {
 
     // 2. Create an Aggregate Operator for final aggregations.
     val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final))
-    val finalAggregateAttributes =
-      finalAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
+    // The attributes of the final aggregation buffer, which is presented as input to the result
+    // projection:
+    val finalAggregateAttributes = finalAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+    }
 
     val finalAggregate = if (usesTungstenAggregate) {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transformDown {
-          case agg: AggregateExpression2 =>
-            // aggregateFunctionMap contains unique aggregate functions.
-            val aggregateFunction =
-              aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._1
-            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
+        nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
-        resultExpressions = rewrittenResultExpressions,
+        completeAggregateAttributes = Nil,
+        resultExpressions = resultExpressions,
         child = partialAggregate)
     } else {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transformDown {
-          case agg: AggregateExpression2 =>
-            aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
-        initialInputBufferOffset = namedGroupingAttributes.length,
-        resultExpressions = rewrittenResultExpressions,
+        initialInputBufferOffset = groupingExpressions.length,
+        resultExpressions = resultExpressions,
         child = partialAggregate)
     }
 
@@ -204,10 +144,10 @@ object Utils {
   }
 
   def planAggregateWithOneDistinct(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       functionsWithDistinct: Seq[AggregateExpression2],
       functionsWithoutDistinct: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
 
@@ -221,20 +161,7 @@ object Utils {
           aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
     // 1. Create an Aggregate Operator for partial aggregations.
-    // The grouping expressions are original groupingExpressions and
-    // distinct columns. For example, for avg(distinct value) ... group by key
-    // the grouping expressions of this Aggregate Operator will be [key, value].
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
 
     // It is safe to call head at here since functionsWithDistinct has at least one
     // AggregateExpression2.
@@ -253,22 +180,27 @@ object Utils {
     val partialAggregateAttributes =
       partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialAggregateGroupingExpressions =
-      (namedGroupingExpressions ++ namedDistinctColumnExpressions).map(_._2)
+      groupingExpressions ++ namedDistinctColumnExpressions.map(_._2)
     val partialAggregateResult =
-      namedGroupingAttributes ++
+        groupingAttributes ++
         distinctColumnAttributes ++
         partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
-        requiredChildDistributionExpressions = None: Option[Seq[Expression]],
+        requiredChildDistributionExpressions = None,
+        // The grouping expressions are original groupingExpressions and
+        // distinct columns. For example, for avg(distinct value) ... group by key
+        // the grouping expressions of this Aggregate Operator will be [key, value].
         groupingExpressions = partialAggregateGroupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
+        nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialAggregateResult,
         child = child)
     } else {
       SortBasedAggregate(
-        requiredChildDistributionExpressions = None: Option[Seq[Expression]],
+        requiredChildDistributionExpressions = None,
         groupingExpressions = partialAggregateGroupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         nonCompleteAggregateAttributes = partialAggregateAttributes,
@@ -284,41 +216,40 @@ object Utils {
     val partialMergeAggregateAttributes =
       partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialMergeAggregateResult =
-      namedGroupingAttributes ++
+        groupingAttributes ++
         distinctColumnAttributes ++
         partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialMergeAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
         nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialMergeAggregateResult,
         child = partialAggregate)
     } else {
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
         nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
         nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
+        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
         resultExpressions = partialMergeAggregateResult,
         child = partialAggregate)
     }
 
     // 3. Create an Aggregate Operator for partial merge aggregations.
     val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
-    val finalAggregateAttributes =
-      finalAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
-    // Create a map to store those rewritten aggregate functions. We always need to use
-    // both function and its corresponding isDistinct flag as the key because function itself
-    // does not knows if it is has distinct keyword or now.
-    val rewrittenAggregateFunctions =
-      mutable.Map.empty[(AggregateFunction2, Boolean), AggregateFunction2]
+    // The attributes of the final aggregation buffer, which is presented as input to the result
+    // projection:
+    val finalAggregateAttributes = finalAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+    }
+
     val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
       // Children of an AggregateFunction with DISTINCT keyword has already
       // been evaluated. At here, we need to replace original children
@@ -328,9 +259,6 @@ object Utils {
           case expr if distinctColumnExpressionMap.contains(expr) =>
             distinctColumnExpressionMap(expr).toAttribute
         }.asInstanceOf[AggregateFunction2]
-        // Because we have rewritten the aggregate function, we use rewrittenAggregateFunctions
-        // to track the old version and the new version of this function.
-        rewrittenAggregateFunctions += (aggregateFunction, true) -> rewrittenAggregateFunction
         // We rewrite the aggregate function to a non-distinct aggregation because
         // its input will have distinct arguments.
         // We just keep the isDistinct setting to true, so when users look at the query plan,
@@ -338,66 +266,30 @@ object Utils {
         val rewrittenAggregateExpression =
           AggregateExpression2(rewrittenAggregateFunction, Complete, true)
 
-        val aggregateFunctionAttribute =
-          aggregateFunctionMap(agg.aggregateFunction, true)._2
-        (rewrittenAggregateExpression -> aggregateFunctionAttribute)
+        val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
+        (rewrittenAggregateExpression, aggregateFunctionAttribute)
     }.unzip
 
     val finalAndCompleteAggregate = if (usesTungstenAggregate) {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transform {
-          case agg: AggregateExpression2 =>
-            val function = agg.aggregateFunction
-            val isDistinct = agg.isDistinct
-            val aggregateFunction =
-              if (rewrittenAggregateFunctions.contains(function, isDistinct)) {
-                // If this function has been rewritten, we get the rewritten version from
-                // rewrittenAggregateFunctions.
-                rewrittenAggregateFunctions(function, isDistinct)
-              } else {
-                // Oterwise, we get it from aggregateFunctionMap, which contains unique
-                // aggregate functions that have not been rewritten.
-                aggregateFunctionMap(function, isDistinct)._1
-              }
-            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
+        nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = completeAggregateExpressions,
-        resultExpressions = rewrittenResultExpressions,
+        completeAggregateAttributes = completeAggregateAttributes,
+        resultExpressions = resultExpressions,
         child = partialMergeAggregate)
     } else {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transform {
-          case agg: AggregateExpression2 =>
-            aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = completeAggregateExpressions,
         completeAggregateAttributes = completeAggregateAttributes,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = rewrittenResultExpressions,
+        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
+        resultExpressions = resultExpressions,
         child = partialMergeAggregate)
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 7ca677a6c72ad..ed974b3a53d41 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -38,7 +38,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
         () => new InterpretedMutableProjection(expr, schema)
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
-      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty,
+      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
         Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)

From 02149ff08eed3745086589a047adbce9a580389f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 8 Oct 2015 16:18:35 -0700
Subject: [PATCH 503/802] [SPARK-8848] [SQL] Refactors Parquet write path to
 follow parquet-format

This PR refactors Parquet write path to follow parquet-format spec.  It's a successor of PR #7679, but with less non-essential changes.

Major changes include:

1.  Replaces `RowWriteSupport` and `MutableRowWriteSupport` with `CatalystWriteSupport`

    - Writes Parquet data using standard layout defined in parquet-format

      Specifically, we are now writing ...

      - ... arrays and maps in standard 3-level structure with proper annotations and field names
      - ... decimals as `INT32` and `INT64` whenever possible, and taking `FIXED_LEN_BYTE_ARRAY` as the final fallback

    - Supports legacy mode which is compatible with Spark 1.4 and prior versions

      The legacy mode is by default off, and can be turned on by flipping SQL option `spark.sql.parquet.writeLegacyFormat` to `true`.

    - Eliminates per value data type dispatching costs via prebuilt composed writer functions

1.  Cleans up the last pieces of old Parquet support code

As pointed out by rxin previously, we probably want to rename all those `Catalyst*` Parquet classes to `Parquet*` for clarity.  But I'd like to do this in a follow-up PR to minimize code review noises in this one.

Author: Cheng Lian <lian@databricks.com>

Closes #8988 from liancheng/spark-8848/standard-parquet-write-path.
---
 .../org/apache/spark/sql/types/Decimal.scala  |   4 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 +-
 .../parquet/CatalystReadSupport.scala         |  35 +-
 .../parquet/CatalystRowConverter.scala        |  59 +--
 .../parquet/CatalystSchemaConverter.scala     |  46 +-
 .../parquet/CatalystWriteSupport.scala        | 436 ++++++++++++++++++
 .../DirectParquetOutputCommitter.scala        |   2 +-
 .../parquet/ParquetConverter.scala            |  39 --
 .../datasources/parquet/ParquetFilters.scala  |  36 --
 .../datasources/parquet/ParquetRelation.scala |  42 +-
 .../parquet/ParquetTableSupport.scala         | 321 -------------
 .../parquet/ParquetTypesConverter.scala       | 160 -------
 .../spark/sql/UserDefinedTypeSuite.scala      |  32 +-
 .../datasources/parquet/ParquetIOSuite.scala  |  63 +--
 .../parquet/ParquetQuerySuite.scala           |  46 +-
 .../parquet/ParquetSchemaSuite.scala          |   4 +-
 .../datasources/parquet/ParquetTest.scala     |  44 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  28 +-
 18 files changed, 709 insertions(+), 693 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 909b8e31f2458..c11dab35cdf6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -108,7 +108,9 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    */
   def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = {
     this.decimalVal = decimal.setScale(scale, ROUNDING_MODE)
-    require(decimalVal.precision <= precision, "Overflowed precision")
+    require(
+      decimalVal.precision <= precision,
+      s"Decimal precision ${decimalVal.precision} exceeds max precision $precision")
     this.longVal = 0L
     this._precision = precision
     this._scale = scale
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 8f0f8910b36ab..47397c4be3cb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -292,10 +292,9 @@ private[spark] object SQLConf {
 
   val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
     key = "spark.sql.parquet.writeLegacyFormat",
-    defaultValue = Some(true),
+    defaultValue = Some(false),
     doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.",
-    isPublic = false)
+      "Spark SQL schema and vice versa.")
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
     key = "spark.sql.parquet.output.committer.class",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 5325698034095..a958373eb769d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -95,7 +95,9 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
        """.stripMargin
     }
 
-    new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
+    new CatalystRecordMaterializer(
+      parquetRequestedSchema,
+      CatalystReadSupport.expandUDT(catalystRequestedSchema))
   }
 }
 
@@ -110,7 +112,10 @@ private[parquet] object CatalystReadSupport {
    */
   def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = {
     val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema)
-    Types.buildMessage().addFields(clippedParquetFields: _*).named("root")
+    Types
+      .buildMessage()
+      .addFields(clippedParquetFields: _*)
+      .named(CatalystSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)
   }
 
   private def clipParquetType(parquetType: Type, catalystType: DataType): Type = {
@@ -271,4 +276,30 @@ private[parquet] object CatalystReadSupport {
         .getOrElse(toParquet.convertField(f))
     }
   }
+
+  def expandUDT(schema: StructType): StructType = {
+    def expand(dataType: DataType): DataType = {
+      dataType match {
+        case t: ArrayType =>
+          t.copy(elementType = expand(t.elementType))
+
+        case t: MapType =>
+          t.copy(
+            keyType = expand(t.keyType),
+            valueType = expand(t.valueType))
+
+        case t: StructType =>
+          val expandedFields = t.fields.map(f => f.copy(dataType = expand(f.dataType)))
+          t.copy(fields = expandedFields)
+
+        case t: UserDefinedType[_] =>
+          t.sqlType
+
+        case t =>
+          t
+      }
+    }
+
+    expand(schema).asInstanceOf[StructType]
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 050d3610a6413..247d35363b862 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -27,7 +27,6 @@ import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8}
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE
-import org.apache.parquet.schema.Type.Repetition
 import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 
 import org.apache.spark.Logging
@@ -114,7 +113,8 @@ private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUp
  * any "parent" container.
  *
  * @param parquetType Parquet schema of Parquet records
- * @param catalystType Spark SQL schema that corresponds to the Parquet record type
+ * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined
+ *        types should have been expanded.
  * @param updater An updater which propagates converted field values to the parent container
  */
 private[parquet] class CatalystRowConverter(
@@ -133,6 +133,12 @@ private[parquet] class CatalystRowConverter(
        |${catalystType.prettyJson}
      """.stripMargin)
 
+  assert(
+    !catalystType.existsRecursively(_.isInstanceOf[UserDefinedType[_]]),
+    s"""User-defined types in Catalyst schema should have already been expanded:
+       |${catalystType.prettyJson}
+     """.stripMargin)
+
   logDebug(
     s"""Building row converter for the following schema:
        |
@@ -268,13 +274,6 @@ private[parquet] class CatalystRowConverter(
           override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
-      case t: UserDefinedType[_] =>
-        val catalystTypeForUDT = t.sqlType
-        val nullable = parquetType.isRepetition(Repetition.OPTIONAL)
-        val field = StructField("udt", catalystTypeForUDT, nullable)
-        val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field)
-        newConverter(parquetTypeForUDT, catalystTypeForUDT, updater)
-
       case _ =>
         throw new RuntimeException(
           s"Unable to create Parquet converter for data type ${catalystType.json}")
@@ -340,30 +339,36 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
 
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
-        // Constructs a `Decimal` with an unscaled `Long` value if possible.  The underlying
-        // `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we are using
-        // `Binary.toByteBuffer.array()` to steal the underlying byte array without copying it.
-        val buffer = value.toByteBuffer
-        val bytes = buffer.array()
-        val start = buffer.position()
-        val end = buffer.limit()
-
-        var unscaled = 0L
-        var i = start
-
-        while (i < end) {
-          unscaled = (unscaled << 8) | (bytes(i) & 0xff)
-          i += 1
-        }
-
-        val bits = 8 * (end - start)
-        unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        val unscaled = binaryToUnscaledLong(value)
         Decimal(unscaled, precision, scale)
       } else {
         // Otherwise, resorts to an unscaled `BigInteger` instead.
         Decimal(new BigDecimal(new BigInteger(value.getBytes), scale), precision, scale)
       }
     }
+
+    private def binaryToUnscaledLong(binary: Binary): Long = {
+      // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here
+      // we are using `Binary.toByteBuffer.array()` to steal the underlying byte array without
+      // copying it.
+      val buffer = binary.toByteBuffer
+      val bytes = buffer.array()
+      val start = buffer.position()
+      val end = buffer.limit()
+
+      var unscaled = 0L
+      var i = start
+
+      while (i < end) {
+        unscaled = (unscaled << 8) | (bytes(i) & 0xff)
+        i += 1
+      }
+
+      val bits = 8 * (end - start)
+      unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+      unscaled
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index 6904fc736c106..7f3394c20ed3d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -121,7 +121,7 @@ private[parquet] class CatalystSchemaConverter(
       val precision = field.getDecimalMetadata.getPrecision
       val scale = field.getDecimalMetadata.getScale
 
-      CatalystSchemaConverter.analysisRequire(
+      CatalystSchemaConverter.checkConversionRequirement(
         maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
         s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)")
 
@@ -155,7 +155,7 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case INT96 =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           assumeInt96IsTimestamp,
           "INT96 is not supported unless it's interpreted as timestamp. " +
             s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
@@ -197,11 +197,11 @@ private[parquet] class CatalystSchemaConverter(
       //
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
       case LIST =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1, s"Invalid list type $field")
 
         val repeatedType = field.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           repeatedType.isRepetition(REPEATED), s"Invalid list type $field")
 
         if (isElementType(repeatedType, field.getName)) {
@@ -217,17 +217,17 @@ private[parquet] class CatalystSchemaConverter(
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
       // scalastyle:on
       case MAP | MAP_KEY_VALUE =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1 && !field.getType(0).isPrimitive,
           s"Invalid map type: $field")
 
         val keyValueType = field.getType(0).asGroupType()
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2,
           s"Invalid map type: $field")
 
         val keyType = keyValueType.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           keyType.isPrimitive,
           s"Map key type is expected to be a primitive type, but found: $keyType")
 
@@ -299,7 +299,10 @@ private[parquet] class CatalystSchemaConverter(
    * Converts a Spark SQL [[StructType]] to a Parquet [[MessageType]].
    */
   def convert(catalystSchema: StructType): MessageType = {
-    Types.buildMessage().addFields(catalystSchema.map(convertField): _*).named("root")
+    Types
+      .buildMessage()
+      .addFields(catalystSchema.map(convertField): _*)
+      .named(CatalystSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)
   }
 
   /**
@@ -347,10 +350,10 @@ private[parquet] class CatalystSchemaConverter(
       // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec.
       //
       // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
-      // timestamp in Impala for some historical reasons, it's not recommended to be used for any
-      // other types and will probably be deprecated in future Parquet format spec.  That's the
-      // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which
-      // are both logical types annotating `INT64`.
+      // timestamp in Impala for some historical reasons.  It's not recommended to be used for any
+      // other types and will probably be deprecated in some future version of parquet-format spec.
+      // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and
+      // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
       //
       // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
       // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
@@ -361,7 +364,7 @@ private[parquet] class CatalystSchemaConverter(
       // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using)
       // hasn't implemented `TIMESTAMP_MICROS` yet.
       //
-      // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+      // TODO Converts `TIMESTAMP_MICROS` once parquet-mr implements that.
       case TimestampType =>
         Types.primitive(INT96, repetition).named(field.name)
 
@@ -523,11 +526,12 @@ private[parquet] class CatalystSchemaConverter(
   }
 }
 
-
 private[parquet] object CatalystSchemaConverter {
+  val SPARK_PARQUET_SCHEMA_NAME = "spark_schema"
+
   def checkFieldName(name: String): Unit = {
     // ,;{}()\n\t= and space are special characters in Parquet schema
-    analysisRequire(
+    checkConversionRequirement(
       !name.matches(".*[ ,;{}()\n\t=].*"),
       s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
          |Please use alias to rename it.
@@ -539,7 +543,7 @@ private[parquet] object CatalystSchemaConverter {
     schema
   }
 
-  def analysisRequire(f: => Boolean, message: String): Unit = {
+  def checkConversionRequirement(f: => Boolean, message: String): Unit = {
     if (!f) {
       throw new AnalysisException(message)
     }
@@ -553,16 +557,8 @@ private[parquet] object CatalystSchemaConverter {
     numBytes
   }
 
-  private val MIN_BYTES_FOR_PRECISION = Array.tabulate[Int](39)(computeMinBytesForPrecision)
-
   // Returns the minimum number of bytes needed to store a decimal with a given `precision`.
-  def minBytesForPrecision(precision : Int) : Int = {
-    if (precision < MIN_BYTES_FOR_PRECISION.length) {
-      MIN_BYTES_FOR_PRECISION(precision)
-    } else {
-      computeMinBytesForPrecision(precision)
-    }
-  }
+  val minBytesForPrecision = Array.tabulate[Int](39)(computeMinBytesForPrecision)
 
   val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) /* 9 */
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
new file mode 100644
index 0000000000000..483363d2c1a21
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.nio.{ByteBuffer, ByteOrder}
+import java.util
+
+import scala.collection.JavaConverters.mapAsJavaMapConverter
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.column.ParquetProperties
+import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.io.api.{Binary, RecordConsumer}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.datasources.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
+import org.apache.spark.sql.types._
+
+/**
+ * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet
+ * messages.  This class can write Parquet data in two modes:
+ *
+ *  - Standard mode: Parquet data are written in standard format defined in parquet-format spec.
+ *  - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior.
+ *
+ * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyFormat`.  The value
+ * of this option is propagated to this class by the `init()` method and its Hadoop configuration
+ * argument.
+ */
+private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
+  // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer.
+  // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access
+  // data in `ArrayData` without the help of `SpecificMutableRow`.
+  private type ValueWriter = (SpecializedGetters, Int) => Unit
+
+  // Schema of the `InternalRow`s to be written
+  private var schema: StructType = _
+
+  // `ValueWriter`s for all fields of the schema
+  private var rootFieldWriters: Seq[ValueWriter] = _
+
+  // The Parquet `RecordConsumer` to which all `InternalRow`s are written
+  private var recordConsumer: RecordConsumer = _
+
+  // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions
+  private var writeLegacyParquetFormat: Boolean = _
+
+  // Reusable byte array used to write timestamps as Parquet INT96 values
+  private val timestampBuffer = new Array[Byte](12)
+
+  // Reusable byte array used to write decimal values
+  private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION))
+
+  override def init(configuration: Configuration): WriteContext = {
+    val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
+    this.schema = StructType.fromString(schemaString)
+    this.writeLegacyParquetFormat = {
+      // `SQLConf.PARQUET_WRITE_LEGACY_FORMAT` should always be explicitly set in ParquetRelation
+      assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null)
+      configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean
+    }
+    this.rootFieldWriters = schema.map(_.dataType).map(makeWriter)
+
+    val messageType = new CatalystSchemaConverter(configuration).convert(schema)
+    val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
+
+    logInfo(
+      s"""Initialized Parquet WriteSupport with Catalyst schema:
+         |${schema.prettyJson}
+         |and corresponding Parquet message type:
+         |$messageType
+       """.stripMargin)
+
+    new WriteContext(messageType, metadata)
+  }
+
+  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+    this.recordConsumer = recordConsumer
+  }
+
+  override def write(row: InternalRow): Unit = {
+    consumeMessage {
+      writeFields(row, schema, rootFieldWriters)
+    }
+  }
+
+  private def writeFields(
+      row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = {
+    var i = 0
+    while (i < row.numFields) {
+      if (!row.isNullAt(i)) {
+        consumeField(schema(i).name, i) {
+          fieldWriters(i).apply(row, i)
+        }
+      }
+      i += 1
+    }
+  }
+
+  private def makeWriter(dataType: DataType): ValueWriter = {
+    dataType match {
+      case BooleanType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBoolean(row.getBoolean(ordinal))
+
+      case ByteType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getByte(ordinal))
+
+      case ShortType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getShort(ordinal))
+
+      case IntegerType | DateType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getInt(ordinal))
+
+      case LongType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addLong(row.getLong(ordinal))
+
+      case FloatType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addFloat(row.getFloat(ordinal))
+
+      case DoubleType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addDouble(row.getDouble(ordinal))
+
+      case StringType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
+
+      case TimestampType =>
+        (row: SpecializedGetters, ordinal: Int) => {
+          // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
+          // Currently we only support timestamps stored as INT96, which is compatible with Hive
+          // and Impala.  However, INT96 is to be deprecated.  We plan to support `TIMESTAMP_MICROS`
+          // defined in the parquet-format spec.  But up until writing, the most recent parquet-mr
+          // version (1.8.1) hasn't implemented it yet.
+
+          // NOTE: Starting from Spark 1.5, Spark SQL `TimestampType` only has microsecond
+          // precision.  Nanosecond parts of timestamp values read from INT96 are simply stripped.
+          val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
+          val buf = ByteBuffer.wrap(timestampBuffer)
+          buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
+          recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer))
+        }
+
+      case BinaryType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
+
+      case DecimalType.Fixed(precision, scale) =>
+        makeDecimalWriter(precision, scale)
+
+      case t: StructType =>
+        val fieldWriters = t.map(_.dataType).map(makeWriter)
+        (row: SpecializedGetters, ordinal: Int) =>
+          consumeGroup {
+            writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)
+          }
+
+      case t: ArrayType => makeArrayWriter(t)
+
+      case t: MapType => makeMapWriter(t)
+
+      case t: UserDefinedType[_] => makeWriter(t.sqlType)
+
+      // TODO Adds IntervalType support
+      case _ => sys.error(s"Unsupported data type $dataType.")
+    }
+  }
+
+  private def makeDecimalWriter(precision: Int, scale: Int): ValueWriter = {
+    assert(
+      precision <= DecimalType.MAX_PRECISION,
+      s"Decimal precision $precision exceeds max precision ${DecimalType.MAX_PRECISION}")
+
+    val numBytes = minBytesForPrecision(precision)
+
+    val int32Writer =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        recordConsumer.addInteger(unscaledLong.toInt)
+      }
+
+    val int64Writer =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        recordConsumer.addLong(unscaledLong)
+      }
+
+    val binaryWriterUsingUnscaledLong =
+      (row: SpecializedGetters, ordinal: Int) => {
+        // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we
+        // can build a fixed-length byte array with length `numBytes` using the unscaled `Long`
+        // value and the `decimalBuffer` for better performance.
+        val unscaled = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        var i = 0
+        var shift = 8 * (numBytes - 1)
+
+        while (i < numBytes) {
+          decimalBuffer(i) = (unscaled >> shift).toByte
+          i += 1
+          shift -= 8
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+      }
+
+    val binaryWriterUsingUnscaledBytes =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val decimal = row.getDecimal(ordinal, precision, scale)
+        val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
+        val fixedLengthBytes = if (bytes.length == numBytes) {
+          // If the length of the underlying byte array of the unscaled `BigInteger` happens to be
+          // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`.
+          bytes
+        } else {
+          // Otherwise, the length must be less than `numBytes`.  In this case we copy contents of
+          // the underlying bytes with padding sign bytes to `decimalBuffer` to form the result
+          // fixed-length byte array.
+          val signByte = if (bytes.head < 0) -1: Byte else 0: Byte
+          util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte)
+          System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
+          decimalBuffer
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes))
+      }
+
+    writeLegacyParquetFormat match {
+      // Standard mode, 1 <= precision <= 9, writes as INT32
+      case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
+
+      // Standard mode, 10 <= precision <= 18, writes as INT64
+      case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
+
+      // Legacy mode, 1 <= precision <= 18, writes as FIXED_LEN_BYTE_ARRAY
+      case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
+
+      // Either standard or legacy mode, 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY
+      case _ => binaryWriterUsingUnscaledBytes
+    }
+  }
+
+  def makeArrayWriter(arrayType: ArrayType): ValueWriter = {
+    val elementWriter = makeWriter(arrayType.elementType)
+
+    def threeLevelArrayWriter(repeatedGroupName: String, elementFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedGroupName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                consumeGroup {
+                  // Only creates the element field if the current array element is not null.
+                  if (!array.isNullAt(i)) {
+                    consumeField(elementFieldName, 0) {
+                      elementWriter.apply(array, i)
+                    }
+                  }
+                }
+                i += 1
+              }
+            }
+          }
+        }
+      }
+
+    def twoLevelArrayWriter(repeatedFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedFieldName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                elementWriter.apply(array, i)
+                i += 1
+              }
+            }
+          }
+        }
+      }
+
+    (writeLegacyParquetFormat, arrayType.containsNull) match {
+      case (legacyMode @ false, _) =>
+        // Standard mode:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated group list {
+        //                    ^~~~  repeatedGroupName
+        //       <element-repetition> <element-type> element;
+        //                                           ^~~~~~~  elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "list", elementFieldName = "element")
+
+      case (legacyMode @ true, nullableElements @ true) =>
+        // Legacy mode, with nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     optional group bag {
+        //                    ^~~  repeatedGroupName
+        //       repeated <element-type> array;
+        //                               ^~~~~ elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "bag", elementFieldName = "array")
+
+      case (legacyMode @ true, nullableElements @ false) =>
+        // Legacy mode, with non-nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated <element-type> array;
+        //                             ^~~~~  repeatedFieldName
+        //   }
+        twoLevelArrayWriter(repeatedFieldName = "array")
+    }
+  }
+
+  private def makeMapWriter(mapType: MapType): ValueWriter = {
+    val keyWriter = makeWriter(mapType.keyType)
+    val valueWriter = makeWriter(mapType.valueType)
+    val repeatedGroupName = if (writeLegacyParquetFormat) {
+      // Legacy mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group map (MAP_KEY_VALUE) {
+      //                    ^~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "map"
+    } else {
+      // Standard mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group key_value {
+      //                    ^~~~~~~~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "key_value"
+    }
+
+    (row: SpecializedGetters, ordinal: Int) => {
+      val map = row.getMap(ordinal)
+      val keyArray = map.keyArray()
+      val valueArray = map.valueArray()
+
+      consumeGroup {
+        // Only creates the repeated field if the map is non-empty.
+        if (map.numElements() > 0) {
+          consumeField(repeatedGroupName, 0) {
+            var i = 0
+            while (i < map.numElements()) {
+              consumeGroup {
+                consumeField("key", 0) {
+                  keyWriter.apply(keyArray, i)
+                }
+
+                // Only creates the "value" field if the value if non-empty
+                if (!map.valueArray().isNullAt(i)) {
+                  consumeField("value", 1) {
+                    valueWriter.apply(valueArray, i)
+                  }
+                }
+              }
+              i += 1
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def consumeMessage(f: => Unit): Unit = {
+    recordConsumer.startMessage()
+    f
+    recordConsumer.endMessage()
+  }
+
+  private def consumeGroup(f: => Unit): Unit = {
+    recordConsumer.startGroup()
+    f
+    recordConsumer.endGroup()
+  }
+
+  private def consumeField(field: String, index: Int)(f: => Unit): Unit = {
+    recordConsumer.startField(field, index)
+    f
+    recordConsumer.endField(field, index)
+  }
+}
+
+private[parquet] object CatalystWriteSupport {
+  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
+
+  def setSchema(schema: StructType, configuration: Configuration): Unit = {
+    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
+    configuration.set(SPARK_ROW_SCHEMA, schema.json)
+    configuration.set(
+      ParquetOutputFormat.WRITER_VERSION,
+      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
index de1fd0166ac5a..300e8677b312f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
@@ -39,7 +39,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetO
  *
  *   NEVER use [[DirectParquetOutputCommitter]] when appending data, because currently there's
  *   no safe way undo a failed appending job (that's why both `abortTask()` and `abortJob()` are
- *   left * empty).
+ *   left empty).
  */
 private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
   extends ParquetOutputCommitter(outputPath, context) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
deleted file mode 100644
index ccd7ebf319af9..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.{MapData, ArrayData}
-
-// TODO Removes this while fixing SPARK-8848
-private[sql] object CatalystConverter {
-  // This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
-  // Note that "array" for the array elements is chosen by ParquetAvro.
-  // Using a different value will result in Parquet silently dropping columns.
-  val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag"
-  val ARRAY_ELEMENTS_SCHEMA_NAME = "array"
-
-  val MAP_KEY_SCHEMA_NAME = "key"
-  val MAP_VALUE_SCHEMA_NAME = "value"
-  val MAP_SCHEMA_NAME = "map"
-
-  // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
-  type ArrayScalaType = ArrayData
-  type StructScalaType = InternalRow
-  type MapScalaType = MapData
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index c6b3fe7900da8..78040d99fb0a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -18,24 +18,17 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.Serializable
-import java.nio.ByteBuffer
 
-import com.google.common.io.BaseEncoding
-import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.filter2.predicate.FilterApi._
 import org.apache.parquet.filter2.predicate._
 import org.apache.parquet.io.api.Binary
 import org.apache.parquet.schema.OriginalType
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 
-import org.apache.spark.SparkEnv
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
 
 private[sql] object ParquetFilters {
-  val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
-
   case class SetInFilter[T <: Comparable[T]](
     valueSet: Set[T]) extends UserDefinedPredicate[T] with Serializable {
 
@@ -282,33 +275,4 @@ private[sql] object ParquetFilters {
     addMethod.setAccessible(true)
     addMethod.invoke(null, classOf[Binary], enumTypeDescriptor)
   }
-
-  /**
-   * Note: Inside the Hadoop API we only have access to `Configuration`, not to
-   * [[org.apache.spark.SparkContext]], so we cannot use broadcasts to convey
-   * the actual filter predicate.
-   */
-  def serializeFilterExpressions(filters: Seq[Expression], conf: Configuration): Unit = {
-    if (filters.nonEmpty) {
-      val serialized: Array[Byte] =
-        SparkEnv.get.closureSerializer.newInstance().serialize(filters).array()
-      val encoded: String = BaseEncoding.base64().encode(serialized)
-      conf.set(PARQUET_FILTER_DATA, encoded)
-    }
-  }
-
-  /**
-   * Note: Inside the Hadoop API we only have access to `Configuration`, not to
-   * [[org.apache.spark.SparkContext]], so we cannot use broadcasts to convey
-   * the actual filter predicate.
-   */
-  def deserializeFilterExpressions(conf: Configuration): Seq[Expression] = {
-    val data = conf.get(PARQUET_FILTER_DATA)
-    if (data != null) {
-      val decoded: Array[Byte] = BaseEncoding.base64().decode(data)
-      SparkEnv.get.closureSerializer.newInstance().deserialize(ByteBuffer.wrap(decoded))
-    } else {
-      Seq()
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 8a9c0e733a9a1..77d851ca486b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -218,8 +218,8 @@ private[sql] class ParquetRelation(
     }
 
     // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible
-    val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
-    if (committerClassname == "org.apache.spark.sql.parquet.DirectParquetOutputCommitter") {
+    val committerClassName = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
+    if (committerClassName == "org.apache.spark.sql.parquet.DirectParquetOutputCommitter") {
       conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
         classOf[DirectParquetOutputCommitter].getCanonicalName)
     }
@@ -248,18 +248,22 @@ private[sql] class ParquetRelation(
     // bundled with `ParquetOutputFormat[Row]`.
     job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
 
-    // TODO There's no need to use two kinds of WriteSupport
-    // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and
-    // complex types.
-    val writeSupportClass =
-      if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
-        classOf[MutableRowWriteSupport]
-      } else {
-        classOf[RowWriteSupport]
-      }
+    ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport])
+    CatalystWriteSupport.setSchema(dataSchema, conf)
+
+    // Sets flags for `CatalystSchemaConverter` (which converts Catalyst schema to Parquet schema)
+    // and `CatalystWriteSupport` (writing actual rows to Parquet files).
+    conf.set(
+      SQLConf.PARQUET_BINARY_AS_STRING.key,
+      sqlContext.conf.isParquetBinaryAsString.toString)
 
-    ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass)
-    RowWriteSupport.setSchema(dataSchema.toAttributes, conf)
+    conf.set(
+      SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+      sqlContext.conf.isParquetINT96AsTimestamp.toString)
+
+    conf.set(
+      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+      sqlContext.conf.writeLegacyParquetFormat.toString)
 
     // Sets compression scheme
     conf.set(
@@ -287,7 +291,6 @@ private[sql] class ParquetRelation(
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat
 
     // Parquet row group size. We will use this value as the value for
     // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value
@@ -304,8 +307,7 @@ private[sql] class ParquetRelation(
         useMetadataCache,
         parquetFilterPushDown,
         assumeBinaryIsString,
-        assumeInt96IsTimestamp,
-        writeLegacyParquetFormat) _
+        assumeInt96IsTimestamp) _
 
     // Create the function to set input paths at the driver side.
     val setInputPaths =
@@ -530,8 +532,7 @@ private[sql] object ParquetRelation extends Logging {
       useMetadataCache: Boolean,
       parquetFilterPushDown: Boolean,
       assumeBinaryIsString: Boolean,
-      assumeInt96IsTimestamp: Boolean,
-      writeLegacyParquetFormat: Boolean)(job: Job): Unit = {
+      assumeInt96IsTimestamp: Boolean)(job: Job): Unit = {
     val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
@@ -552,16 +553,15 @@ private[sql] object ParquetRelation extends Logging {
     })
 
     conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
+      CatalystWriteSupport.SPARK_ROW_SCHEMA,
       CatalystSchemaConverter.checkFieldNames(dataSchema).json)
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
 
-    // Sets flags for Parquet schema conversion
+    // Sets flags for `CatalystSchemaConverter`
     conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
     conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
-    conf.setBoolean(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, writeLegacyParquetFormat)
 
     overrideMinSplitSize(parquetBlockSize, conf)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
deleted file mode 100644
index ed89aa27aa1f0..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import java.math.BigInteger
-import java.nio.{ByteBuffer, ByteOrder}
-import java.util.{HashMap => JHashMap}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.parquet.column.ParquetProperties
-import org.apache.parquet.hadoop.ParquetOutputFormat
-import org.apache.parquet.hadoop.api.WriteSupport
-import org.apache.parquet.io.api._
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * A `parquet.hadoop.api.WriteSupport` for Row objects.
- */
-private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Logging {
-
-  private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Array[Attribute] = null
-
-  override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    val metadata = new JHashMap[String, String]()
-    metadata.put(CatalystReadSupport.SPARK_METADATA_KEY, origAttributesStr)
-
-    if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
-    }
-
-    log.debug(s"write support initialized for requested schema $attributes")
-    new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata)
-  }
-
-  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
-    writer = recordConsumer
-    log.debug(s"preparing for write with schema $attributes")
-  }
-
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        writeValue(attributes(index).dataType, record.get(index, attributes(index).dataType))
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private[parquet] def writeValue(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case t: UserDefinedType[_] => writeValue(t.sqlType, value)
-        case t @ ArrayType(_, _) => writeArray(
-          t,
-          value.asInstanceOf[CatalystConverter.ArrayScalaType])
-        case t @ MapType(_, _, _) => writeMap(
-          t,
-          value.asInstanceOf[CatalystConverter.MapScalaType])
-        case t @ StructType(_) => writeStruct(
-          t,
-          value.asInstanceOf[CatalystConverter.StructScalaType])
-        case _ => writePrimitive(schema.asInstanceOf[AtomicType], value)
-      }
-    }
-  }
-
-  private[parquet] def writePrimitive(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean])
-        case ByteType => writer.addInteger(value.asInstanceOf[Byte])
-        case ShortType => writer.addInteger(value.asInstanceOf[Short])
-        case IntegerType | DateType => writer.addInteger(value.asInstanceOf[Int])
-        case LongType => writer.addLong(value.asInstanceOf[Long])
-        case TimestampType => writeTimestamp(value.asInstanceOf[Long])
-        case FloatType => writer.addFloat(value.asInstanceOf[Float])
-        case DoubleType => writer.addDouble(value.asInstanceOf[Double])
-        case StringType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[UTF8String].getBytes))
-        case BinaryType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
-        case DecimalType.Fixed(precision, _) =>
-          writeDecimal(value.asInstanceOf[Decimal], precision)
-        case _ => sys.error(s"Do not know how to writer $schema to consumer")
-      }
-    }
-  }
-
-  private[parquet] def writeStruct(
-      schema: StructType,
-      struct: CatalystConverter.StructScalaType): Unit = {
-    if (struct != null) {
-      val fields = schema.fields.toArray
-      writer.startGroup()
-      var i = 0
-      while(i < fields.length) {
-        if (!struct.isNullAt(i)) {
-          writer.startField(fields(i).name, i)
-          writeValue(fields(i).dataType, struct.get(i, fields(i).dataType))
-          writer.endField(fields(i).name, i)
-        }
-        i = i + 1
-      }
-      writer.endGroup()
-    }
-  }
-
-  private[parquet] def writeArray(
-      schema: ArrayType,
-      array: CatalystConverter.ArrayScalaType): Unit = {
-    val elementType = schema.elementType
-    writer.startGroup()
-    if (array.numElements() > 0) {
-      if (schema.containsNull) {
-        writer.startField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writer.startGroup()
-          if (!array.isNullAt(i)) {
-            writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-            writeValue(elementType, array.get(i, elementType))
-            writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-          }
-          writer.endGroup()
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-      } else {
-        writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writeValue(elementType, array.get(i, elementType))
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-      }
-    }
-    writer.endGroup()
-  }
-
-  private[parquet] def writeMap(
-      schema: MapType,
-      map: CatalystConverter.MapScalaType): Unit = {
-    writer.startGroup()
-    val length = map.numElements()
-    if (length > 0) {
-      writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-      map.foreach(schema.keyType, schema.valueType, (key, value) => {
-        writer.startGroup()
-        writer.startField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        writeValue(schema.keyType, key)
-        writer.endField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        if (value != null) {
-          writer.startField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-          writeValue(schema.valueType, value)
-          writer.endField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-        }
-        writer.endGroup()
-      })
-      writer.endField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-    }
-    writer.endGroup()
-  }
-
-  // Scratch array used to write decimals as fixed-length byte array
-  private[this] var reusableDecimalBytes = new Array[Byte](16)
-
-  private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = {
-    val numBytes = CatalystSchemaConverter.minBytesForPrecision(precision)
-
-    def longToBinary(unscaled: Long): Binary = {
-      var i = 0
-      var shift = 8 * (numBytes - 1)
-      while (i < numBytes) {
-        reusableDecimalBytes(i) = (unscaled >> shift).toByte
-        i += 1
-        shift -= 8
-      }
-      Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-    }
-
-    def bigIntegerToBinary(unscaled: BigInteger): Binary = {
-      unscaled.toByteArray match {
-        case bytes if bytes.length == numBytes =>
-          Binary.fromByteArray(bytes)
-
-        case bytes if bytes.length <= reusableDecimalBytes.length =>
-          val signedByte = (if (bytes.head < 0) -1 else 0).toByte
-          java.util.Arrays.fill(reusableDecimalBytes, 0, numBytes - bytes.length, signedByte)
-          System.arraycopy(bytes, 0, reusableDecimalBytes, numBytes - bytes.length, bytes.length)
-          Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-
-        case bytes =>
-          reusableDecimalBytes = new Array[Byte](bytes.length)
-          bigIntegerToBinary(unscaled)
-      }
-    }
-
-    val binary = if (numBytes <= 8) {
-      longToBinary(decimal.toUnscaledLong)
-    } else {
-      bigIntegerToBinary(decimal.toJavaBigDecimal.unscaledValue())
-    }
-
-    writer.addBinary(binary)
-  }
-
-  // array used to write Timestamp as Int96 (fixed-length binary)
-  private[this] val int96buf = new Array[Byte](12)
-
-  private[parquet] def writeTimestamp(ts: Long): Unit = {
-    val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(ts)
-    val buf = ByteBuffer.wrap(int96buf)
-    buf.order(ByteOrder.LITTLE_ENDIAN)
-    buf.putLong(timeOfDayNanos)
-    buf.putInt(julianDay)
-    writer.addBinary(Binary.fromByteArray(int96buf))
-  }
-}
-
-// Optimized for non-nested rows
-private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index) && !record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        consumeType(attributes(index).dataType, record, index)
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private def consumeType(
-      ctype: DataType,
-      record: InternalRow,
-      index: Int): Unit = {
-    ctype match {
-      case BooleanType => writer.addBoolean(record.getBoolean(index))
-      case ByteType => writer.addInteger(record.getByte(index))
-      case ShortType => writer.addInteger(record.getShort(index))
-      case IntegerType | DateType => writer.addInteger(record.getInt(index))
-      case LongType => writer.addLong(record.getLong(index))
-      case TimestampType => writeTimestamp(record.getLong(index))
-      case FloatType => writer.addFloat(record.getFloat(index))
-      case DoubleType => writer.addDouble(record.getDouble(index))
-      case StringType =>
-        writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
-      case BinaryType =>
-        writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
-      case DecimalType.Fixed(precision, scale) =>
-        writeDecimal(record.getDecimal(index, precision, scale), precision)
-      case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
-    }
-  }
-}
-
-private[parquet] object RowWriteSupport {
-  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
-
-  def getSchema(configuration: Configuration): Seq[Attribute] = {
-    val schemaString = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    if (schemaString == null) {
-      throw new RuntimeException("Missing schema!")
-    }
-    ParquetTypesConverter.convertFromString(schemaString)
-  }
-
-  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
-    val encoded = ParquetTypesConverter.convertToString(schema)
-    configuration.set(SPARK_ROW_SCHEMA, encoded)
-    configuration.set(
-      ParquetOutputFormat.WRITER_VERSION,
-      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
deleted file mode 100644
index b647bb6116afa..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import java.io.IOException
-import java.util.{Collections, Arrays}
-
-import scala.util.Try
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.mapreduce.Job
-import org.apache.parquet.format.converter.ParquetMetadataConverter
-import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.types._
-
-
-private[parquet] object ParquetTypesConverter extends Logging {
-  def isPrimitiveType(ctype: DataType): Boolean = ctype match {
-    case _: NumericType | BooleanType | DateType | TimestampType | StringType | BinaryType => true
-    case _ => false
-  }
-
-  /**
-   * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision.
-   */
-  private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision =>
-    var length = 1
-    while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) {
-      length += 1
-    }
-    length
-  }
-
-  def convertFromAttributes(attributes: Seq[Attribute]): MessageType = {
-    val converter = new CatalystSchemaConverter()
-    converter.convert(StructType.fromAttributes(attributes))
-  }
-
-  def convertFromString(string: String): Seq[Attribute] = {
-    Try(DataType.fromJson(string)).getOrElse(DataType.fromCaseClassString(string)) match {
-      case s: StructType => s.toAttributes
-      case other => sys.error(s"Can convert $string to row")
-    }
-  }
-
-  def convertToString(schema: Seq[Attribute]): String = {
-    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
-    StructType.fromAttributes(schema).json
-  }
-
-  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration): Unit = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
-    }
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to write Parquet metadata: path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && !fs.getFileStatus(path).isDir) {
-      throw new IllegalArgumentException(s"Expected to write to directory $path but found file")
-    }
-    val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
-    if (fs.exists(metadataPath)) {
-      try {
-        fs.delete(metadataPath, true)
-      } catch {
-        case e: IOException =>
-          throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE at $metadataPath")
-      }
-    }
-    val extraMetadata = new java.util.HashMap[String, String]()
-    extraMetadata.put(
-      CatalystReadSupport.SPARK_METADATA_KEY,
-      ParquetTypesConverter.convertToString(attributes))
-    // TODO: add extra data, e.g., table name, date, etc.?
-
-    val parquetSchema: MessageType = ParquetTypesConverter.convertFromAttributes(attributes)
-    val metaData: FileMetaData = new FileMetaData(
-      parquetSchema,
-      extraMetadata,
-      "Spark")
-
-    ParquetFileWriter.writeMetadataFile(
-      conf,
-      path,
-      Arrays.asList(new Footer(path, new ParquetMetadata(metaData, Collections.emptyList()))))
-  }
-
-  /**
-   * Try to read Parquet metadata at the given Path. We first see if there is a summary file
-   * in the parent directory. If so, this is used. Else we read the actual footer at the given
-   * location.
-   * @param origPath The path at which we expect one (or more) Parquet files.
-   * @param configuration The Hadoop configuration to use.
-   * @return The `ParquetMetadata` containing among other things the schema.
-   */
-  def readMetaData(origPath: Path, configuration: Option[Configuration]): ParquetMetadata = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to read Parquet metadata: path is null")
-    }
-    val job = new Job()
-    val conf = {
-      // scalastyle:off jobcontext
-      configuration.getOrElse(ContextUtil.getConfiguration(job))
-      // scalastyle:on jobcontext
-    }
-    val fs: FileSystem = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath")
-    }
-    val path = origPath.makeQualified(fs)
-
-    val children =
-      fs
-        .globStatus(path)
-        .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
-        .filterNot { status =>
-          val name = status.getPath.getName
-          (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
-        }
-
-    // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row
-    // groups. Since Parquet schema is replicated among all row groups, we only need to touch a
-    // single row group to read schema related metadata. Notice that we are making assumptions that
-    // all data in a single Parquet file have the same schema, which is normally true.
-    children
-      // Try any non-"_metadata" file first...
-      .find(_.getPath.getName != ParquetFileWriter.PARQUET_METADATA_FILE)
-      // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is
-      // empty, thus normally the "_metadata" file is expected to be fairly small).
-      .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE))
-      .map(ParquetFileReader.readFooter(conf, _, ParquetMetadataConverter.NO_FILTER))
-      .getOrElse(
-        throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 7992fd59ff4ba..d17671d48a2fc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -24,6 +24,7 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT}
+import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -68,7 +69,7 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
   private[spark] override def asNullable: MyDenseVectorUDT = this
 }
 
-class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
+class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest {
   import testImplicits._
 
   private lazy val pointsRDD = Seq(
@@ -98,17 +99,28 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
       Seq(Row(true), Row(true)))
   }
 
-
-  test("UDTs with Parquet") {
-    val tempDir = Utils.createTempDir()
-    tempDir.delete()
-    pointsRDD.write.parquet(tempDir.getCanonicalPath)
+  testStandardAndLegacyModes("UDTs with Parquet") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      pointsRDD.write.parquet(path)
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(1.0, new MyDenseVector(Array(0.1, 1.0))),
+          Row(0.0, new MyDenseVector(Array(0.2, 2.0)))))
+    }
   }
 
-  test("Repartition UDTs with Parquet") {
-    val tempDir = Utils.createTempDir()
-    tempDir.delete()
-    pointsRDD.repartition(1).write.parquet(tempDir.getCanonicalPath)
+  testStandardAndLegacyModes("Repartition UDTs with Parquet") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      pointsRDD.repartition(1).write.parquet(path)
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(1.0, new MyDenseVector(Array(0.1, 1.0))),
+          Row(0.0, new MyDenseVector(Array(0.2, 2.0)))))
+    }
   }
 
   // Tests to make sure that all operators correctly convert types on the way out.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index cd552e83372f1..599cf948e76a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -28,10 +28,10 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.parquet.example.data.simple.SimpleGroup
 import org.apache.parquet.example.data.{Group, GroupWriter}
+import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
-import org.apache.parquet.hadoop.metadata.{BlockMetaData, CompressionCodecName, FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
+import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata}
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
@@ -99,16 +99,18 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true")(checkParquetFile(data))
   }
 
-  test("fixed-length decimals") {
-    def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sparkContext
-        .parallelize(0 to 1000)
-        .map(i => Tuple1(i / 100.0))
-        .toDF()
-        // Parquet doesn't allow column names with spaces, have to add an alias here
-        .select($"_1" cast decimal as "dec")
+  testStandardAndLegacyModes("fixed-length decimals") {
+    def makeDecimalRDD(decimal: DecimalType): DataFrame = {
+      sqlContext
+        .range(1000)
+        // Parquet doesn't allow column names with spaces, have to add an alias here.
+        // Minus 500 here so that negative decimals are also tested.
+        .select((('id - 500) / 100.0) cast decimal as 'dec)
+        .coalesce(1)
+    }
 
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19, 0), (38, 37))) {
+    val combinations = Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19, 0), (38, 37))
+    for ((precision, scale) <- combinations) {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
         data.write.parquet(dir.getCanonicalPath)
@@ -132,22 +134,22 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("map") {
+  testStandardAndLegacyModes("map") {
     val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
     checkParquetFile(data)
   }
 
-  test("array") {
+  testStandardAndLegacyModes("array") {
     val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
     checkParquetFile(data)
   }
 
-  test("array and double") {
+  testStandardAndLegacyModes("array and double") {
     val data = (1 to 4).map(i => (i.toDouble, Seq(i.toDouble, (i + 1).toDouble)))
     checkParquetFile(data)
   }
 
-  test("struct") {
+  testStandardAndLegacyModes("struct") {
     val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
     withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
@@ -157,7 +159,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("nested struct with array of array as field") {
+  testStandardAndLegacyModes("nested struct with array of array as field") {
     val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
     withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
@@ -167,7 +169,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("nested map with struct as value type") {
+  testStandardAndLegacyModes("nested map with struct as value type") {
     val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
     withParquetDataFrame(data) { df =>
       checkAnswer(df, data.map { case Tuple1(m) =>
@@ -205,14 +207,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("compression codec") {
-    def compressionCodecFor(path: String): String = {
-      val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(hadoopConfiguration)).getBlocks.asScala
-        .flatMap(_.getColumns.asScala)
-        .map(_.getCodec.name())
-        .distinct
-
-      assert(codecs.size === 1)
+    def compressionCodecFor(path: String, codecName: String): String = {
+      val codecs = for {
+        footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConfiguration)
+        block <- footer.getParquetMetadata.getBlocks.asScala
+        column <- block.getColumns.asScala
+      } yield column.getCodec.name()
+
+      assert(codecs.distinct === Seq(codecName))
       codecs.head
     }
 
@@ -222,7 +224,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> codec.name()) {
         withParquetFile(data) { path =>
           assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) {
-            compressionCodecFor(path)
+            compressionCodecFor(path, codec.name())
           }
         }
       }
@@ -278,15 +280,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     withTempPath { file =>
       val path = new Path(file.toURI.toString)
       val fs = FileSystem.getLocal(hadoopConfiguration)
-      val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, hadoopConfiguration)
+      val schema = StructType.fromAttributes(ScalaReflection.attributesFor[(Int, String)])
+      writeMetadata(schema, path, hadoopConfiguration)
 
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(hadoopConfiguration))
-      val actualSchema = metaData.getFileMetaData.getSchema
-      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+      val expectedSchema = new CatalystSchemaConverter().convert(schema)
+      val actualSchema = readFooter(path, hadoopConfiguration).getFileMetaData.getSchema
 
       actualSchema.checkContains(expectedSchema)
       expectedSchema.checkContains(actualSchema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 1c1cfa34ad04b..cc02ef81c9f8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -484,7 +484,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
-  test("SPARK-10301 requested schema clipping - UDT") {
+  testStandardAndLegacyModes("SPARK-10301 requested schema clipping - UDT") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
@@ -517,6 +517,50 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         Row(Row(NestedStruct(1, 2L, 3.5D))))
     }
   }
+
+  test("expand UDT in StructType") {
+    val schema = new StructType().add("n", new NestedStructUDT, nullable = true)
+    val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true)
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
+
+  test("expand UDT in ArrayType") {
+    val schema = new StructType().add(
+      "n",
+      ArrayType(
+        elementType = new NestedStructUDT,
+        containsNull = false),
+      nullable = true)
+
+    val expected = new StructType().add(
+      "n",
+      ArrayType(
+        elementType = new NestedStructUDT().sqlType,
+        containsNull = false),
+      nullable = true)
+
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
+
+  test("expand UDT in MapType") {
+    val schema = new StructType().add(
+      "n",
+      MapType(
+        keyType = IntegerType,
+        valueType = new NestedStructUDT,
+        valueContainsNull = false),
+      nullable = true)
+
+    val expected = new StructType().add(
+      "n",
+      MapType(
+        keyType = IntegerType,
+        valueType = new NestedStructUDT().sqlType,
+        valueContainsNull = false),
+      nullable = true)
+
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
 }
 
 object TestingUDT {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index f17fb36f25fe8..60fa81b1ab819 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -357,8 +357,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     val jsonString = """{"type":"struct","fields":[{"name":"c1","type":"integer","nullable":false,"metadata":{}},{"name":"c2","type":"binary","nullable":true,"metadata":{}}]}"""
     // scalastyle:on
 
-    val fromCaseClassString = ParquetTypesConverter.convertFromString(caseClassString)
-    val fromJson = ParquetTypesConverter.convertFromString(jsonString)
+    val fromCaseClassString = StructType.fromString(caseClassString)
+    val fromJson = StructType.fromString(jsonString)
 
     (fromCaseClassString, fromJson).zipped.foreach { (a, b) =>
       assert(a.name == b.name)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 442fafb12f200..9840ad919e510 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -19,11 +19,19 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.File
 
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.format.converter.ParquetMetadataConverter
+import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
+
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SQLConf, SaveMode}
 
 /**
  * A helper trait that provides convenient facilities for Parquet testing.
@@ -97,4 +105,38 @@ private[sql] trait ParquetTest extends SQLTestUtils {
     assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
     partDir
   }
+
+  protected def writeMetadata(
+      schema: StructType, path: Path, configuration: Configuration): Unit = {
+    val parquetSchema = new CatalystSchemaConverter().convert(schema)
+    val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava
+    val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}"
+    val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy)
+    val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava)
+    val footer = new Footer(path, parquetMetadata)
+    ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava)
+  }
+
+  protected def readAllFootersWithoutSummaryFiles(
+      path: Path, configuration: Configuration): Seq[Footer] = {
+    val fs = path.getFileSystem(configuration)
+    ParquetFileReader.readAllFootersInParallel(configuration, fs.getFileStatus(path)).asScala.toSeq
+  }
+
+  protected def readFooter(path: Path, configuration: Configuration): ParquetMetadata = {
+    ParquetFileReader.readFooter(
+      configuration,
+      new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE),
+      ParquetMetadataConverter.NO_FILTER)
+  }
+
+  protected def testStandardAndLegacyModes(testName: String)(f: => Unit): Unit = {
+    test(s"Standard mode - $testName") {
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") { f }
+    }
+
+    test(s"Legacy mode - $testName") {
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") { f }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 107457f79ec03..d63f3d3996523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -20,11 +20,11 @@ package org.apache.spark.sql.hive
 import java.io.File
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.hive.client.{ExternalTable, ManagedTable}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
+import org.apache.spark.sql.{SQLConf, QueryTest, Row, SaveMode}
 
 class HiveMetastoreCatalogSuite extends SparkFunSuite with TestHiveSingleton {
   import hiveContext.implicits._
@@ -74,11 +74,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
       withTable("t") {
-        testDF
-          .write
-          .mode(SaveMode.Overwrite)
-          .format(provider)
-          .saveAsTable("t")
+        withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
+          testDF
+            .write
+            .mode(SaveMode.Overwrite)
+            .format(provider)
+            .saveAsTable("t")
+        }
 
         val hiveTable = catalog.client.getTable("default", "t")
         assert(hiveTable.inputFormat === Some(inputFormat))
@@ -102,12 +104,14 @@ class DataSourceWithHiveMetastoreCatalogSuite
         withTable("t") {
           val path = dir.getCanonicalFile
 
-          testDF
-            .write
-            .mode(SaveMode.Overwrite)
-            .format(provider)
-            .option("path", path.toString)
-            .saveAsTable("t")
+          withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
+            testDF
+              .write
+              .mode(SaveMode.Overwrite)
+              .format(provider)
+              .option("path", path.toString)
+              .saveAsTable("t")
+          }
 
           val hiveTable = catalog.client.getTable("default", "t")
           assert(hiveTable.inputFormat === Some(inputFormat))

From 84ea287178247c163226e835490c9c70b17d8d3b Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 8 Oct 2015 17:25:14 -0700
Subject: [PATCH 504/802] [SPARK-10914] UnsafeRow serialization breaks when two
 machines have different Oops size.

UnsafeRow contains 3 pieces of information when pointing to some data in memory (an object, a base offset, and length). When the row is serialized with Java/Kryo serialization, the object layout in memory can change if two machines have different pointer width (Oops in JVM).

To reproduce, launch Spark using

MASTER=local-cluster[2,1,1024] bin/spark-shell --conf "spark.executor.extraJavaOptions=-XX:-UseCompressedOops"

And then run the following

scala> sql("select 1 xx").collect()

Author: Reynold Xin <rxin@databricks.com>

Closes #9030 from rxin/SPARK-10914.
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 47 +++++++++++++++++--
 .../org/apache/spark/sql/UnsafeRowSuite.scala | 29 +++++++++++-
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index e8ac2999c2d29..5af7ed5d6eb6d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import java.io.IOException;
-import java.io.OutputStream;
+import java.io.*;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.util.Arrays;
@@ -26,6 +25,11 @@
 import java.util.HashSet;
 import java.util.Set;
 
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.KryoSerializable;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
+
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
@@ -35,6 +39,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 import static org.apache.spark.sql.types.DataTypes.*;
+import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
 
 /**
  * An Unsafe implementation of Row which is backed by raw memory instead of Java objects.
@@ -52,7 +57,7 @@
  *
  * Instances of `UnsafeRow` act as pointers to row data stored in this format.
  */
-public final class UnsafeRow extends MutableRow {
+public final class UnsafeRow extends MutableRow implements Externalizable, KryoSerializable {
 
   //////////////////////////////////////////////////////////////////////////////
   // Static methods
@@ -596,4 +601,40 @@ public boolean anyNull() {
   public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
+
+  @Override
+  public void writeExternal(ObjectOutput out) throws IOException {
+    byte[] bytes = getBytes();
+    out.writeInt(bytes.length);
+    out.writeInt(this.numFields);
+    out.write(bytes);
+  }
+
+  @Override
+  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+    this.baseOffset = BYTE_ARRAY_OFFSET;
+    this.sizeInBytes = in.readInt();
+    this.numFields = in.readInt();
+    this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
+    this.baseObject = new byte[sizeInBytes];
+    in.readFully((byte[]) baseObject);
+  }
+
+  @Override
+  public void write(Kryo kryo, Output out) {
+    byte[] bytes = getBytes();
+    out.writeInt(bytes.length);
+    out.writeInt(this.numFields);
+    out.write(bytes);
+  }
+
+  @Override
+  public void read(Kryo kryo, Input in) {
+    this.baseOffset = BYTE_ARRAY_OFFSET;
+    this.sizeInBytes = in.readInt();
+    this.numFields = in.readInt();
+    this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
+    this.baseObject = new byte[sizeInBytes];
+    in.read((byte[]) baseObject);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 944d4e11348cf..7d1ee39d4b539 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql
 
 import java.io.ByteArrayOutputStream
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.{KryoSerializer, JavaSerializer}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.types._
@@ -29,6 +30,32 @@ import org.apache.spark.unsafe.types.UTF8String
 
 class UnsafeRowSuite extends SparkFunSuite {
 
+  test("UnsafeRow Java serialization") {
+    // serializing an UnsafeRow pointing to a large buffer should only serialize the relevant data
+    val data = new Array[Byte](1024)
+    val row = new UnsafeRow
+    row.pointTo(data, 1, 16)
+    row.setLong(0, 19285)
+
+    val ser = new JavaSerializer(new SparkConf).newInstance()
+    val row1 = ser.deserialize[UnsafeRow](ser.serialize(row))
+    assert(row1.getLong(0) == 19285)
+    assert(row1.getBaseObject().asInstanceOf[Array[Byte]].length == 16)
+  }
+
+  test("UnsafeRow Kryo serialization") {
+    // serializing an UnsafeRow pointing to a large buffer should only serialize the relevant data
+    val data = new Array[Byte](1024)
+    val row = new UnsafeRow
+    row.pointTo(data, 1, 16)
+    row.setLong(0, 19285)
+
+    val ser = new KryoSerializer(new SparkConf).newInstance()
+    val row1 = ser.deserialize[UnsafeRow](ser.serialize(row))
+    assert(row1.getLong(0) == 19285)
+    assert(row1.getBaseObject().asInstanceOf[Array[Byte]].length == 16)
+  }
+
   test("bitset width calculation") {
     assert(UnsafeRow.calculateBitSetWidthInBytes(0) === 0)
     assert(UnsafeRow.calculateBitSetWidthInBytes(1) === 8)

From 3390b400d04e40f767d8a51f1078fcccb4e64abd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 8 Oct 2015 17:34:24 -0700
Subject: [PATCH 505/802] [SPARK-10810] [SPARK-10902] [SQL] Improve session
 management in SQL

This PR improve the sessions management by replacing the thread-local based to one SQLContext per session approach, introduce separated temporary tables and UDFs/UDAFs for each session.

A new session of SQLContext could be created by:

1) create an new SQLContext
2) call newSession() on existing SQLContext

For HiveContext, in order to reduce the cost for each session, the classloader and Hive client are shared across multiple sessions (created by newSession).

CacheManager is also shared by multiple sessions, so cache a table multiple times in different sessions will not cause multiple copies of in-memory cache.

Added jars are still shared by all the sessions, because SparkContext does not support sessions.

cc marmbrus yhuai rxin

Author: Davies Liu <davies@databricks.com>

Closes #8909 from davies/sessions.
---
 project/MimaExcludes.scala                    |  22 ++-
 .../catalyst/analysis/FunctionRegistry.scala  |  28 ++-
 .../org/apache/spark/sql/SQLContext.scala     | 164 ++++++++++--------
 .../spark/sql/execution/CacheManager.scala    |  14 +-
 .../apache/spark/sql/SQLContextSuite.scala    |  59 ++++---
 .../spark/sql/test/TestSQLContext.scala       |  21 +--
 .../SparkExecuteStatementOperation.scala      |  76 ++------
 .../thriftserver/SparkSQLSessionManager.scala |   9 +-
 .../server/SparkSQLOperationManager.scala     |   5 +-
 .../sql/hive/thriftserver/CliSuite.scala      |   8 +-
 .../HiveThriftServer2Suites.scala             |  76 ++++----
 .../apache/spark/sql/hive/HiveContext.scala   | 155 +++++++++++------
 .../org/apache/spark/sql/hive/HiveQl.scala    |  28 +--
 .../sql/hive/client/ClientInterface.scala     |   9 +
 .../spark/sql/hive/client/ClientWrapper.scala |  85 +++++----
 .../hive/client/IsolatedClientLoader.scala    | 107 +++++++-----
 .../spark/sql/hive/execution/commands.scala   |  27 +--
 .../apache/spark/sql/hive/test/TestHive.scala |  27 +--
 .../apache/spark/sql/hive/HiveQlSuite.scala   |  13 +-
 .../spark/sql/hive/client/VersionsSuite.scala |   6 +-
 .../sql/hive/execution/HiveQuerySuite.scala   |  32 ++++
 .../sql/hive/execution/SQLQuerySuite.scala    |   9 +-
 22 files changed, 540 insertions(+), 440 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 2d4d146f51339..08e4a449cf762 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -79,7 +79,27 @@ object MimaExcludes {
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.ml.regression.LeastSquaresAggregator.add"),
         ProblemFilters.exclude[MissingMethodProblem](
-          "org.apache.spark.ml.regression.LeastSquaresCostFun.this")
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.clearLastInstantiatedContext"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.setLastInstantiatedContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.SQLContext$SQLSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.detachSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.tlSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.defaultSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.currentSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.openSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.setSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.createSession")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e6122d92b763c..ba77b70a378a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -51,23 +51,37 @@ class SimpleFunctionRegistry extends FunctionRegistry {
   private val functionBuilders =
     StringKeyHashMap[(ExpressionInfo, FunctionBuilder)](caseSensitive = false)
 
-  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
-  : Unit = {
+  override def registerFunction(
+      name: String,
+      info: ExpressionInfo,
+      builder: FunctionBuilder): Unit = synchronized {
     functionBuilders.put(name, (info, builder))
   }
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    val func = functionBuilders.get(name).map(_._2).getOrElse {
-      throw new AnalysisException(s"undefined function $name")
+    val func = synchronized {
+      functionBuilders.get(name).map(_._2).getOrElse {
+        throw new AnalysisException(s"undefined function $name")
+      }
     }
     func(children)
   }
 
-  override def listFunction(): Seq[String] = functionBuilders.iterator.map(_._1).toList.sorted
+  override def listFunction(): Seq[String] = synchronized {
+    functionBuilders.iterator.map(_._1).toList.sorted
+  }
 
-  override def lookupFunction(name: String): Option[ExpressionInfo] = {
+  override def lookupFunction(name: String): Option[ExpressionInfo] = synchronized {
     functionBuilders.get(name).map(_._1)
   }
+
+  def copy(): SimpleFunctionRegistry = synchronized {
+    val registry = new SimpleFunctionRegistry
+    functionBuilders.iterator.foreach { case (name, (info, builder)) =>
+      registry.registerFunction(name, info, builder)
+    }
+    registry
+  }
 }
 
 /**
@@ -257,7 +271,7 @@ object FunctionRegistry {
     expression[InputFileName]("input_file_name")
   )
 
-  val builtin: FunctionRegistry = {
+  val builtin: SimpleFunctionRegistry = {
     val fr = new SimpleFunctionRegistry
     expressions.foreach { case (name, (info, builder)) => fr.registerFunction(name, info, builder) }
     fr
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index cb0a3e361c97a..2bdfd82af0adb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -30,6 +30,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
@@ -38,15 +39,12 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
-import org.apache.spark.sql.execution.{Filter, _}
-import org.apache.spark.sql.{execution => sparkexecution}
-import org.apache.spark.sql.execution._
-import org.apache.spark.sql.sources._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{execution => sparkexecution}
 import org.apache.spark.util.Utils
 
 /**
@@ -64,18 +62,30 @@ import org.apache.spark.util.Utils
  *
  * @since 1.0.0
  */
-class SQLContext(@transient val sparkContext: SparkContext)
-  extends org.apache.spark.Logging
-  with Serializable {
+class SQLContext private[sql](
+    @transient val sparkContext: SparkContext,
+    @transient protected[sql] val cacheManager: CacheManager)
+  extends org.apache.spark.Logging with Serializable {
 
   self =>
 
+  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager)
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
+  /**
+   * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
+   * registered functions, but sharing the same SparkContext and CacheManager.
+   *
+   * @since 1.6.0
+   */
+  def newSession(): SQLContext = {
+    new SQLContext(sparkContext, cacheManager)
+  }
+
   /**
    * @return Spark SQL configuration
    */
-  protected[sql] def conf = currentSession().conf
+  protected[sql] lazy val conf = new SQLConf
 
   // `listener` should be only used in the driver
   @transient private[sql] val listener = new SQLListener(this)
@@ -142,13 +152,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
 
-  // TODO how to handle the temp table per user session?
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(conf)
 
-  // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin
+  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin.copy()
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
@@ -198,20 +206,19 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] def executePlan(plan: LogicalPlan) =
     new sparkexecution.QueryExecution(this, plan)
 
-  @transient
-  protected[sql] val tlSession = new ThreadLocal[SQLSession]() {
-    override def initialValue: SQLSession = defaultSession
-  }
-
-  @transient
-  protected[sql] val defaultSession = createSession()
-
   protected[sql] def dialectClassName = if (conf.dialect == "sql") {
     classOf[DefaultParserDialect].getCanonicalName
   } else {
     conf.dialect
   }
 
+  /**
+   * Add a jar to SQLContext
+   */
+  protected[sql] def addJar(path: String): Unit = {
+    sparkContext.addJar(path)
+  }
+
   {
     // We extract spark sql settings from SparkContext's conf and put them to
     // Spark SQL's conf.
@@ -236,9 +243,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
   }
 
-  @transient
-  protected[sql] val cacheManager = new CacheManager(this)
-
   /**
    * :: Experimental ::
    * A collection of methods that are considered experimental, but can be used to hook into
@@ -300,21 +304,25 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group cachemgmt
    * @since 1.3.0
    */
-  def isCached(tableName: String): Boolean = cacheManager.isCached(tableName)
+  def isCached(tableName: String): Boolean = {
+    cacheManager.lookupCachedData(table(tableName)).nonEmpty
+  }
 
   /**
    * Caches the specified table in-memory.
    * @group cachemgmt
    * @since 1.3.0
    */
-  def cacheTable(tableName: String): Unit = cacheManager.cacheTable(tableName)
+  def cacheTable(tableName: String): Unit = {
+    cacheManager.cacheQuery(table(tableName), Some(tableName))
+  }
 
   /**
    * Removes the specified table from the in-memory cache.
    * @group cachemgmt
    * @since 1.3.0
    */
-  def uncacheTable(tableName: String): Unit = cacheManager.uncacheTable(tableName)
+  def uncacheTable(tableName: String): Unit = cacheManager.uncacheQuery(table(tableName))
 
   /**
    * Removes all cached tables from the in-memory cache.
@@ -830,36 +838,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     )
   }
 
-  protected[sql] def openSession(): SQLSession = {
-    detachSession()
-    val session = createSession()
-    tlSession.set(session)
-
-    session
-  }
-
-  protected[sql] def currentSession(): SQLSession = {
-    tlSession.get()
-  }
-
-  protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
-  protected[sql] def detachSession(): Unit = {
-    tlSession.remove()
-  }
-
-  protected[sql] def setSession(session: SQLSession): Unit = {
-    detachSession()
-    tlSession.set(session)
-  }
-
-  protected[sql] class SQLSession {
-    // Note that this is a lazy val so we can override the default value in subclasses.
-    protected[sql] lazy val conf: SQLConf = new SQLConf
-  }
-
   @deprecated("use org.apache.spark.sql.QueryExecution", "1.6.0")
   protected[sql] class QueryExecution(logical: LogicalPlan)
     extends sparkexecution.QueryExecution(this, logical)
@@ -1196,46 +1174,90 @@ class SQLContext(@transient val sparkContext: SparkContext)
   // Register a succesfully instantiatd context to the singleton. This should be at the end of
   // the class definition so that the singleton is updated only if there is no exception in the
   // construction of the instance.
-  SQLContext.setLastInstantiatedContext(self)
+  sparkContext.addSparkListener(new SparkListener {
+    override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
+      SQLContext.clearInstantiatedContext(self)
+    }
+  })
+
+  SQLContext.setInstantiatedContext(self)
 }
 
 /**
  * This SQLContext object contains utility functions to create a singleton SQLContext instance,
- * or to get the last created SQLContext instance.
+ * or to get the created SQLContext instance.
+ *
+ * It also provides utility functions to support preference for threads in multiple sessions
+ * scenario, setActive could set a SQLContext for current thread, which will be returned by
+ * getOrCreate instead of the global one.
  */
 object SQLContext {
 
-  private val INSTANTIATION_LOCK = new Object()
+  /**
+   * The active SQLContext for the current thread.
+   */
+  private val activeContext: InheritableThreadLocal[SQLContext] =
+    new InheritableThreadLocal[SQLContext]
 
   /**
-   * Reference to the last created SQLContext.
+   * Reference to the created SQLContext.
    */
-  @transient private val lastInstantiatedContext = new AtomicReference[SQLContext]()
+  @transient private val instantiatedContext = new AtomicReference[SQLContext]()
 
   /**
    * Get the singleton SQLContext if it exists or create a new one using the given SparkContext.
+   *
    * This function can be used to create a singleton SQLContext object that can be shared across
    * the JVM.
+   *
+   * If there is an active SQLContext for current thread, it will be returned instead of the global
+   * one.
+   *
+   * @since 1.5.0
    */
   def getOrCreate(sparkContext: SparkContext): SQLContext = {
-    INSTANTIATION_LOCK.synchronized {
-      if (lastInstantiatedContext.get() == null) {
+    val ctx = activeContext.get()
+    if (ctx != null) {
+      return ctx
+    }
+
+    synchronized {
+      val ctx = instantiatedContext.get()
+      if (ctx == null) {
         new SQLContext(sparkContext)
+      } else {
+        ctx
       }
     }
-    lastInstantiatedContext.get()
   }
 
-  private[sql] def clearLastInstantiatedContext(): Unit = {
-    INSTANTIATION_LOCK.synchronized {
-      lastInstantiatedContext.set(null)
-    }
+  private[sql] def clearInstantiatedContext(sqlContext: SQLContext): Unit = {
+    instantiatedContext.compareAndSet(sqlContext, null)
   }
 
-  private[sql] def setLastInstantiatedContext(sqlContext: SQLContext): Unit = {
-    INSTANTIATION_LOCK.synchronized {
-      lastInstantiatedContext.set(sqlContext)
-    }
+  private[sql] def setInstantiatedContext(sqlContext: SQLContext): Unit = {
+    instantiatedContext.compareAndSet(null, sqlContext)
+  }
+
+  /**
+   * Changes the SQLContext that will be returned in this thread and its children when
+   * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives
+   * a SQLContext with an isolated session, instead of the global (first created) context.
+   *
+   * @since 1.6.0
+   */
+  def setActive(sqlContext: SQLContext): Unit = {
+    activeContext.set(sqlContext)
+  }
+
+  /**
+   * Clears the active SQLContext for current thread. Subsequent calls to getOrCreate will
+   * return the first created context instead of a thread-local override.
+   *
+   * @since 1.6.0
+   */
+  def clearActive(): Unit = {
+    activeContext.remove()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index d3e5c378d037d..f85aeb1b02694 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.execution
 import java.util.concurrent.locks.ReentrantReadWriteLock
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.columnar.InMemoryRelation
-import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK
 
@@ -37,7 +37,7 @@ private[sql] case class CachedData(plan: LogicalPlan, cachedRepresentation: InMe
  *
  * Internal to Spark SQL.
  */
-private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
+private[sql] class CacheManager extends Logging {
 
   @transient
   private val cachedData = new scala.collection.mutable.ArrayBuffer[CachedData]
@@ -45,15 +45,6 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
   @transient
   private val cacheLock = new ReentrantReadWriteLock
 
-  /** Returns true if the table is currently cached in-memory. */
-  def isCached(tableName: String): Boolean = lookupCachedData(sqlContext.table(tableName)).nonEmpty
-
-  /** Caches the specified table in-memory. */
-  def cacheTable(tableName: String): Unit = cacheQuery(sqlContext.table(tableName), Some(tableName))
-
-  /** Removes the specified table from the in-memory cache. */
-  def uncacheTable(tableName: String): Unit = uncacheQuery(sqlContext.table(tableName))
-
   /** Acquires a read lock on the cache for the duration of `f`. */
   private def readLock[A](f: => A): A = {
     val lock = cacheLock.readLock()
@@ -96,6 +87,7 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
     if (lookupCachedData(planToCache).nonEmpty) {
       logWarning("Asked to cache already cached data.")
     } else {
+      val sqlContext = query.sqlContext
       cachedData +=
         CachedData(
           planToCache,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index dd88ae3700ab9..1994dacfc4dfa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -17,33 +17,52 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 
-class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
-
-  override def afterAll(): Unit = {
-    try {
-      SQLContext.setLastInstantiatedContext(sqlContext)
-    } finally {
-      super.afterAll()
-    }
-  }
+class SQLContextSuite extends SparkFunSuite with SharedSparkContext{
 
   test("getOrCreate instantiates SQLContext") {
-    SQLContext.clearLastInstantiatedContext()
-    val sqlContext = SQLContext.getOrCreate(sparkContext)
+    val sqlContext = SQLContext.getOrCreate(sc)
     assert(sqlContext != null, "SQLContext.getOrCreate returned null")
-    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
       "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
   }
 
-  test("getOrCreate gets last explicitly instantiated SQLContext") {
-    SQLContext.clearLastInstantiatedContext()
-    val sqlContext = new SQLContext(sparkContext)
-    assert(SQLContext.getOrCreate(sparkContext) != null,
-      "SQLContext.getOrCreate after explicitly created SQLContext returned null")
-    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
+  test("getOrCreate return the original SQLContext") {
+    val sqlContext = SQLContext.getOrCreate(sc)
+    val newSession = sqlContext.newSession()
+    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
       "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
+    SQLContext.setActive(newSession)
+    assert(SQLContext.getOrCreate(sc).eq(newSession),
+      "SQLContext.getOrCreate after explicitly setActive() did not return the active context")
+  }
+
+  test("Sessions of SQLContext") {
+    val sqlContext = SQLContext.getOrCreate(sc)
+    val session1 = sqlContext.newSession()
+    val session2 = sqlContext.newSession()
+
+    // all have the default configurations
+    val key = SQLConf.SHUFFLE_PARTITIONS.key
+    assert(session1.getConf(key) === session2.getConf(key))
+    session1.setConf(key, "1")
+    session2.setConf(key, "2")
+    assert(session1.getConf(key) === "1")
+    assert(session2.getConf(key) === "2")
+
+    // temporary table should not be shared
+    val df = session1.range(10)
+    df.registerTempTable("test1")
+    assert(session1.tableNames().contains("test1"))
+    assert(!session2.tableNames().contains("test1"))
+
+    // UDF should not be shared
+    def myadd(a: Int, b: Int): Int = a + b
+    session1.udf.register[Int, Int, Int]("myadd", myadd)
+    session1.sql("select myadd(1, 2)").explain()
+    intercept[AnalysisException] {
+      session2.sql("select myadd(1, 2)").explain()
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 10e633f3cde46..c89a1516503e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -31,23 +31,16 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
       new SparkConf().set("spark.sql.testkey", "true")))
   }
 
-  // Make sure we set those test specific confs correctly when we create
-  // the SQLConf as well as when we call clear.
-  protected[sql] override def createSession(): SQLSession = new this.SQLSession()
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
 
-  /** A special [[SQLSession]] that uses fewer shuffle partitions than normal. */
-  protected[sql] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    clear()
 
-      clear()
+    override def clear(): Unit = {
+      super.clear()
 
-      override def clear(): Unit = {
-        super.clear()
-
-        // Make sure we start with the default test configs even after clear
-        TestSQLContext.overrideConfs.map {
-          case (key, value) => setConfString(key, value)
-        }
+      // Make sure we start with the default test configs even after clear
+      TestSQLContext.overrideConfs.map {
+        case (key, value) => setConfString(key, value)
       }
     }
   }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 306f98bcb5344..719b03e1c7c71 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -20,19 +20,15 @@ package org.apache.spark.sql.hive.thriftserver
 import java.security.PrivilegedExceptionAction
 import java.sql.{Date, Timestamp}
 import java.util.concurrent.RejectedExecutionException
-import java.util.{Arrays, Map => JMap, UUID}
+import java.util.{Arrays, UUID, Map => JMap}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => SMap}
 import scala.util.control.NonFatal
 
-import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hive.service.cli._
-import org.apache.hadoop.hive.ql.metadata.Hive
-import org.apache.hadoop.hive.ql.metadata.HiveException
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.shims.Utils
+import org.apache.hive.service.cli._
 import org.apache.hive.service.cli.operation.ExecuteStatementOperation
 import org.apache.hive.service.cli.session.HiveSession
 
@@ -40,7 +36,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.execution.SetCommand
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf}
+import org.apache.spark.sql.{DataFrame, SQLConf, Row => SparkRow}
 
 
 private[hive] class SparkExecuteStatementOperation(
@@ -143,30 +139,15 @@ private[hive] class SparkExecuteStatementOperation(
     if (!runInBackground) {
       runInternal()
     } else {
-      val parentSessionState = SessionState.get()
-      val hiveConf = getConfigForOperation()
       val sparkServiceUGI = Utils.getUGI()
-      val sessionHive = getCurrentHive()
-      val currentSqlSession = hiveContext.currentSession
 
       // Runnable impl to call runInternal asynchronously,
       // from a different thread
       val backgroundOperation = new Runnable() {
 
         override def run(): Unit = {
-          val doAsAction = new PrivilegedExceptionAction[Object]() {
-            override def run(): Object = {
-
-              // User information is part of the metastore client member in Hive
-              hiveContext.setSession(currentSqlSession)
-              // Always use the latest class loader provided by executionHive's state.
-              val executionHiveClassLoader =
-                hiveContext.executionHive.state.getConf.getClassLoader
-              sessionHive.getConf.setClassLoader(executionHiveClassLoader)
-              parentSessionState.getConf.setClassLoader(executionHiveClassLoader)
-
-              Hive.set(sessionHive)
-              SessionState.setCurrentSessionState(parentSessionState)
+          val doAsAction = new PrivilegedExceptionAction[Unit]() {
+            override def run(): Unit = {
               try {
                 runInternal()
               } catch {
@@ -174,7 +155,6 @@ private[hive] class SparkExecuteStatementOperation(
                   setOperationException(e)
                   log.error("Error running hive query: ", e)
               }
-              return null
             }
           }
 
@@ -191,7 +171,7 @@ private[hive] class SparkExecuteStatementOperation(
       try {
         // This submit blocks if no background threads are available to run this operation
         val backgroundHandle =
-          getParentSession().getSessionManager().submitBackgroundOperation(backgroundOperation)
+          parentSession.getSessionManager().submitBackgroundOperation(backgroundOperation)
         setBackgroundHandle(backgroundHandle)
       } catch {
         case rejected: RejectedExecutionException =>
@@ -210,6 +190,11 @@ private[hive] class SparkExecuteStatementOperation(
     statementId = UUID.randomUUID().toString
     logInfo(s"Running query '$statement' with $statementId")
     setState(OperationState.RUNNING)
+    // Always use the latest class loader provided by executionHive's state.
+    val executionHiveClassLoader =
+      hiveContext.executionHive.state.getConf.getClassLoader
+    Thread.currentThread().setContextClassLoader(executionHiveClassLoader)
+
     HiveThriftServer2.listener.onStatementStart(
       statementId,
       parentSession.getSessionHandle.getSessionId.toString,
@@ -279,43 +264,4 @@ private[hive] class SparkExecuteStatementOperation(
       }
     }
   }
-
-  /**
-   * If there are query specific settings to overlay, then create a copy of config
-   * There are two cases we need to clone the session config that's being passed to hive driver
-   * 1. Async query -
-   *    If the client changes a config setting, that shouldn't reflect in the execution
-   *    already underway
-   * 2. confOverlay -
-   *    The query specific settings should only be applied to the query config and not session
-   * @return new configuration
-   * @throws HiveSQLException
-   */
-  private def getConfigForOperation(): HiveConf = {
-    var sqlOperationConf = getParentSession().getHiveConf()
-    if (!getConfOverlay().isEmpty() || runInBackground) {
-      // clone the partent session config for this query
-      sqlOperationConf = new HiveConf(sqlOperationConf)
-
-      // apply overlay query specific settings, if any
-      getConfOverlay().asScala.foreach { case (k, v) =>
-        try {
-          sqlOperationConf.verifyAndSet(k, v)
-        } catch {
-          case e: IllegalArgumentException =>
-            throw new HiveSQLException("Error applying statement specific settings", e)
-        }
-      }
-    }
-    return sqlOperationConf
-  }
-
-  private def getCurrentHive(): Hive = {
-    try {
-      return Hive.get()
-    } catch {
-      case e: HiveException =>
-        throw new HiveSQLException("Failed to get current Hive object", e);
-    }
-  }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 92ac0ec3fca29..33aaead3fbf96 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -36,7 +36,7 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
   extends SessionManager(hiveServer)
   with ReflectedCompositeService {
 
-  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
+  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()
 
   override def init(hiveConf: HiveConf) {
     setSuperField(this, "hiveConf", hiveConf)
@@ -60,13 +60,15 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
       sessionConf: java.util.Map[String, String],
       withImpersonation: Boolean,
       delegationToken: String): SessionHandle = {
-    hiveContext.openSession()
     val sessionHandle =
       super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
           delegationToken)
     val session = super.getSession(sessionHandle)
     HiveThriftServer2.listener.onSessionCreated(
       session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
+    val ctx = hiveContext.newSession()
+    ctx.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
+    sparkSqlOperationManager.sessionToContexts += sessionHandle -> ctx
     sessionHandle
   }
 
@@ -74,7 +76,6 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
     HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
     super.closeSession(sessionHandle)
     sparkSqlOperationManager.sessionToActivePool -= sessionHandle
-
-    hiveContext.detachSession()
+    sparkSqlOperationManager.sessionToContexts.remove(sessionHandle)
   }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index c8031ed0f3437..476651a559d2c 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -30,20 +30,21 @@ import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, R
 /**
  * Executes queries using Spark SQL, and maintains a list of handles to active queries.
  */
-private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
+private[thriftserver] class SparkSQLOperationManager()
   extends OperationManager with Logging {
 
   val handleToOperation = ReflectionUtils
     .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")
 
   val sessionToActivePool = Map[SessionHandle, String]()
+  val sessionToContexts = Map[SessionHandle, HiveContext]()
 
   override def newExecuteStatementOperation(
       parentSession: HiveSession,
       statement: String,
       confOverlay: JMap[String, String],
       async: Boolean): ExecuteStatementOperation = synchronized {
-
+    val hiveContext = sessionToContexts(parentSession.getSessionHandle)
     val runInBackground = async && hiveContext.hiveThriftServerAsync
     val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
       runInBackground)(hiveContext, sessionToActivePool)
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index e59a14ec00d5c..76d1591a235c2 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -96,7 +96,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       buffer += s"${new Timestamp(new Date().getTime)} - $source> $line"
 
       // If we haven't found all expected answers and another expected answer comes up...
-      if (next < expectedAnswers.size && line.startsWith(expectedAnswers(next))) {
+      if (next < expectedAnswers.size && line.contains(expectedAnswers(next))) {
         next += 1
         // If all expected answers have been found...
         if (next == expectedAnswers.size) {
@@ -159,7 +159,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE hive_test;"
         -> "OK",
       "CACHE TABLE hive_test;"
-        -> "Time taken: ",
+        -> "",
       "SELECT COUNT(*) FROM hive_test;"
         -> "5",
       "DROP TABLE hive_test;"
@@ -180,7 +180,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       "CREATE TABLE hive_test(key INT, val STRING);"
         -> "OK",
       "SHOW TABLES;"
-        -> "Time taken: "
+        -> "hive_test"
     )
 
     runCliWithin(2.minute, Seq("--database", "hive_test_db", "-e", "SHOW TABLES;"))(
@@ -210,7 +210,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTable;"
         -> "OK",
       "INSERT INTO TABLE t1 SELECT key, val FROM sourceTable;"
-        -> "Time taken:",
+        -> "",
       "SELECT count(key) FROM t1;"
         -> "5",
       "DROP TABLE t1;"
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 19b2f24456ab0..ff8ca0150649d 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -205,6 +205,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
     import org.apache.spark.sql.SQLConf
     var defaultV1: String = null
     var defaultV2: String = null
+    var data: ArrayBuffer[Int] = null
 
     withMultipleConnectionJdbcStatement(
       // create table
@@ -214,10 +215,16 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
             "DROP TABLE IF EXISTS test_map",
             "CREATE TABLE test_map(key INT, value STRING)",
             s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map",
-            "CACHE TABLE test_table AS SELECT key FROM test_map ORDER BY key DESC")
+            "CACHE TABLE test_table AS SELECT key FROM test_map ORDER BY key DESC",
+            "CREATE DATABASE db1")
 
         queries.foreach(statement.execute)
 
+        val plan = statement.executeQuery("explain select * from test_table")
+        plan.next()
+        plan.next()
+        assert(plan.getString(1).contains("InMemoryColumnarTableScan"))
+
         val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
         val buf1 = new collection.mutable.ArrayBuffer[Int]()
         while (rs1.next()) {
@@ -233,6 +240,8 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
         rs2.close()
 
         assert(buf1 === buf2)
+
+        data = buf1
       },
 
       // first session, we get the default value of the session status
@@ -289,56 +298,51 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
         rs2.close()
       },
 
-      // accessing the cached data in another session
+      // try to access the cached data in another session
       { statement =>
 
-        val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
-        val buf1 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs1.next()) {
-          buf1 += rs1.getInt(1)
+        // Cached temporary table can't be accessed by other sessions
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
         }
-        rs1.close()
 
-        val rs2 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf2 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs2.next()) {
-          buf2 += rs2.getInt(1)
+        val plan = statement.executeQuery("explain select key from test_map ORDER BY key DESC")
+        plan.next()
+        plan.next()
+        assert(plan.getString(1).contains("InMemoryColumnarTableScan"))
+
+        val rs = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
+        val buf = new collection.mutable.ArrayBuffer[Int]()
+        while (rs.next()) {
+          buf += rs.getInt(1)
         }
-        rs2.close()
+        rs.close()
+        assert(buf === data)
+      },
 
-        assert(buf1 === buf2)
-        statement.executeQuery("UNCACHE TABLE test_table")
+      // switch another database
+      { statement =>
+        statement.execute("USE db1")
 
-        // TODO need to figure out how to determine if the data loaded from cache
-        val rs3 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf3 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs3.next()) {
-          buf3 += rs3.getInt(1)
+        // there is no test_map table in db1
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
         }
-        rs3.close()
 
-        assert(buf1 === buf3)
+        statement.execute("CREATE TABLE test_map2(key INT, value STRING)")
       },
 
-      // accessing the uncached table
+      // access default database
       { statement =>
 
-        // TODO need to figure out how to determine if the data loaded from cache
-        val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
-        val buf1 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs1.next()) {
-          buf1 += rs1.getInt(1)
-        }
-        rs1.close()
-
-        val rs2 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf2 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs2.next()) {
-          buf2 += rs2.getInt(1)
+        // current database should still be `default`
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_map2")
         }
-        rs2.close()
 
-        assert(buf1 === buf2)
+        statement.execute("USE db1")
+        // access test_map2
+        statement.executeQuery("SELECT key from test_map2")
       }
     )
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 17de8ef56f9a6..dad1e2347c387 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -25,7 +25,6 @@ import java.util.concurrent.TimeUnit
 import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
-import scala.concurrent.duration._
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.StatsSetupConst
@@ -34,32 +33,49 @@ import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
-import org.apache.spark.Logging
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql._
+import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
-import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier, ParserDialect}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand}
-import org.apache.spark.sql.execution.datasources.{PreWriteCheck, PreInsertCastAndRename, DataSourceStrategy}
+import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
+import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkContext}
 
 
 /**
  * This is the HiveQL Dialect, this dialect is strongly bind with HiveContext
  */
-private[hive] class HiveQLDialect extends ParserDialect {
+private[hive] class HiveQLDialect(sqlContext: HiveContext) extends ParserDialect {
   override def parse(sqlText: String): LogicalPlan = {
-    HiveQl.parseSql(sqlText)
+    sqlContext.executionHive.withHiveState {
+      HiveQl.parseSql(sqlText)
+    }
+  }
+}
+
+/**
+ * Returns the current database of metadataHive.
+ */
+private[hive] case class CurrentDatabase(ctx: HiveContext)
+  extends LeafExpression with CodegenFallback {
+  override def dataType: DataType = StringType
+  override def foldable: Boolean = true
+  override def nullable: Boolean = false
+  override def eval(input: InternalRow): Any = {
+    UTF8String.fromString(ctx.metadataHive.currentDatabase)
   }
 }
 
@@ -69,13 +85,29 @@ private[hive] class HiveQLDialect extends ParserDialect {
  *
  * @since 1.0.0
  */
-class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
+class HiveContext private[hive](
+    sc: SparkContext,
+    cacheManager: CacheManager,
+    @transient execHive: ClientWrapper,
+    @transient metaHive: ClientInterface) extends SQLContext(sc, cacheManager) with Logging {
   self =>
 
-  import HiveContext._
+  def this(sc: SparkContext) = this(sc, new CacheManager, null, null)
+  def this(sc: JavaSparkContext) = this(sc.sc)
+
+  import org.apache.spark.sql.hive.HiveContext._
 
   logDebug("create HiveContext")
 
+  /**
+   * Returns a new HiveContext as new session, which will have separated SQLConf, UDF/UDAF,
+   * temporary tables and SessionState, but sharing the same CacheManager, IsolatedClientLoader
+   * and Hive client (both of execution and metadata) with existing HiveContext.
+   */
+  override def newSession(): HiveContext = {
+    new HiveContext(sc, cacheManager, executionHive.newSession(), metadataHive.newSession())
+  }
+
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
    * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive
@@ -157,14 +189,18 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * for storing persistent metadata, and only point to a dummy metastore in a temporary directory.
    */
   @transient
-  protected[hive] lazy val executionHive: ClientWrapper = {
+  protected[hive] lazy val executionHive: ClientWrapper = if (execHive != null) {
+    execHive
+  } else {
     logInfo(s"Initializing execution hive, version $hiveExecutionVersion")
-    new ClientWrapper(
+    val loader = new IsolatedClientLoader(
       version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
+      execJars = Seq(),
       config = newTemporaryConfiguration(),
-      initClassLoader = Utils.getContextOrSparkClassLoader)
+      isolationOn = false,
+      baseClassLoader = Utils.getContextOrSparkClassLoader)
+    loader.createClient().asInstanceOf[ClientWrapper]
   }
-  SessionState.setCurrentSessionState(executionHive.state)
 
   /**
    * Overrides default Hive configurations to avoid breaking changes to Spark SQL users.
@@ -182,7 +218,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * in the hive-site.xml file.
    */
   @transient
-  protected[hive] lazy val metadataHive: ClientInterface = {
+  protected[hive] lazy val metadataHive: ClientInterface = if (metaHive != null) {
+    metaHive
+  } else {
     val metaVersion = IsolatedClientLoader.hiveVersion(hiveMetastoreVersion)
 
     // We instantiate a HiveConf here to read in the hive-site.xml file and then pass the options
@@ -268,14 +306,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
         barrierPrefixes = hiveMetastoreBarrierPrefixes,
         sharedPrefixes = hiveMetastoreSharedPrefixes)
     }
-    isolatedLoader.client
+    isolatedLoader.createClient()
   }
 
   protected[sql] override def parseSql(sql: String): LogicalPlan = {
-    var state = SessionState.get()
-    if (state == null) {
-      SessionState.setCurrentSessionState(tlSession.get().asInstanceOf[SQLSession].sessionState)
-    }
     super.parseSql(substitutor.substitute(hiveconf, sql))
   }
 
@@ -384,8 +418,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     }
   }
 
-  protected[hive] def hiveconf = tlSession.get().asInstanceOf[this.SQLSession].hiveconf
-
   override def setConf(key: String, value: String): Unit = {
     super.setConf(key, value)
     executionHive.runSqlHive(s"SET $key=$value")
@@ -402,7 +434,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     setConf(entry.key, entry.stringConverter(value))
   }
 
-    /* A catalyst metadata catalog that points to the Hive Metastore. */
+  /* A catalyst metadata catalog that points to the Hive Metastore. */
   @transient
   override protected[sql] lazy val catalog =
     new HiveMetastoreCatalog(metadataHive, this) with OverrideCatalog
@@ -410,7 +442,13 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
   override protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new HiveFunctionRegistry(FunctionRegistry.builtin)
+    new HiveFunctionRegistry(FunctionRegistry.builtin.copy())
+
+  // The Hive UDF current_database() is foldable, will be evaluated by optimizer, but the optimizer
+  // can't access the SessionState of metadataHive.
+  functionRegistry.registerFunction(
+    "current_database",
+    (expressions: Seq[Expression]) => new CurrentDatabase(this))
 
   /* An analyzer that uses the Hive metastore. */
   @transient
@@ -430,10 +468,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
       )
     }
 
-  override protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
   /** Overridden by child classes that need to set configuration before the client init. */
   protected def configure(): Map[String, String] = {
     // Hive 0.14.0 introduces timeout operations in HiveConf, and changes default values of a bunch
@@ -488,41 +522,40 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     }.toMap
   }
 
-  protected[hive] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
-    }
-
-    /**
-     * SQLConf and HiveConf contracts:
-     *
-     * 1. reuse existing started SessionState if any
-     * 2. when the Hive session is first initialized, params in HiveConf will get picked up by the
-     *    SQLConf.  Additionally, any properties set by set() or a SET command inside sql() will be
-     *    set in the SQLConf *as well as* in the HiveConf.
-     */
-    protected[hive] lazy val sessionState: SessionState = {
-      var state = SessionState.get()
-      if (state == null) {
-        state = new SessionState(new HiveConf(classOf[SessionState]))
-        SessionState.start(state)
-      }
-      state
-    }
+  /**
+   * SQLConf and HiveConf contracts:
+   *
+   * 1. create a new SessionState for each HiveContext
+   * 2. when the Hive session is first initialized, params in HiveConf will get picked up by the
+   *    SQLConf.  Additionally, any properties set by set() or a SET command inside sql() will be
+   *    set in the SQLConf *as well as* in the HiveConf.
+   */
+  @transient
+  protected[hive] lazy val hiveconf: HiveConf = {
+    val c = executionHive.conf
+    setConf(c.getAllProperties)
+    c
+  }
 
-    protected[hive] lazy val hiveconf: HiveConf = {
-      setConf(sessionState.getConf.getAllProperties)
-      sessionState.getConf
-    }
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
+    override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
   }
 
-  override protected[sql] def dialectClassName = if (conf.dialect == "hiveql") {
+  protected[sql] override def dialectClassName = if (conf.dialect == "hiveql") {
     classOf[HiveQLDialect].getCanonicalName
   } else {
     super.dialectClassName
   }
 
+  protected[sql] override def getSQLDialect(): ParserDialect = {
+    if (conf.dialect == "hiveql") {
+      new HiveQLDialect(this)
+    } else {
+      super.getSQLDialect()
+    }
+  }
+
   @transient
   private val hivePlanner = new SparkPlanner with HiveStrategies {
     val hiveContext = self
@@ -598,6 +631,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
         case _ => super.simpleString
       }
   }
+
+  protected[sql] override def addJar(path: String): Unit = {
+    // Add jar to Hive and classloader
+    executionHive.addJar(path)
+    metadataHive.addJar(path)
+    Thread.currentThread().setContextClassLoader(executionHive.clientLoader.classLoader)
+    super.addJar(path)
+  }
 }
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2bf22f5449641..250c232856885 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -25,29 +25,27 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.serde.serdeConstants
-import org.apache.hadoop.hive.ql.{ErrorMsg, Context}
-import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, FunctionInfo}
+import org.apache.hadoop.hive.ql.exec.{FunctionInfo, FunctionRegistry}
 import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
+import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.{AnalysisException, catalyst}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.{logical, _}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.client._
-import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema}
+import org.apache.spark.sql.hive.execution.{AnalyzeTable, DropTable, HiveNativeCommand, HiveScriptIOSchema}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.random.RandomSampler
@@ -268,7 +266,7 @@ private[hive] object HiveQl extends Logging {
     node
   }
 
-  private def createContext(): Context = new Context(SessionState.get().getConf())
+  private def createContext(): Context = new Context(hiveConf)
 
   private def getAst(sql: String, context: Context) =
     ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, context))
@@ -277,12 +275,16 @@ private[hive] object HiveQl extends Logging {
    * Returns the HiveConf
    */
   private[this] def hiveConf: HiveConf = {
-    val ss = SessionState.get() // SessionState is lazy initialization, it can be null here
+    var ss = SessionState.get()
+    // SessionState is lazy initialization, it can be null here
     if (ss == null) {
-      new HiveConf()
-    } else {
-      ss.getConf
+      val original = Thread.currentThread().getContextClassLoader
+      val conf = new HiveConf(classOf[SessionState])
+      conf.setClassLoader(original)
+      ss = new SessionState(conf)
+      SessionState.start(ss)
     }
+    ss.getConf
   }
 
   /** Returns a LogicalPlan for a given HiveQL string. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 915eae9d21e23..9d9a55edd7314 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -178,6 +178,15 @@ private[hive] trait ClientInterface {
       holdDDLTime: Boolean,
       listBucketingEnabled: Boolean): Unit
 
+  /** Add a jar into class loader */
+  def addJar(path: String): Unit
+
+  /** Return a ClientInterface as new session, that will share the class loader and Hive client */
+  def newSession(): ClientInterface
+
+  /** Run a function within Hive state (SessionState, HiveConf, Hive client and class loader) */
+  def withHiveState[A](f: => A): A
+
   /** Used for testing only.  Removes all metadata from this instance of Hive. */
   def reset(): Unit
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 8f6d448b2aef4..3dce86c480747 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -60,7 +60,8 @@ import org.apache.spark.util.{CircularBuffer, Utils}
 private[hive] class ClientWrapper(
     override val version: HiveVersion,
     config: Map[String, String],
-    initClassLoader: ClassLoader)
+    initClassLoader: ClassLoader,
+    val clientLoader: IsolatedClientLoader)
   extends ClientInterface
   with Logging {
 
@@ -150,31 +151,29 @@ private[hive] class ClientWrapper(
     // Switch to the initClassLoader.
     Thread.currentThread().setContextClassLoader(initClassLoader)
     val ret = try {
-      val oldState = SessionState.get()
-      if (oldState == null) {
-        val initialConf = new HiveConf(classOf[SessionState])
-        // HiveConf is a Hadoop Configuration, which has a field of classLoader and
-        // the initial value will be the current thread's context class loader
-        // (i.e. initClassLoader at here).
-        // We call initialConf.setClassLoader(initClassLoader) at here to make
-        // this action explicit.
-        initialConf.setClassLoader(initClassLoader)
-        config.foreach { case (k, v) =>
-          if (k.toLowerCase.contains("password")) {
-            logDebug(s"Hive Config: $k=xxx")
-          } else {
-            logDebug(s"Hive Config: $k=$v")
-          }
-          initialConf.set(k, v)
+      val initialConf = new HiveConf(classOf[SessionState])
+      // HiveConf is a Hadoop Configuration, which has a field of classLoader and
+      // the initial value will be the current thread's context class loader
+      // (i.e. initClassLoader at here).
+      // We call initialConf.setClassLoader(initClassLoader) at here to make
+      // this action explicit.
+      initialConf.setClassLoader(initClassLoader)
+      config.foreach { case (k, v) =>
+        if (k.toLowerCase.contains("password")) {
+          logDebug(s"Hive Config: $k=xxx")
+        } else {
+          logDebug(s"Hive Config: $k=$v")
         }
-        val newState = new SessionState(initialConf)
-        SessionState.start(newState)
-        newState.out = new PrintStream(outputBuffer, true, "UTF-8")
-        newState.err = new PrintStream(outputBuffer, true, "UTF-8")
-        newState
-      } else {
-        oldState
+        initialConf.set(k, v)
+      }
+      val state = new SessionState(initialConf)
+      if (clientLoader.cachedHive != null) {
+        Hive.set(clientLoader.cachedHive.asInstanceOf[Hive])
       }
+      SessionState.start(state)
+      state.out = new PrintStream(outputBuffer, true, "UTF-8")
+      state.err = new PrintStream(outputBuffer, true, "UTF-8")
+      state
     } finally {
       Thread.currentThread().setContextClassLoader(original)
     }
@@ -188,11 +187,6 @@ private[hive] class ClientWrapper(
     conf.get(key, defaultValue)
   }
 
-  // TODO: should be a def?s
-  // When we create this val client, the HiveConf of it (conf) is the one associated with state.
-  @GuardedBy("this")
-  private var client = Hive.get(conf)
-
   // We use hive's conf for compatibility.
   private val retryLimit = conf.getIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES)
   private val retryDelayMillis = shim.getMetastoreClientConnectRetryDelayMillis(conf)
@@ -200,7 +194,7 @@ private[hive] class ClientWrapper(
   /**
    * Runs `f` with multiple retries in case the hive metastore is temporarily unreachable.
    */
-  private def retryLocked[A](f: => A): A = synchronized {
+  private def retryLocked[A](f: => A): A = clientLoader.synchronized {
     // Hive sometimes retries internally, so set a deadline to avoid compounding delays.
     val deadline = System.nanoTime + (retryLimit * retryDelayMillis * 1e6).toLong
     var numTries = 0
@@ -215,13 +209,8 @@ private[hive] class ClientWrapper(
           logWarning(
             "HiveClientWrapper got thrift exception, destroying client and retrying " +
               s"(${retryLimit - numTries} tries remaining)", e)
+          clientLoader.cachedHive = null
           Thread.sleep(retryDelayMillis)
-          try {
-            client = Hive.get(state.getConf, true)
-          } catch {
-            case e: Exception if causedByThrift(e) =>
-              logWarning("Failed to refresh hive client, will retry.", e)
-          }
       }
     } while (numTries <= retryLimit && System.nanoTime < deadline)
     if (System.nanoTime > deadline) {
@@ -242,13 +231,26 @@ private[hive] class ClientWrapper(
     false
   }
 
+  def client: Hive = {
+    if (clientLoader.cachedHive != null) {
+      clientLoader.cachedHive.asInstanceOf[Hive]
+    } else {
+      val c = Hive.get(conf)
+      clientLoader.cachedHive = c
+      c
+    }
+  }
+
   /**
    * Runs `f` with ThreadLocal session state and classloaders configured for this version of hive.
    */
-  private def withHiveState[A](f: => A): A = retryLocked {
+  def withHiveState[A](f: => A): A = retryLocked {
     val original = Thread.currentThread().getContextClassLoader
     // Set the thread local metastore client to the client associated with this ClientWrapper.
     Hive.set(client)
+    // The classloader in clientLoader could be changed after addJar, always use the latest
+    // classloader
+    state.getConf.setClassLoader(clientLoader.classLoader)
     // setCurrentSessionState will use the classLoader associated
     // with the HiveConf in `state` to override the context class loader of the current
     // thread.
@@ -545,6 +547,15 @@ private[hive] class ClientWrapper(
       listBucketingEnabled)
   }
 
+  def addJar(path: String): Unit = {
+    clientLoader.addJar(path)
+    runSqlHive(s"ADD JAR $path")
+  }
+
+  def newSession(): ClientWrapper = {
+    clientLoader.createClient().asInstanceOf[ClientWrapper]
+  }
+
   def reset(): Unit = withHiveState {
     client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 1fe4cba9571f3..567e4d7b411ec 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -22,6 +22,7 @@ import java.lang.reflect.InvocationTargetException
 import java.net.{URL, URLClassLoader}
 import java.util
 
+import scala.collection.mutable
 import scala.language.reflectiveCalls
 import scala.util.Try
 
@@ -148,53 +149,75 @@ private[hive] class IsolatedClientLoader(
     name.replaceAll("\\.", "/") + ".class"
 
   /** The classloader that is used to load an isolated version of Hive. */
-  protected val classLoader: ClassLoader = new URLClassLoader(allJars, rootClassLoader) {
-    override def loadClass(name: String, resolve: Boolean): Class[_] = {
-      val loaded = findLoadedClass(name)
-      if (loaded == null) doLoadClass(name, resolve) else loaded
-    }
-
-    def doLoadClass(name: String, resolve: Boolean): Class[_] = {
-      val classFileName = name.replaceAll("\\.", "/") + ".class"
-      if (isBarrierClass(name) && isolationOn) {
-        // For barrier classes, we construct a new copy of the class.
-        val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
-        logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
-        defineClass(name, bytes, 0, bytes.length)
-      } else if (!isSharedClass(name)) {
-        logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
-        super.loadClass(name, resolve)
-      } else {
-        // For shared classes, we delegate to baseClassLoader.
-        logDebug(s"shared class: $name")
-        baseClassLoader.loadClass(name)
+  private[hive] var classLoader: ClassLoader = if (isolationOn) {
+    new URLClassLoader(allJars, rootClassLoader) {
+      override def loadClass(name: String, resolve: Boolean): Class[_] = {
+        val loaded = findLoadedClass(name)
+        if (loaded == null) doLoadClass(name, resolve) else loaded
+      }
+      def doLoadClass(name: String, resolve: Boolean): Class[_] = {
+        val classFileName = name.replaceAll("\\.", "/") + ".class"
+        if (isBarrierClass(name)) {
+          // For barrier classes, we construct a new copy of the class.
+          val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
+          logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
+          defineClass(name, bytes, 0, bytes.length)
+        } else if (!isSharedClass(name)) {
+          logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
+          super.loadClass(name, resolve)
+        } else {
+          // For shared classes, we delegate to baseClassLoader.
+          logDebug(s"shared class: $name")
+          baseClassLoader.loadClass(name)
+        }
       }
     }
+  } else {
+    baseClassLoader
   }
 
-  // Pre-reflective instantiation setup.
-  logDebug("Initializing the logger to avoid disaster...")
-  Thread.currentThread.setContextClassLoader(classLoader)
+  private[hive] def addJar(path: String): Unit = synchronized {
+    val jarURL = new java.io.File(path).toURI.toURL
+    // TODO: we should avoid of stacking classloaders (use a single URLClassLoader and add jars
+    // to that)
+    classLoader = new java.net.URLClassLoader(Array(jarURL), classLoader)
+  }
 
   /** The isolated client interface to Hive. */
-  val client: ClientInterface = try {
-    classLoader
-      .loadClass(classOf[ClientWrapper].getName)
-      .getConstructors.head
-      .newInstance(version, config, classLoader)
-      .asInstanceOf[ClientInterface]
-  } catch {
-    case e: InvocationTargetException =>
-      if (e.getCause().isInstanceOf[NoClassDefFoundError]) {
-        val cnf = e.getCause().asInstanceOf[NoClassDefFoundError]
-        throw new ClassNotFoundException(
-          s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
-           "Please make sure that jars for your version of hive and hadoop are included in the " +
-          s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
-      } else {
-        throw e
-      }
-  } finally {
-    Thread.currentThread.setContextClassLoader(baseClassLoader)
+  private[hive] def createClient(): ClientInterface = {
+    if (!isolationOn) {
+      return new ClientWrapper(version, config, baseClassLoader, this)
+    }
+    // Pre-reflective instantiation setup.
+    logDebug("Initializing the logger to avoid disaster...")
+    val origLoader = Thread.currentThread().getContextClassLoader
+    Thread.currentThread.setContextClassLoader(classLoader)
+
+    try {
+      classLoader
+        .loadClass(classOf[ClientWrapper].getName)
+        .getConstructors.head
+        .newInstance(version, config, classLoader, this)
+        .asInstanceOf[ClientInterface]
+    } catch {
+      case e: InvocationTargetException =>
+        if (e.getCause().isInstanceOf[NoClassDefFoundError]) {
+          val cnf = e.getCause().asInstanceOf[NoClassDefFoundError]
+          throw new ClassNotFoundException(
+            s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
+            "Please make sure that jars for your version of hive and hadoop are included in the " +
+            s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
+        } else {
+          throw e
+        }
+    } finally {
+      Thread.currentThread.setContextClassLoader(origLoader)
+    }
   }
+
+  /**
+   * The place holder for shared Hive client for all the HiveContext sessions (they share an
+   * IsolatedClientLoader).
+   */
+  private[hive] var cachedHive: Any = null
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9f654eed5761c..51ec92afd06ed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,18 +18,18 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.hadoop.hive.metastore.MetaStoreUtils
+
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{TableIdentifier, SqlParser}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 /**
  * Analyzes the given table in the current database to generate statistics, which will be
@@ -86,26 +86,7 @@ case class AddJar(path: String) extends RunnableCommand {
   }
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
-    val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val currentClassLoader = Utils.getContextOrSparkClassLoader
-
-    // Add jar to current context
-    val jarURL = new java.io.File(path).toURI.toURL
-    val newClassLoader = new java.net.URLClassLoader(Array(jarURL), currentClassLoader)
-    Thread.currentThread.setContextClassLoader(newClassLoader)
-    // We need to explicitly set the class loader associated with the conf in executionHive's
-    // state because this class loader will be used as the context class loader of the current
-    // thread to execute any Hive command.
-    // We cannot use `org.apache.hadoop.hive.ql.metadata.Hive.get().getConf()` because Hive.get()
-    // returns the value of a thread local variable and its HiveConf may not be the HiveConf
-    // associated with `executionHive.state` (for example, HiveContext is created in one thread
-    // and then add jar is called from another thread).
-    hiveContext.executionHive.state.getConf.setClassLoader(newClassLoader)
-    // Add jar to isolated hive (metadataHive) class loader.
-    hiveContext.runSqlHive(s"ADD JAR $path")
-
-    // Add jar to executors
-    hiveContext.sparkContext.addJar(path)
+    sqlContext.addJar(path)
 
     Seq(Row(0))
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index be335a47dcabd..ff39ccb7c1ea5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -116,27 +116,18 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   override def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
-  // Make sure we set those test specific confs correctly when we create
-  // the SQLConf as well as when we call clear.
-  override protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
-  protected[hive] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      // TODO as in unit test, conf.clear() probably be called, all of the value will be cleared.
-      // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
-      override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
+    override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
+    override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
 
-      clear()
+    clear()
 
-      override def clear(): Unit = {
-        super.clear()
+    override def clear(): Unit = {
+      super.clear()
 
-        TestHiveContext.overrideConfs.map {
-          case (key, value) => setConfString(key, value)
-        }
+      TestHiveContext.overrideConfs.map {
+        case (key, value) => setConfString(key, value)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
index 79cf40aba4bf2..528a7398b10df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
@@ -17,22 +17,15 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde.serdeConstants
+import org.scalatest.BeforeAndAfterAll
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.hive.client.{ManagedTable, HiveColumn, ExternalTable, HiveTable}
-import org.scalatest.BeforeAndAfterAll
+import org.apache.spark.sql.hive.client.{ExternalTable, HiveColumn, HiveTable, ManagedTable}
 
 
 class HiveQlSuite extends SparkFunSuite with BeforeAndAfterAll {
-  override def beforeAll() {
-    if (SessionState.get() == null) {
-      SessionState.start(new HiveConf())
-    }
-  }
-
   private def extractTableDesc(sql: String): (HiveTable, Boolean) = {
     HiveQl.createPlan(sql).collect {
       case CreateTableAsSelect(desc, child, allowExisting) => (desc, allowExisting)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 2da22ec2379f3..c6d034a23a1c6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -53,7 +53,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
   test("success sanity check") {
     val badClient = IsolatedClientLoader.forVersion(HiveContext.hiveExecutionVersion,
       buildConf(),
-      ivyPath).client
+      ivyPath).createClient()
     val db = new HiveDatabase("default", "")
     badClient.createDatabase(db)
   }
@@ -83,7 +83,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
   ignore("failure sanity check") {
     val e = intercept[Throwable] {
       val badClient = quietly {
-        IsolatedClientLoader.forVersion("13", buildConf(), ivyPath).client
+        IsolatedClientLoader.forVersion("13", buildConf(), ivyPath).createClient()
       }
     }
     assert(getNestedMessages(e) contains "Unknown column 'A0.OWNER_NAME' in 'field list'")
@@ -97,7 +97,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
     test(s"$version: create client") {
       client = null
       System.gc() // Hack to avoid SEGV on some JVM versions.
-      client = IsolatedClientLoader.forVersion(version, buildConf(), ivyPath).client
+      client = IsolatedClientLoader.forVersion(version, buildConf(), ivyPath).createClient()
     }
 
     test(s"$version: createDatabase") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index fe63ad5683195..2878500453141 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -1133,6 +1133,38 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     conf.clear()
   }
 
+  test("current_database with multiple sessions") {
+    sql("create database a")
+    sql("use a")
+    val s2 = newSession()
+    s2.sql("create database b")
+    s2.sql("use b")
+
+    assert(sql("select current_database()").first() === Row("a"))
+    assert(s2.sql("select current_database()").first() === Row("b"))
+
+    try {
+      sql("create table test_a(key INT, value STRING)")
+      s2.sql("create table test_b(key INT, value STRING)")
+
+      sql("select * from test_a")
+      intercept[AnalysisException] {
+        sql("select * from test_b")
+      }
+      sql("select * from b.test_b")
+
+      s2.sql("select * from test_b")
+      intercept[AnalysisException] {
+        s2.sql("select * from test_a")
+      }
+      s2.sql("select * from a.test_a")
+    } finally {
+      sql("DROP TABLE IF EXISTS test_a")
+      s2.sql("DROP TABLE IF EXISTS test_b")
+    }
+
+  }
+
   createQueryTest("select from thrift based table",
     "SELECT * from src_thrift")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ec5b83b98e401..ccc15eaa63f42 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -160,10 +160,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("show functions") {
-    val allFunctions =
+    val allBuiltinFunctions =
       (FunctionRegistry.builtin.listFunction().toSet[String] ++
         org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionNames.asScala).toList.sorted
-    checkAnswer(sql("SHOW functions"), allFunctions.map(Row(_)))
+    // The TestContext is shared by all the test cases, some functions may be registered before
+    // this, so we check that all the builtin functions are returned.
+    val allFunctions = sql("SHOW functions").collect().map(r => r(0))
+    allBuiltinFunctions.foreach { f =>
+      assert(allFunctions.contains(f))
+    }
     checkAnswer(sql("SHOW functions abs"), Row("abs"))
     checkAnswer(sql("SHOW functions 'abs'"), Row("abs"))
     checkAnswer(sql("SHOW functions abc.abs"), Row("abs"))

From 8e67882b905683a1f151679214ef0b575e77c7e1 Mon Sep 17 00:00:00 2001
From: zero323 <matthew.szymkiewicz@gmail.com>
Date: Thu, 8 Oct 2015 18:34:15 -0700
Subject: [PATCH 506/802] =?UTF-8?q?[SPARK-10973]=20[ML]=20[PYTHON]=20=5F?=
 =?UTF-8?q?=5Fgettitem=5F=5F=20method=20throws=20IndexError=20exception=20?=
 =?UTF-8?q?when=20we=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__gettitem__ method throws IndexError exception when we try to access index after the last non-zero entry

    from pyspark.mllib.linalg import Vectors
    sv = Vectors.sparse(5, {1: 3})
    sv[0]
    ## 0.0
    sv[1]
    ## 3.0
    sv[2]
    ## Traceback (most recent call last):
    ##   File "<stdin>", line 1, in <module>
    ##   File "/python/pyspark/mllib/linalg/__init__.py", line 734, in __getitem__
    ##     row_ind = inds[insert_index]
    ## IndexError: index out of bounds

Author: zero323 <matthew.szymkiewicz@gmail.com>

Closes #9009 from zero323/sparse_vector_index_error.
---
 python/pyspark/mllib/linalg/__init__.py |  3 +++
 python/pyspark/mllib/tests.py           | 12 +++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index ea42127f1651f..d903b9030d8ce 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -770,6 +770,9 @@ def __getitem__(self, index):
             raise ValueError("Index %d out of bounds." % index)
 
         insert_index = np.searchsorted(inds, index)
+        if insert_index >= inds.size:
+            return 0.
+
         row_ind = inds[insert_index]
         if row_ind == index:
             return vals[insert_index]
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 96cf13495aa95..2a6a5cd3fe40e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -237,15 +237,17 @@ def test_conversion(self):
         self.assertTrue(dv.array.dtype == 'float64')
 
     def test_sparse_vector_indexing(self):
-        sv = SparseVector(4, {1: 1, 3: 2})
+        sv = SparseVector(5, {1: 1, 3: 2})
         self.assertEqual(sv[0], 0.)
         self.assertEqual(sv[3], 2.)
         self.assertEqual(sv[1], 1.)
         self.assertEqual(sv[2], 0.)
-        self.assertEqual(sv[-1], 2)
-        self.assertEqual(sv[-2], 0)
-        self.assertEqual(sv[-4], 0)
-        for ind in [4, -5]:
+        self.assertEqual(sv[4], 0.)
+        self.assertEqual(sv[-1], 0.)
+        self.assertEqual(sv[-2], 2.)
+        self.assertEqual(sv[-3], 0.)
+        self.assertEqual(sv[-5], 0.)
+        for ind in [5, -6]:
             self.assertRaises(ValueError, sv.__getitem__, ind)
         for ind in [7.8, '1']:
             self.assertRaises(TypeError, sv.__getitem__, ind)

From fa3e4d8f52995bf632e7eda60dbb776c9f637546 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Thu, 8 Oct 2015 18:50:27 -0700
Subject: [PATCH 507/802] =?UTF-8?q?[SPARK-11019]=20[STREAMING]=20[FLUME]?=
 =?UTF-8?q?=20Gracefully=20shutdown=20Flume=20receiver=20th=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…reads.

Wait for a minute for the receiver threads to shutdown before interrupting them.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #9041 from harishreedharan/flume-graceful-shutdown.
---
 .../spark/streaming/flume/FlumePollingInputDStream.scala  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 3b936d88abd3e..6737750c3d63e 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -18,7 +18,7 @@ package org.apache.spark.streaming.flume
 
 
 import java.net.InetSocketAddress
-import java.util.concurrent.{LinkedBlockingQueue, Executors}
+import java.util.concurrent.{Executors, LinkedBlockingQueue, TimeUnit}
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -93,7 +93,11 @@ private[streaming] class FlumePollingReceiver(
 
   override def onStop(): Unit = {
     logInfo("Shutting down Flume Polling Receiver")
-    receiverExecutor.shutdownNow()
+    receiverExecutor.shutdown()
+    // Wait upto a minute for the threads to die
+    if (!receiverExecutor.awaitTermination(60, TimeUnit.SECONDS)) {
+      receiverExecutor.shutdownNow()
+    }
     connections.asScala.foreach(_.transceiver.close())
     channelFactory.releaseExternalResources()
   }

From 09841290055770a619a2e72fbaef1a5e694916ae Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Thu, 8 Oct 2015 18:53:38 -0700
Subject: [PATCH 508/802] [SPARK-10955] [STREAMING] Add a warning if dynamic
 allocation for Streaming applications

Dynamic allocation can be painful for streaming apps and can lose data. Log a warning for streaming applications if dynamic allocation is enabled.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #8998 from harishreedharan/ss-log-error and squashes the following commits:

462b264 [Hari Shreedharan] Improve log message.
2733d94 [Hari Shreedharan] Minor change to warning message.
eaa48cc [Hari Shreedharan] Log a warning instead of failing the application if dynamic allocation is enabled.
725f090 [Hari Shreedharan] Add config parameter to allow dynamic allocation if the user explicitly sets it.
b3f9a95 [Hari Shreedharan] Disable dynamic allocation and kill app if it is enabled.
a4a5212 [Hari Shreedharan] [streaming] SPARK-10955. Disable dynamic allocation for Streaming applications.
---
 .../org/apache/spark/streaming/StreamingContext.scala    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 94fea63f55b25..9b2632c229548 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver}
 import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener}
 import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab}
-import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils}
+import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils, Utils}
 
 /**
  * Main entry point for Spark Streaming functionality. It provides methods used to create
@@ -564,6 +564,13 @@ class StreamingContext private[streaming] (
           )
       }
     }
+
+    if (Utils.isDynamicAllocationEnabled(sc.conf)) {
+      logWarning("Dynamic Allocation is enabled for this application. " +
+        "Enabling Dynamic allocation for Spark Streaming applications can cause data loss if " +
+        "Write Ahead Log is not enabled for non-replayable sources like Flume. " +
+        "See the programming guide for details on how to enable the Write Ahead Log")
+    }
   }
 
   /**

From 67fbecbf32fced87d3accd2618fef2af9f44fae2 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 8 Oct 2015 21:44:59 -0700
Subject: [PATCH 509/802] [SPARK-10956] Common MemoryManager interface for
 storage and execution

This patch introduces a `MemoryManager` that is the central arbiter of how much memory to grant to storage and execution. This patch is primarily concerned only with refactoring while preserving the existing behavior as much as possible.

This is the first step away from the existing rigid separation of storage and execution memory, which has several major drawbacks discussed on the [issue](https://issues.apache.org/jira/browse/SPARK-10956). It is the precursor of a series of patches that will attempt to address those drawbacks.

Author: Andrew Or <andrew@databricks.com>
Author: Josh Rosen <joshrosen@databricks.com>
Author: andrewor14 <andrew@databricks.com>

Closes #9000 from andrewor14/memory-manager.
---
 .../scala/org/apache/spark/SparkEnv.scala     |  11 +-
 .../apache/spark/memory/MemoryManager.scala   | 117 ++++++++
 .../spark/memory/StaticMemoryManager.scala    | 202 +++++++++++++
 .../spark/shuffle/ShuffleMemoryManager.scala  |  69 +++--
 .../apache/spark/storage/BlockManager.scala   |  33 +--
 .../apache/spark/storage/MemoryStore.scala    | 272 +++++++++---------
 .../memory/StaticMemoryManagerSuite.scala     | 172 +++++++++++
 .../BlockManagerReplicationSuite.scala        |  29 +-
 .../spark/storage/BlockManagerSuite.scala     |  34 ++-
 .../execution/TestShuffleMemoryManager.scala  |  28 +-
 .../streaming/ReceivedBlockHandlerSuite.scala |  13 +-
 11 files changed, 752 insertions(+), 228 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
 create mode 100644 core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index cfde27fb2e7d3..df3d84a1f08e9 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -30,6 +30,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.PythonWorkerFactory
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.memory.{MemoryManager, StaticMemoryManager}
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
@@ -69,6 +70,8 @@ class SparkEnv (
     val httpFileServer: HttpFileServer,
     val sparkFilesDir: String,
     val metricsSystem: MetricsSystem,
+    // TODO: unify these *MemoryManager classes (SPARK-10984)
+    val memoryManager: MemoryManager,
     val shuffleMemoryManager: ShuffleMemoryManager,
     val executorMemoryManager: ExecutorMemoryManager,
     val outputCommitCoordinator: OutputCommitCoordinator,
@@ -332,7 +335,8 @@ object SparkEnv extends Logging {
     val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 
-    val shuffleMemoryManager = ShuffleMemoryManager.create(conf, numUsableCores)
+    val memoryManager = new StaticMemoryManager(conf)
+    val shuffleMemoryManager = ShuffleMemoryManager.create(conf, memoryManager, numUsableCores)
 
     val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
 
@@ -343,8 +347,8 @@ object SparkEnv extends Logging {
 
     // NB: blockManager is not valid until initialize() is called later.
     val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,
-      serializer, conf, mapOutputTracker, shuffleManager, blockTransferService, securityManager,
-      numUsableCores)
+      serializer, conf, memoryManager, mapOutputTracker, shuffleManager,
+      blockTransferService, securityManager, numUsableCores)
 
     val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)
 
@@ -417,6 +421,7 @@ object SparkEnv extends Logging {
       httpFileServer,
       sparkFilesDir,
       metricsSystem,
+      memoryManager,
       shuffleMemoryManager,
       executorMemoryManager,
       outputCommitCoordinator,
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
new file mode 100644
index 0000000000000..4bf73b696920d
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
+
+
+/**
+ * An abstract memory manager that enforces how memory is shared between execution and storage.
+ *
+ * In this context, execution memory refers to that used for computation in shuffles, joins,
+ * sorts and aggregations, while storage memory refers to that used for caching and propagating
+ * internal data across the cluster. There exists one of these per JVM.
+ */
+private[spark] abstract class MemoryManager {
+
+  // The memory store used to evict cached blocks
+  private var _memoryStore: MemoryStore = _
+  protected def memoryStore: MemoryStore = {
+    if (_memoryStore == null) {
+      throw new IllegalArgumentException("memory store not initialized yet")
+    }
+    _memoryStore
+  }
+
+  /**
+   * Set the [[MemoryStore]] used by this manager to evict cached blocks.
+   * This must be set after construction due to initialization ordering constraints.
+   */
+  def setMemoryStore(store: MemoryStore): Unit = {
+    _memoryStore = store
+  }
+
+  /**
+   * Acquire N bytes of memory for execution.
+   * @return number of bytes successfully granted (<= N).
+   */
+  def acquireExecutionMemory(numBytes: Long): Long
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+
+  /**
+   * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+
+  /**
+   * Release N bytes of execution memory.
+   */
+  def releaseExecutionMemory(numBytes: Long): Unit
+
+  /**
+   * Release N bytes of storage memory.
+   */
+  def releaseStorageMemory(numBytes: Long): Unit
+
+  /**
+   * Release all storage memory acquired.
+   */
+  def releaseStorageMemory(): Unit
+
+  /**
+   * Release N bytes of unroll memory.
+   */
+  def releaseUnrollMemory(numBytes: Long): Unit
+
+  /**
+   * Total available memory for execution, in bytes.
+   */
+  def maxExecutionMemory: Long
+
+  /**
+   * Total available memory for storage, in bytes.
+   */
+  def maxStorageMemory: Long
+
+  /**
+   * Execution memory currently in use, in bytes.
+   */
+  def executionMemoryUsed: Long
+
+  /**
+   * Storage memory currently in use, in bytes.
+   */
+  def storageMemoryUsed: Long
+
+}
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
new file mode 100644
index 0000000000000..150445edb9578
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.storage.{BlockId, BlockStatus}
+
+
+/**
+ * A [[MemoryManager]] that statically partitions the heap space into disjoint regions.
+ *
+ * The sizes of the execution and storage regions are determined through
+ * `spark.shuffle.memoryFraction` and `spark.storage.memoryFraction` respectively. The two
+ * regions are cleanly separated such that neither usage can borrow memory from the other.
+ */
+private[spark] class StaticMemoryManager(
+    conf: SparkConf,
+    override val maxExecutionMemory: Long,
+    override val maxStorageMemory: Long)
+  extends MemoryManager with Logging {
+
+  // Max number of bytes worth of blocks to evict when unrolling
+  private val maxMemoryToEvictForUnroll: Long = {
+    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
+  }
+
+  // Amount of execution / storage memory in use
+  // Accesses must be synchronized on `this`
+  private var _executionMemoryUsed: Long = 0
+  private var _storageMemoryUsed: Long = 0
+
+  def this(conf: SparkConf) {
+    this(
+      conf,
+      StaticMemoryManager.getMaxExecutionMemory(conf),
+      StaticMemoryManager.getMaxStorageMemory(conf))
+  }
+
+  /**
+   * Acquire N bytes of memory for execution.
+   * @return number of bytes successfully granted (<= N).
+   */
+  override def acquireExecutionMemory(numBytes: Long): Long = synchronized {
+    assert(_executionMemoryUsed <= maxExecutionMemory)
+    val bytesToGrant = math.min(numBytes, maxExecutionMemory - _executionMemoryUsed)
+    _executionMemoryUsed += bytesToGrant
+    bytesToGrant
+  }
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    acquireStorageMemory(blockId, numBytes, numBytes, evictedBlocks)
+  }
+
+  /**
+   * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   *
+   * This evicts at most M bytes worth of existing blocks, where M is a fraction of the storage
+   * space specified by `spark.storage.unrollFraction`. Blocks evicted in the process, if any,
+   * are added to `evictedBlocks`.
+   *
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    val currentUnrollMemory = memoryStore.currentUnrollMemory
+    val maxNumBytesToFree = math.max(0, maxMemoryToEvictForUnroll - currentUnrollMemory)
+    val numBytesToFree = math.min(numBytes, maxNumBytesToFree)
+    acquireStorageMemory(blockId, numBytes, numBytesToFree, evictedBlocks)
+  }
+
+  /**
+   * Acquire N bytes of storage memory for the given block, evicting existing ones if necessary.
+   *
+   * @param blockId the ID of the block we are acquiring storage memory for
+   * @param numBytesToAcquire the size of this block
+   * @param numBytesToFree the size of space to be freed through evicting blocks
+   * @param evictedBlocks a holder for blocks evicted in the process
+   * @return whether all N bytes were successfully granted.
+   */
+  private def acquireStorageMemory(
+      blockId: BlockId,
+      numBytesToAcquire: Long,
+      numBytesToFree: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    // Note: Keep this outside synchronized block to avoid potential deadlocks!
+    memoryStore.ensureFreeSpace(blockId, numBytesToFree, evictedBlocks)
+    synchronized {
+      assert(_storageMemoryUsed <= maxStorageMemory)
+      val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
+      if (enoughMemory) {
+        _storageMemoryUsed += numBytesToAcquire
+      }
+      enoughMemory
+    }
+  }
+
+  /**
+   * Release N bytes of execution memory.
+   */
+  override def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _executionMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of execution " +
+        s"memory when we only have ${_executionMemoryUsed} bytes")
+      _executionMemoryUsed = 0
+    } else {
+      _executionMemoryUsed -= numBytes
+    }
+  }
+
+  /**
+   * Release N bytes of storage memory.
+   */
+  override def releaseStorageMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _storageMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of storage " +
+        s"memory when we only have ${_storageMemoryUsed} bytes")
+      _storageMemoryUsed = 0
+    } else {
+      _storageMemoryUsed -= numBytes
+    }
+  }
+
+  /**
+   * Release all storage memory acquired.
+   */
+  override def releaseStorageMemory(): Unit = synchronized {
+    _storageMemoryUsed = 0
+  }
+
+  /**
+   * Release N bytes of unroll memory.
+   */
+  override def releaseUnrollMemory(numBytes: Long): Unit = {
+    releaseStorageMemory(numBytes)
+  }
+
+  /**
+   * Amount of execution memory currently in use, in bytes.
+   */
+  override def executionMemoryUsed: Long = synchronized {
+    _executionMemoryUsed
+  }
+
+  /**
+   * Amount of storage memory currently in use, in bytes.
+   */
+  override def storageMemoryUsed: Long = synchronized {
+    _storageMemoryUsed
+  }
+
+}
+
+
+private[spark] object StaticMemoryManager {
+
+  /**
+   * Return the total amount of memory available for the storage region, in bytes.
+   */
+  private def getMaxStorageMemory(conf: SparkConf): Long = {
+    val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
+    val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+  }
+
+
+  /**
+   * Return the total amount of memory available for the execution region, in bytes.
+   */
+  private def getMaxExecutionMemory(conf: SparkConf): Long = {
+    val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
+    val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index 9839c7640cc63..bb64bb3f35df0 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -21,8 +21,9 @@ import scala.collection.mutable
 
 import com.google.common.annotations.VisibleForTesting
 
+import org.apache.spark._
+import org.apache.spark.memory.{StaticMemoryManager, MemoryManager}
 import org.apache.spark.unsafe.array.ByteArrayMethods
-import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext}
 
 /**
  * Allocates a pool of memory to tasks for use in shuffle operations. Each disk-spilling
@@ -40,16 +41,17 @@ import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext}
  *
  * Use `ShuffleMemoryManager.create()` factory method to create a new instance.
  *
- * @param maxMemory total amount of memory available for execution, in bytes.
+ * @param memoryManager the interface through which this manager acquires execution memory
  * @param pageSizeBytes number of bytes for each page, by default.
  */
 private[spark]
 class ShuffleMemoryManager protected (
-    val maxMemory: Long,
+    memoryManager: MemoryManager,
     val pageSizeBytes: Long)
   extends Logging {
 
   private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
+  private val maxMemory = memoryManager.maxExecutionMemory
 
   private def currentTaskAttemptId(): Long = {
     // In case this is called on the driver, return an invalid task attempt id.
@@ -71,7 +73,7 @@ class ShuffleMemoryManager protected (
     // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
     if (!taskMemory.contains(taskAttemptId)) {
       taskMemory(taskAttemptId) = 0L
-      notifyAll()  // Will later cause waiting tasks to wake up and check numThreads again
+      notifyAll()  // Will later cause waiting tasks to wake up and check numTasks again
     }
 
     // Keep looping until we're either sure that we don't want to grant this request (because this
@@ -85,46 +87,57 @@ class ShuffleMemoryManager protected (
       // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
       // don't let it be negative
       val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveTasks) - curMem))
+      // Only give it as much memory as is free, which might be none if it reached 1 / numTasks
+      val toGrant = math.min(maxToGrant, freeMemory)
 
       if (curMem < maxMemory / (2 * numActiveTasks)) {
         // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
         // if we can't give it this much now, wait for other tasks to free up memory
         // (this happens if older tasks allocated lots of memory before N grew)
         if (freeMemory >= math.min(maxToGrant, maxMemory / (2 * numActiveTasks) - curMem)) {
-          val toGrant = math.min(maxToGrant, freeMemory)
-          taskMemory(taskAttemptId) += toGrant
-          return toGrant
+          return acquire(toGrant)
         } else {
           logInfo(
             s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
           wait()
         }
       } else {
-        // Only give it as much memory as is free, which might be none if it reached 1 / numThreads
-        val toGrant = math.min(maxToGrant, freeMemory)
-        taskMemory(taskAttemptId) += toGrant
-        return toGrant
+        return acquire(toGrant)
       }
     }
     0L  // Never reached
   }
 
+  /**
+   * Acquire N bytes of execution memory from the memory manager for the current task.
+   * @return number of bytes actually acquired (<= N).
+   */
+  private def acquire(numBytes: Long): Long = synchronized {
+    val taskAttemptId = currentTaskAttemptId()
+    val acquired = memoryManager.acquireExecutionMemory(numBytes)
+    taskMemory(taskAttemptId) += acquired
+    acquired
+  }
+
   /** Release numBytes bytes for the current task. */
   def release(numBytes: Long): Unit = synchronized {
     val taskAttemptId = currentTaskAttemptId()
     val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
       throw new SparkException(
-        s"Internal error: release called on ${numBytes} bytes but task only has ${curMem}")
+        s"Internal error: release called on $numBytes bytes but task only has $curMem")
     }
     taskMemory(taskAttemptId) -= numBytes
+    memoryManager.releaseExecutionMemory(numBytes)
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 
   /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
   def releaseMemoryForThisTask(): Unit = synchronized {
     val taskAttemptId = currentTaskAttemptId()
-    taskMemory.remove(taskAttemptId)
+    taskMemory.remove(taskAttemptId).foreach { numBytes =>
+      memoryManager.releaseExecutionMemory(numBytes)
+    }
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 
@@ -138,30 +151,28 @@ class ShuffleMemoryManager protected (
 
 private[spark] object ShuffleMemoryManager {
 
-  def create(conf: SparkConf, numCores: Int): ShuffleMemoryManager = {
-    val maxMemory = ShuffleMemoryManager.getMaxMemory(conf)
+  def create(
+      conf: SparkConf,
+      memoryManager: MemoryManager,
+      numCores: Int): ShuffleMemoryManager = {
+    val maxMemory = memoryManager.maxExecutionMemory
     val pageSize = ShuffleMemoryManager.getPageSize(conf, maxMemory, numCores)
-    new ShuffleMemoryManager(maxMemory, pageSize)
+    new ShuffleMemoryManager(memoryManager, pageSize)
   }
 
+  /**
+   * Create a dummy [[ShuffleMemoryManager]] with the specified capacity and page size.
+   */
   def create(maxMemory: Long, pageSizeBytes: Long): ShuffleMemoryManager = {
-    new ShuffleMemoryManager(maxMemory, pageSizeBytes)
+    val conf = new SparkConf
+    val memoryManager = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxMemory, maxStorageMemory = Long.MaxValue)
+    new ShuffleMemoryManager(memoryManager, pageSizeBytes)
   }
 
   @VisibleForTesting
   def createForTesting(maxMemory: Long): ShuffleMemoryManager = {
-    new ShuffleMemoryManager(maxMemory, 4 * 1024 * 1024)
-  }
-
-  /**
-   * Figure out the shuffle memory limit from a SparkConf. We currently have both a fraction
-   * of the memory pool and a safety factor since collections can sometimes grow bigger than
-   * the size we target before we estimate their sizes again.
-   */
-  private def getMaxMemory(conf: SparkConf): Long = {
-    val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
-    val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    create(maxMemory, 4 * 1024 * 1024)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 47bd2ef8b2941..9f5bd2abbdc5d 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -31,6 +31,7 @@ import sun.nio.ch.DirectBuffer
 import org.apache.spark._
 import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics}
 import org.apache.spark.io.CompressionCodec
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
@@ -64,8 +65,8 @@ private[spark] class BlockManager(
     rpcEnv: RpcEnv,
     val master: BlockManagerMaster,
     defaultSerializer: Serializer,
-    maxMemory: Long,
     val conf: SparkConf,
+    memoryManager: MemoryManager,
     mapOutputTracker: MapOutputTracker,
     shuffleManager: ShuffleManager,
     blockTransferService: BlockTransferService,
@@ -82,12 +83,15 @@ private[spark] class BlockManager(
 
   // Actual storage of where blocks are kept
   private var externalBlockStoreInitialized = false
-  private[spark] val memoryStore = new MemoryStore(this, maxMemory)
+  private[spark] val memoryStore = new MemoryStore(this, memoryManager)
   private[spark] val diskStore = new DiskStore(this, diskBlockManager)
   private[spark] lazy val externalBlockStore: ExternalBlockStore = {
     externalBlockStoreInitialized = true
     new ExternalBlockStore(this, executorId)
   }
+  memoryManager.setMemoryStore(memoryStore)
+
+  private val maxMemory = memoryManager.maxStorageMemory
 
   private[spark]
   val externalShuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
@@ -157,24 +161,6 @@ private[spark] class BlockManager(
    * loaded yet. */
   private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf)
 
-  /**
-   * Construct a BlockManager with a memory limit set based on system properties.
-   */
-  def this(
-      execId: String,
-      rpcEnv: RpcEnv,
-      master: BlockManagerMaster,
-      serializer: Serializer,
-      conf: SparkConf,
-      mapOutputTracker: MapOutputTracker,
-      shuffleManager: ShuffleManager,
-      blockTransferService: BlockTransferService,
-      securityManager: SecurityManager,
-      numUsableCores: Int) = {
-    this(execId, rpcEnv, master, serializer, BlockManager.getMaxMemory(conf),
-      conf, mapOutputTracker, shuffleManager, blockTransferService, securityManager, numUsableCores)
-  }
-
   /**
    * Initializes the BlockManager with the given appId. This is not performed in the constructor as
    * the appId may not be known at BlockManager instantiation time (in particular for the driver,
@@ -1267,13 +1253,6 @@ private[spark] class BlockManager(
 private[spark] object BlockManager extends Logging {
   private val ID_GENERATOR = new IdGenerator
 
-  /** Return the total amount of storage memory available. */
-  private def getMaxMemory(conf: SparkConf): Long = {
-    val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
-    val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
-  }
-
   /**
    * Attempt to clean up a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that
    * might cause errors if one attempts to read from the unmapped buffer, but it's better than
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 6f27f00307f8c..35c57b923c43a 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.TaskContext
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
 
@@ -33,13 +34,12 @@ private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
  * Stores blocks in memory, either as Arrays of deserialized Java objects or as
  * serialized ByteBuffers.
  */
-private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
+private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: MemoryManager)
   extends BlockStore(blockManager) {
 
   private val conf = blockManager.conf
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
-
-  @volatile private var currentMemory = 0L
+  private val maxMemory = memoryManager.maxStorageMemory
 
   // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
   private val accountingLock = new Object
@@ -56,15 +56,6 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   // memory (SPARK-4777).
   private val pendingUnrollMemoryMap = mutable.HashMap[Long, Long]()
 
-  /**
-   * The amount of space ensured for unrolling values in memory, shared across all cores.
-   * This space is not reserved in advance, but allocated dynamically by dropping existing blocks.
-   */
-  private val maxUnrollMemory: Long = {
-    val unrollFraction = conf.getDouble("spark.storage.unrollFraction", 0.2)
-    (maxMemory * unrollFraction).toLong
-  }
-
   // Initial memory to request before unrolling any block
   private val unrollMemoryThreshold: Long =
     conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
@@ -77,8 +68,14 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
 
   logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory)))
 
-  /** Free memory not occupied by existing blocks. Note that this does not include unroll memory. */
-  def freeMemory: Long = maxMemory - currentMemory
+  /** Total storage memory used including unroll memory, in bytes. */
+  private def memoryUsed: Long = memoryManager.storageMemoryUsed
+
+  /**
+   * Amount of storage memory, in bytes, used for caching blocks.
+   * This does not include memory used for unrolling.
+   */
+  private def blocksMemoryUsed: Long = memoryUsed - currentUnrollMemory
 
   override def getSize(blockId: BlockId): Long = {
     entries.synchronized {
@@ -94,8 +91,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       val values = blockManager.dataDeserialize(blockId, bytes)
       putIterator(blockId, values, level, returnValues = true)
     } else {
-      val putAttempt = tryToPut(blockId, bytes, bytes.limit, deserialized = false)
-      PutResult(bytes.limit(), Right(bytes.duplicate()), putAttempt.droppedBlocks)
+      val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+      tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks)
+      PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks)
     }
   }
 
@@ -108,15 +106,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   def putBytes(blockId: BlockId, size: Long, _bytes: () => ByteBuffer): PutResult = {
     // Work on a duplicate - since the original input might be used elsewhere.
     lazy val bytes = _bytes().duplicate().rewind().asInstanceOf[ByteBuffer]
-    val putAttempt = tryToPut(blockId, () => bytes, size, deserialized = false)
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val putSuccess = tryToPut(blockId, () => bytes, size, deserialized = false, droppedBlocks)
     val data =
-      if (putAttempt.success) {
+      if (putSuccess) {
         assert(bytes.limit == size)
         Right(bytes.duplicate())
       } else {
         null
       }
-    PutResult(size, data, putAttempt.droppedBlocks)
+    PutResult(size, data, droppedBlocks)
   }
 
   override def putArray(
@@ -124,14 +123,15 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     if (level.deserialized) {
       val sizeEstimate = SizeEstimator.estimate(values.asInstanceOf[AnyRef])
-      val putAttempt = tryToPut(blockId, values, sizeEstimate, deserialized = true)
-      PutResult(sizeEstimate, Left(values.iterator), putAttempt.droppedBlocks)
+      tryToPut(blockId, values, sizeEstimate, deserialized = true, droppedBlocks)
+      PutResult(sizeEstimate, Left(values.iterator), droppedBlocks)
     } else {
       val bytes = blockManager.dataSerialize(blockId, values.iterator)
-      val putAttempt = tryToPut(blockId, bytes, bytes.limit, deserialized = false)
-      PutResult(bytes.limit(), Right(bytes.duplicate()), putAttempt.droppedBlocks)
+      tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks)
+      PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks)
     }
   }
 
@@ -209,23 +209,22 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   }
 
   override def remove(blockId: BlockId): Boolean = {
-    entries.synchronized {
-      val entry = entries.remove(blockId)
-      if (entry != null) {
-        currentMemory -= entry.size
-        logDebug(s"Block $blockId of size ${entry.size} dropped from memory (free $freeMemory)")
-        true
-      } else {
-        false
-      }
+    val entry = entries.synchronized { entries.remove(blockId) }
+    if (entry != null) {
+      memoryManager.releaseStorageMemory(entry.size)
+      logDebug(s"Block $blockId of size ${entry.size} dropped " +
+        s"from memory (free ${maxMemory - blocksMemoryUsed})")
+      true
+    } else {
+      false
     }
   }
 
   override def clear() {
     entries.synchronized {
       entries.clear()
-      currentMemory = 0
     }
+    memoryManager.releaseStorageMemory()
     logInfo("MemoryStore cleared")
   }
 
@@ -265,7 +264,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     var vector = new SizeTrackingVector[Any]
 
     // Request enough memory to begin unrolling
-    keepUnrolling = reserveUnrollMemoryForThisTask(initialMemoryThreshold)
+    keepUnrolling = reserveUnrollMemoryForThisTask(blockId, initialMemoryThreshold, droppedBlocks)
 
     if (!keepUnrolling) {
       logWarning(s"Failed to reserve initial memory threshold of " +
@@ -281,20 +280,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
           val currentSize = vector.estimateSize()
           if (currentSize >= memoryThreshold) {
             val amountToRequest = (currentSize * memoryGrowthFactor - memoryThreshold).toLong
-            // Hold the accounting lock, in case another thread concurrently puts a block that
-            // takes up the unrolling space we just ensured here
-            accountingLock.synchronized {
-              if (!reserveUnrollMemoryForThisTask(amountToRequest)) {
-                // If the first request is not granted, try again after ensuring free space
-                // If there is still not enough space, give up and drop the partition
-                val spaceToEnsure = maxUnrollMemory - currentUnrollMemory
-                if (spaceToEnsure > 0) {
-                  val result = ensureFreeSpace(blockId, spaceToEnsure)
-                  droppedBlocks ++= result.droppedBlocks
-                }
-                keepUnrolling = reserveUnrollMemoryForThisTask(amountToRequest)
-              }
-            }
+            keepUnrolling = reserveUnrollMemoryForThisTask(
+              blockId, amountToRequest, droppedBlocks)
             // New threshold is currentSize * memoryGrowthFactor
             memoryThreshold += amountToRequest
           }
@@ -317,10 +304,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       // Otherwise, if we return an iterator, we release the memory reserved here
       // later when the task finishes.
       if (keepUnrolling) {
+        val taskAttemptId = currentTaskAttemptId()
         accountingLock.synchronized {
-          val amountToRelease = currentUnrollMemoryForThisTask - previousMemoryReserved
-          releaseUnrollMemoryForThisTask(amountToRelease)
-          reservePendingUnrollMemoryForThisTask(amountToRelease)
+          // Here, we transfer memory from unroll to pending unroll because we expect to cache this
+          // block in `tryToPut`. We do not release and re-acquire memory from the MemoryManager in
+          // order to avoid race conditions where another component steals the memory that we're
+          // trying to transfer.
+          val amountToTransferToPending = currentUnrollMemoryForThisTask - previousMemoryReserved
+          unrollMemoryMap(taskAttemptId) -= amountToTransferToPending
+          pendingUnrollMemoryMap(taskAttemptId) =
+            pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + amountToTransferToPending
         }
       }
     }
@@ -337,8 +330,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       blockId: BlockId,
       value: Any,
       size: Long,
-      deserialized: Boolean): ResultWithDroppedBlocks = {
-    tryToPut(blockId, () => value, size, deserialized)
+      deserialized: Boolean,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    tryToPut(blockId, () => value, size, deserialized, droppedBlocks)
   }
 
   /**
@@ -354,13 +348,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * blocks to free memory for one block, another thread may use up the freed space for
    * another block.
    *
-   * Return whether put was successful, along with the blocks dropped in the process.
+   * All blocks evicted in the process, if any, will be added to `droppedBlocks`.
+   *
+   * @return whether put was successful.
    */
   private def tryToPut(
       blockId: BlockId,
       value: () => Any,
       size: Long,
-      deserialized: Boolean): ResultWithDroppedBlocks = {
+      deserialized: Boolean,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
 
     /* TODO: Its possible to optimize the locking by locking entries only when selecting blocks
      * to be dropped. Once the to-be-dropped blocks have been selected, and lock on entries has
@@ -368,24 +365,27 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
      * for freeing up more space for another block that needs to be put. Only then the actually
      * dropping of blocks (and writing to disk if necessary) can proceed in parallel. */
 
-    var putSuccess = false
-    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
-
     accountingLock.synchronized {
-      val freeSpaceResult = ensureFreeSpace(blockId, size)
-      val enoughFreeSpace = freeSpaceResult.success
-      droppedBlocks ++= freeSpaceResult.droppedBlocks
-
-      if (enoughFreeSpace) {
+      // Note: if we have previously unrolled this block successfully, then pending unroll
+      // memory should be non-zero. This is the amount that we already reserved during the
+      // unrolling process. In this case, we can just reuse this space to cache our block.
+      //
+      // Note: the StaticMemoryManager counts unroll memory as storage memory. Here, the
+      // synchronization on `accountingLock` guarantees that the release of unroll memory and
+      // acquisition of storage memory happens atomically. However, if storage memory is acquired
+      // outside of MemoryStore or if unroll memory is counted as execution memory, then we will
+      // have to revisit this assumption. See SPARK-10983 for more context.
+      releasePendingUnrollMemoryForThisTask()
+      val enoughMemory = memoryManager.acquireStorageMemory(blockId, size, droppedBlocks)
+      if (enoughMemory) {
+        // We acquired enough memory for the block, so go ahead and put it
         val entry = new MemoryEntry(value(), size, deserialized)
         entries.synchronized {
           entries.put(blockId, entry)
-          currentMemory += size
         }
         val valuesOrBytes = if (deserialized) "values" else "bytes"
         logInfo("Block %s stored as %s in memory (estimated size %s, free %s)".format(
-          blockId, valuesOrBytes, Utils.bytesToString(size), Utils.bytesToString(freeMemory)))
-        putSuccess = true
+          blockId, valuesOrBytes, Utils.bytesToString(size), Utils.bytesToString(blocksMemoryUsed)))
       } else {
         // Tell the block manager that we couldn't put it in memory so that it can drop it to
         // disk if the block allows disk storage.
@@ -397,10 +397,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         val droppedBlockStatus = blockManager.dropFromMemory(blockId, () => data)
         droppedBlockStatus.foreach { status => droppedBlocks += ((blockId, status)) }
       }
-      // Release the unroll memory used because we no longer need the underlying Array
-      releasePendingUnrollMemoryForThisTask()
+      enoughMemory
     }
-    ResultWithDroppedBlocks(putSuccess, droppedBlocks)
   }
 
   /**
@@ -409,40 +407,42 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
    * don't fit into memory that we want to avoid).
    *
-   * Assume that `accountingLock` is held by the caller to ensure only one thread is dropping
-   * blocks. Otherwise, the freed space may fill up before the caller puts in their new value.
-   *
-   * Return whether there is enough free space, along with the blocks dropped in the process.
+   * @param blockId the ID of the block we are freeing space for
+   * @param space the size of this block
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether there is enough free space.
    */
-  private def ensureFreeSpace(
-      blockIdToAdd: BlockId,
-      space: Long): ResultWithDroppedBlocks = {
-    logInfo(s"ensureFreeSpace($space) called with curMem=$currentMemory, maxMem=$maxMemory")
-
-    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+  private[spark] def ensureFreeSpace(
+      blockId: BlockId,
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    accountingLock.synchronized {
+      val freeMemory = maxMemory - memoryUsed
+      val rddToAdd = getRddId(blockId)
+      val selectedBlocks = new ArrayBuffer[BlockId]
+      var selectedMemory = 0L
 
-    if (space > maxMemory) {
-      logInfo(s"Will not store $blockIdToAdd as it is larger than our memory limit")
-      return ResultWithDroppedBlocks(success = false, droppedBlocks)
-    }
+      logInfo(s"Ensuring $space bytes of free space for block $blockId " +
+        s"(free: $freeMemory, max: $maxMemory)")
 
-    // Take into account the amount of memory currently occupied by unrolling blocks
-    // and minus the pending unroll memory for that block on current thread.
-    val taskAttemptId = currentTaskAttemptId()
-    val actualFreeMemory = freeMemory - currentUnrollMemory +
-      pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L)
+      // Fail fast if the block simply won't fit
+      if (space > maxMemory) {
+        logInfo(s"Will not store $blockId as the required space " +
+          s"($space bytes) than our memory limit ($maxMemory bytes)")
+        return false
+      }
 
-    if (actualFreeMemory < space) {
-      val rddToAdd = getRddId(blockIdToAdd)
-      val selectedBlocks = new ArrayBuffer[BlockId]
-      var selectedMemory = 0L
+      // No need to evict anything if there is already enough free space
+      if (freeMemory >= space) {
+        return true
+      }
 
       // This is synchronized to ensure that the set of entries is not changed
       // (because of getValue or getBytes) while traversing the iterator, as that
       // can lead to exceptions.
       entries.synchronized {
         val iterator = entries.entrySet().iterator()
-        while (actualFreeMemory + selectedMemory < space && iterator.hasNext) {
+        while (freeMemory + selectedMemory < space && iterator.hasNext) {
           val pair = iterator.next()
           val blockId = pair.getKey
           if (rddToAdd.isEmpty || rddToAdd != getRddId(blockId)) {
@@ -452,7 +452,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         }
       }
 
-      if (actualFreeMemory + selectedMemory >= space) {
+      if (freeMemory + selectedMemory >= space) {
         logInfo(s"${selectedBlocks.size} blocks selected for dropping")
         for (blockId <- selectedBlocks) {
           val entry = entries.synchronized { entries.get(blockId) }
@@ -469,14 +469,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
             droppedBlockStatus.foreach { status => droppedBlocks += ((blockId, status)) }
           }
         }
-        return ResultWithDroppedBlocks(success = true, droppedBlocks)
+        true
       } else {
-        logInfo(s"Will not store $blockIdToAdd as it would require dropping another block " +
+        logInfo(s"Will not store $blockId as it would require dropping another block " +
           "from the same RDD")
-        return ResultWithDroppedBlocks(success = false, droppedBlocks)
+        false
       }
     }
-    ResultWithDroppedBlocks(success = true, droppedBlocks)
   }
 
   override def contains(blockId: BlockId): Boolean = {
@@ -489,17 +488,21 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   }
 
   /**
-   * Reserve additional memory for unrolling blocks used by this task.
-   * Return whether the request is granted.
+   * Reserve memory for unrolling the given block for this task.
+   * @return whether the request is granted.
    */
-  def reserveUnrollMemoryForThisTask(memory: Long): Boolean = {
+  def reserveUnrollMemoryForThisTask(
+      blockId: BlockId,
+      memory: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
     accountingLock.synchronized {
-      val granted = freeMemory > currentUnrollMemory + memory
-      if (granted) {
+      // Note: all acquisitions of unroll memory must be synchronized on `accountingLock`
+      val success = memoryManager.acquireUnrollMemory(blockId, memory, droppedBlocks)
+      if (success) {
         val taskAttemptId = currentTaskAttemptId()
         unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
       }
-      granted
+      success
     }
   }
 
@@ -507,40 +510,38 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * Release memory used by this task for unrolling blocks.
    * If the amount is not specified, remove the current task's allocation altogether.
    */
-  def releaseUnrollMemoryForThisTask(memory: Long = -1L): Unit = {
+  def releaseUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-      if (memory < 0) {
-        unrollMemoryMap.remove(taskAttemptId)
-      } else {
-        unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, memory) - memory
-        // If this task claims no more unroll memory, release it completely
-        if (unrollMemoryMap(taskAttemptId) <= 0) {
-          unrollMemoryMap.remove(taskAttemptId)
+      if (unrollMemoryMap.contains(taskAttemptId)) {
+        val memoryToRelease = math.min(memory, unrollMemoryMap(taskAttemptId))
+        if (memoryToRelease > 0) {
+          unrollMemoryMap(taskAttemptId) -= memoryToRelease
+          if (unrollMemoryMap(taskAttemptId) == 0) {
+            unrollMemoryMap.remove(taskAttemptId)
+          }
+          memoryManager.releaseUnrollMemory(memoryToRelease)
         }
       }
     }
   }
 
-  /**
-   * Reserve the unroll memory of current unroll successful block used by this task
-   * until actually put the block into memory entry.
-   */
-  def reservePendingUnrollMemoryForThisTask(memory: Long): Unit = {
-    val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
-       pendingUnrollMemoryMap(taskAttemptId) =
-         pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
-    }
-  }
-
   /**
    * Release pending unroll memory of current unroll successful block used by this task
    */
-  def releasePendingUnrollMemoryForThisTask(): Unit = {
+  def releasePendingUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-      pendingUnrollMemoryMap.remove(taskAttemptId)
+      if (pendingUnrollMemoryMap.contains(taskAttemptId)) {
+        val memoryToRelease = math.min(memory, pendingUnrollMemoryMap(taskAttemptId))
+        if (memoryToRelease > 0) {
+          pendingUnrollMemoryMap(taskAttemptId) -= memoryToRelease
+          if (pendingUnrollMemoryMap(taskAttemptId) == 0) {
+            pendingUnrollMemoryMap.remove(taskAttemptId)
+          }
+          memoryManager.releaseUnrollMemory(memoryToRelease)
+        }
+      }
     }
   }
 
@@ -561,19 +562,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   /**
    * Return the number of tasks currently unrolling blocks.
    */
-  def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
+  private def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
 
   /**
    * Log information about current memory usage.
    */
-  def logMemoryUsage(): Unit = {
-    val blocksMemory = currentMemory
-    val unrollMemory = currentUnrollMemory
-    val totalMemory = blocksMemory + unrollMemory
+  private def logMemoryUsage(): Unit = {
     logInfo(
-      s"Memory use = ${Utils.bytesToString(blocksMemory)} (blocks) + " +
-      s"${Utils.bytesToString(unrollMemory)} (scratch space shared across " +
-      s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(totalMemory)}. " +
+      s"Memory use = ${Utils.bytesToString(blocksMemoryUsed)} (blocks) + " +
+      s"${Utils.bytesToString(currentUnrollMemory)} (scratch space shared across " +
+      s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(memoryUsed)}. " +
       s"Storage limit = ${Utils.bytesToString(maxMemory)}."
     )
   }
@@ -584,7 +582,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * @param blockId ID of the block we are trying to unroll.
    * @param finalVectorSize Final size of the vector before unrolling failed.
    */
-  def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = {
+  private def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = {
     logWarning(
       s"Not enough space to cache $blockId in memory! " +
       s"(computed ${Utils.bytesToString(finalVectorSize)} so far)"
@@ -592,7 +590,3 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     logMemoryUsage()
   }
 }
-
-private[spark] case class ResultWithDroppedBlocks(
-    success: Boolean,
-    droppedBlocks: Seq[(BlockId, BlockStatus)])
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
new file mode 100644
index 0000000000000..c436a8b5c9f81
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.mockito.Mockito.{mock, reset, verify, when}
+import org.mockito.Matchers.{any, eq => meq}
+
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+
+class StaticMemoryManagerSuite extends SparkFunSuite {
+  private val conf = new SparkConf().set("spark.storage.unrollFraction", "0.4")
+
+  test("basic execution memory") {
+    val maxExecutionMem = 1000L
+    val (mm, _) = makeThings(maxExecutionMem, Long.MaxValue)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.acquireExecutionMemory(10L) === 10L)
+    assert(mm.executionMemoryUsed === 10L)
+    assert(mm.acquireExecutionMemory(100L) === 100L)
+    // Acquire up to the max
+    assert(mm.acquireExecutionMemory(1000L) === 890L)
+    assert(mm.executionMemoryUsed === maxExecutionMem)
+    assert(mm.acquireExecutionMemory(1L) === 0L)
+    assert(mm.executionMemoryUsed === maxExecutionMem)
+    mm.releaseExecutionMemory(800L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireExecutionMemory(1L) === 1L)
+    assert(mm.executionMemoryUsed === 201L)
+    // Release beyond what was acquired
+    mm.releaseExecutionMemory(maxExecutionMem)
+    assert(mm.executionMemoryUsed === 0L)
+  }
+
+  test("basic storage memory") {
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("you can see the world you brought to live")
+    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
+    // `ensureFreeSpace` should be called with the number of bytes requested
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 10L)
+    assert(mm.storageMemoryUsed === 10L)
+    assert(evictedBlocks.isEmpty)
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire up to the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, 1000L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1000L)
+    assert(mm.storageMemoryUsed === 110L)
+    assert(mm.acquireStorageMemory(dummyBlock, 890L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 890L)
+    assert(mm.storageMemoryUsed === 1000L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 1000L)
+    mm.releaseStorageMemory(800L)
+    assert(mm.storageMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 201L)
+    mm.releaseStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 1L)
+    // Release beyond what was acquired
+    mm.releaseStorageMemory(100L)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  test("execution and storage isolation") {
+    val maxExecutionMem = 200L
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("ain't nobody love like you do")
+    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(maxExecutionMem, maxStorageMem)
+    // Only execution memory should increase
+    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 100L)
+    assert(mm.acquireExecutionMemory(1000L) === 100L)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Only storage memory should increase
+    assert(mm.acquireStorageMemory(dummyBlock, 50L, dummyBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 50L)
+    assert(mm.storageMemoryUsed === 50L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Only execution memory should be released
+    mm.releaseExecutionMemory(133L)
+    assert(mm.storageMemoryUsed === 50L)
+    assert(mm.executionMemoryUsed === 67L)
+    // Only storage memory should be released
+    mm.releaseStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 67L)
+  }
+
+  test("unroll memory") {
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("lonely water")
+    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
+    assert(mm.acquireUnrollMemory(dummyBlock, 100L, dummyBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.storageMemoryUsed === 100L)
+    mm.releaseUnrollMemory(40L)
+    assert(mm.storageMemoryUsed === 60L)
+    when(ms.currentUnrollMemory).thenReturn(60L)
+    assert(mm.acquireUnrollMemory(dummyBlock, 500L, dummyBlocks))
+    // `spark.storage.unrollFraction` is 0.4, so the max unroll space is 400 bytes.
+    // Since we already occupy 60 bytes, we will try to ensure only 400 - 60 = 340 bytes.
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 340L)
+    assert(mm.storageMemoryUsed === 560L)
+    when(ms.currentUnrollMemory).thenReturn(560L)
+    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, dummyBlocks))
+    assert(mm.storageMemoryUsed === 560L)
+    // We already have 560 bytes > the max unroll space of 400 bytes, so no bytes are freed
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 0L)
+    // Release beyond what was acquired
+    mm.releaseUnrollMemory(maxStorageMem)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  /**
+   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(
+      maxExecutionMem: Long,
+      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
+    val mm = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
+    val ms = mock(classOf[MemoryStore])
+    mm.setMemoryStore(ms)
+    (mm, ms)
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
+   */
+  private def assertEnsureFreeSpaceCalled(
+      ms: MemoryStore,
+      blockId: BlockId,
+      numBytes: Long): Unit = {
+    verify(ms).ensureFreeSpace(meq(blockId), meq(numBytes: java.lang.Long), any())
+    reset(ms)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index eb5af70d57aec..cc44c676b27ac 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -29,6 +29,7 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
@@ -39,29 +40,31 @@ import org.apache.spark.storage.StorageLevel._
 class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
 
   private val conf = new SparkConf(false).set("spark.app.id", "test")
-  var rpcEnv: RpcEnv = null
-  var master: BlockManagerMaster = null
-  val securityMgr = new SecurityManager(conf)
-  val mapOutputTracker = new MapOutputTrackerMaster(conf)
-  val shuffleManager = new HashShuffleManager(conf)
+  private var rpcEnv: RpcEnv = null
+  private var master: BlockManagerMaster = null
+  private val securityMgr = new SecurityManager(conf)
+  private val mapOutputTracker = new MapOutputTrackerMaster(conf)
+  private val shuffleManager = new HashShuffleManager(conf)
 
   // List of block manager created during an unit test, so that all of the them can be stopped
   // after the unit test.
-  val allStores = new ArrayBuffer[BlockManager]
+  private val allStores = new ArrayBuffer[BlockManager]
 
   // Reuse a serializer across tests to avoid creating a new thread-local buffer on each test
   conf.set("spark.kryoserializer.buffer", "1m")
-  val serializer = new KryoSerializer(conf)
+  private val serializer = new KryoSerializer(conf)
 
   // Implicitly convert strings to BlockIds for test clarity.
-  implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value)
+  private implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value)
 
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val store = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val store = new BlockManager(name, rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(store.memoryStore)
     store.initialize("app-id")
     allStores += store
     store
@@ -258,8 +261,10 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
     val failableTransfer = mock(classOf[BlockTransferService]) // this wont actually work
     when(failableTransfer.hostName).thenReturn("some-hostname")
     when(failableTransfer.port).thenReturn(1000)
-    val failableStore = new BlockManager("failable-store", rpcEnv, master, serializer,
-      10000, conf, mapOutputTracker, shuffleManager, failableTransfer, securityMgr, 0)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, 10000)
+    val failableStore = new BlockManager("failable-store", rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, failableTransfer, securityMgr, 0)
+    memManager.setMemoryStore(failableStore.memoryStore)
     failableStore.initialize("app-id")
     allStores += failableStore // so that this gets stopped after test
     assert(master.getPeers(store.blockManagerId).toSet === Set(failableStore.blockManagerId))
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 34bb4952e7246..f3fab33ca2e31 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -34,6 +34,7 @@ import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.executor.DataReadMethod
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -67,10 +68,12 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val manager = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
-    manager.initialize("app-id")
-    manager
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val blockManager = new BlockManager(name, rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(blockManager.memoryStore)
+    blockManager.initialize("app-id")
+    blockManager
   }
 
   override def beforeEach(): Unit = {
@@ -820,9 +823,11 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   test("block store put failure") {
     // Use Java serializer so we can create an unserializable error.
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
+    val memoryManager = new StaticMemoryManager(conf, Long.MaxValue, 1200)
     store = new BlockManager(SparkContext.DRIVER_IDENTIFIER, rpcEnv, master,
-      new JavaSerializer(conf), 1200, conf, mapOutputTracker, shuffleManager, transfer, securityMgr,
-      0)
+      new JavaSerializer(conf), conf, memoryManager, mapOutputTracker,
+      shuffleManager, transfer, securityMgr, 0)
+    memoryManager.setMemoryStore(store.memoryStore)
 
     // The put should fail since a1 is not serializable.
     class UnserializableClass
@@ -1043,14 +1048,19 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(memoryStore.currentUnrollMemory === 0)
     assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
+    def reserveUnrollMemoryForThisTask(memory: Long): Boolean = {
+      memoryStore.reserveUnrollMemoryForThisTask(
+        TestBlockId(""), memory, new ArrayBuffer[(BlockId, BlockStatus)])
+    }
+
     // Reserve
-    memoryStore.reserveUnrollMemoryForThisTask(100)
+    assert(reserveUnrollMemoryForThisTask(100))
     assert(memoryStore.currentUnrollMemoryForThisTask === 100)
-    memoryStore.reserveUnrollMemoryForThisTask(200)
+    assert(reserveUnrollMemoryForThisTask(200))
     assert(memoryStore.currentUnrollMemoryForThisTask === 300)
-    memoryStore.reserveUnrollMemoryForThisTask(500)
+    assert(reserveUnrollMemoryForThisTask(500))
     assert(memoryStore.currentUnrollMemoryForThisTask === 800)
-    memoryStore.reserveUnrollMemoryForThisTask(1000000)
+    assert(!reserveUnrollMemoryForThisTask(1000000))
     assert(memoryStore.currentUnrollMemoryForThisTask === 800) // not granted
     // Release
     memoryStore.releaseUnrollMemoryForThisTask(100)
@@ -1058,9 +1068,9 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     memoryStore.releaseUnrollMemoryForThisTask(100)
     assert(memoryStore.currentUnrollMemoryForThisTask === 600)
     // Reserve again
-    memoryStore.reserveUnrollMemoryForThisTask(4400)
+    assert(reserveUnrollMemoryForThisTask(4400))
     assert(memoryStore.currentUnrollMemoryForThisTask === 5000)
-    memoryStore.reserveUnrollMemoryForThisTask(20000)
+    assert(!reserveUnrollMemoryForThisTask(20000))
     assert(memoryStore.currentUnrollMemoryForThisTask === 5000) // not granted
     // Release again
     memoryStore.releaseUnrollMemoryForThisTask(1000)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index 48c3938ff87ba..ff65d7bdf8b92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -17,12 +17,18 @@
 
 package org.apache.spark.sql.execution
 
+import scala.collection.mutable
+
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.shuffle.ShuffleMemoryManager
+import org.apache.spark.storage.{BlockId, BlockStatus}
+
 
 /**
  * A [[ShuffleMemoryManager]] that can be controlled to run out of memory.
  */
-class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue, 4 * 1024 * 1024) {
+class TestShuffleMemoryManager
+  extends ShuffleMemoryManager(new GrantEverythingMemoryManager, 4 * 1024 * 1024) {
   private var oom = false
 
   override def tryToAcquire(numBytes: Long): Long = {
@@ -49,3 +55,23 @@ class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue, 4 * 1
     oom = true
   }
 }
+
+private class GrantEverythingMemoryManager extends MemoryManager {
+  override def acquireExecutionMemory(numBytes: Long): Long = numBytes
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def releaseExecutionMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(): Unit = { }
+  override def releaseUnrollMemory(numBytes: Long): Unit = { }
+  override def maxExecutionMemory: Long = Long.MaxValue
+  override def maxStorageMemory: Long = Long.MaxValue
+  override def executionMemoryUsed: Long = Long.MaxValue
+  override def storageMemoryUsed: Long = Long.MaxValue
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 13cfe29d7b304..b2b6848719639 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -29,6 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
@@ -253,12 +254,14 @@ class ReceivedBlockHandlerSuite
       maxMem: Long,
       conf: SparkConf,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val manager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
-    manager.initialize("app-id")
-    blockManagerBuffer += manager
-    manager
+    val blockManager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(blockManager.memoryStore)
+    blockManager.initialize("app-id")
+    blockManagerBuffer += blockManager
+    blockManager
   }
 
   /**

From 5410747a84e9be1cea44159dfc2216d5e0728ab4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Thu, 8 Oct 2015 22:21:07 -0700
Subject: [PATCH 510/802] [SPARK-10959] [PYSPARK]
 StreamingLogisticRegressionWithSGD does not train with given regParam and
 convergenceTol parameters

These params were being passed into the StreamingLogisticRegressionWithSGD constructor, but not transferred to the call for model training.  Same with StreamingLinearRegressionWithSGD.  I added the params as named arguments to the call and also fixed the intercept parameter, which was being passed as regularization value.

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #9002 from BryanCutler/StreamingSGD-convergenceTol-bug-10959.
---
 python/pyspark/mllib/classification.py | 3 ++-
 python/pyspark/mllib/regression.py     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index cb4ee83678081..b77754500bded 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -639,7 +639,8 @@ def update(rdd):
             if not rdd.isEmpty():
                 self._model = LogisticRegressionWithSGD.train(
                     rdd, self.numIterations, self.stepSize,
-                    self.miniBatchFraction, self._model.weights)
+                    self.miniBatchFraction, self._model.weights,
+                    regParam=self.regParam, convergenceTol=self.convergenceTol)
 
         dstream.foreachRDD(update)
 
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 256b7537fef6b..961b5e80b013c 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -679,7 +679,7 @@ def update(rdd):
                 self._model = LinearRegressionWithSGD.train(
                     rdd, self.numIterations, self.stepSize,
                     self.miniBatchFraction, self._model.weights,
-                    self._model.intercept)
+                    intercept=self._model.intercept, convergenceTol=self.convergenceTol)
 
         dstream.foreachRDD(update)
 

From 5994cfe81271a39294aa29fd47aa94c99aa56743 Mon Sep 17 00:00:00 2001
From: Nick Pritchard <nicholas.pritchard@falkonry.com>
Date: Thu, 8 Oct 2015 22:22:20 -0700
Subject: [PATCH 511/802] [SPARK-10875] [MLLIB] Computed covariance matrix
 should be symmetric

Compute upper triangular values of the covariance matrix, then copy to lower triangular values.

Author: Nick Pritchard <nicholas.pritchard@falkonry.com>

Closes #8940 from pnpritchard/SPARK-10875.
---
 .../mllib/linalg/distributed/RowMatrix.scala   |  6 ++++--
 .../linalg/distributed/RowMatrixSuite.scala    | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 7c7d900af3d5a..b8a7adceb15b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -357,9 +357,11 @@ class RowMatrix @Since("1.0.0") (
     var alpha = 0.0
     while (i < n) {
       alpha = m / m1 * mean(i)
-      j = 0
+      j = i
       while (j < n) {
-        G(i, j) = G(i, j) / m1 - alpha * mean(j)
+        val Gij = G(i, j) / m1 - alpha * mean(j)
+        G(i, j) = Gij
+        G(j, i) = Gij
         j += 1
       }
       i += 1
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index 283ffec1d49d7..4abb98fb6fe4e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -24,6 +24,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, s
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector}
+import org.apache.spark.mllib.random.RandomRDDs
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 
 class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -255,6 +256,23 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]])))
     }
   }
+
+  test("compute covariance") {
+    for (mat <- Seq(denseMat, sparseMat)) {
+      val result = mat.computeCovariance()
+      val expected = breeze.linalg.cov(mat.toBreeze())
+      assert(closeToZero(abs(expected) - abs(result.toBreeze.asInstanceOf[BDM[Double]])))
+    }
+  }
+
+  test("covariance matrix is symmetric (SPARK-10875)") {
+    val rdd = RandomRDDs.normalVectorRDD(sc, 100, 10, 0, 0)
+    val matrix = new RowMatrix(rdd)
+    val cov = matrix.computeCovariance()
+    for (i <- 0 until cov.numRows; j <- 0 until i) {
+      assert(cov(i, j) === cov(j, i))
+    }
+  }
 }
 
 class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {

From 70f44ad2d836236c74e1336a7368982d5fe3abff Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Fri, 9 Oct 2015 09:36:40 -0700
Subject: [PATCH 512/802] [SPARK-10905] [SPARKR] Export freqItems() for
 DataFrameStatFunctions

[SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions
- Add function (together with roxygen2 doc) to DataFrame.R and generics.R
- Expose the function in NAMESPACE
- Add unit test for the function

Author: Rerngvit Yanggratoke <rerngvit@kth.se>

Closes #8962 from rerngvit/SPARK-10905.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/R/stats.R                  | 27 +++++++++++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 21 +++++++++++++++++++++
 4 files changed, 53 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9aad35469bbb7..255be2e76ff49 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -40,6 +40,7 @@ exportMethods("arrange",
               "fillna",
               "filter",
               "first",
+              "freqItems",
               "group_by",
               "groupBy",
               "head",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e9086fdbd18c6..c4474131804bb 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 # @export
 setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 
+# @rdname statfunctions
+# @export
+setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 06382d55d086e..4928cf4d4367d 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -100,3 +100,30 @@ setMethod("corr",
             statFunctions <- callJMethod(x@sdf, "stat")
             callJMethod(statFunctions, "corr", col1, col2, method)
           })
+
+#' freqItems
+#'
+#' Finding frequent items for columns, possibly with false positives.
+#' Using the frequent element count algorithm described in
+#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param cols A vector column names to search frequent items in.
+#' @param support (Optional) The minimum frequency for an item to be considered `frequent`. 
+#'                Should be greater than 1e-4. Default support = 0.01.
+#' @return a local R data.frame with the frequent items in each column
+#'
+#' @rdname statfunctions
+#' @name freqItems
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' fi = freqItems(df, c("title", "gender"))
+#' }
+setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
+          function(x, cols, support = 0.01) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
+            collect(dataFrame(sct))
+          })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e85de2507085c..4804ecf177341 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", {
   expect_true(abs(result - 1.0) < 1e-12)
 })
 
+test_that("freqItems() on a DataFrame", {
+  input <- 1:1000
+  rdf <- data.frame(numbers = input, letters = as.character(input),
+                    negDoubles = input * -1.0, stringsAsFactors = F)
+  rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+  df <- createDataFrame(sqlContext, rdf)
+  multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
+  expect_true(1 %in% multiColResults$numbers[[1]])
+  expect_true("1" %in% multiColResults$letters[[1]])
+  singleColResult <- freqItems(df, "negDoubles", support=0.1)
+  expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
+
+  l <- lapply(c(0:99), function(i) {
+    if (i %% 2 == 0) { list(1L, -1.0) }
+    else { list(i, i * -1.0) }})
+  df <- createDataFrame(sqlContext, l, c("a", "b"))
+  result <- freqItems(df, c("a", "b"), 0.4)
+  expect_identical(result[[1]], list(list(1L, 99L)))
+  expect_identical(result[[2]], list(list(-1, -99)))
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)

From 015f7ef503d5544f79512b6333326749a1f0c48b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 9 Oct 2015 15:28:09 -0500
Subject: [PATCH 513/802] [SPARK-8673] [LAUNCHER] API and infrastructure for
 communicating with child apps.

This change adds an API that encapsulates information about an app
launched using the library. It also creates a socket-based communication
layer for apps that are launched as child processes; the launching
application listens for connections from launched apps, and once
communication is established, the channel can be used to send updates
to the launching app, or to send commands to the child app.

The change also includes hooks for local, standalone/client and yarn
masters.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7052 from vanzin/SPARK-8673.
---
 .../spark/launcher/LauncherBackend.scala      | 119 ++++++
 .../cluster/SparkDeploySchedulerBackend.scala |  35 +-
 .../spark/scheduler/local/LocalBackend.scala  |  19 +-
 .../spark/launcher/SparkLauncherSuite.java    |  39 +-
 core/src/test/resources/log4j.properties      |  11 +-
 .../spark/launcher/LauncherBackendSuite.scala |  81 +++++
 launcher/pom.xml                              |   5 +
 .../launcher/AbstractCommandBuilder.java      |  38 +-
 .../spark/launcher/ChildProcAppHandle.java    | 159 ++++++++
 .../spark/launcher/LauncherConnection.java    | 110 ++++++
 .../spark/launcher/LauncherProtocol.java      |  93 +++++
 .../apache/spark/launcher/LauncherServer.java | 341 ++++++++++++++++++
 .../spark/launcher/NamedThreadFactory.java    |  40 ++
 .../spark/launcher/OutputRedirector.java      |  78 ++++
 .../apache/spark/launcher/SparkAppHandle.java | 126 +++++++
 .../apache/spark/launcher/SparkLauncher.java  | 106 +++++-
 .../launcher/SparkSubmitCommandBuilder.java   |  22 +-
 .../apache/spark/launcher/package-info.java   |  38 +-
 .../org/apache/spark/launcher/BaseSuite.java  |  32 ++
 .../spark/launcher/LauncherServerSuite.java   | 188 ++++++++++
 .../SparkSubmitCommandBuilderSuite.java       |   4 +-
 .../SparkSubmitOptionParserSuite.java         |   2 +-
 launcher/src/test/resources/log4j.properties  |  13 +-
 .../org/apache/spark/deploy/yarn/Client.scala |  43 ++-
 .../cluster/YarnClientSchedulerBackend.scala  |  10 +
 yarn/src/test/resources/log4j.properties      |   7 +-
 .../deploy/yarn/BaseYarnClusterSuite.scala    | 127 ++++---
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  76 +++-
 .../yarn/YarnShuffleIntegrationSuite.scala    |   4 +-
 29 files changed, 1820 insertions(+), 146 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
 create mode 100644 core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java

diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
new file mode 100644
index 0000000000000..3ea984c501e02
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.net.{InetAddress, Socket}
+
+import org.apache.spark.SPARK_VERSION
+import org.apache.spark.launcher.LauncherProtocol._
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * A class that can be used to talk to a launcher server. Users should extend this class to
+ * provide implementation for the abstract methods.
+ *
+ * See `LauncherServer` for an explanation of how launcher communication works.
+ */
+private[spark] abstract class LauncherBackend {
+
+  private var clientThread: Thread = _
+  private var connection: BackendConnection = _
+  private var lastState: SparkAppHandle.State = _
+  @volatile private var _isConnected = false
+
+  def connect(): Unit = {
+    val port = sys.env.get(LauncherProtocol.ENV_LAUNCHER_PORT).map(_.toInt)
+    val secret = sys.env.get(LauncherProtocol.ENV_LAUNCHER_SECRET)
+    if (port != None && secret != None) {
+      val s = new Socket(InetAddress.getLoopbackAddress(), port.get)
+      connection = new BackendConnection(s)
+      connection.send(new Hello(secret.get, SPARK_VERSION))
+      clientThread = LauncherBackend.threadFactory.newThread(connection)
+      clientThread.start()
+      _isConnected = true
+    }
+  }
+
+  def close(): Unit = {
+    if (connection != null) {
+      try {
+        connection.close()
+      } finally {
+        if (clientThread != null) {
+          clientThread.join()
+        }
+      }
+    }
+  }
+
+  def setAppId(appId: String): Unit = {
+    if (connection != null) {
+      connection.send(new SetAppId(appId))
+    }
+  }
+
+  def setState(state: SparkAppHandle.State): Unit = {
+    if (connection != null && lastState != state) {
+      connection.send(new SetState(state))
+      lastState = state
+    }
+  }
+
+  /** Return whether the launcher handle is still connected to this backend. */
+  def isConnected(): Boolean = _isConnected
+
+  /**
+   * Implementations should provide this method, which should try to stop the application
+   * as gracefully as possible.
+   */
+  protected def onStopRequest(): Unit
+
+  /**
+   * Callback for when the launcher handle disconnects from this backend.
+   */
+  protected def onDisconnected() : Unit = { }
+
+
+  private class BackendConnection(s: Socket) extends LauncherConnection(s) {
+
+    override protected def handle(m: Message): Unit = m match {
+      case _: Stop =>
+        onStopRequest()
+
+      case _ =>
+        throw new IllegalArgumentException(s"Unexpected message type: ${m.getClass().getName()}")
+    }
+
+    override def close(): Unit = {
+      try {
+        super.close()
+      } finally {
+        onDisconnected()
+        _isConnected = false
+      }
+    }
+
+  }
+
+}
+
+private object LauncherBackend {
+
+  val threadFactory = ThreadUtils.namedThreadFactory("LauncherBackend")
+
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 27491ecf8b97d..2625c3e7ac718 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
@@ -36,6 +37,9 @@ private[spark] class SparkDeploySchedulerBackend(
 
   private var client: AppClient = null
   private var stopping = false
+  private val launcherBackend = new LauncherBackend() {
+    override protected def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
+  }
 
   @volatile var shutdownCallback: SparkDeploySchedulerBackend => Unit = _
   @volatile private var appId: String = _
@@ -47,6 +51,7 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def start() {
     super.start()
+    launcherBackend.connect()
 
     // The endpoint for executors to talk to us
     val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
@@ -87,24 +92,20 @@ private[spark] class SparkDeploySchedulerBackend(
       command, appUIAddress, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor)
     client = new AppClient(sc.env.rpcEnv, masters, appDesc, this, conf)
     client.start()
+    launcherBackend.setState(SparkAppHandle.State.SUBMITTED)
     waitForRegistration()
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
-  override def stop() {
-    stopping = true
-    super.stop()
-    client.stop()
-
-    val callback = shutdownCallback
-    if (callback != null) {
-      callback(this)
-    }
+  override def stop(): Unit = synchronized {
+    stop(SparkAppHandle.State.FINISHED)
   }
 
   override def connected(appId: String) {
     logInfo("Connected to Spark cluster with app ID " + appId)
     this.appId = appId
     notifyContext()
+    launcherBackend.setAppId(appId)
   }
 
   override def disconnected() {
@@ -117,6 +118,7 @@ private[spark] class SparkDeploySchedulerBackend(
   override def dead(reason: String) {
     notifyContext()
     if (!stopping) {
+      launcherBackend.setState(SparkAppHandle.State.KILLED)
       logError("Application has been killed. Reason: " + reason)
       try {
         scheduler.error(reason)
@@ -188,4 +190,19 @@ private[spark] class SparkDeploySchedulerBackend(
     registrationBarrier.release()
   }
 
+  private def stop(finalState: SparkAppHandle.State): Unit = synchronized {
+    stopping = true
+
+    launcherBackend.setState(finalState)
+    launcherBackend.close()
+
+    super.stop()
+    client.stop()
+
+    val callback = shutdownCallback
+    if (callback != null) {
+      callback(this)
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index 4d48fcfea44e7..c633d860ae6e5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -24,6 +24,7 @@ import java.nio.ByteBuffer
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv, TaskState}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.executor.{Executor, ExecutorBackend}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -103,6 +104,9 @@ private[spark] class LocalBackend(
   private var localEndpoint: RpcEndpointRef = null
   private val userClassPath = getUserClasspath(conf)
   private val listenerBus = scheduler.sc.listenerBus
+  private val launcherBackend = new LauncherBackend() {
+    override def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
+  }
 
   /**
    * Returns a list of URLs representing the user classpath.
@@ -114,6 +118,8 @@ private[spark] class LocalBackend(
     userClassPathStr.map(_.split(File.pathSeparator)).toSeq.flatten.map(new File(_).toURI.toURL)
   }
 
+  launcherBackend.connect()
+
   override def start() {
     val rpcEnv = SparkEnv.get.rpcEnv
     val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores)
@@ -122,10 +128,12 @@ private[spark] class LocalBackend(
       System.currentTimeMillis,
       executorEndpoint.localExecutorId,
       new ExecutorInfo(executorEndpoint.localExecutorHostname, totalCores, Map.empty)))
+    launcherBackend.setAppId(appId)
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
   override def stop() {
-    localEndpoint.ask(StopExecutor)
+    stop(SparkAppHandle.State.FINISHED)
   }
 
   override def reviveOffers() {
@@ -145,4 +153,13 @@ private[spark] class LocalBackend(
 
   override def applicationId(): String = appId
 
+  private def stop(finalState: SparkAppHandle.State): Unit = {
+    localEndpoint.ask(StopExecutor)
+    try {
+      launcherBackend.setState(finalState)
+    } finally {
+      launcherBackend.close()
+    }
+  }
+
 }
diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index d0c26dd05679b..aa15e792e2b27 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -27,6 +27,7 @@
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.slf4j.bridge.SLF4JBridgeHandler;
 import static org.junit.Assert.*;
 
 /**
@@ -34,7 +35,13 @@
  */
 public class SparkLauncherSuite {
 
+  static {
+    SLF4JBridgeHandler.removeHandlersForRootLogger();
+    SLF4JBridgeHandler.install();
+  }
+
   private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class);
+  private static final NamedThreadFactory TF = new NamedThreadFactory("SparkLauncherSuite-%d");
 
   @Test
   public void testSparkArgumentHandling() throws Exception {
@@ -94,14 +101,15 @@ public void testChildProcLauncher() throws Exception {
       .addSparkArg(opts.CONF,
         String.format("%s=-Dfoo=ShouldBeOverriddenBelow", SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS))
       .setConf(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS,
-        "-Dfoo=bar -Dtest.name=-testChildProcLauncher")
+        "-Dfoo=bar -Dtest.appender=childproc")
       .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
       .addSparkArg(opts.CLASS, "ShouldBeOverriddenBelow")
       .setMainClass(SparkLauncherTestApp.class.getName())
       .addAppArgs("proc");
     final Process app = launcher.launch();
-    new Redirector("stdout", app.getInputStream()).start();
-    new Redirector("stderr", app.getErrorStream()).start();
+
+    new OutputRedirector(app.getInputStream(), TF);
+    new OutputRedirector(app.getErrorStream(), TF);
     assertEquals(0, app.waitFor());
   }
 
@@ -116,29 +124,4 @@ public static void main(String[] args) throws Exception {
 
   }
 
-  private static class Redirector extends Thread {
-
-    private final InputStream in;
-
-    Redirector(String name, InputStream in) {
-      this.in = in;
-      setName(name);
-      setDaemon(true);
-    }
-
-    @Override
-    public void run() {
-      try {
-        BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
-        String line;
-        while ((line = reader.readLine()) != null) {
-          LOG.warn(line);
-        }
-      } catch (Exception e) {
-        LOG.error("Error reading process output.", e);
-      }
-    }
-
-  }
-
 }
diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties
index eb3b1999eb996..a54d27de91ed2 100644
--- a/core/src/test/resources/log4j.properties
+++ b/core/src/test/resources/log4j.properties
@@ -16,13 +16,22 @@
 #
 
 # Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
+test.appender=file
+log4j.rootCategory=INFO, ${test.appender}
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
+# Tests that launch java subprocesses can set the "test.appender" system property to
+# "console" to avoid having the child process's logs overwrite the unit test's
+# log file.
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%t: %m%n
+
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark-project.jetty=WARN
 org.spark-project.jetty.LEVEL=WARN
diff --git a/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
new file mode 100644
index 0000000000000..07e8869833e95
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.util.concurrent.TimeUnit
+
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark._
+import org.apache.spark.launcher._
+
+class LauncherBackendSuite extends SparkFunSuite with Matchers {
+
+  private val tests = Seq(
+    "local" -> "local",
+    "standalone/client" -> "local-cluster[1,1,1024]")
+
+  tests.foreach { case (name, master) =>
+    test(s"$name: launcher handle") {
+      testWithMaster(master)
+    }
+  }
+
+  private def testWithMaster(master: String): Unit = {
+    val env = new java.util.HashMap[String, String]()
+    env.put("SPARK_PRINT_LAUNCH_COMMAND", "1")
+    val handle = new SparkLauncher(env)
+      .setSparkHome(sys.props("spark.test.home"))
+      .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
+      .setConf("spark.ui.enabled", "false")
+      .setConf(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, s"-Dtest.appender=console")
+      .setMaster(master)
+      .setAppResource("spark-internal")
+      .setMainClass(TestApp.getClass.getName().stripSuffix("$"))
+      .startApplication()
+
+    try {
+      eventually(timeout(10 seconds), interval(100 millis)) {
+        handle.getAppId() should not be (null)
+      }
+
+      handle.stop()
+
+      eventually(timeout(10 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.KILLED)
+      }
+    } finally {
+      handle.kill()
+    }
+  }
+
+}
+
+object TestApp {
+
+  def main(args: Array[String]): Unit = {
+    new SparkContext(new SparkConf()).parallelize(Seq(1)).foreach { i =>
+      Thread.sleep(TimeUnit.SECONDS.toMillis(20))
+    }
+  }
+
+}
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d595d74642ab2..5739bfc16958f 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -47,6 +47,11 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>jul-to-slf4j</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 610e8bdaaa639..cf3729b7febc3 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -47,7 +47,7 @@ abstract class AbstractCommandBuilder {
   String javaHome;
   String mainClass;
   String master;
-  String propertiesFile;
+  protected String propertiesFile;
   final List<String> appArgs;
   final List<String> jars;
   final List<String> files;
@@ -55,6 +55,10 @@ abstract class AbstractCommandBuilder {
   final Map<String, String> childEnv;
   final Map<String, String> conf;
 
+  // The merged configuration for the application. Cached to avoid having to read / parse
+  // properties files multiple times.
+  private Map<String, String> effectiveConfig;
+
   public AbstractCommandBuilder() {
     this.appArgs = new ArrayList<String>();
     this.childEnv = new HashMap<String, String>();
@@ -257,12 +261,38 @@ String getSparkHome() {
     return path;
   }
 
+  String getenv(String key) {
+    return firstNonEmpty(childEnv.get(key), System.getenv(key));
+  }
+
+  void setPropertiesFile(String path) {
+    effectiveConfig = null;
+    this.propertiesFile = path;
+  }
+
+  Map<String, String> getEffectiveConfig() throws IOException {
+    if (effectiveConfig == null) {
+      if (propertiesFile == null) {
+        effectiveConfig = conf;
+      } else {
+        effectiveConfig = new HashMap<>(conf);
+        Properties p = loadPropertiesFile();
+        for (String key : p.stringPropertyNames()) {
+          if (!effectiveConfig.containsKey(key)) {
+            effectiveConfig.put(key, p.getProperty(key));
+          }
+        }
+      }
+    }
+    return effectiveConfig;
+  }
+
   /**
    * Loads the configuration file for the application, if it exists. This is either the
    * user-specified properties file, or the spark-defaults.conf file under the Spark configuration
    * directory.
    */
-  Properties loadPropertiesFile() throws IOException {
+  private Properties loadPropertiesFile() throws IOException {
     Properties props = new Properties();
     File propsFile;
     if (propertiesFile != null) {
@@ -294,10 +324,6 @@ Properties loadPropertiesFile() throws IOException {
     return props;
   }
 
-  String getenv(String key) {
-    return firstNonEmpty(childEnv.get(key), System.getenv(key));
-  }
-
   private String findAssembly() {
     String sparkHome = getSparkHome();
     File libdir;
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
new file mode 100644
index 0000000000000..de50f14fbdc87
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ThreadFactory;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Handle implementation for monitoring apps started as a child process.
+ */
+class ChildProcAppHandle implements SparkAppHandle {
+
+  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+  private static final ThreadFactory REDIRECTOR_FACTORY =
+    new NamedThreadFactory("launcher-proc-%d");
+
+  private final String secret;
+  private final LauncherServer server;
+
+  private Process childProc;
+  private boolean disposed;
+  private LauncherConnection connection;
+  private List<Listener> listeners;
+  private State state;
+  private String appId;
+  private OutputRedirector redirector;
+
+  ChildProcAppHandle(String secret, LauncherServer server) {
+    this.secret = secret;
+    this.server = server;
+    this.state = State.UNKNOWN;
+  }
+
+  @Override
+  public synchronized void addListener(Listener l) {
+    if (listeners == null) {
+      listeners = new ArrayList<>();
+    }
+    listeners.add(l);
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public String getAppId() {
+    return appId;
+  }
+
+  @Override
+  public void stop() {
+    CommandBuilderUtils.checkState(connection != null, "Application is still not connected.");
+    try {
+      connection.send(new LauncherProtocol.Stop());
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    }
+  }
+
+  @Override
+  public synchronized void disconnect() {
+    if (!disposed) {
+      disposed = true;
+      if (connection != null) {
+        try {
+          connection.close();
+        } catch (IOException ioe) {
+          // no-op.
+        }
+      }
+      server.unregister(this);
+      if (redirector != null) {
+        redirector.stop();
+      }
+    }
+  }
+
+  @Override
+  public synchronized void kill() {
+    if (!disposed) {
+      disconnect();
+    }
+    if (childProc != null) {
+      childProc.destroy();
+      childProc = null;
+    }
+  }
+
+  String getSecret() {
+    return secret;
+  }
+
+  void setChildProc(Process childProc, String loggerName) {
+    this.childProc = childProc;
+    this.redirector = new OutputRedirector(childProc.getInputStream(), loggerName,
+      REDIRECTOR_FACTORY);
+  }
+
+  void setConnection(LauncherConnection connection) {
+    this.connection = connection;
+  }
+
+  LauncherServer getServer() {
+    return server;
+  }
+
+  LauncherConnection getConnection() {
+    return connection;
+  }
+
+  void setState(State s) {
+    if (!state.isFinal()) {
+      state = s;
+      fireEvent(false);
+    } else {
+      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
+        new Object[] { state, s });
+    }
+  }
+
+  void setAppId(String appId) {
+    this.appId = appId;
+    fireEvent(true);
+  }
+
+  private synchronized void fireEvent(boolean isInfoChanged) {
+    if (listeners != null) {
+      for (Listener l : listeners) {
+        if (isInfoChanged) {
+          l.infoChanged(this);
+        } else {
+          l.stateChanged(this);
+        }
+      }
+    }
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
new file mode 100644
index 0000000000000..eec264909bbb6
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.net.Socket;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+/**
+ * Encapsulates a connection between a launcher server and client. This takes care of the
+ * communication (sending and receiving messages), while processing of messages is left for
+ * the implementations.
+ */
+abstract class LauncherConnection implements Closeable, Runnable {
+
+  private static final Logger LOG = Logger.getLogger(LauncherConnection.class.getName());
+
+  private final Socket socket;
+  private final ObjectOutputStream out;
+
+  private volatile boolean closed;
+
+  LauncherConnection(Socket socket) throws IOException {
+    this.socket = socket;
+    this.out = new ObjectOutputStream(socket.getOutputStream());
+    this.closed = false;
+  }
+
+  protected abstract void handle(Message msg) throws IOException;
+
+  @Override
+  public void run() {
+    try {
+      ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
+      while (!closed) {
+        Message msg = (Message) in.readObject();
+        handle(msg);
+      }
+    } catch (EOFException eof) {
+      // Remote side has closed the connection, just cleanup.
+      try {
+        close();
+      } catch (Exception unused) {
+        // no-op.
+      }
+    } catch (Exception e) {
+      if (!closed) {
+        LOG.log(Level.WARNING, "Error in inbound message handling.", e);
+        try {
+          close();
+        } catch (Exception unused) {
+          // no-op.
+        }
+      }
+    }
+  }
+
+  protected synchronized void send(Message msg) throws IOException {
+    try {
+      CommandBuilderUtils.checkState(!closed, "Disconnected.");
+      out.writeObject(msg);
+      out.flush();
+    } catch (IOException ioe) {
+      if (!closed) {
+        LOG.log(Level.WARNING, "Error when sending message.", ioe);
+        try {
+          close();
+        } catch (Exception unused) {
+          // no-op.
+        }
+      }
+      throw ioe;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (!closed) {
+      synchronized (this) {
+        if (!closed) {
+          closed = true;
+          socket.close();
+        }
+      }
+    }
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
new file mode 100644
index 0000000000000..50f136497ec1a
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.net.Socket;
+import java.util.Map;
+
+/**
+ * Message definitions for the launcher communication protocol. These messages must remain
+ * backwards-compatible, so that the launcher can talk to older versions of Spark that support
+ * the protocol.
+ */
+final class LauncherProtocol {
+
+  /** Environment variable where the server port is stored. */
+  static final String ENV_LAUNCHER_PORT = "_SPARK_LAUNCHER_PORT";
+
+  /** Environment variable where the secret for connecting back to the server is stored. */
+  static final String ENV_LAUNCHER_SECRET = "_SPARK_LAUNCHER_SECRET";
+
+  static class Message implements Serializable {
+
+  }
+
+  /**
+   * Hello message, sent from client to server.
+   */
+  static class Hello extends Message {
+
+    final String secret;
+    final String sparkVersion;
+
+    Hello(String secret, String version) {
+      this.secret = secret;
+      this.sparkVersion = version;
+    }
+
+  }
+
+  /**
+   * SetAppId message, sent from client to server.
+   */
+  static class SetAppId extends Message {
+
+    final String appId;
+
+    SetAppId(String appId) {
+      this.appId = appId;
+    }
+
+  }
+
+  /**
+   * SetState message, sent from client to server.
+   */
+  static class SetState extends Message {
+
+    final SparkAppHandle.State state;
+
+    SetState(SparkAppHandle.State state) {
+      this.state = state;
+    }
+
+  }
+
+  /**
+   * Stop message, send from server to client to stop the application.
+   */
+  static class Stop extends Message {
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
new file mode 100644
index 0000000000000..c5fd40816d62f
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.security.SecureRandom;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Timer;
+import java.util.TimerTask;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+/**
+ * A server that listens locally for connections from client launched by the library. Each client
+ * has a secret that it needs to send to the server to identify itself and establish the session.
+ *
+ * I/O is currently blocking (one thread per client). Clients have a limited time to connect back
+ * to the server, otherwise the server will ignore the connection.
+ *
+ * === Architecture Overview ===
+ *
+ * The launcher server is used when Spark apps are launched as separate processes than the calling
+ * app. It looks more or less like the following:
+ *
+ *         -----------------------                       -----------------------
+ *         |      User App       |     spark-submit      |      Spark App      |
+ *         |                     |  -------------------> |                     |
+ *         |         ------------|                       |-------------        |
+ *         |         |           |        hello          |            |        |
+ *         |         | L. Server |<----------------------| L. Backend |        |
+ *         |         |           |                       |            |        |
+ *         |         -------------                       -----------------------
+ *         |               |     |                              ^
+ *         |               v     |                              |
+ *         |        -------------|                              |
+ *         |        |            |      <per-app channel>       |
+ *         |        | App Handle |<------------------------------
+ *         |        |            |
+ *         -----------------------
+ *
+ * The server is started on demand and remains active while there are active or outstanding clients,
+ * to avoid opening too many ports when multiple clients are launched. Each client is given a unique
+ * secret, and have a limited amount of time to connect back
+ * ({@link SparkLauncher#CHILD_CONNECTION_TIMEOUT}), at which point the server will throw away
+ * that client's state. A client is only allowed to connect back to the server once.
+ *
+ * The launcher server listens on the localhost only, so it doesn't need access controls (aside from
+ * the per-app secret) nor encryption. It thus requires that the launched app has a local process
+ * that communicates with the server. In cluster mode, this means that the client that launches the
+ * application must remain alive for the duration of the application (or until the app handle is
+ * disconnected).
+ */
+class LauncherServer implements Closeable {
+
+  private static final Logger LOG = Logger.getLogger(LauncherServer.class.getName());
+  private static final String THREAD_NAME_FMT = "LauncherServer-%d";
+  private static final long DEFAULT_CONNECT_TIMEOUT = 10000L;
+
+  /** For creating secrets used for communication with child processes. */
+  private static final SecureRandom RND = new SecureRandom();
+
+  private static volatile LauncherServer serverInstance;
+
+  /**
+   * Creates a handle for an app to be launched. This method will start a server if one hasn't been
+   * started yet. The server is shared for multiple handles, and once all handles are disposed of,
+   * the server is shut down.
+   */
+  static synchronized ChildProcAppHandle newAppHandle() throws IOException {
+    LauncherServer server = serverInstance != null ? serverInstance : new LauncherServer();
+    server.ref();
+    serverInstance = server;
+
+    String secret = server.createSecret();
+    while (server.pending.containsKey(secret)) {
+      secret = server.createSecret();
+    }
+
+    return server.newAppHandle(secret);
+  }
+
+  static LauncherServer getServerInstance() {
+    return serverInstance;
+  }
+
+  private final AtomicLong refCount;
+  private final AtomicLong threadIds;
+  private final ConcurrentMap<String, ChildProcAppHandle> pending;
+  private final List<ServerConnection> clients;
+  private final ServerSocket server;
+  private final Thread serverThread;
+  private final ThreadFactory factory;
+  private final Timer timeoutTimer;
+
+  private volatile boolean running;
+
+  private LauncherServer() throws IOException {
+    this.refCount = new AtomicLong(0);
+
+    ServerSocket server = new ServerSocket();
+    try {
+      server.setReuseAddress(true);
+      server.bind(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0));
+
+      this.clients = new ArrayList<ServerConnection>();
+      this.threadIds = new AtomicLong();
+      this.factory = new NamedThreadFactory(THREAD_NAME_FMT);
+      this.pending = new ConcurrentHashMap<>();
+      this.timeoutTimer = new Timer("LauncherServer-TimeoutTimer", true);
+      this.server = server;
+      this.running = true;
+
+      this.serverThread = factory.newThread(new Runnable() {
+        @Override
+        public void run() {
+          acceptConnections();
+        }
+      });
+      serverThread.start();
+    } catch (IOException ioe) {
+      close();
+      throw ioe;
+    } catch (Exception e) {
+      close();
+      throw new IOException(e);
+    }
+  }
+
+  /**
+   * Creates a new app handle. The handle will wait for an incoming connection for a configurable
+   * amount of time, and if one doesn't arrive, it will transition to an error state.
+   */
+  ChildProcAppHandle newAppHandle(String secret) {
+    ChildProcAppHandle handle = new ChildProcAppHandle(secret, this);
+    ChildProcAppHandle existing = pending.putIfAbsent(secret, handle);
+    CommandBuilderUtils.checkState(existing == null, "Multiple handles with the same secret.");
+    return handle;
+  }
+
+  @Override
+  public void close() throws IOException {
+    synchronized (this) {
+      if (running) {
+        running = false;
+        timeoutTimer.cancel();
+        server.close();
+        synchronized (clients) {
+          List<ServerConnection> copy = new ArrayList<>(clients);
+          clients.clear();
+          for (ServerConnection client : copy) {
+            client.close();
+          }
+        }
+      }
+    }
+    if (serverThread != null) {
+      try {
+        serverThread.join();
+      } catch (InterruptedException ie) {
+        // no-op
+      }
+    }
+  }
+
+  void ref() {
+    refCount.incrementAndGet();
+  }
+
+  void unref() {
+    synchronized(LauncherServer.class) {
+      if (refCount.decrementAndGet() == 0) {
+        try {
+          close();
+        } catch (IOException ioe) {
+          // no-op.
+        } finally {
+          serverInstance = null;
+        }
+      }
+    }
+  }
+
+  int getPort() {
+    return server.getLocalPort();
+  }
+
+  /**
+   * Removes the client handle from the pending list (in case it's still there), and unrefs
+   * the server.
+   */
+  void unregister(ChildProcAppHandle handle) {
+    pending.remove(handle.getSecret());
+    unref();
+  }
+
+  private void acceptConnections() {
+    try {
+      while (running) {
+        final Socket client = server.accept();
+        TimerTask timeout = new TimerTask() {
+          @Override
+          public void run() {
+            LOG.warning("Timed out waiting for hello message from client.");
+            try {
+              client.close();
+            } catch (IOException ioe) {
+              // no-op.
+            }
+          }
+        };
+        ServerConnection clientConnection = new ServerConnection(client, timeout);
+        Thread clientThread = factory.newThread(clientConnection);
+        synchronized (timeout) {
+          clientThread.start();
+          synchronized (clients) {
+            clients.add(clientConnection);
+          }
+          timeoutTimer.schedule(timeout, getConnectionTimeout());
+        }
+      }
+    } catch (IOException ioe) {
+      if (running) {
+        LOG.log(Level.SEVERE, "Error in accept loop.", ioe);
+      }
+    }
+  }
+
+  private long getConnectionTimeout() {
+    String value = SparkLauncher.launcherConfig.get(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
+    return (value != null) ? Long.parseLong(value) : DEFAULT_CONNECT_TIMEOUT;
+  }
+
+  private String createSecret() {
+    byte[] secret = new byte[128];
+    RND.nextBytes(secret);
+
+    StringBuilder sb = new StringBuilder();
+    for (byte b : secret) {
+      int ival = b >= 0 ? b : Byte.MAX_VALUE - b;
+      if (ival < 0x10) {
+        sb.append("0");
+      }
+      sb.append(Integer.toHexString(ival));
+    }
+    return sb.toString();
+  }
+
+  private class ServerConnection extends LauncherConnection {
+
+    private TimerTask timeout;
+    private ChildProcAppHandle handle;
+
+    ServerConnection(Socket socket, TimerTask timeout) throws IOException {
+      super(socket);
+      this.timeout = timeout;
+    }
+
+    @Override
+    protected void handle(Message msg) throws IOException {
+      try {
+        if (msg instanceof Hello) {
+          synchronized (timeout) {
+            timeout.cancel();
+          }
+          timeout = null;
+          Hello hello = (Hello) msg;
+          ChildProcAppHandle handle = pending.remove(hello.secret);
+          if (handle != null) {
+            handle.setState(SparkAppHandle.State.CONNECTED);
+            handle.setConnection(this);
+            this.handle = handle;
+          } else {
+            throw new IllegalArgumentException("Received Hello for unknown client.");
+          }
+        } else {
+          if (handle == null) {
+            throw new IllegalArgumentException("Expected hello, got: " +
+            msg != null ? msg.getClass().getName() : null);
+          }
+          if (msg instanceof SetAppId) {
+            SetAppId set = (SetAppId) msg;
+            handle.setAppId(set.appId);
+          } else if (msg instanceof SetState) {
+            handle.setState(((SetState)msg).state);
+          } else {
+            throw new IllegalArgumentException("Invalid message: " +
+              msg != null ? msg.getClass().getName() : null);
+          }
+        }
+      } catch (Exception e) {
+        LOG.log(Level.INFO, "Error handling message from client.", e);
+        if (timeout != null) {
+          timeout.cancel();
+        }
+        close();
+      } finally {
+        timeoutTimer.purge();
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+      synchronized (clients) {
+        clients.remove(this);
+      }
+      super.close();
+      if (handle != null) {
+        handle.disconnect();
+      }
+    }
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java b/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
new file mode 100644
index 0000000000000..995f4d73daaaf
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicLong;
+
+class NamedThreadFactory implements ThreadFactory {
+
+  private final String nameFormat;
+  private final AtomicLong threadIds;
+
+  NamedThreadFactory(String nameFormat) {
+    this.nameFormat = nameFormat;
+    this.threadIds = new AtomicLong();
+  }
+
+  @Override
+  public Thread newThread(Runnable r) {
+    Thread t = new Thread(r, String.format(nameFormat, threadIds.incrementAndGet()));
+    t.setDaemon(true);
+    return t;
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
new file mode 100644
index 0000000000000..6e7120167d605
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.concurrent.ThreadFactory;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Redirects lines read from a given input stream to a j.u.l.Logger (at INFO level).
+ */
+class OutputRedirector {
+
+  private final BufferedReader reader;
+  private final Logger sink;
+  private final Thread thread;
+
+  private volatile boolean active;
+
+  OutputRedirector(InputStream in, ThreadFactory tf) {
+    this(in, OutputRedirector.class.getName(), tf);
+  }
+
+  OutputRedirector(InputStream in, String loggerName, ThreadFactory tf) {
+    this.active = true;
+    this.reader = new BufferedReader(new InputStreamReader(in));
+    this.thread = tf.newThread(new Runnable() {
+      @Override
+      public void run() {
+        redirect();
+      }
+    });
+    this.sink = Logger.getLogger(loggerName);
+    thread.start();
+  }
+
+  private void redirect() {
+    try {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (active) {
+          sink.info(line.replaceFirst("\\s*$", ""));
+        }
+      }
+    } catch (IOException e) {
+      sink.log(Level.FINE, "Error reading child process output.", e);
+    }
+  }
+
+  /**
+   * This method just stops the output of the process from showing up in the local logs.
+   * The child's output will still be read (and, thus, the redirect thread will still be
+   * alive) to avoid the child process hanging because of lack of output buffer.
+   */
+  void stop() {
+    active = false;
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
new file mode 100644
index 0000000000000..2896a91d5e793
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+/**
+ * A handle to a running Spark application.
+ * <p/>
+ * Provides runtime information about the underlying Spark application, and actions to control it.
+ *
+ * @since 1.6.0
+ */
+public interface SparkAppHandle {
+
+  /**
+   * Represents the application's state. A state can be "final", in which case it will not change
+   * after it's reached, and means the application is not running anymore.
+   *
+   * @since 1.6.0
+   */
+  public enum State {
+    /** The application has not reported back yet. */
+    UNKNOWN(false),
+    /** The application has connected to the handle. */
+    CONNECTED(false),
+    /** The application has been submitted to the cluster. */
+    SUBMITTED(false),
+    /** The application is running. */
+    RUNNING(false),
+    /** The application finished with a successful status. */
+    FINISHED(true),
+    /** The application finished with a failed status. */
+    FAILED(true),
+    /** The application was killed. */
+    KILLED(true);
+
+    private final boolean isFinal;
+
+    State(boolean isFinal) {
+      this.isFinal = isFinal;
+    }
+
+    /**
+     * Whether this state is a final state, meaning the application is not running anymore
+     * once it's reached.
+     */
+    public boolean isFinal() {
+      return isFinal;
+    }
+  }
+
+  /**
+   * Adds a listener to be notified of changes to the handle's information. Listeners will be called
+   * from the thread processing updates from the application, so they should avoid blocking or
+   * long-running operations.
+   *
+   * @param l Listener to add.
+   */
+  void addListener(Listener l);
+
+  /** Returns the current application state. */
+  State getState();
+
+  /** Returns the application ID, or <code>null</code> if not yet known. */
+  String getAppId();
+
+  /**
+   * Asks the application to stop. This is best-effort, since the application may fail to receive
+   * or act on the command. Callers should watch for a state transition that indicates the
+   * application has really stopped.
+   */
+  void stop();
+
+  /**
+   * Tries to kill the underlying application. Implies {@link #disconnect()}. This will not send
+   * a {@link #stop()} message to the application, so it's recommended that users first try to
+   * stop the application cleanly and only resort to this method if that fails.
+   */
+  void kill();
+
+  /**
+   * Disconnects the handle from the application, without stopping it. After this method is called,
+   * the handle will not be able to communicate with the application anymore.
+   */
+  void disconnect();
+
+  /**
+   * Listener for updates to a handle's state. The callbacks do not receive information about
+   * what exactly has changed, just that an update has occurred.
+   *
+   * @since 1.6.0
+   */
+  public interface Listener {
+
+    /**
+     * Callback for changes in the handle's state.
+     *
+     * @param handle The updated handle.
+     * @see {@link SparkAppHandle#getState()}
+     */
+    void stateChanged(SparkAppHandle handle);
+
+    /**
+     * Callback for changes in any information that is not the handle's state.
+     *
+     * @param handle The updated handle.
+     */
+    void infoChanged(SparkAppHandle handle);
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 57993405e47be..5d74b37033a51 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -21,8 +21,10 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.apache.spark.launcher.CommandBuilderUtils.*;
 
@@ -58,6 +60,33 @@ public class SparkLauncher {
   /** Configuration key for the number of executor CPU cores. */
   public static final String EXECUTOR_CORES = "spark.executor.cores";
 
+  /** Logger name to use when launching a child process. */
+  public static final String CHILD_PROCESS_LOGGER_NAME = "spark.launcher.childProcLoggerName";
+
+  /**
+   * Maximum time (in ms) to wait for a child process to connect back to the launcher server
+   * when using @link{#start()}.
+   */
+  public static final String CHILD_CONNECTION_TIMEOUT = "spark.launcher.childConectionTimeout";
+
+  /** Used internally to create unique logger names. */
+  private static final AtomicInteger COUNTER = new AtomicInteger();
+
+  static final Map<String, String> launcherConfig = new HashMap<String, String>();
+
+  /**
+   * Set a configuration value for the launcher library. These config values do not affect the
+   * launched application, but rather the behavior of the launcher library itself when managing
+   * applications.
+   *
+   * @since 1.6.0
+   * @param name Config name.
+   * @param value Config value.
+   */
+  public static void setConfig(String name, String value) {
+    launcherConfig.put(name, value);
+  }
+
   // Visible for testing.
   final SparkSubmitCommandBuilder builder;
 
@@ -109,7 +138,7 @@ public SparkLauncher setSparkHome(String sparkHome) {
    */
   public SparkLauncher setPropertiesFile(String path) {
     checkNotNull(path, "path");
-    builder.propertiesFile = path;
+    builder.setPropertiesFile(path);
     return this;
   }
 
@@ -197,6 +226,7 @@ public SparkLauncher setMainClass(String mainClass) {
    * Use this method with caution. It is possible to create an invalid Spark command by passing
    * unknown arguments to this method, since those are allowed for forward compatibility.
    *
+   * @since 1.5.0
    * @param arg Argument to add.
    * @return This launcher.
    */
@@ -218,6 +248,7 @@ public SparkLauncher addSparkArg(String arg) {
    * Use this method with caution. It is possible to create an invalid Spark command by passing
    * unknown arguments to this method, since those are allowed for forward compatibility.
    *
+   * @since 1.5.0
    * @param name Name of argument to add.
    * @param value Value of the argument.
    * @return This launcher.
@@ -319,10 +350,81 @@ public SparkLauncher setVerbose(boolean verbose) {
 
   /**
    * Launches a sub-process that will start the configured Spark application.
+   * <p/>
+   * The {@link #startApplication(SparkAppHandle.Listener...)} method is preferred when launching
+   * Spark, since it provides better control of the child application.
    *
    * @return A process handle for the Spark app.
    */
   public Process launch() throws IOException {
+    return createBuilder().start();
+  }
+
+  /**
+   * Starts a Spark application.
+   * <p/>
+   * This method returns a handle that provides information about the running application and can
+   * be used to do basic interaction with it.
+   * <p/>
+   * The returned handle assumes that the application will instantiate a single SparkContext
+   * during its lifetime. Once that context reports a final state (one that indicates the
+   * SparkContext has stopped), the handle will not perform new state transitions, so anything
+   * that happens after that cannot be monitored. If the underlying application is launched as
+   * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process.
+   * <p/>
+   * Currently, all applications are launched as child processes. The child's stdout and stderr
+   * are merged and written to a logger (see <code>java.util.logging</code>). The logger's name
+   * can be defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If
+   * that option is not set, the code will try to derive a name from the application's name or
+   * main class / script file. If those cannot be determined, an internal, unique name will be
+   * used. In all cases, the logger name will start with "org.apache.spark.launcher.app", to fit
+   * more easily into the configuration of commonly-used logging systems.
+   *
+   * @since 1.6.0
+   * @param listeners Listeners to add to the handle before the app is launched.
+   * @return A handle for the launched application.
+   */
+  public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) throws IOException {
+    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    for (SparkAppHandle.Listener l : listeners) {
+      handle.addListener(l);
+    }
+
+    String appName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
+    if (appName == null) {
+      if (builder.appName != null) {
+        appName = builder.appName;
+      } else if (builder.mainClass != null) {
+        int dot = builder.mainClass.lastIndexOf(".");
+        if (dot >= 0 && dot < builder.mainClass.length() - 1) {
+          appName = builder.mainClass.substring(dot + 1, builder.mainClass.length());
+        } else {
+          appName = builder.mainClass;
+        }
+      } else if (builder.appResource != null) {
+        appName = new File(builder.appResource).getName();
+      } else {
+        appName = String.valueOf(COUNTER.incrementAndGet());
+      }
+    }
+
+    String loggerPrefix = getClass().getPackage().getName();
+    String loggerName = String.format("%s.app.%s", loggerPrefix, appName);
+    ProcessBuilder pb = createBuilder().redirectErrorStream(true);
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_PORT,
+      String.valueOf(LauncherServer.getServerInstance().getPort()));
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, handle.getSecret());
+    try {
+      handle.setChildProc(pb.start(), loggerName);
+    } catch (IOException ioe) {
+      handle.kill();
+      throw ioe;
+    }
+
+    return handle;
+  }
+
+  private ProcessBuilder createBuilder() {
     List<String> cmd = new ArrayList<String>();
     String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
     cmd.add(join(File.separator, builder.getSparkHome(), "bin", script));
@@ -343,7 +445,7 @@ public Process launch() throws IOException {
     for (Map.Entry<String, String> e : builder.childEnv.entrySet()) {
       pb.environment().put(e.getKey(), e.getValue());
     }
-    return pb.start();
+    return pb;
   }
 
   private static class ArgumentValidator extends SparkSubmitOptionParser {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index fc87814a59ed5..39b46e0db8cc2 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -188,10 +188,9 @@ private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOE
     // Load the properties file and check whether spark-submit will be running the app's driver
     // or just launching a cluster app. When running the driver, the JVM's argument will be
     // modified to cover the driver's configuration.
-    Properties props = loadPropertiesFile();
-    boolean isClientMode = isClientMode(props);
-    String extraClassPath = isClientMode ?
-      firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_CLASSPATH, conf, props) : null;
+    Map<String, String> config = getEffectiveConfig();
+    boolean isClientMode = isClientMode(config);
+    String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null;
 
     List<String> cmd = buildJavaCommand(extraClassPath);
     // Take Thrift Server as daemon
@@ -212,14 +211,13 @@ private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOE
       // Take Thrift Server as daemon
       String tsMemory =
         isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null;
-      String memory = firstNonEmpty(tsMemory,
-        firstNonEmptyValue(SparkLauncher.DRIVER_MEMORY, conf, props),
+      String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY),
         System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM);
       cmd.add("-Xms" + memory);
       cmd.add("-Xmx" + memory);
-      addOptionString(cmd, firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, conf, props));
+      addOptionString(cmd, config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS));
       mergeEnvPathList(env, getLibPathEnvName(),
-        firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, conf, props));
+        config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
     }
 
     addPermGenSizeOpt(cmd);
@@ -281,9 +279,8 @@ private List<String> buildSparkRCommand(Map<String, String> env) throws IOExcept
   private void constructEnvVarArgs(
       Map<String, String> env,
       String submitArgsEnvVariable) throws IOException {
-    Properties props = loadPropertiesFile();
     mergeEnvPathList(env, getLibPathEnvName(),
-      firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, conf, props));
+      getEffectiveConfig().get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
 
     StringBuilder submitArgs = new StringBuilder();
     for (String arg : buildSparkSubmitArgs()) {
@@ -295,9 +292,8 @@ private void constructEnvVarArgs(
     env.put(submitArgsEnvVariable, submitArgs.toString());
   }
 
-
-  private boolean isClientMode(Properties userProps) {
-    String userMaster = firstNonEmpty(master, (String) userProps.get(SparkLauncher.SPARK_MASTER));
+  private boolean isClientMode(Map<String, String> userProps) {
+    String userMaster = firstNonEmpty(master, userProps.get(SparkLauncher.SPARK_MASTER));
     // Default master is "local[*]", so assume client mode in that case.
     return userMaster == null ||
       "client".equals(deployMode) ||
diff --git a/launcher/src/main/java/org/apache/spark/launcher/package-info.java b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
index 7c97dba511b28..d1ac39bdc76a9 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/package-info.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
@@ -17,17 +17,42 @@
 
 /**
  * Library for launching Spark applications.
- * 
+ *
  * <p>
  * This library allows applications to launch Spark programmatically. There's only one entry
  * point to the library - the {@link org.apache.spark.launcher.SparkLauncher} class.
  * </p>
  *
  * <p>
- * To launch a Spark application, just instantiate a {@link org.apache.spark.launcher.SparkLauncher}
- * and configure the application to run. For example:
+ * The {@link org.apache.spark.launcher.SparkLauncher#startApplication(
+ * org.apache.spark.launcher.SparkAppHandle.Listener...)} can be used to start Spark and provide
+ * a handle to monitor and control the running application:
  * </p>
- * 
+ *
+ * <pre>
+ * {@code
+ *   import org.apache.spark.launcher.SparkAppHandle;
+ *   import org.apache.spark.launcher.SparkLauncher;
+ *
+ *   public class MyLauncher {
+ *     public static void main(String[] args) throws Exception {
+ *       SparkAppHandle handle = new SparkLauncher()
+ *         .setAppResource("/my/app.jar")
+ *         .setMainClass("my.spark.app.Main")
+ *         .setMaster("local")
+ *         .setConf(SparkLauncher.DRIVER_MEMORY, "2g")
+ *         .startApplication();
+ *       // Use handle API to monitor / control application.
+ *     }
+ *   }
+ * }
+ * </pre>
+ *
+ * <p>
+ * It's also possible to launch a raw child process, using the
+ * {@link org.apache.spark.launcher.SparkLauncher#launch()} method:
+ * </p>
+ *
  * <pre>
  * {@code
  *   import org.apache.spark.launcher.SparkLauncher;
@@ -45,5 +70,10 @@
  *   }
  * }
  * </pre>
+ *
+ * <p>This method requires the calling code to manually manage the child process, including its
+ * output streams (to avoid possible deadlocks). It's recommended that
+ * {@link org.apache.spark.launcher.SparkLauncher#startApplication(
+ *   org.apache.spark.launcher.SparkAppHandle.Listener...)} be used instead.</p>
  */
 package org.apache.spark.launcher;
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
new file mode 100644
index 0000000000000..23e2c64d6dcd7
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import org.slf4j.bridge.SLF4JBridgeHandler;
+
+/**
+ * Handles configuring the JUL -> SLF4J bridge.
+ */
+class BaseSuite {
+
+  static {
+    SLF4JBridgeHandler.removeHandlersForRootLogger();
+    SLF4JBridgeHandler.install();
+  }
+
+}
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
new file mode 100644
index 0000000000000..27cd1061a15b3
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+public class LauncherServerSuite extends BaseSuite {
+
+  @Test
+  public void testLauncherServerReuse() throws Exception {
+    ChildProcAppHandle handle1 = null;
+    ChildProcAppHandle handle2 = null;
+    ChildProcAppHandle handle3 = null;
+
+    try {
+      handle1 = LauncherServer.newAppHandle();
+      handle2 = LauncherServer.newAppHandle();
+      LauncherServer server1 = handle1.getServer();
+      assertSame(server1, handle2.getServer());
+
+      handle1.kill();
+      handle2.kill();
+
+      handle3 = LauncherServer.newAppHandle();
+      assertNotSame(server1, handle3.getServer());
+
+      handle3.kill();
+
+      assertNull(LauncherServer.getServerInstance());
+    } finally {
+      kill(handle1);
+      kill(handle2);
+      kill(handle3);
+    }
+  }
+
+  @Test
+  public void testCommunication() throws Exception {
+    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    TestClient client = null;
+    try {
+      Socket s = new Socket(InetAddress.getLoopbackAddress(),
+        LauncherServer.getServerInstance().getPort());
+
+      final Object waitLock = new Object();
+      handle.addListener(new SparkAppHandle.Listener() {
+        @Override
+        public void stateChanged(SparkAppHandle handle) {
+          wakeUp();
+        }
+
+        @Override
+        public void infoChanged(SparkAppHandle handle) {
+          wakeUp();
+        }
+
+        private void wakeUp() {
+          synchronized (waitLock) {
+            waitLock.notifyAll();
+          }
+        }
+      });
+
+      client = new TestClient(s);
+      synchronized (waitLock) {
+        client.send(new Hello(handle.getSecret(), "1.4.0"));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+
+      // Make sure the server matched the client to the handle.
+      assertNotNull(handle.getConnection());
+
+      synchronized (waitLock) {
+        client.send(new SetAppId("app-id"));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+      assertEquals("app-id", handle.getAppId());
+
+      synchronized (waitLock) {
+        client.send(new SetState(SparkAppHandle.State.RUNNING));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+      assertEquals(SparkAppHandle.State.RUNNING, handle.getState());
+
+      handle.stop();
+      Message stopMsg = client.inbound.poll(10, TimeUnit.SECONDS);
+      assertTrue(stopMsg instanceof Stop);
+    } finally {
+      kill(handle);
+      close(client);
+      client.clientThread.join();
+    }
+  }
+
+  @Test
+  public void testTimeout() throws Exception {
+    final long TEST_TIMEOUT = 10L;
+
+    ChildProcAppHandle handle = null;
+    TestClient client = null;
+    try {
+      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, String.valueOf(TEST_TIMEOUT));
+
+      handle = LauncherServer.newAppHandle();
+
+      Socket s = new Socket(InetAddress.getLoopbackAddress(),
+        LauncherServer.getServerInstance().getPort());
+      client = new TestClient(s);
+
+      Thread.sleep(TEST_TIMEOUT * 10);
+      try {
+        client.send(new Hello(handle.getSecret(), "1.4.0"));
+        fail("Expected exception caused by connection timeout.");
+      } catch (IllegalStateException e) {
+        // Expected.
+      }
+    } finally {
+      SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
+      kill(handle);
+      close(client);
+    }
+  }
+
+  private void kill(SparkAppHandle handle) {
+    if (handle != null) {
+      handle.kill();
+    }
+  }
+
+  private void close(Closeable c) {
+    if (c != null) {
+      try {
+        c.close();
+      } catch (Exception e) {
+        // no-op.
+      }
+    }
+  }
+
+  private static class TestClient extends LauncherConnection {
+
+    final BlockingQueue<Message> inbound;
+    final Thread clientThread;
+
+    TestClient(Socket s) throws IOException {
+      super(s);
+      this.inbound = new LinkedBlockingQueue<Message>();
+      this.clientThread = new Thread(this);
+      clientThread.setName("TestClient");
+      clientThread.setDaemon(true);
+      clientThread.start();
+    }
+
+    @Override
+    protected void handle(Message msg) throws IOException {
+      inbound.offer(msg);
+    }
+
+  }
+
+}
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
index 7329ac9f7fb8c..d5397b0685046 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
@@ -30,7 +30,7 @@
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-public class SparkSubmitCommandBuilderSuite {
+public class SparkSubmitCommandBuilderSuite extends BaseSuite {
 
   private static File dummyPropsFile;
   private static SparkSubmitOptionParser parser;
@@ -161,7 +161,7 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     launcher.appResource = "/foo";
     launcher.appName = "MyApp";
     launcher.mainClass = "my.Class";
-    launcher.propertiesFile = dummyPropsFile.getAbsolutePath();
+    launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
     launcher.appArgs.add("foo");
     launcher.appArgs.add("bar");
     launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
index f3d2109917056..3ee5b8cf9689d 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
@@ -28,7 +28,7 @@
 
 import static org.apache.spark.launcher.SparkSubmitOptionParser.*;
 
-public class SparkSubmitOptionParserSuite {
+public class SparkSubmitOptionParserSuite extends BaseSuite {
 
   private SparkSubmitOptionParser parser;
 
diff --git a/launcher/src/test/resources/log4j.properties b/launcher/src/test/resources/log4j.properties
index 67a6a98217118..c64b1565e1469 100644
--- a/launcher/src/test/resources/log4j.properties
+++ b/launcher/src/test/resources/log4j.properties
@@ -16,16 +16,19 @@
 #
 
 # Set everything to be logged to the file core/target/unit-tests.log
-log4j.rootCategory=INFO, file
+test.appender=file
+log4j.rootCategory=INFO, ${test.appender}
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-
-# Some tests will set "test.name" to avoid overwriting the main log file.
-log4j.appender.file.file=target/unit-tests${test.name}.log
-
+log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
+log4j.appender.childproc=org.apache.log4j.ConsoleAppender
+log4j.appender.childproc.target=System.err
+log4j.appender.childproc.layout=org.apache.log4j.PatternLayout
+log4j.appender.childproc.layout.ConversionPattern=%t: %m%n
+
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark-project.jetty=WARN
 org.spark-project.jetty.LEVEL=WARN
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index eb3b7fb885087..cec81b940644c 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -55,8 +55,8 @@ import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 import org.apache.hadoop.yarn.util.Records
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.util.Utils
 
 private[spark] class Client(
@@ -70,8 +70,6 @@ private[spark] class Client(
   def this(clientArgs: ClientArguments, spConf: SparkConf) =
     this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
 
-  def this(clientArgs: ClientArguments) = this(clientArgs, new SparkConf())
-
   private val yarnClient = YarnClient.createYarnClient
   private val yarnConf = new YarnConfiguration(hadoopConf)
   private var credentials: Credentials = null
@@ -84,10 +82,27 @@ private[spark] class Client(
   private var principal: String = null
   private var keytab: String = null
 
+  private val launcherBackend = new LauncherBackend() {
+    override def onStopRequest(): Unit = {
+      if (isClusterMode && appId != null) {
+        yarnClient.killApplication(appId)
+      } else {
+        setState(SparkAppHandle.State.KILLED)
+        stop()
+      }
+    }
+  }
   private val fireAndForget = isClusterMode &&
     !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
 
+  private var appId: ApplicationId = null
+
+  def reportLauncherState(state: SparkAppHandle.State): Unit = {
+    launcherBackend.setState(state)
+  }
+
   def stop(): Unit = {
+    launcherBackend.close()
     yarnClient.stop()
     // Unset YARN mode system env variable, to allow switching between cluster types.
     System.clearProperty("SPARK_YARN_MODE")
@@ -103,6 +118,7 @@ private[spark] class Client(
   def submitApplication(): ApplicationId = {
     var appId: ApplicationId = null
     try {
+      launcherBackend.connect()
       // Setup the credentials before doing anything else,
       // so we have don't have issues at any point.
       setupCredentials()
@@ -116,6 +132,8 @@ private[spark] class Client(
       val newApp = yarnClient.createApplication()
       val newAppResponse = newApp.getNewApplicationResponse()
       appId = newAppResponse.getApplicationId()
+      reportLauncherState(SparkAppHandle.State.SUBMITTED)
+      launcherBackend.setAppId(appId.toString())
 
       // Verify whether the cluster has enough resources for our AM
       verifyClusterResources(newAppResponse)
@@ -881,6 +899,20 @@ private[spark] class Client(
         }
       }
 
+      if (lastState != state) {
+        state match {
+          case YarnApplicationState.RUNNING =>
+            reportLauncherState(SparkAppHandle.State.RUNNING)
+          case YarnApplicationState.FINISHED =>
+            reportLauncherState(SparkAppHandle.State.FINISHED)
+          case YarnApplicationState.FAILED =>
+            reportLauncherState(SparkAppHandle.State.FAILED)
+          case YarnApplicationState.KILLED =>
+            reportLauncherState(SparkAppHandle.State.KILLED)
+          case _ =>
+        }
+      }
+
       if (state == YarnApplicationState.FINISHED ||
         state == YarnApplicationState.FAILED ||
         state == YarnApplicationState.KILLED) {
@@ -928,8 +960,8 @@ private[spark] class Client(
    * throw an appropriate SparkException.
    */
   def run(): Unit = {
-    val appId = submitApplication()
-    if (fireAndForget) {
+    this.appId = submitApplication()
+    if (!launcherBackend.isConnected() && fireAndForget) {
       val report = getApplicationReport(appId)
       val state = report.getYarnApplicationState
       logInfo(s"Application report for $appId (state: $state)")
@@ -971,6 +1003,7 @@ private[spark] class Client(
 }
 
 object Client extends Logging {
+
   def main(argStrings: Array[String]) {
     if (!sys.props.contains("SPARK_SUBMIT")) {
       logWarning("WARNING: This client is deprecated and will be removed in a " +
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 36d5759554d98..20771f655473c 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
 
 import org.apache.spark.{SparkException, Logging, SparkContext}
 import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
+import org.apache.spark.launcher.SparkAppHandle
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 private[spark] class YarnClientSchedulerBackend(
@@ -177,6 +178,15 @@ private[spark] class YarnClientSchedulerBackend(
     if (monitorThread != null) {
       monitorThread.stopMonitor()
     }
+
+    // Report a final state to the launcher if one is connected. This is needed since in client
+    // mode this backend doesn't let the app monitor loop run to completion, so it does not report
+    // the final state itself.
+    //
+    // Note: there's not enough information at this point to provide a better final state,
+    // so assume the application was successful.
+    client.reportLauncherState(SparkAppHandle.State.FINISHED)
+
     super.stop()
     YarnSparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()
     client.stop()
diff --git a/yarn/src/test/resources/log4j.properties b/yarn/src/test/resources/log4j.properties
index 6b8a5dbf6373e..6b9a799954bf1 100644
--- a/yarn/src/test/resources/log4j.properties
+++ b/yarn/src/test/resources/log4j.properties
@@ -23,6 +23,9 @@ log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.spark-project.jetty=WARN
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
 log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark-project.jetty=WARN
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 17c59ff06e0c1..12494b01054ba 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -22,15 +22,18 @@ import java.util.Properties
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
+import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
 import org.scalatest.{BeforeAndAfterAll, Matchers}
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.launcher.TestClasspathBuilder
+import org.apache.spark.launcher._
 import org.apache.spark.util.Utils
 
 abstract class BaseYarnClusterSuite
@@ -46,13 +49,14 @@ abstract class BaseYarnClusterSuite
     |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
     |log4j.logger.org.apache.hadoop=WARN
     |log4j.logger.org.eclipse.jetty=WARN
+    |log4j.logger.org.mortbay=WARN
     |log4j.logger.org.spark-project.jetty=WARN
     """.stripMargin
 
   private var yarnCluster: MiniYARNCluster = _
   protected var tempDir: File = _
   private var fakeSparkJar: File = _
-  private var hadoopConfDir: File = _
+  protected var hadoopConfDir: File = _
   private var logConfDir: File = _
 
   def newYarnConfig(): YarnConfiguration
@@ -120,15 +124,77 @@ abstract class BaseYarnClusterSuite
       clientMode: Boolean,
       klass: String,
       appArgs: Seq[String] = Nil,
-      sparkArgs: Seq[String] = Nil,
+      sparkArgs: Seq[(String, String)] = Nil,
       extraClassPath: Seq[String] = Nil,
       extraJars: Seq[String] = Nil,
       extraConf: Map[String, String] = Map(),
-      extraEnv: Map[String, String] = Map()): Unit = {
+      extraEnv: Map[String, String] = Map()): SparkAppHandle.State = {
     val master = if (clientMode) "yarn-client" else "yarn-cluster"
-    val props = new Properties()
+    val propsFile = createConfFile(extraClassPath = extraClassPath, extraConf = extraConf)
+    val env = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv
+
+    val launcher = new SparkLauncher(env.asJava)
+    if (klass.endsWith(".py")) {
+      launcher.setAppResource(klass)
+    } else {
+      launcher.setMainClass(klass)
+      launcher.setAppResource(fakeSparkJar.getAbsolutePath())
+    }
+    launcher.setSparkHome(sys.props("spark.test.home"))
+      .setMaster(master)
+      .setConf("spark.executor.instances", "1")
+      .setPropertiesFile(propsFile)
+      .addAppArgs(appArgs.toArray: _*)
+
+    sparkArgs.foreach { case (name, value) =>
+      if (value != null) {
+        launcher.addSparkArg(name, value)
+      } else {
+        launcher.addSparkArg(name)
+      }
+    }
+    extraJars.foreach(launcher.addJar)
 
-    props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
+    val handle = launcher.startApplication()
+    try {
+      eventually(timeout(2 minutes), interval(1 second)) {
+        assert(handle.getState().isFinal())
+      }
+    } finally {
+      handle.kill()
+    }
+
+    handle.getState()
+  }
+
+  /**
+   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
+   * any sort of error when the job process finishes successfully, but the job itself fails. So
+   * the tests enforce that something is written to a file after everything is ok to indicate
+   * that the job succeeded.
+   */
+  protected def checkResult(finalState: SparkAppHandle.State, result: File): Unit = {
+    checkResult(finalState, result, "success")
+  }
+
+  protected def checkResult(
+      finalState: SparkAppHandle.State,
+      result: File,
+      expected: String): Unit = {
+    finalState should be (SparkAppHandle.State.FINISHED)
+    val resultString = Files.toString(result, UTF_8)
+    resultString should be (expected)
+  }
+
+  protected def mainClassName(klass: Class[_]): String = {
+    klass.getName().stripSuffix("$")
+  }
+
+  protected def createConfFile(
+      extraClassPath: Seq[String] = Nil,
+      extraConf: Map[String, String] = Map()): String = {
+    val props = new Properties()
+    props.put("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
 
     val testClasspath = new TestClasspathBuilder()
       .buildClassPath(
@@ -138,69 +204,28 @@ abstract class BaseYarnClusterSuite
       .asScala
       .mkString(File.pathSeparator)
 
-    props.setProperty("spark.driver.extraClassPath", testClasspath)
-    props.setProperty("spark.executor.extraClassPath", testClasspath)
+    props.put("spark.driver.extraClassPath", testClasspath)
+    props.put("spark.executor.extraClassPath", testClasspath)
 
     // SPARK-4267: make sure java options are propagated correctly.
     props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
     props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
 
-    yarnCluster.getConfig.asScala.foreach { e =>
+    yarnCluster.getConfig().asScala.foreach { e =>
       props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
     }
-
     sys.props.foreach { case (k, v) =>
       if (k.startsWith("spark.")) {
         props.setProperty(k, v)
       }
     }
-
     extraConf.foreach { case (k, v) => props.setProperty(k, v) }
 
     val propsFile = File.createTempFile("spark", ".properties", tempDir)
     val writer = new OutputStreamWriter(new FileOutputStream(propsFile), UTF_8)
     props.store(writer, "Spark properties.")
     writer.close()
-
-    val extraJarArgs = if (extraJars.nonEmpty) Seq("--jars", extraJars.mkString(",")) else Nil
-    val mainArgs =
-      if (klass.endsWith(".py")) {
-        Seq(klass)
-      } else {
-        Seq("--class", klass, fakeSparkJar.getAbsolutePath())
-      }
-    val argv =
-      Seq(
-        new File(sys.props("spark.test.home"), "bin/spark-submit").getAbsolutePath(),
-        "--master", master,
-        "--num-executors", "1",
-        "--properties-file", propsFile.getAbsolutePath()) ++
-      extraJarArgs ++
-      sparkArgs ++
-      mainArgs ++
-      appArgs
-
-    Utils.executeAndGetOutput(argv,
-      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv)
-  }
-
-  /**
-   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
-   * any sort of error when the job process finishes successfully, but the job itself fails. So
-   * the tests enforce that something is written to a file after everything is ok to indicate
-   * that the job succeeded.
-   */
-  protected def checkResult(result: File): Unit = {
-    checkResult(result, "success")
-  }
-
-  protected def checkResult(result: File, expected: String): Unit = {
-    val resultString = Files.toString(result, UTF_8)
-    resultString should be (expected)
-  }
-
-  protected def mainClassName(klass: Class[_]): String = {
-    klass.getName().stripSuffix("$")
+    propsFile.getAbsolutePath()
   }
 
 }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index f1601cd16100f..d1cd0c89b5d38 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -19,16 +19,20 @@ package org.apache.spark.deploy.yarn
 
 import java.io.File
 import java.net.URL
+import java.util.{HashMap => JHashMap, Properties}
 
 import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.launcher.TestClasspathBuilder
+import org.apache.spark.launcher._
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
   SparkListenerExecutorAdded}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -82,10 +86,8 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
 
   test("run Spark in yarn-cluster mode unsuccessfully") {
     // Don't provide arguments so the driver will fail.
-    val exception = intercept[SparkException] {
-      runSpark(false, mainClassName(YarnClusterDriver.getClass))
-      fail("Spark application should have failed.")
-    }
+    val finalState = runSpark(false, mainClassName(YarnClusterDriver.getClass))
+    finalState should be (SparkAppHandle.State.FAILED)
   }
 
   test("run Python application in yarn-client mode") {
@@ -104,11 +106,42 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     testUseClassPathFirst(false)
   }
 
+  test("monitor app using launcher library") {
+    val env = new JHashMap[String, String]()
+    env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath())
+
+    val propsFile = createConfFile()
+    val handle = new SparkLauncher(env)
+      .setSparkHome(sys.props("spark.test.home"))
+      .setConf("spark.ui.enabled", "false")
+      .setPropertiesFile(propsFile)
+      .setMaster("yarn-client")
+      .setAppResource("spark-internal")
+      .setMainClass(mainClassName(YarnLauncherTestApp.getClass))
+      .startApplication()
+
+    try {
+      eventually(timeout(30 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.RUNNING)
+      }
+
+      handle.getAppId() should not be (null)
+      handle.getAppId() should startWith ("application_")
+      handle.stop()
+
+      eventually(timeout(30 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.KILLED)
+      }
+    } finally {
+      handle.kill()
+    }
+  }
+
   private def testBasicYarnApp(clientMode: Boolean): Unit = {
     val result = File.createTempFile("result", null, tempDir)
-    runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
+    val finalState = runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
       appArgs = Seq(result.getAbsolutePath()))
-    checkResult(result)
+    checkResult(finalState, result)
   }
 
   private def testPySpark(clientMode: Boolean): Unit = {
@@ -143,11 +176,11 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     val pyFiles = Seq(pyModule.getAbsolutePath(), mod2Archive.getPath()).mkString(",")
     val result = File.createTempFile("result", null, tempDir)
 
-    runSpark(clientMode, primaryPyFile.getAbsolutePath(),
-      sparkArgs = Seq("--py-files", pyFiles),
+    val finalState = runSpark(clientMode, primaryPyFile.getAbsolutePath(),
+      sparkArgs = Seq("--py-files" -> pyFiles),
       appArgs = Seq(result.getAbsolutePath()),
       extraEnv = extraEnv)
-    checkResult(result)
+    checkResult(finalState, result)
   }
 
   private def testUseClassPathFirst(clientMode: Boolean): Unit = {
@@ -156,15 +189,15 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "OVERRIDDEN"), tempDir)
     val driverResult = File.createTempFile("driver", null, tempDir)
     val executorResult = File.createTempFile("executor", null, tempDir)
-    runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
+    val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
       appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
       extraClassPath = Seq(originalJar.getPath()),
       extraJars = Seq("local:" + userJar.getPath()),
       extraConf = Map(
         "spark.driver.userClassPathFirst" -> "true",
         "spark.executor.userClassPathFirst" -> "true"))
-    checkResult(driverResult, "OVERRIDDEN")
-    checkResult(executorResult, "OVERRIDDEN")
+    checkResult(finalState, driverResult, "OVERRIDDEN")
+    checkResult(finalState, executorResult, "OVERRIDDEN")
   }
 
 }
@@ -211,8 +244,8 @@ private object YarnClusterDriver extends Logging with Matchers {
       data should be (Set(1, 2, 3, 4))
       result = "success"
     } finally {
-      sc.stop()
       Files.write(result, status, UTF_8)
+      sc.stop()
     }
 
     // verify log urls are present
@@ -297,3 +330,18 @@ private object YarnClasspathTest extends Logging {
   }
 
 }
+
+private object YarnLauncherTestApp {
+
+  def main(args: Array[String]): Unit = {
+    // Do not stop the application; the test will stop it using the launcher lib. Just run a task
+    // that will prevent the process from exiting.
+    val sc = new SparkContext(new SparkConf())
+    sc.parallelize(Seq(1)).foreach { i =>
+      this.synchronized {
+        wait()
+      }
+    }
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index a85e5772a0fa4..c17e8695c24fb 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -53,7 +53,7 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
     logInfo("Shuffle service port = " + shuffleServicePort)
     val result = File.createTempFile("result", null, tempDir)
-    runSpark(
+    val finalState = runSpark(
       false,
       mainClassName(YarnExternalShuffleDriver.getClass),
       appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
@@ -62,7 +62,7 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
         "spark.shuffle.service.port" -> shuffleServicePort.toString
       )
     )
-    checkResult(result)
+    checkResult(finalState, result)
     assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
   }
 }

From 12b7191d2075ae870c73529de450cbb5725872ec Mon Sep 17 00:00:00 2001
From: Rick Hillegas <rhilleg@us.ibm.com>
Date: Fri, 9 Oct 2015 13:36:51 -0700
Subject: [PATCH 514/802] [SPARK-10855] [SQL] Add a JDBC dialect for Apache
 Derby

marmbrus
rxin

This patch adds a JdbcDialect class, which customizes the datatype mappings for Derby backends. The patch also adds unit tests for the new dialect, corresponding to the existing tests for other JDBC dialects.

JDBCSuite runs cleanly for me with this patch. So does JDBCWriteSuite, although it produces noise as described here: https://issues.apache.org/jira/browse/SPARK-10890

This patch is my original work, which I license to the ASF. I am a Derby contributor, so my ICLA is on file under SVN id "rhillegas": http://people.apache.org/committer-index.html

Touches the following files:

---------------------------------

org.apache.spark.sql.jdbc.JdbcDialects

Adds a DerbyDialect.

---------------------------------

org.apache.spark.sql.jdbc.JDBCSuite

Adds unit tests for the new DerbyDialect.

Author: Rick Hillegas <rhilleg@us.ibm.com>

Closes #8982 from rick-ibm/b_10855.
---
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 28 +++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 14 +++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 0cd356f222984..a2ff4cc1c91f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -138,6 +138,7 @@ object JdbcDialects {
   registerDialect(PostgresDialect)
   registerDialect(DB2Dialect)
   registerDialect(MsSqlServerDialect)
+  registerDialect(DerbyDialect)
 
 
   /**
@@ -287,3 +288,30 @@ case object MsSqlServerDialect extends JdbcDialect {
     case _ => None
   }
 }
+
+/**
+ * :: DeveloperApi ::
+ * Default Apache Derby dialect, mapping real on read
+ * and string/byte/short/boolean/decimal on write.
+ */
+@DeveloperApi
+case object DerbyDialect extends JdbcDialect {
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:derby")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (sqlType == Types.REAL) Option(FloatType) else None
+  }
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case StringType => Some(JdbcType("CLOB", java.sql.Types.CLOB))
+    case ByteType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT))
+    case ShortType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT))
+    case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
+    // 31 is the maximum precision and 5 is the default scale for a Derby DECIMAL
+    case (t: DecimalType) if (t.precision > 31) =>
+      Some(JdbcType("DECIMAL(31,5)", java.sql.Types.DECIMAL))
+    case _ => None
+  }
+
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index bbf705ce95933..d530b1a469ce2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -409,18 +409,22 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
     assert(JdbcDialects.get("jdbc:db2://127.0.0.1/db") == DB2Dialect)
     assert(JdbcDialects.get("jdbc:sqlserver://127.0.0.1/db") == MsSqlServerDialect)
+    assert(JdbcDialects.get("jdbc:derby:db") == DerbyDialect)
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
   test("quote column names by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    val Derby = JdbcDialects.get("jdbc:derby:db")
 
     val columns = Seq("abc", "key")
     val MySQLColumns = columns.map(MySQL.quoteIdentifier(_))
     val PostgresColumns = columns.map(Postgres.quoteIdentifier(_))
+    val DerbyColumns = columns.map(Derby.quoteIdentifier(_))
     assert(MySQLColumns === Seq("`abc`", "`key`"))
     assert(PostgresColumns === Seq(""""abc"""", """"key""""))
+    assert(DerbyColumns === Seq(""""abc"""", """"key""""))
   }
 
   test("Dialect unregister") {
@@ -454,16 +458,23 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
 
   test("PostgresDialect type mapping") {
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
-    // SPARK-7869: Testing JSON types handling
     assert(Postgres.getCatalystType(java.sql.Types.OTHER, "json", 1, null) === Some(StringType))
     assert(Postgres.getCatalystType(java.sql.Types.OTHER, "jsonb", 1, null) === Some(StringType))
   }
 
+  test("DerbyDialect jdbc type mapping") {
+    val derbyDialect = JdbcDialects.get("jdbc:derby:db")
+    assert(derbyDialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")
+    assert(derbyDialect.getJDBCType(ByteType).map(_.databaseTypeDefinition).get == "SMALLINT")
+    assert(derbyDialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "BOOLEAN")
+  }
+
   test("table exists query by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
     val db2 = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
     val h2 = JdbcDialects.get(url)
+    val derby = JdbcDialects.get("jdbc:derby:db")
     val table = "weblogs"
     val defaultQuery = s"SELECT * FROM $table WHERE 1=0"
     val limitQuery = s"SELECT 1 FROM $table LIMIT 1"
@@ -471,5 +482,6 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(Postgres.getTableExistsQuery(table) == limitQuery)
     assert(db2.getTableExistsQuery(table) == defaultQuery)
     assert(h2.getTableExistsQuery(table) == defaultQuery)
+    assert(derby.getTableExistsQuery(table) == defaultQuery)
   }
 }

From 63c340a710b24869410d56602b712fbfe443e6f0 Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Fri, 9 Oct 2015 14:06:25 -0700
Subject: [PATCH 515/802] [SPARK-10858] YARN: archives/jar/files rename with #
 doesn't work unl

https://issues.apache.org/jira/browse/SPARK-10858

The issue here is that in resolveURI we default to calling new File(path).getAbsoluteFile().toURI().  But if the path passed in already has a # in it then File(path) will think that is supposed to be part of the actual file path and not a fragment so it changes # to %23. Then when we try to parse that  later in Client as a URI it doesn't recognize there is a fragment.

so to fix we just check if there is a fragment, still create the File like we did before and then add the fragment back on.

Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #9035 from tgravescs/SPARK-10858.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala      | 7 +++++++
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 2bab4af2e73ab..e60c1b355a73e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1749,6 +1749,13 @@ private[spark] object Utils extends Logging {
       if (uri.getScheme() != null) {
         return uri
       }
+      // make sure to handle if the path has a fragment (applies to yarn
+      // distributed cache)
+      if (uri.getFragment() != null) {
+        val absoluteURI = new File(uri.getPath()).getAbsoluteFile().toURI()
+        return new URI(absoluteURI.getScheme(), absoluteURI.getHost(), absoluteURI.getPath(),
+          uri.getFragment())
+      }
     } catch {
       case e: URISyntaxException =>
     }
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 1fb81ad565b41..68b0da76bc134 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -384,7 +384,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assertResolves("hdfs:/root/spark.jar", "hdfs:/root/spark.jar")
     assertResolves("hdfs:///root/spark.jar#app.jar", "hdfs:/root/spark.jar#app.jar")
     assertResolves("spark.jar", s"file:$cwd/spark.jar")
-    assertResolves("spark.jar#app.jar", s"file:$cwd/spark.jar%23app.jar")
+    assertResolves("spark.jar#app.jar", s"file:$cwd/spark.jar#app.jar")
     assertResolves("path to/file.txt", s"file:$cwd/path%20to/file.txt")
     if (Utils.isWindows) {
       assertResolves("C:\\path\\to\\file.txt", "file:/C:/path/to/file.txt")
@@ -414,10 +414,10 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assertResolves("file:/jar1,file:/jar2", "file:/jar1,file:/jar2")
     assertResolves("hdfs:/jar1,file:/jar2,jar3", s"hdfs:/jar1,file:/jar2,file:$cwd/jar3")
     assertResolves("hdfs:/jar1,file:/jar2,jar3,jar4#jar5,path to/jar6",
-      s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:$cwd/jar4%23jar5,file:$cwd/path%20to/jar6")
+      s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:$cwd/jar4#jar5,file:$cwd/path%20to/jar6")
     if (Utils.isWindows) {
       assertResolves("""hdfs:/jar1,file:/jar2,jar3,C:\pi.py#py.pi,C:\path to\jar4""",
-        s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:/C:/pi.py%23py.pi,file:/C:/path%20to/jar4")
+        s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:/C:/pi.py#py.pi,file:/C:/path%20to/jar4")
     }
   }
 

From c1b4ce43264fa8b9945df3c599a51d4d2a675705 Mon Sep 17 00:00:00 2001
From: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>
Date: Fri, 9 Oct 2015 14:16:13 -0700
Subject: [PATCH 516/802] [SPARK-10535] Sync up API for matrix factorization
 model between Scala and PySpark

Support for recommendUsersForProducts and recommendProductsForUsers in matrix factorization model for PySpark

Author: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>

Closes #8700 from smartkiwi/SPARK-10535_.
---
 .../MatrixFactorizationModelWrapper.scala     |  8 +++++
 python/pyspark/mllib/recommendation.py        | 32 ++++++++++++++++---
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
index 534edac56bc5a..eeb7cba882ce2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
@@ -42,4 +42,12 @@ private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorization
       case (product, feature) => (product, Vectors.dense(feature))
     }.asInstanceOf[RDD[(Any, Any)]])
   }
+
+  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
+    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
+  }
+
+  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
+    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
+  }
 }
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 95047b5b7b4b7..b9442b0d16c0f 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -76,16 +76,28 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
 
     >>> first_user = model.userFeatures().take(1)[0]
     >>> latents = first_user[1]
-    >>> len(latents) == 4
-    True
+    >>> len(latents)
+    4
 
     >>> model.productFeatures().collect()
     [(1, array('d', [...])), (2, array('d', [...]))]
 
     >>> first_product = model.productFeatures().take(1)[0]
     >>> latents = first_product[1]
-    >>> len(latents) == 4
-    True
+    >>> len(latents)
+    4
+
+    >>> products_for_users = model.recommendProductsForUsers(1).collect()
+    >>> len(products_for_users)
+    2
+    >>> products_for_users[0]
+    (1, (Rating(user=1, product=2, rating=...),))
+
+    >>> users_for_products = model.recommendUsersForProducts(1).collect()
+    >>> len(users_for_products)
+    2
+    >>> users_for_products[0]
+    (1, (Rating(user=2, product=1, rating=...),))
 
     >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
     >>> model.predict(2, 2)
@@ -166,6 +178,18 @@ def recommendProducts(self, user, num):
         """
         return list(self.call("recommendProducts", user, num))
 
+    def recommendProductsForUsers(self, num):
+        """
+        Recommends top "num" products for all users. The number returned may be less than this.
+        """
+        return self.call("wrappedRecommendProductsForUsers", num)
+
+    def recommendUsersForProducts(self, num):
+        """
+        Recommends top "num" users for all products. The number returned may be less than this.
+        """
+        return self.call("wrappedRecommendUsersForProducts", num)
+
     @property
     @since("1.4.0")
     def rank(self):

From 864de3bf4041c829e95d278b9569e91448bab0cc Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Fri, 9 Oct 2015 23:05:38 -0700
Subject: [PATCH 517/802] [SPARK-10079] [SPARKR] Make 'column' and 'col'
 functions be S4 functions.

1.  Add a "col" function into DataFrame.
2.  Move the current "col" function in Column.R to functions.R, convert it to S4 function.
3.  Add a s4 "column" function in functions.R.
4.  Convert the "column" function in Column.R to S4 function. This is for private use.

Author: Sun Rui <rui.sun@intel.com>

Closes #8864 from sun-rui/SPARK-10079.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/column.R                 | 12 +++++-------
 R/pkg/R/functions.R              | 22 ++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R |  4 ++--
 5 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 255be2e76ff49..95d949ee3e5a4 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -107,6 +107,7 @@ exportMethods("%in%",
               "cbrt",
               "ceil",
               "ceiling",
+              "column",
               "concat",
               "concat_ws",
               "contains",
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 42e9d12179db7..20de3907b7dd9 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -36,13 +36,11 @@ setMethod("initialize", "Column", function(.Object, jc) {
   .Object
 })
 
-column <- function(jc) {
-  new("Column", jc)
-}
-
-col <- function(x) {
-  column(callJStatic("org.apache.spark.sql.functions", "col", x))
-}
+setMethod("column",
+          signature(x = "jobj"),
+          function(x) {
+            new("Column", x)
+          })
 
 #' @rdname show
 #' @name show
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 94687edb05442..a220ad8b9f58b 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -233,6 +233,28 @@ setMethod("ceil",
             column(jc)
           })
 
+#' Though scala functions has "col" function, we don't expose it in SparkR
+#' because we don't want to conflict with the "col" function in the R base
+#' package and we also have "column" function exported which is an alias of "col".
+col <- function(x) {
+  column(callJStatic("org.apache.spark.sql.functions", "col", x))
+}
+
+#' column
+#'
+#' Returns a Column based on the given column name.
+#'
+#' @rdname col
+#' @name column
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{column(df)}
+setMethod("column",
+          signature(x = "character"),
+          function(x) {
+            col(x)
+          })
+
 #' cos
 #'
 #' Computes the cosine of the given value.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c4474131804bb..8fad17026c06f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -686,6 +686,10 @@ setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
 #' @export
 setGeneric("ceil", function(x) { standardGeneric("ceil") })
 
+#' @rdname col
+#' @export
+setGeneric("column", function(x) { standardGeneric("column") })
+
 #' @rdname concat
 #' @export
 setGeneric("concat", function(x, ...) { standardGeneric("concat") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 4804ecf177341..3a04edbb4c116 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -787,7 +787,7 @@ test_that("test HiveContext", {
 })
 
 test_that("column operators", {
-  c <- SparkR:::col("a")
+  c <- column("a")
   c2 <- (- c + 1 - 2) * 3 / 4.0
   c3 <- (c + c2 - c2) * c2 %% c2
   c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
@@ -795,7 +795,7 @@ test_that("column operators", {
 })
 
 test_that("column functions", {
-  c <- SparkR:::col("a")
+  c <- column("a")
   c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
   c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
   c3 <- cosh(c) + count(c) + crc32(c) + exp(c)

From a16396df76cc27099011bfb96b28cbdd7f964ca8 Mon Sep 17 00:00:00 2001
From: Jacker Hu <gt.hu.chang@gmail.com>
Date: Sat, 10 Oct 2015 11:36:18 +0100
Subject: [PATCH 518/802] [SPARK-10772] [STREAMING] [SCALA]
 NullPointerException when transform function in DStream returns NULL

Currently, the ```TransformedDStream``` will using ```Some(transformFunc(parentRDDs, validTime))``` as compute return value, when the ```transformFunc``` somehow returns null as return value, the followed operator will have NullPointerExeception.

This fix uses the ```Option()``` instead of ```Some()``` to deal with the possible null value. When   ```transformFunc``` returns ```null```, the option will transform null to ```None```, the downstream can handle ```None``` correctly.

NOTE (2015-09-25): The latest fix will check the return value of transform function, if it is ```NULL```, a spark exception will be thrown out

Author: Jacker Hu <gt.hu.chang@gmail.com>
Author: jhu-chang <gt.hu.chang@gmail.com>

Closes #8881 from jhu-chang/Fix_Transform.
---
 .../streaming/dstream/TransformedDStream.scala      | 12 ++++++++++--
 .../spark/streaming/BasicOperationsSuite.scala      | 13 +++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
index 5d46ca0715ffd..ab01f47d5cf99 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.reflect.ClassTag
+
+import org.apache.spark.SparkException
 import org.apache.spark.rdd.{PairRDDFunctions, RDD}
 import org.apache.spark.streaming.{Duration, Time}
-import scala.reflect.ClassTag
 
 private[streaming]
 class TransformedDStream[U: ClassTag] (
@@ -38,6 +40,12 @@ class TransformedDStream[U: ClassTag] (
 
   override def compute(validTime: Time): Option[RDD[U]] = {
     val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some(transformFunc(parentRDDs, validTime))
+    val transformedRDD = transformFunc(parentRDDs, validTime)
+    if (transformedRDD == null) {
+      throw new SparkException("Transform function must not return null. " +
+        "Return SparkContext.emptyRDD() instead to represent no element " +
+        "as the result of transformation.")
+    }
+    Some(transformedRDD)
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 255376807c957..9988f410f0bc1 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -211,6 +211,19 @@ class BasicOperationsSuite extends TestSuiteBase {
     )
   }
 
+  test("transform with NULL") {
+    val input = Seq(1 to 4)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (r: DStream[Int]) => r.transform(rdd => null.asInstanceOf[RDD[Int]]),
+        Seq(Seq()),
+        1,
+        false
+      )
+    }
+  }
+
   test("transformWith") {
     val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), Seq("")   )

From 595012ea8b9c6afcc2fc024d5a5e198df765bd75 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 11 Oct 2015 18:11:08 -0700
Subject: [PATCH 519/802] [SPARK-11053] Remove use of KVIterator in
 SortBasedAggregationIterator

SortBasedAggregationIterator uses a KVIterator interface in order to process input rows as key-value pairs, but this use of KVIterator is unnecessary, slightly complicates the code, and might hurt performance. This patch refactors this code to remove the use of this extra layer of iterator wrapping and simplifies other parts of the code in the process.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9066 from JoshRosen/sort-iterator-cleanup.
---
 .../aggregate/AggregationIterator.scala       | 83 -----------------
 .../aggregate/SortBasedAggregate.scala        | 20 +++--
 .../SortBasedAggregationIterator.scala        | 89 +++++--------------
 3 files changed, 33 insertions(+), 159 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 5f7341e88c7c9..8e0fbd109b413 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -21,7 +21,6 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.unsafe.KVIterator
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -412,85 +411,3 @@ abstract class AggregationIterator(
    */
   protected def newBuffer: MutableRow
 }
-
-object AggregationIterator {
-  def kvIterator(
-    groupingExpressions: Seq[NamedExpression],
-    newProjection: (Seq[Expression], Seq[Attribute]) => Projection,
-    inputAttributes: Seq[Attribute],
-    inputIter: Iterator[InternalRow]): KVIterator[InternalRow, InternalRow] = {
-    new KVIterator[InternalRow, InternalRow] {
-      private[this] val groupingKeyGenerator = newProjection(groupingExpressions, inputAttributes)
-
-      private[this] var groupingKey: InternalRow = _
-
-      private[this] var value: InternalRow = _
-
-      override def next(): Boolean = {
-        if (inputIter.hasNext) {
-          // Read the next input row.
-          val inputRow = inputIter.next()
-          // Get groupingKey based on groupingExpressions.
-          groupingKey = groupingKeyGenerator(inputRow)
-          // The value is the inputRow.
-          value = inputRow
-          true
-        } else {
-          false
-        }
-      }
-
-      override def getKey(): InternalRow = {
-        groupingKey
-      }
-
-      override def getValue(): InternalRow = {
-        value
-      }
-
-      override def close(): Unit = {
-        // Do nothing
-      }
-    }
-  }
-
-  def unsafeKVIterator(
-      groupingExpressions: Seq[NamedExpression],
-      inputAttributes: Seq[Attribute],
-      inputIter: Iterator[InternalRow]): KVIterator[UnsafeRow, InternalRow] = {
-    new KVIterator[UnsafeRow, InternalRow] {
-      private[this] val groupingKeyGenerator =
-        UnsafeProjection.create(groupingExpressions, inputAttributes)
-
-      private[this] var groupingKey: UnsafeRow = _
-
-      private[this] var value: InternalRow = _
-
-      override def next(): Boolean = {
-        if (inputIter.hasNext) {
-          // Read the next input row.
-          val inputRow = inputIter.next()
-          // Get groupingKey based on groupingExpressions.
-          groupingKey = groupingKeyGenerator.apply(inputRow)
-          // The value is the inputRow.
-          value = inputRow
-          true
-        } else {
-          false
-        }
-      }
-
-      override def getKey(): UnsafeRow = {
-        groupingKey
-      }
-
-      override def getValue(): InternalRow = {
-        value
-      }
-
-      override def close(): Unit = {
-        // Do nothing
-      }
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
index f4c14a9b3556f..4d37106e007f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
@@ -23,9 +23,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, ClusteredDistribution, AllTuples, Distribution}
-import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan, UnaryNode}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryNode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.types.StructType
 
 case class SortBasedAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
@@ -79,18 +78,23 @@ case class SortBasedAggregate(
         // so return an empty iterator.
         Iterator[InternalRow]()
       } else {
-        val outputIter = SortBasedAggregationIterator.createFromInputIterator(
-          groupingExpressions,
+        val groupingKeyProjection = if (UnsafeProjection.canSupport(groupingExpressions)) {
+          UnsafeProjection.create(groupingExpressions, child.output)
+        } else {
+          newMutableProjection(groupingExpressions, child.output)()
+        }
+        val outputIter = new SortBasedAggregationIterator(
+          groupingKeyProjection,
+          groupingExpressions.map(_.toAttribute),
+          child.output,
+          iter,
           nonCompleteAggregateExpressions,
           nonCompleteAggregateAttributes,
           completeAggregateExpressions,
           completeAggregateAttributes,
           initialInputBufferOffset,
           resultExpressions,
-          newMutableProjection _,
-          newProjection _,
-          child.output,
-          iter,
+          newMutableProjection,
           outputsUnsafeRows,
           numInputRows,
           numOutputRows)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
index a9e5d175bf895..64c673064f576 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -21,16 +21,16 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2, AggregateFunction2}
 import org.apache.spark.sql.execution.metric.LongSQLMetric
-import org.apache.spark.unsafe.KVIterator
 
 /**
  * An iterator used to evaluate [[AggregateFunction2]]. It assumes the input rows have been
  * sorted by values of [[groupingKeyAttributes]].
  */
 class SortBasedAggregationIterator(
+    groupingKeyProjection: InternalRow => InternalRow,
     groupingKeyAttributes: Seq[Attribute],
     valueAttributes: Seq[Attribute],
-    inputKVIterator: KVIterator[InternalRow, InternalRow],
+    inputIterator: Iterator[InternalRow],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
@@ -90,6 +90,22 @@ class SortBasedAggregationIterator(
   // The aggregation buffer used by the sort-based aggregation.
   private[this] val sortBasedAggregationBuffer: MutableRow = newBuffer
 
+  protected def initialize(): Unit = {
+    if (inputIterator.hasNext) {
+      initializeBuffer(sortBasedAggregationBuffer)
+      val inputRow = inputIterator.next()
+      nextGroupingKey = groupingKeyProjection(inputRow).copy()
+      firstRowInNextGroup = inputRow.copy()
+      numInputRows += 1
+      sortedInputHasNewGroup = true
+    } else {
+      // This inputIter is empty.
+      sortedInputHasNewGroup = false
+    }
+  }
+
+  initialize()
+
   /** Processes rows in the current group. It will stop when it find a new group. */
   protected def processCurrentSortedGroup(): Unit = {
     currentGroupingKey = nextGroupingKey
@@ -101,18 +117,15 @@ class SortBasedAggregationIterator(
 
     // The search will stop when we see the next group or there is no
     // input row left in the iter.
-    var hasNext = inputKVIterator.next()
-    while (!findNextPartition && hasNext) {
+    while (!findNextPartition && inputIterator.hasNext) {
       // Get the grouping key.
-      val groupingKey = inputKVIterator.getKey
-      val currentRow = inputKVIterator.getValue
+      val currentRow = inputIterator.next()
+      val groupingKey = groupingKeyProjection(currentRow)
       numInputRows += 1
 
       // Check if the current row belongs the current input row.
       if (currentGroupingKey == groupingKey) {
         processRow(sortBasedAggregationBuffer, currentRow)
-
-        hasNext = inputKVIterator.next()
       } else {
         // We find a new group.
         findNextPartition = true
@@ -149,68 +162,8 @@ class SortBasedAggregationIterator(
     }
   }
 
-  protected def initialize(): Unit = {
-    if (inputKVIterator.next()) {
-      initializeBuffer(sortBasedAggregationBuffer)
-
-      nextGroupingKey = inputKVIterator.getKey().copy()
-      firstRowInNextGroup = inputKVIterator.getValue().copy()
-      numInputRows += 1
-      sortedInputHasNewGroup = true
-    } else {
-      // This inputIter is empty.
-      sortedInputHasNewGroup = false
-    }
-  }
-
-  initialize()
-
   def outputForEmptyGroupingKeyWithoutInput(): InternalRow = {
     initializeBuffer(sortBasedAggregationBuffer)
     generateOutput(new GenericInternalRow(0), sortBasedAggregationBuffer)
   }
 }
-
-object SortBasedAggregationIterator {
-  // scalastyle:off
-  def createFromInputIterator(
-      groupingExprs: Seq[NamedExpression],
-      nonCompleteAggregateExpressions: Seq[AggregateExpression2],
-      nonCompleteAggregateAttributes: Seq[Attribute],
-      completeAggregateExpressions: Seq[AggregateExpression2],
-      completeAggregateAttributes: Seq[Attribute],
-      initialInputBufferOffset: Int,
-      resultExpressions: Seq[NamedExpression],
-      newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
-      newProjection: (Seq[Expression], Seq[Attribute]) => Projection,
-      inputAttributes: Seq[Attribute],
-      inputIter: Iterator[InternalRow],
-      outputsUnsafeRows: Boolean,
-      numInputRows: LongSQLMetric,
-      numOutputRows: LongSQLMetric): SortBasedAggregationIterator = {
-    val kvIterator = if (UnsafeProjection.canSupport(groupingExprs)) {
-      AggregationIterator.unsafeKVIterator(
-        groupingExprs,
-        inputAttributes,
-        inputIter).asInstanceOf[KVIterator[InternalRow, InternalRow]]
-    } else {
-      AggregationIterator.kvIterator(groupingExprs, newProjection, inputAttributes, inputIter)
-    }
-
-    new SortBasedAggregationIterator(
-      groupingExprs.map(_.toAttribute),
-      inputAttributes,
-      kvIterator,
-      nonCompleteAggregateExpressions,
-      nonCompleteAggregateAttributes,
-      completeAggregateExpressions,
-      completeAggregateAttributes,
-      initialInputBufferOffset,
-      resultExpressions,
-      newMutableProjection,
-      outputsUnsafeRows,
-      numInputRows,
-      numOutputRows)
-  }
-  // scalastyle:on
-}

From fcb37a04177edc2376e39dd0b910f0268f7c72ec Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 12 Oct 2015 09:16:14 -0700
Subject: [PATCH 520/802] [SPARK-10960] [SQL] SQL with windowing function
 should be able to refer column in inner select

JIRA: https://issues.apache.org/jira/browse/SPARK-10960

When accessing a column in inner select from a select with window function, `AnalysisException` will be thrown. For example, an query like this:

     select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 from (select month, area, product, 1 as tmp1 from windowData) tmp

Currently, the rule `ExtractWindowExpressions` in `Analyzer` only extracts regular expressions from `WindowFunction`, `WindowSpecDefinition` and `AggregateExpression`. We need to also extract other attributes as the one in `Alias` as shown in the above query.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9011 from viirya/fix-window-inner-column.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  4 +++
 .../sql/hive/execution/SQLQuerySuite.scala    | 27 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index bf72d47ce1ea6..f5597a08d3595 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -831,6 +831,10 @@ class Analyzer(
             val withName = Alias(agg, s"_w${extractedExprBuffer.length}")()
             extractedExprBuffer += withName
             withName.toAttribute
+
+          // Extracts other attributes
+          case attr: Attribute => extractExpr(attr)
+
         }.asInstanceOf[NamedExpression]
       }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ccc15eaa63f42..51b63f3688783 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -838,6 +838,33 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3)))
   }
 
+  test("window function: refer column in inner select block") {
+    val data = Seq(
+      WindowData(1, "a", 5),
+      WindowData(2, "a", 6),
+      WindowData(3, "b", 7),
+      WindowData(4, "b", 8),
+      WindowData(5, "c", 9),
+      WindowData(6, "c", 10)
+    )
+    sparkContext.parallelize(data).toDF().registerTempTable("windowData")
+
+    checkAnswer(
+      sql(
+        """
+          |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1
+          |from (select month, area, product, 1 as tmp1 from windowData) tmp
+        """.stripMargin),
+      Seq(
+        ("a", 2),
+        ("a", 3),
+        ("b", 2),
+        ("b", 3),
+        ("c", 2),
+        ("c", 3)
+      ).map(i => Row(i._1, i._2)))
+  }
+
   test("window function: partition and order expressions") {
     val data = Seq(
       WindowData(1, "a", 5),

From 64b1d00e1a7c1dc52c08a5e97baf6e7117f1a94f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 12 Oct 2015 10:17:19 -0700
Subject: [PATCH 521/802] [SPARK-11007] [SQL] Adds dictionary aware Parquet
 decimal converters

For Parquet decimal columns that are encoded using plain-dictionary encoding, we can make the upper level converter aware of the dictionary, so that we can pre-instantiate all the decimals to avoid duplicated instantiation.

Note that plain-dictionary encoding isn't available for `FIXED_LEN_BYTE_ARRAY` for Parquet writer version `PARQUET_1_0`. So currently only decimals written as `INT32` and `INT64` can benefit from this optimization.

Author: Cheng Lian <lian@databricks.com>

Closes #9040 from liancheng/spark-11007.decimal-converter-dict-support.
---
 .../parquet/CatalystRowConverter.scala        |  83 +++++++++++++++---
 .../src/test/resources/dec-in-i32.parquet     | Bin 0 -> 420 bytes
 .../src/test/resources/dec-in-i64.parquet     | Bin 0 -> 437 bytes
 .../datasources/parquet/ParquetIOSuite.scala  |  19 ++++
 .../ParquetProtobufCompatibilitySuite.scala   |  22 ++---
 .../datasources/parquet/ParquetTest.scala     |   5 ++
 6 files changed, 103 insertions(+), 26 deletions(-)
 create mode 100755 sql/core/src/test/resources/dec-in-i32.parquet
 create mode 100755 sql/core/src/test/resources/dec-in-i64.parquet

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 247d35363b862..49007e45ecf87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8}
-import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{DOUBLE, INT32, INT64, BINARY, FIXED_LEN_BYTE_ARRAY}
 import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 
 import org.apache.spark.Logging
@@ -222,8 +222,25 @@ private[parquet] class CatalystRowConverter(
             updater.setShort(value.asInstanceOf[ShortType#InternalType])
         }
 
+      // For INT32 backed decimals
+      case t: DecimalType if parquetType.asPrimitiveType().getPrimitiveTypeName == INT32 =>
+        new CatalystIntDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
+      // For INT64 backed decimals
+      case t: DecimalType if parquetType.asPrimitiveType().getPrimitiveTypeName == INT64 =>
+        new CatalystLongDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
+      // For BINARY and FIXED_LEN_BYTE_ARRAY backed decimals
+      case t: DecimalType
+        if parquetType.asPrimitiveType().getPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY ||
+           parquetType.asPrimitiveType().getPrimitiveTypeName == BINARY =>
+        new CatalystBinaryDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
       case t: DecimalType =>
-        new CatalystDecimalConverter(t, updater)
+        throw new RuntimeException(
+          s"Unable to create Parquet converter for decimal type ${t.json} whose Parquet type is " +
+            s"$parquetType.  Parquet DECIMAL type can only be backed by INT32, INT64, " +
+            "FIXED_LEN_BYTE_ARRAY, or BINARY.")
 
       case StringType =>
         new CatalystStringConverter(updater)
@@ -274,9 +291,10 @@ private[parquet] class CatalystRowConverter(
           override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
-      case _ =>
+      case t =>
         throw new RuntimeException(
-          s"Unable to create Parquet converter for data type ${catalystType.json}")
+          s"Unable to create Parquet converter for data type ${t.json} " +
+            s"whose Parquet type is $parquetType")
     }
   }
 
@@ -314,11 +332,18 @@ private[parquet] class CatalystRowConverter(
   /**
    * Parquet converter for fixed-precision decimals.
    */
-  private final class CatalystDecimalConverter(
-      decimalType: DecimalType,
-      updater: ParentContainerUpdater)
+  private abstract class CatalystDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
     extends CatalystPrimitiveConverter(updater) {
 
+    protected var expandedDictionary: Array[Decimal] = _
+
+    override def hasDictionarySupport: Boolean = true
+
+    override def addValueFromDictionary(dictionaryId: Int): Unit = {
+      updater.set(expandedDictionary(dictionaryId))
+    }
+
     // Converts decimals stored as INT32
     override def addInt(value: Int): Unit = {
       addLong(value: Long)
@@ -326,18 +351,19 @@ private[parquet] class CatalystRowConverter(
 
     // Converts decimals stored as INT64
     override def addLong(value: Long): Unit = {
-      updater.set(Decimal(value, decimalType.precision, decimalType.scale))
+      updater.set(decimalFromLong(value))
     }
 
     // Converts decimals stored as either FIXED_LENGTH_BYTE_ARRAY or BINARY
     override def addBinary(value: Binary): Unit = {
-      updater.set(toDecimal(value))
+      updater.set(decimalFromBinary(value))
     }
 
-    private def toDecimal(value: Binary): Decimal = {
-      val precision = decimalType.precision
-      val scale = decimalType.scale
+    protected def decimalFromLong(value: Long): Decimal = {
+      Decimal(value, precision, scale)
+    }
 
+    protected def decimalFromBinary(value: Binary): Decimal = {
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
         // Constructs a `Decimal` with an unscaled `Long` value if possible.
         val unscaled = binaryToUnscaledLong(value)
@@ -371,6 +397,39 @@ private[parquet] class CatalystRowConverter(
     }
   }
 
+  private class CatalystIntDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromLong(dictionary.decodeToInt(id).toLong)
+      }
+    }
+  }
+
+  private class CatalystLongDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromLong(dictionary.decodeToLong(id))
+      }
+    }
+  }
+
+  private class CatalystBinaryDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromBinary(dictionary.decodeToBinary(id))
+      }
+    }
+  }
+
   /**
    * Parquet converter for arrays.  Spark SQL arrays are represented as Parquet lists.  Standard
    * Parquet lists are represented as a 3-level group annotated by `LIST`:
diff --git a/sql/core/src/test/resources/dec-in-i32.parquet b/sql/core/src/test/resources/dec-in-i32.parquet
new file mode 100755
index 0000000000000000000000000000000000000000..bb5d4af8dd36817bfb2cc16746417f0f2cbec759
GIT binary patch
literal 420
zcmWG=3^EjD5e*Pc^AQyhWno~D@8)2DfaHXP1P{hX!VYT=pEzK^*s-6XkVTmJu$)3z
zLRvxwV-mx>L&=vkfNDh<L={9`bbthlD4QsUj08&yGXsMJ&@cuDF%W@dW>P{zKtf8Q
zr~#MeY(^CZr+?eF2>?}yGD+%q@Dvv$7G=j5CugMQCW<lv1yz|O*fWid;!{$SRk?ts
zb1{f1NXkgcsBy>ub(penut~xdh_Z+&h@D{+YhzO5u)%NwNJfD{Qbs~EzbIWVu^<s>
zi5}QKz2d?gJ)p&frKu%)Mfv4=xv3?IDTyVC63Nv{C6xuKN>)n6B}JvlB}zI<X_=`x
zDaA@w(bY<MiMb#tsPlkwP_;m}X67d5Xqf64X#z#_N^^1&lX8Gcfo7!YD8WouvZ}7F
ljjd&nkbv5)n_Hw%mReMtnV+X%sAr~Uz#z)Vzz_h89{|hbXDk2!

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/resources/dec-in-i64.parquet b/sql/core/src/test/resources/dec-in-i64.parquet
new file mode 100755
index 0000000000000000000000000000000000000000..e07c4a0ad9843dca983e17d784566aae7b4be54e
GIT binary patch
literal 437
zcmWG=3^EjD5naG2n&KlWBFe(RAm7cw00GGf4Gkh1wM<^G4V+$Z2K?fl(wES5p?bj<
zCgYa8#!C!QD$gec0F{a|h$@J>=l}^8Q8rNy83~RSW{3$AFryg6KmtfcCnY2VB%~yY
z8gOaOW>jHt`nPSH0LUmNNgWTK;)2AY?D*p3jMUsjQ6>ga7F8w*_DnOA_>|OSRW6_{
zA`D^*k}{GqY8*16ERv=y9Bh(s1)?ls3S#S+#HKN+aoFH=3P^<lgQSdvW`0q+USdHa
z&@w%+y?VukIeI_`6qcrz=oRIc>*c1FB&H;mBub=IE0t6hq$*h{6_*s1CYLDbD5Yhl
z=A;xWSw&YX<t65Vq@d0O%0blv-JF@5n4@86pkt&76wWKn$w^Gg0jdQWlB%NwGhWH6
ny0$j9mO(-SYPoK1kwRH&QE_H|o`RvCnVtcI93ulm05HM;Rw!wM

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 599cf948e76a0..72744799897be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -488,6 +488,25 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
+
+  test("read dictionary encoded decimals written as INT32") {
+    checkAnswer(
+      // Decimal column in this file is encoded using plain dictionary
+      readResourceParquetFile("dec-in-i32.parquet"),
+      sqlContext.range(1 << 4).select('id % 10 cast DecimalType(5, 2) as 'i32_dec))
+  }
+
+  test("read dictionary encoded decimals written as INT64") {
+    checkAnswer(
+      // Decimal column in this file is encoded using plain dictionary
+      readResourceParquetFile("dec-in-i64.parquet"),
+      sqlContext.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'i64_dec))
+  }
+
+  // TODO Adds test case for reading dictionary encoded decimals written as `FIXED_LEN_BYTE_ARRAY`
+  // The Parquet writer version Spark 1.6 and prior versions use is `PARQUET_1_0`, which doesn't
+  // provide dictionary encoding support for `FIXED_LEN_BYTE_ARRAY`.  Should add a test here once
+  // we upgrade to `PARQUET_2_0`.
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
index b290429c2a021..98333e58cada8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
@@ -17,23 +17,17 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
-
-  private def readParquetProtobufFile(name: String): DataFrame = {
-    val url = Thread.currentThread().getContextClassLoader.getResource(name)
-    sqlContext.read.parquet(url.toString)
-  }
-
   test("unannotated array of primitive type") {
-    checkAnswer(readParquetProtobufFile("old-repeated-int.parquet"), Row(Seq(1, 2, 3)))
+    checkAnswer(readResourceParquetFile("old-repeated-int.parquet"), Row(Seq(1, 2, 3)))
   }
 
   test("unannotated array of struct") {
     checkAnswer(
-      readParquetProtobufFile("old-repeated-message.parquet"),
+      readResourceParquetFile("old-repeated-message.parquet"),
       Row(
         Seq(
           Row("First inner", null, null),
@@ -41,14 +35,14 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
           Row(null, null, "Third inner"))))
 
     checkAnswer(
-      readParquetProtobufFile("proto-repeated-struct.parquet"),
+      readResourceParquetFile("proto-repeated-struct.parquet"),
       Row(
         Seq(
           Row("0 - 1", "0 - 2", "0 - 3"),
           Row("1 - 1", "1 - 2", "1 - 3"))))
 
     checkAnswer(
-      readParquetProtobufFile("proto-struct-with-array-many.parquet"),
+      readResourceParquetFile("proto-struct-with-array-many.parquet"),
       Seq(
         Row(
           Seq(
@@ -66,13 +60,13 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
 
   test("struct with unannotated array") {
     checkAnswer(
-      readParquetProtobufFile("proto-struct-with-array.parquet"),
+      readResourceParquetFile("proto-struct-with-array.parquet"),
       Row(10, 9, Seq.empty, null, Row(9), Seq(Row(9), Row(10))))
   }
 
   test("unannotated array of struct with unannotated array") {
     checkAnswer(
-      readParquetProtobufFile("nested-array-struct.parquet"),
+      readResourceParquetFile("nested-array-struct.parquet"),
       Seq(
         Row(2, Seq(Row(1, Seq(Row(3))))),
         Row(5, Seq(Row(4, Seq(Row(6))))),
@@ -81,7 +75,7 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
 
   test("unannotated array of string") {
     checkAnswer(
-      readParquetProtobufFile("proto-repeated-string.parquet"),
+      readResourceParquetFile("proto-repeated-string.parquet"),
       Seq(
         Row(Seq("hello", "world")),
         Row(Seq("good", "bye")),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 9840ad919e510..8ffb01fc5b584 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -139,4 +139,9 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") { f }
     }
   }
+
+  protected def readResourceParquetFile(name: String): DataFrame = {
+    val url = Thread.currentThread().getContextClassLoader.getResource(name)
+    sqlContext.read.parquet(url.toString)
+  }
 }

From 149472a01d12828c64b0a852982d48c123984182 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 12 Oct 2015 10:21:57 -0700
Subject: [PATCH 522/802] [SPARK-11023] [YARN] Avoid creating URIs from local
 paths directly.

The issue is that local paths on Windows, when provided with drive
letters or backslashes, are not valid URIs.

Instead of trying to figure out whether paths are URIs or not, use
Utils.resolveURI() which does that for us.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9049 from vanzin/SPARK-11023 and squashes the following commits:

77021f2 [Marcelo Vanzin] [SPARK-11023] [yarn] Avoid creating URIs from local paths directly.
---
 .../scala/org/apache/spark/deploy/yarn/Client.scala   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index cec81b940644c..1fbd18aa466d4 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -358,7 +358,8 @@ private[spark] class Client(
         destName: Option[String] = None,
         targetDir: Option[String] = None,
         appMasterOnly: Boolean = false): (Boolean, String) = {
-      val localURI = new URI(path.trim())
+      val trimmedPath = path.trim()
+      val localURI = Utils.resolveURI(trimmedPath)
       if (localURI.getScheme != LOCAL_SCHEME) {
         if (addDistributedUri(localURI)) {
           val localPath = getQualifiedLocalPath(localURI, hadoopConf)
@@ -374,7 +375,7 @@ private[spark] class Client(
           (false, null)
         }
       } else {
-        (true, path.trim())
+        (true, trimmedPath)
       }
     }
 
@@ -595,10 +596,10 @@ private[spark] class Client(
         LOCALIZED_PYTHON_DIR)
     }
     (pySparkArchives ++ pyArchives).foreach { path =>
-      val uri = new URI(path)
+      val uri = Utils.resolveURI(path)
       if (uri.getScheme != LOCAL_SCHEME) {
         pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
-          new Path(path).getName())
+          new Path(uri).getName())
       } else {
         pythonPath += uri.getPath()
       }
@@ -1229,7 +1230,7 @@ object Client extends Logging {
 
   private def getMainJarUri(mainJar: Option[String]): Option[URI] = {
     mainJar.flatMap { path =>
-      val uri = new URI(path)
+      val uri = Utils.resolveURI(path)
       if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
     }.orElse(Some(new URI(APP_JAR)))
   }

From 2e572c4135c3f5ad3061c1f58cdb8a70bed0a9d3 Mon Sep 17 00:00:00 2001
From: Ashwin Shankar <ashankar@netflix.com>
Date: Mon, 12 Oct 2015 11:06:21 -0700
Subject: [PATCH 523/802] [SPARK-8170] [PYTHON] Add signal handler to trap
 Ctrl-C in pyspark and cancel all running jobs

This patch adds a signal handler to trap Ctrl-C and cancels running job.

Author: Ashwin Shankar <ashankar@netflix.com>

Closes #9033 from ashwinshankar77/master.
---
 python/pyspark/context.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a0a1ccbeefb09..4969d85f52b23 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -19,6 +19,7 @@
 
 import os
 import shutil
+import signal
 import sys
 from threading import Lock
 from tempfile import NamedTemporaryFile
@@ -217,6 +218,12 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         else:
             self.profiler_collector = None
 
+        # create a signal handler which would be invoked on receiving SIGINT
+        def signal_handler(signal, frame):
+            self.cancelAllJobs()
+
+        signal.signal(signal.SIGINT, signal_handler)
+
     def _initialize_context(self, jconf):
         """
         Initialize SparkContext in function to allow subclass specific initialization

From 8a354bef55ce9cc0fa77fa1c3a9d62c16438ca1b Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 12 Oct 2015 13:50:34 -0700
Subject: [PATCH 524/802] [SPARK-11042] [SQL] Add a mechanism to ban creating
 multiple root SQLContexts/HiveContexts in a JVM

https://issues.apache.org/jira/browse/SPARK-11042

Author: Yin Huai <yhuai@databricks.com>

Closes #9058 from yhuai/SPARK-11042.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  | 10 ++
 .../org/apache/spark/sql/SQLContext.scala     | 42 +++++++-
 .../spark/sql/MultiSQLContextsSuite.scala     | 99 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   | 12 ++-
 4 files changed, 156 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 47397c4be3cb6..f62df9bdebcc0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -186,6 +186,16 @@ private[spark] object SQLConf {
 
   import SQLConfEntry._
 
+  val ALLOW_MULTIPLE_CONTEXTS = booleanConf("spark.sql.allowMultipleContexts",
+    defaultValue = Some(true),
+    doc = "When set to true, creating multiple SQLContexts/HiveContexts is allowed." +
+      "When set to false, only one SQLContext/HiveContext is allowed to be created " +
+      "through the constructor (new SQLContexts/HiveContexts created through newSession " +
+      "method is allowed). Please note that this conf needs to be set in Spark Conf. Once" +
+      "a SQLContext/HiveContext has been created, changing the value of this conf will not" +
+      "have effect.",
+    isPublic = true)
+
   val COMPRESS_CACHED = booleanConf("spark.sql.inMemoryColumnarStorage.compressed",
     defaultValue = Some(true),
     doc = "When set to true Spark SQL will automatically select a compression codec for each " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2bdfd82af0adb..1bd291389241a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -26,7 +26,7 @@ import scala.collection.immutable
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkException, SparkContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
@@ -64,14 +64,37 @@ import org.apache.spark.util.Utils
  */
 class SQLContext private[sql](
     @transient val sparkContext: SparkContext,
-    @transient protected[sql] val cacheManager: CacheManager)
+    @transient protected[sql] val cacheManager: CacheManager,
+    val isRootContext: Boolean)
   extends org.apache.spark.Logging with Serializable {
 
   self =>
 
-  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager)
+  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager, true)
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
+  // If spark.sql.allowMultipleContexts is true, we will throw an exception if a user
+  // wants to create a new root SQLContext (a SLQContext that is not created by newSession).
+  private val allowMultipleContexts =
+    sparkContext.conf.getBoolean(
+      SQLConf.ALLOW_MULTIPLE_CONTEXTS.key,
+      SQLConf.ALLOW_MULTIPLE_CONTEXTS.defaultValue.get)
+
+  // Assert no root SQLContext is running when allowMultipleContexts is false.
+  {
+    if (!allowMultipleContexts && isRootContext) {
+      SQLContext.getInstantiatedContextOption() match {
+        case Some(rootSQLContext) =>
+          val errMsg = "Only one SQLContext/HiveContext may be running in this JVM. " +
+            s"It is recommended to use SQLContext.getOrCreate to get the instantiated " +
+            s"SQLContext/HiveContext. To ignore this error, " +
+            s"set ${SQLConf.ALLOW_MULTIPLE_CONTEXTS.key} = true in SparkConf."
+          throw new SparkException(errMsg)
+        case None => // OK
+      }
+    }
+  }
+
   /**
    * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
    * registered functions, but sharing the same SparkContext and CacheManager.
@@ -79,7 +102,10 @@ class SQLContext private[sql](
    * @since 1.6.0
    */
   def newSession(): SQLContext = {
-    new SQLContext(sparkContext, cacheManager)
+    new SQLContext(
+      sparkContext = sparkContext,
+      cacheManager = cacheManager,
+      isRootContext = false)
   }
 
   /**
@@ -1239,6 +1265,10 @@ object SQLContext {
     instantiatedContext.compareAndSet(null, sqlContext)
   }
 
+  private[sql] def getInstantiatedContextOption(): Option[SQLContext] = {
+    Option(instantiatedContext.get())
+  }
+
   /**
    * Changes the SQLContext that will be returned in this thread and its children when
    * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives
@@ -1260,6 +1290,10 @@ object SQLContext {
     activeContext.remove()
   }
 
+  private[sql] def getActiveContextOption(): Option[SQLContext] = {
+    Option(activeContext.get())
+  }
+
   /**
    * Converts an iterator of Java Beans to InternalRow using the provided
    * bean info & schema. This is not related to the singleton, but is a static
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala
new file mode 100644
index 0000000000000..0e8fcb6a858b1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala
@@ -0,0 +1,99 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark._
+import org.scalatest.BeforeAndAfterAll
+
+class MultiSQLContextsSuite extends SparkFunSuite with BeforeAndAfterAll {
+
+  private var originalActiveSQLContext: Option[SQLContext] = _
+  private var originalInstantiatedSQLContext: Option[SQLContext] = _
+  private var sparkConf: SparkConf = _
+
+  override protected def beforeAll(): Unit = {
+    originalActiveSQLContext = SQLContext.getActiveContextOption()
+    originalInstantiatedSQLContext = SQLContext.getInstantiatedContextOption()
+
+    SQLContext.clearActive()
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.clearInstantiatedContext(ctx))
+    sparkConf =
+      new SparkConf(false)
+        .setMaster("local[*]")
+        .setAppName("test")
+        .set("spark.ui.enabled", "false")
+        .set("spark.driver.allowMultipleContexts", "true")
+  }
+
+  override protected def afterAll(): Unit = {
+    // Set these states back.
+    originalActiveSQLContext.foreach(ctx => SQLContext.setActive(ctx))
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.setInstantiatedContext(ctx))
+  }
+
+  def testNewSession(rootSQLContext: SQLContext): Unit = {
+    // Make sure we can successfully create new Session.
+    rootSQLContext.newSession()
+
+    // Reset the state. It is always safe to clear the active context.
+    SQLContext.clearActive()
+  }
+
+  def testCreatingNewSQLContext(allowsMultipleContexts: Boolean): Unit = {
+    val conf =
+      sparkConf
+        .clone
+        .set(SQLConf.ALLOW_MULTIPLE_CONTEXTS.key, allowsMultipleContexts.toString)
+    val sparkContext = new SparkContext(conf)
+
+    try {
+      if (allowsMultipleContexts) {
+        new SQLContext(sparkContext)
+        SQLContext.clearActive()
+      } else {
+        // If allowsMultipleContexts is false, make sure we can get the error.
+        val message = intercept[SparkException] {
+          new SQLContext(sparkContext)
+        }.getMessage
+        assert(message.contains("Only one SQLContext/HiveContext may be running"))
+      }
+    } finally {
+      sparkContext.stop()
+    }
+  }
+
+  test("test the flag to disallow creating multiple root SQLContext") {
+    Seq(false, true).foreach { allowMultipleSQLContexts =>
+      val conf =
+        sparkConf
+          .clone
+          .set(SQLConf.ALLOW_MULTIPLE_CONTEXTS.key, allowMultipleSQLContexts.toString)
+      val sc = new SparkContext(conf)
+      try {
+        val rootSQLContext = new SQLContext(sc)
+        testNewSession(rootSQLContext)
+        testNewSession(rootSQLContext)
+        testCreatingNewSQLContext(allowMultipleSQLContexts)
+
+        SQLContext.clearInstantiatedContext(rootSQLContext)
+      } finally {
+        sc.stop()
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index dad1e2347c387..ddeadd3eb737d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -89,10 +89,11 @@ class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
     @transient execHive: ClientWrapper,
-    @transient metaHive: ClientInterface) extends SQLContext(sc, cacheManager) with Logging {
+    @transient metaHive: ClientInterface,
+    isRootContext: Boolean) extends SQLContext(sc, cacheManager, isRootContext) with Logging {
   self =>
 
-  def this(sc: SparkContext) = this(sc, new CacheManager, null, null)
+  def this(sc: SparkContext) = this(sc, new CacheManager, null, null, true)
   def this(sc: JavaSparkContext) = this(sc.sc)
 
   import org.apache.spark.sql.hive.HiveContext._
@@ -105,7 +106,12 @@ class HiveContext private[hive](
    * and Hive client (both of execution and metadata) with existing HiveContext.
    */
   override def newSession(): HiveContext = {
-    new HiveContext(sc, cacheManager, executionHive.newSession(), metadataHive.newSession())
+    new HiveContext(
+      sc = sc,
+      cacheManager = cacheManager,
+      execHive = executionHive.newSession(),
+      metaHive = metadataHive.newSession(),
+      isRootContext = false)
   }
 
   /**

From 091c2c3ecd69803d78c2b15a1487046701059d38 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Mon, 12 Oct 2015 14:23:29 -0700
Subject: [PATCH 525/802] [SPARK-11056] Improve documentation of SBT build.

This commit improves the documentation around building Spark to
(1) recommend using SBT interactive mode to avoid the overhead of
launching SBT and (2) refer to the wiki page that documents using
SPARK_PREPEND_CLASSES to avoid creating the assembly jar for each
compile.

cc srowen

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #9068 from kayousterhout/SPARK-11056.
---
 docs/building-spark.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 4d929ee10a33f..743643cbcc62f 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -216,6 +216,11 @@ can be set to control the SBT build. For example:
 
     build/sbt -Pyarn -Phadoop-2.3 assembly
 
+To avoid the overhead of launching sbt each time you need to re-compile, you can launch sbt
+in interactive mode by running `build/sbt`, and then run all build commands at the command
+prompt. For more recommendations on reducing build time, refer to the
+[wiki page](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-ReducingBuildTimes).
+
 # Testing with SBT
 
 Some of the tests require Spark to be packaged first, so always run `build/sbt assembly` the first time.  The following is an example of a correct (build, test) sequence:

From f97e9323b526b3d0b0fee0ca03f4276f37bb5750 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 12 Oct 2015 18:17:28 -0700
Subject: [PATCH 526/802] [SPARK-10739] [YARN] Add application attempt window
 for Spark on Yarn

Add application attempt window for Spark on Yarn to ignore old out of window failures, this is useful for long running applications to recover from failures.

Author: jerryshao <sshao@hortonworks.com>

Closes #8857 from jerryshao/SPARK-10739 and squashes the following commits:

36eabdc [jerryshao] change the doc
7f9b77d [jerryshao] Style change
1c9afd0 [jerryshao] Address the comments
caca695 [jerryshao] Add application attempt window for Spark on Yarn
---
 docs/running-on-yarn.md                            |  9 +++++++++
 .../org/apache/spark/deploy/yarn/Client.scala      | 14 ++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 6d77db6a3271e..677c0000440ac 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -305,6 +305,15 @@ If you need a reference to the proper location to put log files in the YARN so t
   It should be no larger than the global number of max attempts in the YARN configuration.
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.am.attemptFailuresValidityInterval</code></td>
+  <td>(none)</td>
+  <td>
+  Defines the validity interval for AM failure tracking.
+  If the AM has been running for at least the defined interval, the AM failure count will be reset.
+  This feature is not enabled if not configured, and only supported in Hadoop 2.6+.
+  </td>
+</tr>
 <tr>
   <td><code>spark.yarn.submit.waitAppCompletion</code></td>
   <td><code>true</code></td>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 1fbd18aa466d4..d25d830fd4349 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -208,6 +208,20 @@ private[spark] class Client(
       case None => logDebug("spark.yarn.maxAppAttempts is not set. " +
           "Cluster's default value will be used.")
     }
+
+    if (sparkConf.contains("spark.yarn.am.attemptFailuresValidityInterval")) {
+      try {
+        val interval = sparkConf.getTimeAsMs("spark.yarn.am.attemptFailuresValidityInterval")
+        val method = appContext.getClass().getMethod(
+          "setAttemptFailuresValidityInterval", classOf[Long])
+        method.invoke(appContext, interval: java.lang.Long)
+      } catch {
+        case e: NoSuchMethodException =>
+          logWarning("Ignoring spark.yarn.am.attemptFailuresValidityInterval because the version " +
+            "of YARN does not support it")
+      }
+    }
+
     val capability = Records.newRecord(classOf[Resource])
     capability.setMemory(args.amMemory + amMemoryOverhead)
     capability.setVirtualCores(args.amCores)

From c4da5345a0ef643a7518756caaa18ff3f3ea9acc Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 12 Oct 2015 21:12:59 -0700
Subject: [PATCH 527/802] [SPARK-10990] [SPARK-11018] [SQL] improve unrolling
 of complex types

This PR improve the unrolling and read of complex types in columnar cache:
1) Using UnsafeProjection to do serialization of complex types, so they will not be serialized three times (two for actualSize)
2) Copy the bytes from UnsafeRow/UnsafeArrayData to ByteBuffer directly, avoiding the immediate byte[]
3) Using the underlying array in ByteBuffer to create UTF8String/UnsafeRow/UnsafeArrayData without copy.

Combine these optimizations,  we can reduce the unrolling time from 25s to 21s (20% less), reduce the scanning time from 3.5s to 2.5s (28% less).

```
df = sqlContext.read.parquet(path)
t = time.time()
df.cache()
df.count()
print 'unrolling', time.time() - t

for i in range(10):
    t = time.time()
    print df.select("*")._jdf.queryExecution().toRdd().count()
    print time.time() - t
```

The schema is
```
root
 |-- a: struct (nullable = true)
 |    |-- b: long (nullable = true)
 |    |-- c: string (nullable = true)
 |-- d: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- e: map (nullable = true)
 |    |-- key: long
 |    |-- value: string (valueContainsNull = true)
```

Now the columnar cache depends on that UnsafeProjection support all the data types (including UDT), this PR also fix that.

Author: Davies Liu <davies@databricks.com>

Closes #9016 from davies/complex2.
---
 .../catalyst/expressions/UnsafeArrayData.java |  12 ++
 .../sql/catalyst/expressions/UnsafeRow.java   |  12 ++
 .../expressions/codegen/CodeGenerator.scala   |   5 +
 .../codegen/GenerateSafeProjection.scala      |   1 +
 .../codegen/GenerateUnsafeProjection.scala    |  29 ++-
 .../spark/sql/columnar/ColumnAccessor.scala   |   9 +-
 .../spark/sql/columnar/ColumnType.scala       | 187 +++++++++---------
 .../columnar/InMemoryColumnarTableScan.scala  |   6 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  37 ++--
 .../NullableColumnAccessorSuite.scala         |   7 +-
 .../columnar/NullableColumnBuilderSuite.scala |  13 +-
 .../apache/spark/unsafe/types/UTF8String.java |  10 +
 12 files changed, 188 insertions(+), 140 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index fdd9125613a26..796f8abec9a1d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -19,6 +19,7 @@
 
 import java.math.BigDecimal;
 import java.math.BigInteger;
+import java.nio.ByteBuffer;
 
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
@@ -145,6 +146,8 @@ public Object get(int ordinal, DataType dataType) {
       return getArray(ordinal);
     } else if (dataType instanceof MapType) {
       return getMap(ordinal);
+    } else if (dataType instanceof UserDefinedType) {
+      return get(ordinal, ((UserDefinedType)dataType).sqlType());
     } else {
       throw new UnsupportedOperationException("Unsupported data type " + dataType.simpleString());
     }
@@ -306,6 +309,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public UnsafeArrayData copy() {
     UnsafeArrayData arrayCopy = new UnsafeArrayData();
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 5af7ed5d6eb6d..36859fbab9744 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -20,6 +20,7 @@
 import java.io.*;
 import java.math.BigDecimal;
 import java.math.BigInteger;
+import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -326,6 +327,8 @@ public Object get(int ordinal, DataType dataType) {
       return getArray(ordinal);
     } else if (dataType instanceof MapType) {
       return getMap(ordinal);
+    } else if (dataType instanceof UserDefinedType) {
+      return get(ordinal, ((UserDefinedType)dataType).sqlType());
     } else {
       throw new UnsupportedOperationException("Unsupported data type " + dataType.simpleString());
     }
@@ -602,6 +605,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert (buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public void writeExternal(ObjectOutput out) throws IOException {
     byte[] bytes = getBytes();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index a0fe5bd77e3aa..7544d27e3dc15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -129,6 +129,7 @@ class CodeGenContext {
       case _: ArrayType => s"$input.getArray($ordinal)"
       case _: MapType => s"$input.getMap($ordinal)"
       case NullType => "null"
+      case udt: UserDefinedType[_] => getValue(input, udt.sqlType, ordinal)
       case _ => s"($jt)$input.get($ordinal, null)"
     }
   }
@@ -143,6 +144,7 @@ class CodeGenContext {
       case t: DecimalType => s"$row.setDecimal($ordinal, $value, ${t.precision})"
       // The UTF8String may came from UnsafeRow, otherwise clone is cheap (re-use the bytes)
       case StringType => s"$row.update($ordinal, $value.clone())"
+      case udt: UserDefinedType[_] => setColumn(row, udt.sqlType, ordinal, value)
       case _ => s"$row.update($ordinal, $value)"
     }
   }
@@ -177,6 +179,7 @@ class CodeGenContext {
     case _: MapType => "MapData"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
+    case udt: UserDefinedType[_] => javaType(udt.sqlType)
     case ObjectType(cls) if cls.isArray => s"${javaType(ObjectType(cls.getComponentType))}[]"
     case ObjectType(cls) => cls.getName
     case _ => "Object"
@@ -222,6 +225,7 @@ class CodeGenContext {
     case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
     case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
+    case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
     case other => s"$c1.equals($c2)"
   }
 
@@ -255,6 +259,7 @@ class CodeGenContext {
       addNewFunction(compareFunc, funcCode)
       s"this.$compareFunc($c1, $c2)"
     case other if other.isInstanceOf[AtomicType] => s"$c1.compare($c2)"
+    case udt: UserDefinedType[_] => genComp(udt.sqlType, c1, c2)
     case _ =>
       throw new IllegalArgumentException("cannot generate compare code for un-comparable type")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 9873630937d31..ee50587ed097e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -124,6 +124,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
     case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType)
     // UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe.
     case StringType => GeneratedExpressionCode("", "false", s"$input.clone()")
+    case udt: UserDefinedType[_] => convertToSafe(ctx, input, udt.sqlType)
     case _ => GeneratedExpressionCode("", "false", input)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 3e0e81733fb1f..1b957a508d10e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -39,6 +39,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case t: StructType => t.toSeq.forall(field => canSupport(field.dataType))
     case t: ArrayType if canSupport(t.elementType) => true
     case MapType(kt, vt, _) if canSupport(kt) && canSupport(vt) => true
+    case dt: OpenHashSetUDT => false  // it's not a standard UDT
+    case udt: UserDefinedType[_] => canSupport(udt.sqlType)
     case _ => false
   }
 
@@ -77,7 +79,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState(rowWriterClass, rowWriter, s"this.$rowWriter = new $rowWriterClass();")
 
     val writeFields = inputs.zip(inputTypes).zipWithIndex.map {
-      case ((input, dt), index) =>
+      case ((input, dataType), index) =>
+        val dt = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType
+          case other => other
+        }
         val tmpCursor = ctx.freshName("tmpCursor")
 
         val setNull = dt match {
@@ -167,15 +173,20 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val index = ctx.freshName("index")
     val element = ctx.freshName("element")
 
-    val jt = ctx.javaType(elementType)
+    val et = elementType match {
+      case udt: UserDefinedType[_] => udt.sqlType
+      case other => other
+    }
+
+    val jt = ctx.javaType(et)
 
-    val fixedElementSize = elementType match {
+    val fixedElementSize = et match {
       case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => 8
-      case _ if ctx.isPrimitiveType(jt) => elementType.defaultSize
+      case _ if ctx.isPrimitiveType(jt) => et.defaultSize
       case _ => 0
     }
 
-    val writeElement = elementType match {
+    val writeElement = et match {
       case t: StructType =>
         s"""
           $arrayWriter.setOffset($index);
@@ -194,13 +205,13 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${writeMapToBuffer(ctx, element, kt, vt, bufferHolder)}
         """
 
-      case _ if ctx.isPrimitiveType(elementType) =>
+      case _ if ctx.isPrimitiveType(et) =>
         // Should we do word align?
-        val dataSize = elementType.defaultSize
+        val dataSize = et.defaultSize
 
         s"""
           $arrayWriter.setOffset($index);
-          ${writePrimitiveType(ctx, element, elementType,
+          ${writePrimitiveType(ctx, element, et,
             s"$bufferHolder.buffer", s"$bufferHolder.cursor")}
           $bufferHolder.cursor += $dataSize;
         """
@@ -237,7 +248,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           if ($input.isNullAt($index)) {
             $arrayWriter.setNullAt($index);
           } else {
-            final $jt $element = ${ctx.getValue(input, elementType, index)};
+            final $jt $element = ${ctx.getValue(input, et, index)};
             $writeElement
           }
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 62478667eb4fb..42ec4d3433f16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.MutableRow
+import org.apache.spark.sql.catalyst.expressions.{MutableRow, UnsafeArrayData, UnsafeMapData, UnsafeRow}
 import org.apache.spark.sql.columnar.compression.CompressibleColumnAccessor
 import org.apache.spark.sql.types._
 
@@ -109,15 +108,15 @@ private[sql] class DecimalColumnAccessor(buffer: ByteBuffer, dataType: DecimalTy
   with NullableColumnAccessor
 
 private[sql] class StructColumnAccessor(buffer: ByteBuffer, dataType: StructType)
-  extends BasicColumnAccessor[InternalRow](buffer, STRUCT(dataType))
+  extends BasicColumnAccessor[UnsafeRow](buffer, STRUCT(dataType))
   with NullableColumnAccessor
 
 private[sql] class ArrayColumnAccessor(buffer: ByteBuffer, dataType: ArrayType)
-  extends BasicColumnAccessor[ArrayData](buffer, ARRAY(dataType))
+  extends BasicColumnAccessor[UnsafeArrayData](buffer, ARRAY(dataType))
   with NullableColumnAccessor
 
 private[sql] class MapColumnAccessor(buffer: ByteBuffer, dataType: MapType)
-  extends BasicColumnAccessor[MapData](buffer, MAP(dataType))
+  extends BasicColumnAccessor[UnsafeMapData](buffer, MAP(dataType))
   with NullableColumnAccessor
 
 private[sql] object ColumnAccessor {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 3563eacb3a3e9..2bc2c96b61634 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.columnar
 
 import java.math.{BigDecimal, BigInteger}
-import java.nio.{ByteOrder, ByteBuffer}
+import java.nio.ByteBuffer
 
 import scala.reflect.runtime.universe.TypeTag
 
@@ -92,7 +92,7 @@ private[sql] sealed abstract class ColumnType[JvmType] {
    * boxing/unboxing costs whenever possible.
    */
   def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int): Unit = {
-    to.update(toOrdinal, from.get(fromOrdinal, dataType))
+    setField(to, toOrdinal, getField(from, fromOrdinal))
   }
 
   /**
@@ -147,6 +147,7 @@ private[sql] object INT extends NativeColumnType(IntegerType, 4) {
 
   override def getField(row: InternalRow, ordinal: Int): Int = row.getInt(ordinal)
 
+
   override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) {
     to.setInt(toOrdinal, from.getInt(fromOrdinal))
   }
@@ -324,15 +325,18 @@ private[sql] object STRING extends NativeColumnType(StringType, 8) {
   }
 
   override def append(v: UTF8String, buffer: ByteBuffer): Unit = {
-    val stringBytes = v.getBytes
-    buffer.putInt(stringBytes.length).put(stringBytes, 0, stringBytes.length)
+    buffer.putInt(v.numBytes())
+    v.writeTo(buffer)
   }
 
   override def extract(buffer: ByteBuffer): UTF8String = {
     val length = buffer.getInt()
-    val stringBytes = new Array[Byte](length)
-    buffer.get(stringBytes, 0, length)
-    UTF8String.fromBytes(stringBytes)
+    assert(buffer.hasArray)
+    val base = buffer.array()
+    val offset = buffer.arrayOffset()
+    val cursor = buffer.position()
+    buffer.position(cursor + length)
+    UTF8String.fromBytes(base, offset + cursor, length)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: UTF8String): Unit = {
@@ -386,11 +390,6 @@ private[sql] sealed abstract class ByteArrayColumnType[JvmType](val defaultSize:
   def serialize(value: JvmType): Array[Byte]
   def deserialize(bytes: Array[Byte]): JvmType
 
-  override def actualSize(row: InternalRow, ordinal: Int): Int = {
-    // TODO: grow the buffer in append(), so serialize() will not be called twice
-    serialize(getField(row, ordinal)).length + 4
-  }
-
   override def append(v: JvmType, buffer: ByteBuffer): Unit = {
     val bytes = serialize(v)
     buffer.putInt(bytes.length).put(bytes, 0, bytes.length)
@@ -416,6 +415,10 @@ private[sql] object BINARY extends ByteArrayColumnType[Array[Byte]](16) {
     row.getBinary(ordinal)
   }
 
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    row.getBinary(ordinal).length + 4
+  }
+
   def serialize(value: Array[Byte]): Array[Byte] = value
   def deserialize(bytes: Array[Byte]): Array[Byte] = bytes
 }
@@ -433,6 +436,10 @@ private[sql] case class LARGE_DECIMAL(precision: Int, scale: Int)
     row.setDecimal(ordinal, value, precision)
   }
 
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    4 + getField(row, ordinal).toJavaBigDecimal.unscaledValue().bitLength() / 8 + 1
+  }
+
   override def serialize(value: Decimal): Array[Byte] = {
     value.toJavaBigDecimal.unscaledValue().toByteArray
   }
@@ -449,124 +456,118 @@ private[sql] object LARGE_DECIMAL {
   }
 }
 
-private[sql] case class STRUCT(dataType: StructType)
-  extends ByteArrayColumnType[InternalRow](20) {
+private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRow] {
 
-  private val projection: UnsafeProjection =
-    UnsafeProjection.create(dataType)
   private val numOfFields: Int = dataType.fields.size
 
-  override def setField(row: MutableRow, ordinal: Int, value: InternalRow): Unit = {
+  override def defaultSize: Int = 20
+
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeRow): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): InternalRow = {
-    row.getStruct(ordinal, numOfFields)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeRow = {
+    row.getStruct(ordinal, numOfFields).asInstanceOf[UnsafeRow]
   }
 
-  override def serialize(value: InternalRow): Array[Byte] = {
-    val unsafeRow = if (value.isInstanceOf[UnsafeRow]) {
-      value.asInstanceOf[UnsafeRow]
-    } else {
-      projection(value)
-    }
-    unsafeRow.getBytes
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    4 + getField(row, ordinal).getSizeInBytes
   }
 
-  override def deserialize(bytes: Array[Byte]): InternalRow = {
+  override def append(value: UnsafeRow, buffer: ByteBuffer): Unit = {
+    buffer.putInt(value.getSizeInBytes)
+    value.writeTo(buffer)
+  }
+
+  override def extract(buffer: ByteBuffer): UnsafeRow = {
+    val sizeInBytes = buffer.getInt()
+    assert(buffer.hasArray)
+    val base = buffer.array()
+    val offset = buffer.arrayOffset()
+    val cursor = buffer.position()
+    buffer.position(cursor + sizeInBytes)
     val unsafeRow = new UnsafeRow
-    unsafeRow.pointTo(bytes, numOfFields, bytes.length)
+    unsafeRow.pointTo(base, Platform.BYTE_ARRAY_OFFSET + offset + cursor, numOfFields, sizeInBytes)
     unsafeRow
   }
 
-  override def clone(v: InternalRow): InternalRow = v.copy()
+  override def clone(v: UnsafeRow): UnsafeRow = v.copy()
 }
 
-private[sql] case class ARRAY(dataType: ArrayType)
-  extends ByteArrayColumnType[ArrayData](16) {
+private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArrayData] {
 
-  private lazy val projection = UnsafeProjection.create(Array[DataType](dataType))
-  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+  override def defaultSize: Int = 16
 
-  override def setField(row: MutableRow, ordinal: Int, value: ArrayData): Unit = {
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeArrayData): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): ArrayData = {
-    row.getArray(ordinal)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeArrayData = {
+    row.getArray(ordinal).asInstanceOf[UnsafeArrayData]
   }
 
-  override def serialize(value: ArrayData): Array[Byte] = {
-    val unsafeArray = if (value.isInstanceOf[UnsafeArrayData]) {
-      value.asInstanceOf[UnsafeArrayData]
-    } else {
-      mutableRow(0) = value
-      projection(mutableRow).getArray(0)
-    }
-    val outputBuffer =
-      ByteBuffer.allocate(4 + unsafeArray.getSizeInBytes).order(ByteOrder.nativeOrder())
-    outputBuffer.putInt(unsafeArray.numElements())
-    val underlying = outputBuffer.array()
-    unsafeArray.writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 4)
-    underlying
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    val unsafeArray = getField(row, ordinal)
+    4 + 4 + unsafeArray.getSizeInBytes
   }
 
-  override def deserialize(bytes: Array[Byte]): ArrayData = {
-    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
-    val numElements = buffer.getInt
-    val array = new UnsafeArrayData
-    array.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 4, numElements, bytes.length - 4)
-    array
+  override def append(value: UnsafeArrayData, buffer: ByteBuffer): Unit = {
+    buffer.putInt(4 + value.getSizeInBytes)
+    buffer.putInt(value.numElements())
+    value.writeTo(buffer)
   }
 
-  override def clone(v: ArrayData): ArrayData = v.copy()
+  override def extract(buffer: ByteBuffer): UnsafeArrayData = {
+    val numBytes = buffer.getInt
+    assert(buffer.hasArray)
+    val cursor = buffer.position()
+    buffer.position(cursor + numBytes)
+    UnsafeReaders.readArray(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numBytes)
+  }
+
+  override def clone(v: UnsafeArrayData): UnsafeArrayData = v.copy()
 }
 
-private[sql] case class MAP(dataType: MapType) extends ByteArrayColumnType[MapData](32) {
+private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData] {
 
-  private lazy val projection: UnsafeProjection = UnsafeProjection.create(Array[DataType](dataType))
-  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+  override def defaultSize: Int = 32
 
-  override def setField(row: MutableRow, ordinal: Int, value: MapData): Unit = {
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeMapData): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): MapData = {
-    row.getMap(ordinal)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeMapData = {
+    row.getMap(ordinal).asInstanceOf[UnsafeMapData]
   }
 
-  override def serialize(value: MapData): Array[Byte] = {
-    val unsafeMap = if (value.isInstanceOf[UnsafeMapData]) {
-      value.asInstanceOf[UnsafeMapData]
-    } else {
-      mutableRow(0) = value
-      projection(mutableRow).getMap(0)
-    }
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    val unsafeMap = getField(row, ordinal)
+    12 + unsafeMap.keyArray().getSizeInBytes + unsafeMap.valueArray().getSizeInBytes
+  }
+
+  override def append(value: UnsafeMapData, buffer: ByteBuffer): Unit = {
+    buffer.putInt(8 + value.keyArray().getSizeInBytes + value.valueArray().getSizeInBytes)
+    buffer.putInt(value.numElements())
+    buffer.putInt(value.keyArray().getSizeInBytes)
+    value.keyArray().writeTo(buffer)
+    value.valueArray().writeTo(buffer)
+  }
+
+  override def extract(buffer: ByteBuffer): UnsafeMapData = {
+    val numBytes = buffer.getInt
+    assert(buffer.hasArray)
+    val cursor = buffer.position()
+    buffer.position(cursor + numBytes)
+    UnsafeReaders.readMap(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numBytes)
+  }
 
-    val outputBuffer =
-      ByteBuffer.allocate(8 + unsafeMap.getSizeInBytes).order(ByteOrder.nativeOrder())
-    outputBuffer.putInt(unsafeMap.numElements())
-    val keyBytes = unsafeMap.keyArray().getSizeInBytes
-    outputBuffer.putInt(keyBytes)
-    val underlying = outputBuffer.array()
-    unsafeMap.keyArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8)
-    unsafeMap.valueArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8 + keyBytes)
-    underlying
-  }
-
-  override def deserialize(bytes: Array[Byte]): MapData = {
-    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
-    val numElements = buffer.getInt
-    val keyArraySize = buffer.getInt
-    val keyArray = new UnsafeArrayData
-    val valueArray = new UnsafeArrayData
-    keyArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8, numElements, keyArraySize)
-    valueArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8 + keyArraySize, numElements,
-      bytes.length - 8 - keyArraySize)
-    new UnsafeMapData(keyArray, valueArray)
-  }
-
-  override def clone(v: MapData): MapData = v.copy()
+  override def clone(v: UnsafeMapData): UnsafeMapData = v.copy()
 }
 
 private[sql] object ColumnType {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index d7e145f9c2bb8..d967814f627cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
-import org.apache.spark.sql.execution.{LeafNode, SparkPlan}
+import org.apache.spark.sql.execution.{ConvertToUnsafe, LeafNode, SparkPlan}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{Accumulable, Accumulator, Accumulators}
 
@@ -38,7 +38,9 @@ private[sql] object InMemoryRelation {
       storageLevel: StorageLevel,
       child: SparkPlan,
       tableName: Option[String]): InMemoryRelation =
-    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)()
+    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel,
+      if (child.outputsUnsafeRows) child else ConvertToUnsafe(child),
+      tableName)()
 }
 
 private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: InternalRow)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index ceb8ad97bb320..0e6e1bcf72896 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.ByteBuffer
+import java.nio.{ByteOrder, ByteBuffer}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types._
 import org.apache.spark.{Logging, SparkFunSuite}
@@ -55,7 +55,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       assertResult(expected, s"Wrong actualSize for $columnType") {
         val row = new GenericMutableRow(1)
         row.update(0, CatalystTypeConverters.convertToCatalyst(value))
-        columnType.actualSize(row, 0)
+        val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
+        columnType.actualSize(proj(row), 0)
       }
     }
 
@@ -99,35 +100,27 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
   def testColumnType[JvmType](columnType: ColumnType[JvmType]): Unit = {
 
-    val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE)
-    val seq = (0 until 4).map(_ => makeRandomValue(columnType))
+    val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE).order(ByteOrder.nativeOrder())
+    val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
     val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType)
+    val seq = (0 until 4).map(_ => proj(makeRandomRow(columnType)).copy())
 
     test(s"$columnType append/extract") {
       buffer.rewind()
-      seq.foreach(columnType.append(_, buffer))
+      seq.foreach(columnType.append(_, 0, buffer))
 
       buffer.rewind()
-      seq.foreach { expected =>
-        logInfo("buffer = " + buffer + ", expected = " + expected)
-        val extracted = columnType.extract(buffer)
-        assert(
-          converter(expected) === converter(extracted),
-          "Extracted value didn't equal to the original one. " +
-            hexDump(expected) + " != " + hexDump(extracted) +
-            ", buffer = " + dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
+      seq.foreach { row =>
+        logInfo("buffer = " + buffer + ", expected = " + row)
+        val expected = converter(row.get(0, columnType.dataType))
+        val extracted = converter(columnType.extract(buffer))
+        assert(expected === extracted,
+          s"Extracted value didn't equal to the original one. $expected != $extracted, buffer =" +
+          dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
       }
     }
   }
 
-  private def hexDump(value: Any): String = {
-    if (value == null) {
-      ""
-    } else {
-      value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" ")
-    }
-  }
-
   private def dumpBuffer(buff: ByteBuffer): Any = {
     val sb = new StringBuilder()
     while (buff.hasRemaining) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index 78cebbf3cc934..aa1605fee8c73 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -21,7 +21,7 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.types._
 
 class TestNullableColumnAccessor[JvmType](
@@ -64,10 +64,11 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
     test(s"Nullable $typeName column accessor: access null values") {
       val builder = TestNullableColumnBuilder(columnType)
       val randomRow = makeRandomRow(columnType)
+      val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
 
       (0 until 4).foreach { _ =>
-        builder.appendFrom(randomRow, 0)
-        builder.appendFrom(nullRow, 0)
+        builder.appendFrom(proj(randomRow), 0)
+        builder.appendFrom(proj(nullRow), 0)
       }
 
       val accessor = TestNullableColumnAccessor(builder.build(), columnType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index fba08e626d720..91404577832a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.types._
 
 class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType])
@@ -51,6 +51,9 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       columnType: ColumnType[JvmType]): Unit = {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
+    val dataType = columnType.dataType
+    val proj = UnsafeProjection.create(Array[DataType](dataType))
+    val converter = CatalystTypeConverters.createToScalaConverter(dataType)
 
     test(s"$typeName column builder: empty column") {
       val columnBuilder = TestNullableColumnBuilder(columnType)
@@ -65,7 +68,7 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val randomRow = makeRandomRow(columnType)
 
       (0 until 4).foreach { _ =>
-        columnBuilder.appendFrom(randomRow, 0)
+        columnBuilder.appendFrom(proj(randomRow), 0)
       }
 
       val buffer = columnBuilder.build()
@@ -77,12 +80,10 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val columnBuilder = TestNullableColumnBuilder(columnType)
       val randomRow = makeRandomRow(columnType)
       val nullRow = makeNullRow(1)
-      val dataType = columnType.dataType
-      val converter = CatalystTypeConverters.createToScalaConverter(dataType)
 
       (0 until 4).foreach { _ =>
-        columnBuilder.appendFrom(randomRow, 0)
-        columnBuilder.appendFrom(nullRow, 0)
+        columnBuilder.appendFrom(proj(randomRow), 0)
+        columnBuilder.appendFrom(proj(nullRow), 0)
       }
 
       val buffer = columnBuilder.build()
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 216aeea60d1c8..b7aecb5102ba6 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -19,6 +19,7 @@
 
 import javax.annotation.Nonnull;
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Arrays;
 import java.util.Map;
@@ -137,6 +138,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(base, offset, target, targetOffset, numBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + numBytes);
+  }
+
   /**
    * Returns the number of bytes for a code point with the first byte as `b`
    * @param b The first byte of a code point

From 626aab79c9b4d4ac9d65bf5fa45b81dd9cbc609c Mon Sep 17 00:00:00 2001
From: Lianhui Wang <lianhuiwang09@gmail.com>
Date: Tue, 13 Oct 2015 08:29:47 -0500
Subject: [PATCH 528/802] [SPARK-11026] [YARN] spark.yarn.user.classpath.first
 does work for 'spark-submit --jars hdfs://user/foo.jar'

when spark.yarn.user.classpath.first=true and using 'spark-submit --jars hdfs://user/foo.jar', it can not put foo.jar to system classpath. so we need to put yarn's linkNames of jars to the system classpath. vanzin tgravescs

Author: Lianhui Wang <lianhuiwang09@gmail.com>

Closes #9045 from lianhuiwang/spark-11026.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index d25d830fd4349..9fcfe362a3ba2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1212,7 +1212,7 @@ object Client extends Logging {
         } else {
           getMainJarUri(sparkConf.getOption(CONF_SPARK_USER_JAR))
         }
-      mainJar.foreach(addFileToClasspath(sparkConf, _, APP_JAR, env))
+      mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR, env))
 
       val secondaryJars =
         if (args != null) {
@@ -1221,10 +1221,10 @@ object Client extends Logging {
           getSecondaryJarUris(sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
         }
       secondaryJars.foreach { x =>
-        addFileToClasspath(sparkConf, x, null, env)
+        addFileToClasspath(sparkConf, conf, x, null, env)
       }
     }
-    addFileToClasspath(sparkConf, new URI(sparkJar(sparkConf)), SPARK_JAR, env)
+    addFileToClasspath(sparkConf, conf, new URI(sparkJar(sparkConf)), SPARK_JAR, env)
     populateHadoopClasspath(conf, env)
     sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
       addClasspathEntry(getClusterPath(sparkConf, cp), env)
@@ -1259,15 +1259,17 @@ object Client extends Logging {
    * If an alternate name for the file is given, and it's not a "local:" file, the alternate
    * name will be added to the classpath (relative to the job's work directory).
    *
-   * If not a "local:" file and no alternate name, the environment is not modified.
+   * If not a "local:" file and no alternate name, the linkName will be added to the classpath.
    *
-   * @param conf      Spark configuration.
-   * @param uri       URI to add to classpath (optional).
-   * @param fileName  Alternate name for the file (optional).
-   * @param env       Map holding the environment variables.
+   * @param conf        Spark configuration.
+   * @param hadoopConf  Hadoop configuration.
+   * @param uri         URI to add to classpath (optional).
+   * @param fileName    Alternate name for the file (optional).
+   * @param env         Map holding the environment variables.
    */
   private def addFileToClasspath(
       conf: SparkConf,
+      hadoopConf: Configuration,
       uri: URI,
       fileName: String,
       env: HashMap[String, String]): Unit = {
@@ -1276,6 +1278,11 @@ object Client extends Logging {
     } else if (fileName != null) {
       addClasspathEntry(buildPath(
         YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), fileName), env)
+    } else if (uri != null) {
+      val localPath = getQualifiedLocalPath(uri, hadoopConf)
+      val linkName = Option(uri.getFragment()).getOrElse(localPath.getName())
+      addClasspathEntry(buildPath(
+        YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), linkName), env)
     }
   }
 

From 6987c067937a50867b4d5788f5bf496ecdfdb62c Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 13 Oct 2015 09:40:36 -0700
Subject: [PATCH 529/802] [SPARK-11009] [SQL] fix wrong result of Window
 function in cluster mode

Currently, All windows function could generate wrong result in cluster sometimes.

The root cause is that AttributeReference is called in executor, then id of it may not be unique than others created in driver.

Here is the script that could reproduce the problem (run in local cluster):
```
from pyspark import SparkContext, HiveContext
from pyspark.sql.window import Window
from pyspark.sql.functions import rowNumber

sqlContext = HiveContext(SparkContext())
sqlContext.setConf("spark.sql.shuffle.partitions", "3")
df =  sqlContext.range(1<<20)
df2 = df.select((df.id % 1000).alias("A"), (df.id / 1000).alias('B'))
ws = Window.partitionBy(df2.A).orderBy(df2.B)
df3 = df2.select("client", "date", rowNumber().over(ws).alias("rn")).filter("rn < 0")
assert df3.count() == 0
```

Author: Davies Liu <davies@databricks.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #9050 from davies/wrong_window.
---
 .../apache/spark/sql/execution/Window.scala   | 20 ++++-----
 .../spark/sql/hive/HiveSparkSubmitSuite.scala | 41 +++++++++++++++++++
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index f8929530c5036..55035f4bc5f2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -145,11 +145,10 @@ case class Window(
         // Construct the ordering. This is used to compare the result of current value projection
         // to the result of bound value projection. This is done manually because we want to use
         // Code Generation (if it is enabled).
-        val (sortExprs, schema) = exprs.map { case e =>
-          val ref = AttributeReference("ordExpr", e.dataType, e.nullable)()
-          (SortOrder(ref, e.direction), ref)
-        }.unzip
-        val ordering = newOrdering(sortExprs, schema)
+        val sortExprs = exprs.zipWithIndex.map { case (e, i) =>
+          SortOrder(BoundReference(i, e.dataType, e.nullable), e.direction)
+        }
+        val ordering = newOrdering(sortExprs, Nil)
         RangeBoundOrdering(ordering, current, bound)
       case RowFrame => RowBoundOrdering(offset)
     }
@@ -205,14 +204,15 @@ case class Window(
    */
   private[this] def createResultProjection(
       expressions: Seq[Expression]): MutableProjection = {
-    val unboundToAttr = expressions.map {
-      e => (e, AttributeReference("windowResult", e.dataType, e.nullable)())
+    val references = expressions.zipWithIndex.map{ case (e, i) =>
+      // Results of window expressions will be on the right side of child's output
+      BoundReference(child.output.size + i, e.dataType, e.nullable)
     }
-    val unboundToAttrMap = unboundToAttr.toMap
-    val patchedWindowExpression = windowExpression.map(_.transform(unboundToAttrMap))
+    val unboundToRefMap = expressions.zip(references).toMap
+    val patchedWindowExpression = windowExpression.map(_.transform(unboundToRefMap))
     newMutableProjection(
       projectList ++ patchedWindowExpression,
-      child.output ++ unboundToAttr.map(_._2))()
+      child.output)()
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 5f1660b62d418..10e4ae2c50308 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -30,6 +30,7 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.sql.{SQLContext, QueryTest}
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
 import org.apache.spark.sql.types.DecimalType
@@ -107,6 +108,16 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
+  test("SPARK-11009 fix wrong result of Window function in cluster mode") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+    val args = Seq(
+      "--class", SPARK_11009.getClass.getName.stripSuffix("$"),
+      "--name", "SparkSQLConfTest",
+      "--master", "local-cluster[2,1,1024]",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -320,3 +331,33 @@ object SPARK_9757 extends QueryTest {
     }
   }
 }
+
+object SPARK_11009 extends QueryTest {
+  import org.apache.spark.sql.functions._
+
+  protected var sqlContext: SQLContext = _
+
+  def main(args: Array[String]): Unit = {
+    Utils.configTestLog4j("INFO")
+
+    val sparkContext = new SparkContext(
+      new SparkConf()
+        .set("spark.ui.enabled", "false")
+        .set("spark.sql.shuffle.partitions", "100"))
+
+    val hiveContext = new TestHiveContext(sparkContext)
+    sqlContext = hiveContext
+
+    try {
+      val df = sqlContext.range(1 << 20)
+      val df2 = df.select((df("id") % 1000).alias("A"), (df("id") / 1000).alias("B"))
+      val ws = Window.partitionBy(df2("A")).orderBy(df2("B"))
+      val df3 = df2.select(df2("A"), df2("B"), rowNumber().over(ws).alias("rn")).filter("rn < 0")
+      if (df3.rdd.count() != 0) {
+        throw new Exception("df3 should have 0 output row.")
+      }
+    } finally {
+      sparkContext.stop()
+    }
+  }
+}

From 1797055dbf1d2fd7714d7c65c8d2efde2f15efc1 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 13 Oct 2015 09:51:20 -0700
Subject: [PATCH 530/802] [SPARK-11079] Post-hoc review Netty-based RPC - round
 1

I'm going through the implementation right now for post-doc review. Adding more comments and renaming things as I go through them.

I also want to write higher level documentation about how the whole thing works -- but those will come in other pull requests.

Author: Reynold Xin <rxin@databricks.com>

Closes #9091 from rxin/rpc-review.
---
 .../org/apache/spark/MapOutputTracker.scala   |   2 +-
 .../org/apache/spark/rpc/RpcAddress.scala     |  50 ++++++
 .../org/apache/spark/rpc/RpcEndpoint.scala    |   3 +-
 .../scala/org/apache/spark/rpc/RpcEnv.scala   | 153 +-----------------
 .../org/apache/spark/rpc/RpcTimeout.scala     | 131 +++++++++++++++
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |   4 -
 .../apache/spark/rpc/netty/Dispatcher.scala   | 108 +++++++------
 .../apache/spark/rpc/netty/IDVerifier.scala   |   4 +-
 .../org/apache/spark/rpc/netty/Inbox.scala    | 119 ++++++--------
 .../spark/rpc/netty/NettyRpcCallContext.scala |  11 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  |  38 +++--
 .../org/apache/spark/util/ThreadUtils.scala   |   1 -
 .../scala/org/apache/spark/util/Utils.scala   |   1 +
 .../apache/spark/rpc/netty/InboxSuite.scala   |   6 +-
 .../rpc/netty/NettyRpcHandlerSuite.scala      |   7 +-
 15 files changed, 336 insertions(+), 302 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 45e12e40c837f..72355cdfa68b3 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -48,7 +48,7 @@ private[spark] class MapOutputTrackerMasterEndpoint(
       val hostPort = context.senderAddress.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
       val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
-      val serializedSize = mapOutputStatuses.size
+      val serializedSize = mapOutputStatuses.length
       if (serializedSize > maxAkkaFrameSize) {
         val msg = s"Map output statuses were $serializedSize bytes which " +
           s"exceeds spark.akka.frameSize ($maxAkkaFrameSize bytes)."
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
new file mode 100644
index 0000000000000..eb0b26947f504
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * Address for an RPC environment, with hostname and port.
+ */
+private[spark] case class RpcAddress(host: String, port: Int) {
+
+  def hostPort: String = host + ":" + port
+
+  /** Returns a string in the form of "spark://host:port". */
+  def toSparkURL: String = "spark://" + hostPort
+
+  override def toString: String = hostPort
+}
+
+
+private[spark] object RpcAddress {
+
+  /** Return the [[RpcAddress]] represented by `uri`. */
+  def fromURIString(uri: String): RpcAddress = {
+    val uriObj = new java.net.URI(uri)
+    RpcAddress(uriObj.getHost, uriObj.getPort)
+  }
+
+  /** Returns the [[RpcAddress]] encoded in the form of "spark://host:port" */
+  def fromSparkURL(sparkUrl: String): RpcAddress = {
+    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
+    RpcAddress(host, port)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
index f1ddc6d2cd438..0ba95169529e6 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
@@ -145,5 +145,4 @@ private[spark] trait RpcEndpoint {
  * However, there is no guarantee that the same thread will be executing the same
  * [[ThreadSafeRpcEndpoint]] for different messages.
  */
-private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint {
-}
+private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 35e402c725331..ef491a0ae4f09 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -17,12 +17,7 @@
 
 package org.apache.spark.rpc
 
-import java.net.URI
-import java.util.concurrent.TimeoutException
-
-import scala.concurrent.{Awaitable, Await, Future}
-import scala.concurrent.duration._
-import scala.language.postfixOps
+import scala.concurrent.Future
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.util.{RpcUtils, Utils}
@@ -35,8 +30,8 @@ import org.apache.spark.util.{RpcUtils, Utils}
 private[spark] object RpcEnv {
 
   private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = {
-    // Add more RpcEnv implementations here
-    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
+    val rpcEnvNames = Map(
+      "akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
       "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory")
     val rpcEnvName = conf.get("spark.rpc", "netty")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
@@ -53,7 +48,6 @@ private[spark] object RpcEnv {
     val config = RpcEnvConfig(conf, name, host, port, securityManager)
     getRpcEnvFactory(conf).create(config)
   }
-
 }
 
 
@@ -155,144 +149,3 @@ private[spark] case class RpcEnvConfig(
     host: String,
     port: Int,
     securityManager: SecurityManager)
-
-
-/**
- * Represents a host and port.
- */
-private[spark] case class RpcAddress(host: String, port: Int) {
-  // TODO do we need to add the type of RpcEnv in the address?
-
-  val hostPort: String = host + ":" + port
-
-  override val toString: String = hostPort
-
-  def toSparkURL: String = "spark://" + hostPort
-}
-
-
-private[spark] object RpcAddress {
-
-  /**
-   * Return the [[RpcAddress]] represented by `uri`.
-   */
-  def fromURI(uri: URI): RpcAddress = {
-    RpcAddress(uri.getHost, uri.getPort)
-  }
-
-  /**
-   * Return the [[RpcAddress]] represented by `uri`.
-   */
-  def fromURIString(uri: String): RpcAddress = {
-    fromURI(new java.net.URI(uri))
-  }
-
-  def fromSparkURL(sparkUrl: String): RpcAddress = {
-    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
-    RpcAddress(host, port)
-  }
-}
-
-
-/**
- * An exception thrown if RpcTimeout modifies a [[TimeoutException]].
- */
-private[rpc] class RpcTimeoutException(message: String, cause: TimeoutException)
-  extends TimeoutException(message) { initCause(cause) }
-
-
-/**
- * Associates a timeout with a description so that a when a TimeoutException occurs, additional
- * context about the timeout can be amended to the exception message.
- * @param duration timeout duration in seconds
- * @param timeoutProp the configuration property that controls this timeout
- */
-private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: String)
-  extends Serializable {
-
-  /** Amends the standard message of TimeoutException to include the description */
-  private def createRpcTimeoutException(te: TimeoutException): RpcTimeoutException = {
-    new RpcTimeoutException(te.getMessage() + ". This timeout is controlled by " + timeoutProp, te)
-  }
-
-  /**
-   * PartialFunction to match a TimeoutException and add the timeout description to the message
-   *
-   * @note This can be used in the recover callback of a Future to add to a TimeoutException
-   * Example:
-   *    val timeout = new RpcTimeout(5 millis, "short timeout")
-   *    Future(throw new TimeoutException).recover(timeout.addMessageIfTimeout)
-   */
-  def addMessageIfTimeout[T]: PartialFunction[Throwable, T] = {
-    // The exception has already been converted to a RpcTimeoutException so just raise it
-    case rte: RpcTimeoutException => throw rte
-    // Any other TimeoutException get converted to a RpcTimeoutException with modified message
-    case te: TimeoutException => throw createRpcTimeoutException(te)
-  }
-
-  /**
-   * Wait for the completed result and return it. If the result is not available within this
-   * timeout, throw a [[RpcTimeoutException]] to indicate which configuration controls the timeout.
-   * @param  awaitable  the `Awaitable` to be awaited
-   * @throws RpcTimeoutException if after waiting for the specified time `awaitable`
-   *         is still not ready
-   */
-  def awaitResult[T](awaitable: Awaitable[T]): T = {
-    try {
-      Await.result(awaitable, duration)
-    } catch addMessageIfTimeout
-  }
-}
-
-
-private[spark] object RpcTimeout {
-
-  /**
-   * Lookup the timeout property in the configuration and create
-   * a RpcTimeout with the property key in the description.
-   * @param conf configuration properties containing the timeout
-   * @param timeoutProp property key for the timeout in seconds
-   * @throws NoSuchElementException if property is not set
-   */
-  def apply(conf: SparkConf, timeoutProp: String): RpcTimeout = {
-    val timeout = { conf.getTimeAsSeconds(timeoutProp) seconds }
-    new RpcTimeout(timeout, timeoutProp)
-  }
-
-  /**
-   * Lookup the timeout property in the configuration and create
-   * a RpcTimeout with the property key in the description.
-   * Uses the given default value if property is not set
-   * @param conf configuration properties containing the timeout
-   * @param timeoutProp property key for the timeout in seconds
-   * @param defaultValue default timeout value in seconds if property not found
-   */
-  def apply(conf: SparkConf, timeoutProp: String, defaultValue: String): RpcTimeout = {
-    val timeout = { conf.getTimeAsSeconds(timeoutProp, defaultValue) seconds }
-    new RpcTimeout(timeout, timeoutProp)
-  }
-
-  /**
-   * Lookup prioritized list of timeout properties in the configuration
-   * and create a RpcTimeout with the first set property key in the
-   * description.
-   * Uses the given default value if property is not set
-   * @param conf configuration properties containing the timeout
-   * @param timeoutPropList prioritized list of property keys for the timeout in seconds
-   * @param defaultValue default timeout value in seconds if no properties found
-   */
-  def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = {
-    require(timeoutPropList.nonEmpty)
-
-    // Find the first set property or use the default value with the first property
-    val itr = timeoutPropList.iterator
-    var foundProp: Option[(String, String)] = None
-    while (itr.hasNext && foundProp.isEmpty){
-      val propKey = itr.next()
-      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
-    }
-    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
-    val timeout = { Utils.timeStringAsSeconds(finalProp._2) seconds }
-    new RpcTimeout(timeout, finalProp._1)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
new file mode 100644
index 0000000000000..285786ebf9f1b
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import java.util.concurrent.TimeoutException
+
+import scala.concurrent.{Awaitable, Await}
+import scala.concurrent.duration._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.util.Utils
+
+
+/**
+ * An exception thrown if RpcTimeout modifies a [[TimeoutException]].
+ */
+private[rpc] class RpcTimeoutException(message: String, cause: TimeoutException)
+  extends TimeoutException(message) { initCause(cause) }
+
+
+/**
+ * Associates a timeout with a description so that a when a TimeoutException occurs, additional
+ * context about the timeout can be amended to the exception message.
+ *
+ * @param duration timeout duration in seconds
+ * @param timeoutProp the configuration property that controls this timeout
+ */
+private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: String)
+  extends Serializable {
+
+  /** Amends the standard message of TimeoutException to include the description */
+  private def createRpcTimeoutException(te: TimeoutException): RpcTimeoutException = {
+    new RpcTimeoutException(te.getMessage + ". This timeout is controlled by " + timeoutProp, te)
+  }
+
+  /**
+   * PartialFunction to match a TimeoutException and add the timeout description to the message
+   *
+   * @note This can be used in the recover callback of a Future to add to a TimeoutException
+   * Example:
+   *    val timeout = new RpcTimeout(5 millis, "short timeout")
+   *    Future(throw new TimeoutException).recover(timeout.addMessageIfTimeout)
+   */
+  def addMessageIfTimeout[T]: PartialFunction[Throwable, T] = {
+    // The exception has already been converted to a RpcTimeoutException so just raise it
+    case rte: RpcTimeoutException => throw rte
+    // Any other TimeoutException get converted to a RpcTimeoutException with modified message
+    case te: TimeoutException => throw createRpcTimeoutException(te)
+  }
+
+  /**
+   * Wait for the completed result and return it. If the result is not available within this
+   * timeout, throw a [[RpcTimeoutException]] to indicate which configuration controls the timeout.
+   * @param  awaitable  the `Awaitable` to be awaited
+   * @throws RpcTimeoutException if after waiting for the specified time `awaitable`
+   *         is still not ready
+   */
+  def awaitResult[T](awaitable: Awaitable[T]): T = {
+    try {
+      Await.result(awaitable, duration)
+    } catch addMessageIfTimeout
+  }
+}
+
+
+private[spark] object RpcTimeout {
+
+  /**
+   * Lookup the timeout property in the configuration and create
+   * a RpcTimeout with the property key in the description.
+   * @param conf configuration properties containing the timeout
+   * @param timeoutProp property key for the timeout in seconds
+   * @throws NoSuchElementException if property is not set
+   */
+  def apply(conf: SparkConf, timeoutProp: String): RpcTimeout = {
+    val timeout = { conf.getTimeAsSeconds(timeoutProp).seconds }
+    new RpcTimeout(timeout, timeoutProp)
+  }
+
+  /**
+   * Lookup the timeout property in the configuration and create
+   * a RpcTimeout with the property key in the description.
+   * Uses the given default value if property is not set
+   * @param conf configuration properties containing the timeout
+   * @param timeoutProp property key for the timeout in seconds
+   * @param defaultValue default timeout value in seconds if property not found
+   */
+  def apply(conf: SparkConf, timeoutProp: String, defaultValue: String): RpcTimeout = {
+    val timeout = { conf.getTimeAsSeconds(timeoutProp, defaultValue).seconds }
+    new RpcTimeout(timeout, timeoutProp)
+  }
+
+  /**
+   * Lookup prioritized list of timeout properties in the configuration
+   * and create a RpcTimeout with the first set property key in the
+   * description.
+   * Uses the given default value if property is not set
+   * @param conf configuration properties containing the timeout
+   * @param timeoutPropList prioritized list of property keys for the timeout in seconds
+   * @param defaultValue default timeout value in seconds if no properties found
+   */
+  def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = {
+    require(timeoutPropList.nonEmpty)
+
+    // Find the first set property or use the default value with the first property
+    val itr = timeoutPropList.iterator
+    var foundProp: Option[(String, String)] = None
+    while (itr.hasNext && foundProp.isEmpty){
+      val propKey = itr.next()
+      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
+    }
+    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
+    val timeout = { Utils.timeStringAsSeconds(finalProp._2).seconds }
+    new RpcTimeout(timeout, finalProp._1)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index 95132a4e4a0bf..3fad595a0d0b0 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -39,10 +39,6 @@ import org.apache.spark.util.{ActorLogReceive, AkkaUtils, ThreadUtils}
  *
  * TODO Once we remove all usages of Akka in other place, we can move this file to a new project and
  * remove Akka from the dependencies.
- *
- * @param actorSystem
- * @param conf
- * @param boundPort
  */
 private[spark] class AkkaRpcEnv private[akka] (
     val actorSystem: ActorSystem, conf: SparkConf, boundPort: Int)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index d71e6f01dbb29..398e9eafc1444 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
+import java.util.concurrent.{ThreadPoolExecutor, ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
@@ -38,12 +38,16 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val inbox = new Inbox(ref, endpoint)
   }
 
-  private val endpoints = new ConcurrentHashMap[String, EndpointData]()
-  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]()
+  private val endpoints = new ConcurrentHashMap[String, EndpointData]
+  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
   private val receivers = new LinkedBlockingQueue[EndpointData]()
 
+  /**
+   * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced
+   * immediately.
+   */
   @GuardedBy("this")
   private var stopped = false
 
@@ -59,7 +63,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
       }
       val data = endpoints.get(name)
       endpointRefs.put(data.endpoint, data.ref)
-      receivers.put(data)
+      receivers.put(data)  // for the OnStart message
     }
     endpointRef
   }
@@ -73,7 +77,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val data = endpoints.remove(name)
     if (data != null) {
       data.inbox.stop()
-      receivers.put(data)
+      receivers.put(data)  // for the OnStop message
     }
     // Don't clean `endpointRefs` here because it's possible that some messages are being processed
     // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
@@ -91,19 +95,23 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   }
 
   /**
-   * Send a message to all registered [[RpcEndpoint]]s.
-   * @param message
+   * Send a message to all registered [[RpcEndpoint]]s in this process.
+   *
+   * This can be used to make network events known to all end points (e.g. "a new node connected").
    */
-  def broadcastMessage(message: InboxMessage): Unit = {
+  def postToAll(message: InboxMessage): Unit = {
     val iter = endpoints.keySet().iterator()
     while (iter.hasNext) {
       val name = iter.next
-      postMessageToInbox(name, (_) => message,
-        () => { logWarning(s"Drop ${message} because ${name} has been stopped") })
+      postMessage(
+        name,
+        _ => message,
+        () => { logWarning(s"Drop $message because $name has been stopped") })
     }
   }
 
-  def postMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
+  /** Posts a message sent by a remote endpoint. */
+  def postRemoteMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
     def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
       val rpcCallContext =
         new RemoteNettyRpcCallContext(
@@ -116,10 +124,11 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
         new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
     }
 
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+    postMessage(message.receiver.name, createMessage, onEndpointStopped)
   }
 
-  def postMessage(message: RequestMessage, p: Promise[Any]): Unit = {
+  /** Posts a message sent by a local endpoint. */
+  def postLocalMessage(message: RequestMessage, p: Promise[Any]): Unit = {
     def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
       val rpcCallContext =
         new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p)
@@ -131,39 +140,36 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
         new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
     }
 
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+    postMessage(message.receiver.name, createMessage, onEndpointStopped)
   }
 
-  private def postMessageToInbox(
+  /**
+   * Posts a message to a specific endpoint.
+   *
+   * @param endpointName name of the endpoint.
+   * @param createMessageFn function to create the message.
+   * @param callbackIfStopped callback function if the endpoint is stopped.
+   */
+  private def postMessage(
       endpointName: String,
       createMessageFn: NettyRpcEndpointRef => InboxMessage,
-      onStopped: () => Unit): Unit = {
-    val shouldCallOnStop =
-      synchronized {
-        val data = endpoints.get(endpointName)
-        if (stopped || data == null) {
-          true
-        } else {
-          data.inbox.post(createMessageFn(data.ref))
-          receivers.put(data)
-          false
-        }
+      callbackIfStopped: () => Unit): Unit = {
+    val shouldCallOnStop = synchronized {
+      val data = endpoints.get(endpointName)
+      if (stopped || data == null) {
+        true
+      } else {
+        data.inbox.post(createMessageFn(data.ref))
+        receivers.put(data)
+        false
       }
+    }
     if (shouldCallOnStop) {
       // We don't need to call `onStop` in the `synchronized` block
-      onStopped()
+      callbackIfStopped()
     }
   }
 
-  private val parallelism = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.parallelism",
-    Runtime.getRuntime.availableProcessors())
-
-  private val executor = ThreadUtils.newDaemonFixedThreadPool(parallelism, "dispatcher-event-loop")
-
-  (0 until parallelism) foreach { _ =>
-    executor.execute(new MessageLoop)
-  }
-
   def stop(): Unit = {
     synchronized {
       if (stopped) {
@@ -174,12 +180,12 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     // Stop all endpoints. This will queue all endpoints for processing by the message loops.
     endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
     // Enqueue a message that tells the message loops to stop.
-    receivers.put(PoisonEndpoint)
-    executor.shutdown()
+    receivers.put(PoisonPill)
+    threadpool.shutdown()
   }
 
   def awaitTermination(): Unit = {
-    executor.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
+    threadpool.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
   }
 
   /**
@@ -189,15 +195,27 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     endpoints.containsKey(name)
   }
 
+  /** Thread pool used for dispatching messages. */
+  private val threadpool: ThreadPoolExecutor = {
+    val numThreads = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.numThreads",
+      Runtime.getRuntime.availableProcessors())
+    val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop")
+    for (i <- 0 until numThreads) {
+      pool.execute(new MessageLoop)
+    }
+    pool
+  }
+
+  /** Message loop used for dispatching messages. */
   private class MessageLoop extends Runnable {
     override def run(): Unit = {
       try {
         while (true) {
           try {
             val data = receivers.take()
-            if (data == PoisonEndpoint) {
-              // Put PoisonEndpoint back so that other MessageLoops can see it.
-              receivers.put(PoisonEndpoint)
+            if (data == PoisonPill) {
+              // Put PoisonPill back so that other MessageLoops can see it.
+              receivers.put(PoisonPill)
               return
             }
             data.inbox.process(Dispatcher.this)
@@ -211,8 +229,6 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     }
   }
 
-  /**
-   * A poison endpoint that indicates MessageLoop should exit its loop.
-   */
-  private val PoisonEndpoint = new EndpointData(null, null, null)
+  /** A poison endpoint that indicates MessageLoop should exit its message loop. */
+  private val PoisonPill = new EndpointData(null, null, null)
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
index 6061c9b8de944..fa9a3eb99b02a 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
@@ -26,8 +26,8 @@ private[netty] case class ID(name: String)
 /**
  * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
  */
-private[netty] class IDVerifier(
-    override val rpcEnv: RpcEnv, dispatcher: Dispatcher) extends RpcEndpoint {
+private[netty] class IDVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
+  extends RpcEndpoint {
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case ID(name) => context.reply(dispatcher.verify(name))
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
index b669f59a2884e..c72b588db57fe 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.LinkedList
 import javax.annotation.concurrent.GuardedBy
 
 import scala.util.control.NonFatal
 
+import com.google.common.annotations.VisibleForTesting
+
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint}
 
+
 private[netty] sealed trait InboxMessage
 
 private[netty] case class ContentMessage(
@@ -37,44 +39,40 @@ private[netty] case object OnStart extends InboxMessage
 
 private[netty] case object OnStop extends InboxMessage
 
-/**
- * A broadcast message that indicates connecting to a remote node.
- */
-private[netty] case class Associated(remoteAddress: RpcAddress) extends InboxMessage
+/** A message to tell all endpoints that a remote process has connected. */
+private[netty] case class RemoteProcessConnected(remoteAddress: RpcAddress) extends InboxMessage
 
-/**
- * A broadcast message that indicates a remote connection is lost.
- */
-private[netty] case class Disassociated(remoteAddress: RpcAddress) extends InboxMessage
+/** A message to tell all endpoints that a remote process has disconnected. */
+private[netty] case class RemoteProcessDisconnected(remoteAddress: RpcAddress) extends InboxMessage
 
-/**
- * A broadcast message that indicates a network error
- */
-private[netty] case class AssociationError(cause: Throwable, remoteAddress: RpcAddress)
+/** A message to tell all endpoints that a network error has happened. */
+private[netty] case class RemoteProcessConnectionError(cause: Throwable, remoteAddress: RpcAddress)
   extends InboxMessage
 
 /**
  * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely.
- * @param endpointRef
- * @param endpoint
  */
 private[netty] class Inbox(
     val endpointRef: NettyRpcEndpointRef,
-    val endpoint: RpcEndpoint) extends Logging {
+    val endpoint: RpcEndpoint)
+  extends Logging {
 
-  inbox =>
+  inbox =>  // Give this an alias so we can use it more clearly in closures.
 
   @GuardedBy("this")
-  protected val messages = new LinkedList[InboxMessage]()
+  protected val messages = new java.util.LinkedList[InboxMessage]()
 
+  /** True if the inbox (and its associated endpoint) is stopped. */
   @GuardedBy("this")
   private var stopped = false
 
+  /** Allow multiple threads to process messages at the same time. */
   @GuardedBy("this")
   private var enableConcurrent = false
 
+  /** The number of threads processing messages for this inbox. */
   @GuardedBy("this")
-  private var workerCount = 0
+  private var numActiveThreads = 0
 
   // OnStart should be the first message to process
   inbox.synchronized {
@@ -87,12 +85,12 @@ private[netty] class Inbox(
   def process(dispatcher: Dispatcher): Unit = {
     var message: InboxMessage = null
     inbox.synchronized {
-      if (!enableConcurrent && workerCount != 0) {
+      if (!enableConcurrent && numActiveThreads != 0) {
         return
       }
       message = messages.poll()
       if (message != null) {
-        workerCount += 1
+        numActiveThreads += 1
       } else {
         return
       }
@@ -101,15 +99,11 @@ private[netty] class Inbox(
       safelyCall(endpoint) {
         message match {
           case ContentMessage(_sender, content, needReply, context) =>
-            val pf: PartialFunction[Any, Unit] =
-              if (needReply) {
-                endpoint.receiveAndReply(context)
-              } else {
-                endpoint.receive
-              }
+            // The partial function to call
+            val pf = if (needReply) endpoint.receiveAndReply(context) else endpoint.receive
             try {
               pf.applyOrElse[Any, Unit](content, { msg =>
-                throw new SparkException(s"Unmatched message $message from ${_sender}")
+                throw new SparkException(s"Unsupported message $message from ${_sender}")
               })
               if (!needReply) {
                 context.finish()
@@ -121,11 +115,13 @@ private[netty] class Inbox(
                   context.sendFailure(e)
                 } else {
                   context.finish()
-                  throw e
                 }
+                // Throw the exception -- this exception will be caught by the safelyCall function.
+                // The endpoint's onError function will be called.
+                throw e
             }
 
-          case OnStart => {
+          case OnStart =>
             endpoint.onStart()
             if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {
               inbox.synchronized {
@@ -134,24 +130,22 @@ private[netty] class Inbox(
                 }
               }
             }
-          }
 
           case OnStop =>
-            val _workCount = inbox.synchronized {
-              workerCount
-            }
-            assert(_workCount == 1, s"There should be only one worker but was ${_workCount}")
+            val activeThreads = inbox.synchronized { inbox.numActiveThreads }
+            assert(activeThreads == 1,
+              s"There should be only a single active thread but found $activeThreads threads.")
             dispatcher.removeRpcEndpointRef(endpoint)
             endpoint.onStop()
             assert(isEmpty, "OnStop should be the last message")
 
-          case Associated(remoteAddress) =>
+          case RemoteProcessConnected(remoteAddress) =>
             endpoint.onConnected(remoteAddress)
 
-          case Disassociated(remoteAddress) =>
+          case RemoteProcessDisconnected(remoteAddress) =>
             endpoint.onDisconnected(remoteAddress)
 
-          case AssociationError(cause, remoteAddress) =>
+          case RemoteProcessConnectionError(cause, remoteAddress) =>
             endpoint.onNetworkError(cause, remoteAddress)
         }
       }
@@ -159,33 +153,27 @@ private[netty] class Inbox(
       inbox.synchronized {
         // "enableConcurrent" will be set to false after `onStop` is called, so we should check it
         // every time.
-        if (!enableConcurrent && workerCount != 1) {
+        if (!enableConcurrent && numActiveThreads != 1) {
           // If we are not the only one worker, exit
-          workerCount -= 1
+          numActiveThreads -= 1
           return
         }
         message = messages.poll()
         if (message == null) {
-          workerCount -= 1
+          numActiveThreads -= 1
           return
         }
       }
     }
   }
 
-  def post(message: InboxMessage): Unit = {
-    val dropped =
-      inbox.synchronized {
-        if (stopped) {
-          // We already put "OnStop" into "messages", so we should drop further messages
-          true
-        } else {
-          messages.add(message)
-          false
-        }
-      }
-    if (dropped) {
+  def post(message: InboxMessage): Unit = inbox.synchronized {
+    if (stopped) {
+      // We already put "OnStop" into "messages", so we should drop further messages
       onDrop(message)
+    } else {
+      messages.add(message)
+      false
     }
   }
 
@@ -203,24 +191,23 @@ private[netty] class Inbox(
     }
   }
 
-  // Visible for testing.
+  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }
+
+  /** Called when we are dropping a message. Test cases override this to test message dropping. */
+  @VisibleForTesting
   protected def onDrop(message: InboxMessage): Unit = {
-    logWarning(s"Drop ${message} because $endpointRef is stopped")
+    logWarning(s"Drop $message because $endpointRef is stopped")
   }
 
-  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }
-
+  /**
+   * Calls action closure, and calls the endpoint's onError function in the case of exceptions.
+   */
   private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {
-    try {
-      action
-    } catch {
-      case NonFatal(e) => {
-        try {
-          endpoint.onError(e)
-        } catch {
-          case NonFatal(e) => logWarning(s"Ignore error", e)
+    try action catch {
+      case NonFatal(e) =>
+        try endpoint.onError(e) catch {
+          case NonFatal(ee) => logError(s"Ignoring error", ee)
         }
-      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
index 75dcc02a0c5a9..21d5bb4923d1b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
@@ -26,7 +26,8 @@ import org.apache.spark.rpc.{RpcAddress, RpcCallContext}
 private[netty] abstract class NettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     override val senderAddress: RpcAddress,
-    needReply: Boolean) extends RpcCallContext with Logging {
+    needReply: Boolean)
+  extends RpcCallContext with Logging {
 
   protected def send(message: Any): Unit
 
@@ -35,7 +36,7 @@ private[netty] abstract class NettyRpcCallContext(
       send(AskResponse(endpointRef, response))
     } else {
       throw new IllegalStateException(
-        s"Cannot send $response to the sender because the sender won't handle it")
+        s"Cannot send $response to the sender because the sender does not expect a reply")
     }
   }
 
@@ -63,7 +64,8 @@ private[netty] class LocalNettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     senderAddress: RpcAddress,
     needReply: Boolean,
-    p: Promise[Any]) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+    p: Promise[Any])
+  extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
 
   override protected def send(message: Any): Unit = {
     p.success(message)
@@ -78,7 +80,8 @@ private[netty] class RemoteNettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     callback: RpcResponseCallback,
     senderAddress: RpcAddress,
-    needReply: Boolean) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+    needReply: Boolean)
+  extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
 
   override protected def send(message: Any): Unit = {
     val reply = nettyEnv.serialize(message)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 5522b40782d9e..89b6df76c2707 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -19,7 +19,6 @@ package org.apache.spark.rpc.netty
 import java.io._
 import java.net.{InetSocketAddress, URI}
 import java.nio.ByteBuffer
-import java.util.Arrays
 import java.util.concurrent._
 import javax.annotation.concurrent.GuardedBy
 
@@ -77,19 +76,19 @@ private[netty] class NettyRpcEnv(
   @volatile private var server: TransportServer = _
 
   def start(port: Int): Unit = {
-    val bootstraps: Seq[TransportServerBootstrap] =
+    val bootstraps: java.util.List[TransportServerBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslServerBootstrap(transportConf, securityManager))
+        java.util.Arrays.asList(new SaslServerBootstrap(transportConf, securityManager))
       } else {
-        Nil
+        java.util.Collections.emptyList()
       }
-    server = transportContext.createServer(port, bootstraps.asJava)
+    server = transportContext.createServer(port, bootstraps)
     dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
   }
 
   override lazy val address: RpcAddress = {
     require(server != null, "NettyRpcEnv has not yet started")
-    RpcAddress(host, server.getPort())
+    RpcAddress(host, server.getPort)
   }
 
   override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
@@ -119,7 +118,7 @@ private[netty] class NettyRpcEnv(
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
       val promise = Promise[Any]()
-      dispatcher.postMessage(message, promise)
+      dispatcher.postLocalMessage(message, promise)
       promise.future.onComplete {
         case Success(response) =>
           val ack = response.asInstanceOf[Ack]
@@ -148,10 +147,9 @@ private[netty] class NettyRpcEnv(
           }
         })
       } catch {
-        case e: RejectedExecutionException => {
+        case e: RejectedExecutionException =>
           // `send` after shutting clientConnectionExecutor down, ignore it
-          logWarning(s"Cannot send ${message} because RpcEnv is stopped")
-        }
+          logWarning(s"Cannot send $message because RpcEnv is stopped")
       }
     }
   }
@@ -161,7 +159,7 @@ private[netty] class NettyRpcEnv(
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
       val p = Promise[Any]()
-      dispatcher.postMessage(message, p)
+      dispatcher.postLocalMessage(message, p)
       p.future.onComplete {
         case Success(response) =>
           val reply = response.asInstanceOf[AskResponse]
@@ -218,7 +216,7 @@ private[netty] class NettyRpcEnv(
 
   private[netty] def serialize(content: Any): Array[Byte] = {
     val buffer = javaSerializerInstance.serialize(content)
-    Arrays.copyOfRange(
+    java.util.Arrays.copyOfRange(
       buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
   }
 
@@ -425,7 +423,7 @@ private[netty] class NettyRpcHandler(
     assert(addr != null)
     val remoteEnvAddress = requestMessage.senderAddress
     val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-    val broadcastMessage =
+    val broadcastMessage: Option[RemoteProcessConnected] =
       synchronized {
         // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
         if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
@@ -435,7 +433,7 @@ private[netty] class NettyRpcHandler(
           remoteConnectionCount.put(remoteEnvAddress, count + 1)
           if (count == 0) {
             // This is the first connection, so fire "Associated"
-            Some(Associated(remoteEnvAddress))
+            Some(RemoteProcessConnected(remoteEnvAddress))
           } else {
             None
           }
@@ -443,8 +441,8 @@ private[netty] class NettyRpcHandler(
           None
         }
       }
-    broadcastMessage.foreach(dispatcher.broadcastMessage)
-    dispatcher.postMessage(requestMessage, callback)
+    broadcastMessage.foreach(dispatcher.postToAll)
+    dispatcher.postRemoteMessage(requestMessage, callback)
   }
 
   override def getStreamManager: StreamManager = new OneForOneStreamManager
@@ -455,12 +453,12 @@ private[netty] class NettyRpcHandler(
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
       val broadcastMessage =
         synchronized {
-          remoteAddresses.get(clientAddr).map(AssociationError(cause, _))
+          remoteAddresses.get(clientAddr).map(RemoteProcessConnectionError(cause, _))
         }
       if (broadcastMessage.isEmpty) {
         logError(cause.getMessage, cause)
       } else {
-        dispatcher.broadcastMessage(broadcastMessage.get)
+        dispatcher.postToAll(broadcastMessage.get)
       }
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null.
@@ -485,7 +483,7 @@ private[netty] class NettyRpcHandler(
             if (count - 1 == 0) {
               // We lost all clients, so clean up and fire "Disassociated"
               remoteConnectionCount.remove(remoteEnvAddress)
-              Some(Disassociated(remoteEnvAddress))
+              Some(RemoteProcessDisconnected(remoteEnvAddress))
             } else {
               // Decrease the connection number of remoteEnvAddress
               remoteConnectionCount.put(remoteEnvAddress, count - 1)
@@ -493,7 +491,7 @@ private[netty] class NettyRpcHandler(
             }
           }
         }
-      broadcastMessage.foreach(dispatcher.broadcastMessage)
+      broadcastMessage.foreach(dispatcher.postToAll)
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null. In this case,
       // we can ignore it since we don't fire "Associated".
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 1ed098379e299..15e7519d708c6 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-
 package org.apache.spark.util
 
 import java.util.concurrent._
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index e60c1b355a73e..bd7e51c3b5100 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1895,6 +1895,7 @@ private[spark] object Utils extends Logging {
    *                     This is expected to throw java.net.BindException on port collision.
    * @param conf A SparkConf used to get the maximum number of retries when binding to a port.
    * @param serviceName Name of the service.
+   * @return (service: T, port: Int)
    */
   def startServiceOnPort[T](
       startPort: Int,
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
index 120cf1b6fa9dc..276c077b3d13e 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
@@ -113,7 +113,7 @@ class InboxSuite extends SparkFunSuite {
     val remoteAddress = RpcAddress("localhost", 11111)
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Associated(remoteAddress))
+    inbox.post(RemoteProcessConnected(remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnConnectedMessage(remoteAddress)
@@ -127,7 +127,7 @@ class InboxSuite extends SparkFunSuite {
     val remoteAddress = RpcAddress("localhost", 11111)
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Disassociated(remoteAddress))
+    inbox.post(RemoteProcessDisconnected(remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnDisconnectedMessage(remoteAddress)
@@ -142,7 +142,7 @@ class InboxSuite extends SparkFunSuite {
     val cause = new RuntimeException("Oops")
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(AssociationError(cause, remoteAddress))
+    inbox.post(RemoteProcessConnectionError(cause, remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress)
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
index 06ca035d199e8..f24f78b8c4542 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -45,7 +45,7 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
     nettyRpcHandler.receive(client, null, null)
 
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
   }
 
   test("connectionTerminated") {
@@ -60,8 +60,9 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.connectionTerminated(client)
 
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
-    verify(dispatcher, times(1)).broadcastMessage(Disassociated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(
+      RemoteProcessDisconnected(RpcAddress("localhost", 12345)))
   }
 
 }

From d0cc79ccd0b4500bd6b18184a723dabc164e8abd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 13 Oct 2015 09:57:53 -0700
Subject: [PATCH 531/802] [SPARK-11030] [SQL] share the SQLTab across sessions

The SQLTab will be shared by multiple sessions.

If we create multiple independent SQLContexts (not using newSession()), will still see multiple SQLTabs in the Spark UI.

Author: Davies Liu <davies@databricks.com>

Closes #9048 from davies/sqlui.
---
 .../org/apache/spark/sql/SQLContext.scala     | 23 +++++++++++++------
 .../spark/sql/execution/ui/SQLListener.scala  | 10 +++-----
 .../spark/sql/execution/ui/SQLTab.scala       |  4 +---
 .../sql/execution/ui/SQLListenerSuite.scala   |  8 +++----
 .../apache/spark/sql/hive/HiveContext.scala   | 12 +++++++---
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1bd291389241a..cd937257d31a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -65,12 +65,15 @@ import org.apache.spark.util.Utils
 class SQLContext private[sql](
     @transient val sparkContext: SparkContext,
     @transient protected[sql] val cacheManager: CacheManager,
+    @transient private[sql] val listener: SQLListener,
     val isRootContext: Boolean)
   extends org.apache.spark.Logging with Serializable {
 
   self =>
 
-  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager, true)
+  def this(sparkContext: SparkContext) = {
+    this(sparkContext, new CacheManager, SQLContext.createListenerAndUI(sparkContext), true)
+  }
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
   // If spark.sql.allowMultipleContexts is true, we will throw an exception if a user
@@ -97,7 +100,7 @@ class SQLContext private[sql](
 
   /**
    * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
-   * registered functions, but sharing the same SparkContext and CacheManager.
+   * registered functions, but sharing the same SparkContext, CacheManager, SQLListener and SQLTab.
    *
    * @since 1.6.0
    */
@@ -105,6 +108,7 @@ class SQLContext private[sql](
     new SQLContext(
       sparkContext = sparkContext,
       cacheManager = cacheManager,
+      listener = listener,
       isRootContext = false)
   }
 
@@ -113,11 +117,6 @@ class SQLContext private[sql](
    */
   protected[sql] lazy val conf = new SQLConf
 
-  // `listener` should be only used in the driver
-  @transient private[sql] val listener = new SQLListener(this)
-  sparkContext.addSparkListener(listener)
-  sparkContext.ui.foreach(new SQLTab(this, _))
-
   /**
    * Set Spark SQL configuration properties.
    *
@@ -1312,4 +1311,14 @@ object SQLContext {
       ): InternalRow
     }
   }
+
+  /**
+   * Create a SQLListener then add it into SparkContext, and create an SQLTab if there is SparkUI.
+   */
+  private[sql] def createListenerAndUI(sc: SparkContext): SQLListener = {
+    val listener = new SQLListener(sc.conf)
+    sc.addSparkListener(listener)
+    sc.ui.foreach(new SQLTab(listener, _))
+    listener
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index 5779c71f64e9e..d6472400a6a21 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -19,19 +19,15 @@ package org.apache.spark.sql.execution.ui
 
 import scala.collection.mutable
 
-import com.google.common.annotations.VisibleForTesting
-
-import org.apache.spark.{JobExecutionStatus, Logging}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue}
+import org.apache.spark.{JobExecutionStatus, Logging, SparkConf}
 
-private[sql] class SQLListener(sqlContext: SQLContext) extends SparkListener with Logging {
+private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Logging {
 
-  private val retainedExecutions =
-    sqlContext.sparkContext.conf.getInt("spark.sql.ui.retainedExecutions", 1000)
+  private val retainedExecutions = conf.getInt("spark.sql.ui.retainedExecutions", 1000)
 
   private val activeExecutions = mutable.HashMap[Long, SQLExecutionUIData]()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
index 0b0867f67eb6e..9c27944d42fc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
@@ -20,14 +20,12 @@ package org.apache.spark.sql.execution.ui
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.ui.{SparkUI, SparkUITab}
 
-private[sql] class SQLTab(sqlContext: SQLContext, sparkUI: SparkUI)
+private[sql] class SQLTab(val listener: SQLListener, sparkUI: SparkUI)
   extends SparkUITab(sparkUI, SQLTab.nextTabName) with Logging {
 
   val parent = sparkUI
-  val listener = sqlContext.listener
 
   attachPage(new AllExecutionsPage(this))
   attachPage(new ExecutionPage(this))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 7a46c69a056b1..727cf3665a871 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -74,7 +74,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     val accumulatorIds =
@@ -212,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -241,7 +241,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -281,7 +281,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobFailed)") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index ddeadd3eb737d..e620d7fb82af9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -40,12 +40,13 @@ import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
-import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.ui.SQLListener
 import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
@@ -88,12 +89,16 @@ private[hive] case class CurrentDatabase(ctx: HiveContext)
 class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
+    @transient listener: SQLListener,
     @transient execHive: ClientWrapper,
     @transient metaHive: ClientInterface,
-    isRootContext: Boolean) extends SQLContext(sc, cacheManager, isRootContext) with Logging {
+    isRootContext: Boolean)
+  extends SQLContext(sc, cacheManager, listener, isRootContext) with Logging {
   self =>
 
-  def this(sc: SparkContext) = this(sc, new CacheManager, null, null, true)
+  def this(sc: SparkContext) = {
+    this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), null, null, true)
+  }
   def this(sc: JavaSparkContext) = this(sc.sc)
 
   import org.apache.spark.sql.hive.HiveContext._
@@ -109,6 +114,7 @@ class HiveContext private[hive](
     new HiveContext(
       sc = sc,
       cacheManager = cacheManager,
+      listener = listener,
       execHive = executionHive.newSession(),
       metaHive = metadataHive.newSession(),
       isRootContext = false)

From 5e3868ba139f5f0b3a33361c6b884594a3ab6421 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 13 Oct 2015 10:02:21 -0700
Subject: [PATCH 532/802] [SPARK-10051] [SPARKR] Support collecting data of
 StructType in DataFrame

Two points in this PR:

1.    Originally thought was that a named R list is assumed to be a struct in SerDe. But this is problematic because some R functions will implicitly generate named lists that are not intended to be a struct when transferred by SerDe. So SerDe clients have to explicitly mark a names list as struct by changing its class from "list" to "struct".

2.    SerDe is in the Spark Core module, and data of StructType is represented as GenricRow which is defined in Spark SQL module. SerDe can't import GenricRow as in maven build  Spark SQL module depends on Spark Core module. So this PR adds a registration hook in SerDe to allow SQLUtils in Spark SQL module to register its functions for serialization and deserialization of StructType.

Author: Sun Rui <rui.sun@intel.com>

Closes #8794 from sun-rui/SPARK-10051.
---
 R/pkg/R/SQLContext.R                          | 22 +++---
 R/pkg/R/deserialize.R                         | 10 +++
 R/pkg/R/schema.R                              | 28 +++++++-
 R/pkg/R/serialize.R                           | 43 +++++++----
 R/pkg/R/sparkR.R                              |  4 +-
 R/pkg/R/utils.R                               | 17 +++++
 R/pkg/inst/tests/test_sparkSQL.R              | 51 +++++++------
 .../scala/org/apache/spark/api/r/SerDe.scala  | 71 ++++++++++++++-----
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 47 ++++++++++--
 9 files changed, 224 insertions(+), 69 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 1c58fd96d750a..66c7e307212c3 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -32,6 +32,7 @@ infer_type <- function(x) {
                  numeric = "double",
                  raw = "binary",
                  list = "array",
+                 struct = "struct",
                  environment = "map",
                  Date = "date",
                  POSIXlt = "timestamp",
@@ -44,17 +45,18 @@ infer_type <- function(x) {
     paste0("map<string,", infer_type(get(key, x)), ">")
   } else if (type == "array") {
     stopifnot(length(x) > 0)
+
+    paste0("array<", infer_type(x[[1]]), ">")
+  } else if (type == "struct") {
+    stopifnot(length(x) > 0)
     names <- names(x)
-    if (is.null(names)) {
-      paste0("array<", infer_type(x[[1]]), ">")
-    } else {
-      # StructType
-      types <- lapply(x, infer_type)
-      fields <- lapply(1:length(x), function(i) {
-        structField(names[[i]], types[[i]], TRUE)
-      })
-      do.call(structType, fields)
-    }
+    stopifnot(!is.null(names))
+
+    type <- lapply(seq_along(x), function(i) {
+      paste0(names[[i]], ":", infer_type(x[[i]]), ",")
+    })
+    type <- Reduce(paste0, type)
+    type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
   } else if (length(x) > 1) {
     paste0("array<", infer_type(x[[1]]), ">")
   } else {
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index ce88d0b071b72..f7e56e43016ea 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -51,6 +51,7 @@ readTypedObject <- function(con, type) {
     "a" = readArray(con),
     "l" = readList(con),
     "e" = readEnv(con),
+    "s" = readStruct(con),
     "n" = NULL,
     "j" = getJobj(readString(con)),
     stop(paste("Unsupported type for deserialization", type)))
@@ -135,6 +136,15 @@ readEnv <- function(con) {
   env
 }
 
+# Read a field of StructType from DataFrame
+# into a named list in R whose class is "struct"
+readStruct <- function(con) {
+  names <- readObject(con)
+  fields <- readObject(con)
+  names(fields) <- names
+  listToStruct(fields)
+}
+
 readRaw <- function(con) {
   dataLen <- readInt(con)
   readBin(con, raw(), as.integer(dataLen), endian = "big")
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 8df1563f8ebc0..6f0e9a94e9bfa 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -136,7 +136,7 @@ checkType <- function(type) {
     switch (firstChar,
             a = {
               # Array type
-              m <- regexec("^array<(.*)>$", type)
+              m <- regexec("^array<(.+)>$", type)
               matchedStrings <- regmatches(type, m)
               if (length(matchedStrings[[1]]) >= 2) {
                 elemType <- matchedStrings[[1]][2]
@@ -146,7 +146,7 @@ checkType <- function(type) {
             },
             m = {
               # Map type
-              m <- regexec("^map<(.*),(.*)>$", type)
+              m <- regexec("^map<(.+),(.+)>$", type)
               matchedStrings <- regmatches(type, m)
               if (length(matchedStrings[[1]]) >= 3) {
                 keyType <- matchedStrings[[1]][2]
@@ -157,6 +157,30 @@ checkType <- function(type) {
                 checkType(valueType)
                 return()
               }
+            },
+            s = {
+              # Struct type
+              m <- regexec("^struct<(.+)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 2) {
+                fieldsString <- matchedStrings[[1]][2]
+                # strsplit does not return the final empty string, so check if
+                # the final char is ","
+                if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
+                  fields <- strsplit(fieldsString, ",")[[1]]
+                  for (field in fields) {
+                    m <- regexec("^(.+):(.+)$", field)
+                    matchedStrings <- regmatches(field, m)
+                    if (length(matchedStrings[[1]]) >= 3) {
+                      fieldType <- matchedStrings[[1]][3]
+                      checkType(fieldType)
+                    } else {
+                      break
+                    }
+                  }
+                  return()
+                }
+              }
             })
   }
 
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 91e6b3e5609b5..17082b4e52fcf 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -32,6 +32,21 @@
 # environment -> Map[String, T], where T is a native type
 # jobj -> Object, where jobj is an object created in the backend
 
+getSerdeType <- function(object) {
+  type <- class(object)[[1]]
+  if (type != "list") {
+    type
+  } else {
+    # Check if all elements are of same type
+    elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
+    if (length(elemType) <= 1) {
+      "array"
+    } else {
+      "list"
+    }
+  }
+}
+
 writeObject <- function(con, object, writeType = TRUE) {
   # NOTE: In R vectors have same type as objects. So we don't support
   # passing in vectors as arrays and instead require arrays to be passed
@@ -45,10 +60,12 @@ writeObject <- function(con, object, writeType = TRUE) {
       type <- "NULL"
     }
   }
+
+  serdeType <- getSerdeType(object)
   if (writeType) {
-    writeType(con, type)
+    writeType(con, serdeType)
   }
-  switch(type,
+  switch(serdeType,
          NULL = writeVoid(con),
          integer = writeInt(con, object),
          character = writeString(con, object),
@@ -56,7 +73,9 @@ writeObject <- function(con, object, writeType = TRUE) {
          double = writeDouble(con, object),
          numeric = writeDouble(con, object),
          raw = writeRaw(con, object),
+         array = writeArray(con, object),
          list = writeList(con, object),
+         struct = writeList(con, object),
          jobj = writeJobj(con, object),
          environment = writeEnv(con, object),
          Date = writeDate(con, object),
@@ -110,7 +129,7 @@ writeRowSerialize <- function(outputCon, rows) {
 serializeRow <- function(row) {
   rawObj <- rawConnection(raw(0), "wb")
   on.exit(close(rawObj))
-  writeGenericList(rawObj, row)
+  writeList(rawObj, row)
   rawConnectionValue(rawObj)
 }
 
@@ -128,7 +147,9 @@ writeType <- function(con, class) {
                  double = "d",
                  numeric = "d",
                  raw = "r",
+                 array = "a",
                  list = "l",
+                 struct = "s",
                  jobj = "j",
                  environment = "e",
                  Date = "D",
@@ -139,15 +160,13 @@ writeType <- function(con, class) {
 }
 
 # Used to pass arrays where all the elements are of the same type
-writeList <- function(con, arr) {
-  # All elements should be of same type
-  elemType <- unique(sapply(arr, function(elem) { class(elem) }))
-  stopifnot(length(elemType) <= 1)
-
+writeArray <- function(con, arr) {
   # TODO: Empty lists are given type "character" right now.
   # This may not work if the Java side expects array of any other type.
-  if (length(elemType) == 0) {
+  if (length(arr) == 0) {
     elemType <- class("somestring")
+  } else {
+    elemType <- getSerdeType(arr[[1]])
   }
 
   writeType(con, elemType)
@@ -161,7 +180,7 @@ writeList <- function(con, arr) {
 }
 
 # Used to pass arrays where the elements can be of different types
-writeGenericList <- function(con, list) {
+writeList <- function(con, list) {
   writeInt(con, length(list))
   for (elem in list) {
     writeObject(con, elem)
@@ -174,9 +193,9 @@ writeEnv <- function(con, env) {
 
   writeInt(con, len)
   if (len > 0) {
-    writeList(con, as.list(ls(env)))
+    writeArray(con, as.list(ls(env)))
     vals <- lapply(ls(env), function(x) { env[[x]] })
-    writeGenericList(con, as.list(vals))
+    writeList(con, as.list(vals))
   }
 }
 
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 3c57a44db257d..cc47110f54732 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -178,7 +178,7 @@ sparkR.init <- function(
   }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
-  localJarPaths <- sapply(nonEmptyJars,
+  localJarPaths <- lapply(nonEmptyJars,
                           function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
 
   # Set the start time to identify jobjs
@@ -193,7 +193,7 @@ sparkR.init <- function(
       master,
       appName,
       as.character(sparkHome),
-      as.list(localJarPaths),
+      localJarPaths,
       sparkEnvirMap,
       sparkExecutorEnvMap),
     envir = .sparkREnv
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 69a2bc728f842..94f16c7ac52cc 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -588,3 +588,20 @@ mergePartitions <- function(rdd, zip) {
 
   PipelinedRDD(rdd, partitionFunc)
 }
+
+# Convert a named list to struct so that
+# SerDe won't confuse between a normal named list and struct
+listToStruct <- function(list) {
+  stopifnot(class(list) == "list")
+  stopifnot(!is.null(names(list)))
+  class(list) <- "struct"
+  list
+}
+
+# Convert a struct to a named list
+structToList <- function(struct) {
+  stopifnot(class(list) == "struct")
+
+  class(struct) <- "list"
+  struct
+}
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 3a04edbb4c116..af6efa40fb2f6 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -66,10 +66,7 @@ test_that("infer types and check types", {
   expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
   expect_equal(infer_type(c(1L, 2L)), "array<integer>")
   expect_equal(infer_type(list(1L, 2L)), "array<integer>")
-  testStruct <- infer_type(list(a = 1L, b = "2"))
-  expect_equal(class(testStruct), "structType")
-  checkStructField(testStruct$fields()[[1]], "a", "IntegerType", TRUE)
-  checkStructField(testStruct$fields()[[2]], "b", "StringType", TRUE)
+  expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
   e <- new.env()
   assign("a", 1L, envir = e)
   expect_equal(infer_type(e), "map<string,integer>")
@@ -242,38 +239,36 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
-test_that("create DataFrame with nested array and map", {
-#  e <- new.env()
-#  assign("n", 3L, envir = e)
-#  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
-#  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c", "d"))
-#  expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>"),
-#                                c("c", "map<string,int>"), c("d", "struct<a:string,b:int>")))
-#  expect_equal(count(df), 1)
-#  ldf <- collect(df)
-#  expect_equal(ldf[1,], l[[1]])
-
-  #  ArrayType and MapType
+test_that("create DataFrame with complex types", {
   e <- new.env()
   assign("n", 3L, envir = e)
 
-  l <- list(as.list(1:10), list("a", "b"), e)
-  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c"))
+  s <- listToStruct(list(a = "aa", b = 3L))
+
+  l <- list(as.list(1:10), list("a", "b"), e, s)
+  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c", "d"))
   expect_equal(dtypes(df), list(c("a", "array<int>"),
                                 c("b", "array<string>"),
-                                c("c", "map<string,int>")))
+                                c("c", "map<string,int>"),
+                                c("d", "struct<a:string,b:int>")))
   expect_equal(count(df), 1)
   ldf <- collect(df)
-  expect_equal(names(ldf), c("a", "b", "c"))
+  expect_equal(names(ldf), c("a", "b", "c", "d"))
   expect_equal(ldf[1, 1][[1]], l[[1]])
   expect_equal(ldf[1, 2][[1]], l[[2]])
+
   e <- ldf$c[[1]]
   expect_equal(class(e), "environment")
   expect_equal(ls(e), "n")
   expect_equal(e$n, 3L)
+
+  s <- ldf$d[[1]]
+  expect_equal(class(s), "struct")
+  expect_equal(s$a, "aa")
+  expect_equal(s$b, 3L)
 })
 
-# For test map type in DataFrame
+# For test map type and struct type in DataFrame
 mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
                       "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
                       "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
@@ -308,7 +303,19 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(bob$age, 16)
   expect_equal(bob$height, 176.5)
 
-  # TODO: tests for StructType after it is supported
+  # StructType
+  df <- jsonFile(sqlContext, mapTypeJsonPath)
+  expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
+                                c("name", "string")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("info", "name"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "struct")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
 })
 
 test_that("jsonFile() on a local file returns a DataFrame", {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 0c78613e406e1..da126bac7ad1f 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -27,6 +27,14 @@ import scala.collection.mutable.WrappedArray
  * Utility functions to serialize, deserialize objects to / from R
  */
 private[spark] object SerDe {
+  type ReadObject = (DataInputStream, Char) => Object
+  type WriteObject = (DataOutputStream, Object) => Boolean
+
+  var sqlSerDe: (ReadObject, WriteObject) = _
+
+  def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = {
+    this.sqlSerDe = sqlSerDe
+  }
 
   // Type mapping from R to Java
   //
@@ -63,11 +71,22 @@ private[spark] object SerDe {
       case 'c' => readString(dis)
       case 'e' => readMap(dis)
       case 'r' => readBytes(dis)
+      case 'a' => readArray(dis)
       case 'l' => readList(dis)
       case 'D' => readDate(dis)
       case 't' => readTime(dis)
       case 'j' => JVMObjectTracker.getObject(readString(dis))
-      case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
+      case _ =>
+        if (sqlSerDe == null || sqlSerDe._1 == null) {
+          throw new IllegalArgumentException (s"Invalid type $dataType")
+        } else {
+          val obj = (sqlSerDe._1)(dis, dataType)
+          if (obj == null) {
+            throw new IllegalArgumentException (s"Invalid type $dataType")
+          } else {
+            obj
+          }
+        }
     }
   }
 
@@ -141,7 +160,8 @@ private[spark] object SerDe {
     (0 until len).map(_ => readString(in)).toArray
   }
 
-  def readList(dis: DataInputStream): Array[_] = {
+  // All elements of an array must be of the same type
+  def readArray(dis: DataInputStream): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
       case 'i' => readIntArr(dis)
@@ -150,26 +170,43 @@ private[spark] object SerDe {
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
-      case 'l' => {
+      case 'a' =>
+        val len = readInt(dis)
+        (0 until len).map(_ => readArray(dis)).toArray
+      case 'l' =>
         val len = readInt(dis)
         (0 until len).map(_ => readList(dis)).toArray
-      }
-      case _ => throw new IllegalArgumentException(s"Invalid array type $arrType")
+      case _ =>
+        if (sqlSerDe == null || sqlSerDe._1 == null) {
+          throw new IllegalArgumentException (s"Invalid array type $arrType")
+        } else {
+          val len = readInt(dis)
+          (0 until len).map { _ =>
+            val obj = (sqlSerDe._1)(dis, arrType)
+            if (obj == null) {
+              throw new IllegalArgumentException (s"Invalid array type $arrType")
+            } else {
+              obj
+            }
+          }.toArray
+        }
     }
   }
 
+  // Each element of a list can be of different type. They are all represented
+  // as Object on JVM side
+  def readList(dis: DataInputStream): Array[Object] = {
+    val len = readInt(dis)
+    (0 until len).map(_ => readObject(dis)).toArray
+  }
+
   def readMap(in: DataInputStream): java.util.Map[Object, Object] = {
     val len = readInt(in)
     if (len > 0) {
-      val keysType = readObjectType(in)
-      val keysLen = readInt(in)
-      val keys = (0 until keysLen).map(_ => readTypedObject(in, keysType))
-
-      val valuesLen = readInt(in)
-      val values = (0 until valuesLen).map(_ => {
-        val valueType = readObjectType(in)
-        readTypedObject(in, valueType)
-      })
+      // Keys is an array of String
+      val keys = readArray(in).asInstanceOf[Array[Object]]
+      val values = readList(in)
+
       keys.zip(values).toMap.asJava
     } else {
       new java.util.HashMap[Object, Object]()
@@ -338,8 +375,10 @@ private[spark] object SerDe {
           }
 
         case _ =>
-          writeType(dos, "jobj")
-          writeJObj(dos, value)
+          if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) {
+            writeType(dos, "jobj")
+            writeJObj(dos, value)
+          }
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index f45d119c8cfdf..b0120a8d0dc4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -22,13 +22,15 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, Da
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.api.r.SerDe
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression, GenericRowWithSchema}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, GroupedData, Row, SQLContext, SaveMode}
 
 import scala.util.matching.Regex
 
 private[r] object SQLUtils {
+  SerDe.registerSqlSerDe((readSqlObject, writeSqlObject))
+
   def createSQLContext(jsc: JavaSparkContext): SQLContext = {
     new SQLContext(jsc)
   }
@@ -61,15 +63,27 @@ private[r] object SQLUtils {
       case "boolean" => org.apache.spark.sql.types.BooleanType
       case "timestamp" => org.apache.spark.sql.types.TimestampType
       case "date" => org.apache.spark.sql.types.DateType
-      case r"\Aarray<(.*)${elemType}>\Z" => {
+      case r"\Aarray<(.+)${elemType}>\Z" =>
         org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
-      }
-      case r"\Amap<(.*)${keyType},(.*)${valueType}>\Z" => {
+      case r"\Amap<(.+)${keyType},(.+)${valueType}>\Z" =>
         if (keyType != "string" && keyType != "character") {
           throw new IllegalArgumentException("Key type of a map must be string or character")
         }
         org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
-      }
+      case r"\Astruct<(.+)${fieldsStr}>\Z" =>
+        if (fieldsStr(fieldsStr.length - 1) == ',') {
+          throw new IllegalArgumentException(s"Invaid type $dataType")
+        }
+        val fields = fieldsStr.split(",")
+        val structFields = fields.map { field =>
+          field match {
+            case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" =>
+              createStructField(fieldName, fieldType, true)
+
+            case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
+          }
+        }
+        createStructType(structFields)
       case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
     }
   }
@@ -151,4 +165,27 @@ private[r] object SQLUtils {
       options: java.util.Map[String, String]): DataFrame = {
     sqlContext.read.format(source).schema(schema).options(options).load()
   }
+
+  def readSqlObject(dis: DataInputStream, dataType: Char): Object = {
+    dataType match {
+      case 's' =>
+        // Read StructType for DataFrame
+        val fields = SerDe.readList(dis).asInstanceOf[Array[Object]]
+        Row.fromSeq(fields)
+      case _ => null
+    }
+  }
+
+  def writeSqlObject(dos: DataOutputStream, obj: Object): Boolean = {
+    obj match {
+      // Handle struct type in DataFrame
+      case v: GenericRowWithSchema =>
+        dos.writeByte('s')
+        SerDe.writeObject(dos, v.schema.fieldNames)
+        SerDe.writeObject(dos, v.values)
+        true
+      case _ =>
+        false
+    }
+  }
 }

From 1e0aba90b9e73834af70d196f7f869b062d98d94 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>
Date: Tue, 13 Oct 2015 10:09:05 -0700
Subject: [PATCH 533/802] [SPARK-10888] [SPARKR] Added as.DataFrame as a
 synonym to createDataFrame

as.DataFrame is more a R-style like signature.
Also, I'd like to know if we could make the context, e.g. sqlContext global, so that we do not have to specify it as an argument, when we each time create a dataframe.

Author: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>

Closes #8952 from NarineK/sparkrasDataFrame.
---
 R/pkg/NAMESPACE                  |  3 ++-
 R/pkg/R/SQLContext.R             | 17 +++++++++++++----
 R/pkg/inst/tests/test_sparkSQL.R | 15 +++++++++++++++
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 95d949ee3e5a4..41986a5e7ab7d 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -228,7 +228,8 @@ exportMethods("agg")
 export("sparkRSQL.init",
        "sparkRHive.init")
 
-export("cacheTable",
+export("as.DataFrame",
+       "cacheTable",
        "clearCache",
        "createDataFrame",
        "createExternalTable",
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 66c7e307212c3..399f53657a68c 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -64,21 +64,23 @@ infer_type <- function(x) {
   }
 }
 
-#' Create a DataFrame from an RDD
+#' Create a DataFrame
 #'
-#' Converts an RDD to a DataFrame by infer the types.
+#' Converts R data.frame or list into DataFrame.
 #'
 #' @param sqlContext A SQLContext
 #' @param data An RDD or list or data.frame
 #' @param schema a list of column names or named list (StructType), optional
 #' @return an DataFrame
+#' @rdname createDataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
-#' df <- createDataFrame(sqlContext, rdd)
+#' df1 <- as.DataFrame(sqlContext, iris)
+#' df2 <- as.DataFrame(sqlContext, list(3,4,5,6))
+#' df3 <- createDataFrame(sqlContext, iris)
 #' }
 
 # TODO(davies): support sampling and infer type from NA
@@ -151,6 +153,13 @@ createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0
   dataFrame(sdf)
 }
 
+#' @rdname createDataFrame
+#' @aliases createDataFrame
+#' @export
+as.DataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
+  createDataFrame(sqlContext, data, schema, samplingRatio)
+}
+
 # toDF
 #
 # Converts an RDD to a DataFrame by infer the types.
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index af6efa40fb2f6..b599994854670 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -89,17 +89,28 @@ test_that("structType and structField", {
 test_that("create DataFrame from RDD", {
   rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
   df <- createDataFrame(sqlContext, rdd, list("a", "b"))
+  dfAsDF <- as.DataFrame(sqlContext, rdd, list("a", "b"))
   expect_is(df, "DataFrame")
+  expect_is(dfAsDF, "DataFrame")
   expect_equal(count(df), 10)
+  expect_equal(count(dfAsDF), 10)
   expect_equal(nrow(df), 10)
+  expect_equal(nrow(dfAsDF), 10)
   expect_equal(ncol(df), 2)
+  expect_equal(ncol(dfAsDF), 2)
   expect_equal(dim(df), c(10, 2))
+  expect_equal(dim(dfAsDF), c(10, 2))
   expect_equal(columns(df), c("a", "b"))
+  expect_equal(columns(dfAsDF), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+  expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
 
   df <- createDataFrame(sqlContext, rdd)
+  dfAsDF <- as.DataFrame(sqlContext, rdd)
   expect_is(df, "DataFrame")
+  expect_is(dfAsDF, "DataFrame")
   expect_equal(columns(df), c("_1", "_2"))
+  expect_equal(columns(dfAsDF), c("_1", "_2"))
 
   schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
                         structField(x = "b", type = "string", nullable = TRUE))
@@ -130,9 +141,13 @@ test_that("create DataFrame from RDD", {
   schema <- structType(structField("name", "string"), structField("age", "integer"),
                        structField("height", "float"))
   df2 <- createDataFrame(sqlContext, df.toRDD, schema)
+  df2AsDF <- as.DataFrame(sqlContext, df.toRDD, schema)
   expect_equal(columns(df2), c("name", "age", "height"))
+  expect_equal(columns(df2AsDF), c("name", "age", "height"))
   expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
   expect_equal(collect(where(df2, df2$name == "Bob")), c("Bob", 16, 176.5))
+  expect_equal(collect(where(df2AsDF, df2$name == "Bob")), c("Bob", 16, 176.5))
 
   localDF <- data.frame(name=c("John", "Smith", "Sarah"),
                         age=c(19, 23, 18),

From f7f28ee7a513c262d52cf433d25fbf06df9bd1f1 Mon Sep 17 00:00:00 2001
From: Adrian Zhuang <adrian555@users.noreply.github.com>
Date: Tue, 13 Oct 2015 10:21:07 -0700
Subject: [PATCH 534/802] [SPARK-10913] [SPARKR] attach() function support

Bring the change code up to date.

Author: Adrian Zhuang <adrian555@users.noreply.github.com>
Author: adrian555 <wzhuang@us.ibm.com>

Closes #9031 from adrian555/attach2.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              | 30 ++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 20 ++++++++++++++++++++
 4 files changed, 55 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 41986a5e7ab7d..ed9cd94e03b13 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -23,6 +23,7 @@ export("setJobGroup",
 exportClasses("DataFrame")
 
 exportMethods("arrange",
+              "attach",
               "cache",
               "collect",
               "columns",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1b9137e6c7934..e0ce056243585 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1881,3 +1881,33 @@ setMethod("as.data.frame",
             }
             collect(x)
           })
+
+#' The specified DataFrame is attached to the R search path. This means that
+#' the DataFrame is searched by R when evaluating a variable, so columns in
+#' the DataFrame can be accessed by simply giving their names.
+#'
+#' @rdname attach
+#' @title Attach DataFrame to R search path
+#' @param what (DataFrame) The DataFrame to attach
+#' @param pos (integer) Specify position in search() where to attach.
+#' @param name (character) Name to use for the attached DataFrame. Names
+#'   starting with package: are reserved for library.
+#' @param warn.conflicts (logical) If TRUE, warnings are printed about conflicts
+#' from attaching the database, unless that DataFrame contains an object
+#' @examples
+#' \dontrun{
+#' attach(irisDf)
+#' summary(Sepal_Width)
+#' }
+#' @seealso \link{detach}
+setMethod("attach",
+          signature(what = "DataFrame"),
+          function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) {
+            cols <- columns(what)
+            stopifnot(length(cols) > 0)
+            newEnv <- new.env()
+            for (i in 1:length(cols)) {
+              assign(x = cols[i], value = what[, cols[i]], envir = newEnv)
+            }
+            attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 8fad17026c06f..c106a0024583e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1003,3 +1003,7 @@ setGeneric("rbind", signature = "...")
 #' @rdname as.data.frame
 #' @export
 setGeneric("as.data.frame")
+
+#' @rdname attach
+#' @export
+setGeneric("attach")
\ No newline at end of file
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index b599994854670..d5509e475de05 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1405,6 +1405,26 @@ test_that("Method as.data.frame as a synonym for collect()", {
   expect_equal(as.data.frame(irisDF2), collect(irisDF2))
 })
 
+test_that("attach() on a DataFrame", {
+  df <- jsonFile(sqlContext, jsonPath)
+  expect_error(age)
+  attach(df)
+  expect_is(age, "DataFrame")
+  expected_age <- data.frame(age = c(NA, 30, 19))
+  expect_equal(head(age), expected_age)
+  stat <- summary(age)
+  expect_equal(collect(stat)[5, "age"], "30")
+  age <- age$age + 1
+  expect_is(age, "Column")
+  rm(age)
+  stat2 <- summary(age)
+  expect_equal(collect(stat2)[5, "age"], "30")
+  detach("df")
+  stat3 <- summary(df[, "age"])
+  expect_equal(collect(stat3)[5, "age"], "30")
+  expect_error(age)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)

From c75f058b72d492d6de898957b3058f242d70dd8a Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 13 Oct 2015 12:03:46 -0700
Subject: [PATCH 535/802] [PYTHON] [MINOR] List modules in PySpark tests when
 given bad name

Output list of supported modules for python tests in error message when given bad module name.

CC: davies

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #9088 from jkbradley/python-tests-modules.
---
 python/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index fd56c7ab6e0e2..152f5cc98d0fd 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -167,7 +167,8 @@ def main():
         if module_name in python_modules:
             modules_to_test.append(python_modules[module_name])
         else:
-            print("Error: unrecognized module %s" % module_name)
+            print("Error: unrecognized module '%s'. Supported modules: %s" %
+                  (module_name, ", ".join(python_modules)))
             sys.exit(-1)
     LOGGER.info("Will test against the following Python executables: %s", python_execs)
     LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])

From 2b574f52d7bf51b1fe2a73086a3735b633e9083f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 13 Oct 2015 13:24:10 -0700
Subject: [PATCH 536/802] [SPARK-7402] [ML] JSON SerDe for standard param types

This PR implements the JSON SerDe for the following param types: `Boolean`, `Int`, `Long`, `Float`, `Double`, `String`, `Array[Int]`, `Array[Double]`, and `Array[String]`. The implementation of `Float`, `Double`, and `Array[Double]` are specialized to handle `NaN` and `Inf`s. This will be used in pipeline persistence. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #9090 from mengxr/SPARK-7402.
---
 .../org/apache/spark/ml/param/params.scala    | 169 ++++++++++++++++++
 .../apache/spark/ml/param/ParamsSuite.scala   | 114 ++++++++++++
 2 files changed, 283 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index ec98b05e13b89..8361406f87299 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,6 +24,9 @@ import scala.annotation.varargs
 import scala.collection.mutable
 import scala.collection.JavaConverters._
 
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.util.Identifiable
 
@@ -80,6 +83,30 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
   /** Creates a param pair with the given value (for Scala). */
   def ->(value: T): ParamPair[T] = ParamPair(this, value)
 
+  /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */
+  def jsonEncode(value: T): String = {
+    value match {
+      case x: String =>
+        compact(render(JString(x)))
+      case _ =>
+        throw new NotImplementedError(
+          "The default jsonEncode only supports string. " +
+            s"${this.getClass.getName} must override jsonEncode for ${value.getClass.getName}.")
+    }
+  }
+
+  /** Decodes a param value from JSON. */
+  def jsonDecode(json: String): T = {
+    parse(json) match {
+      case JString(x) =>
+        x.asInstanceOf[T]
+      case _ =>
+        throw new NotImplementedError(
+          "The default jsonDecode only supports string. " +
+            s"${this.getClass.getName} must override jsonDecode to support its value type.")
+    }
+  }
+
   override final def toString: String = s"${parent}__$name"
 
   override final def hashCode: Int = toString.##
@@ -198,6 +225,46 @@ class DoubleParam(parent: String, name: String, doc: String, isValid: Double =>
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Double): ParamPair[Double] = super.w(value)
+
+  override def jsonEncode(value: Double): String = {
+    compact(render(DoubleParam.jValueEncode(value)))
+  }
+
+  override def jsonDecode(json: String): Double = {
+    DoubleParam.jValueDecode(parse(json))
+  }
+}
+
+private[param] object DoubleParam {
+  /** Encodes a param value into JValue. */
+  def jValueEncode(value: Double): JValue = {
+    value match {
+      case _ if value.isNaN =>
+        JString("NaN")
+      case Double.NegativeInfinity =>
+        JString("-Inf")
+      case Double.PositiveInfinity =>
+        JString("Inf")
+      case _ =>
+        JDouble(value)
+    }
+  }
+
+  /** Decodes a param value from JValue. */
+  def jValueDecode(jValue: JValue): Double = {
+    jValue match {
+      case JString("NaN") =>
+        Double.NaN
+      case JString("-Inf") =>
+        Double.NegativeInfinity
+      case JString("Inf") =>
+        Double.PositiveInfinity
+      case JDouble(x) =>
+        x
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $jValue to Double.")
+    }
+  }
 }
 
 /**
@@ -218,6 +285,15 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Int): ParamPair[Int] = super.w(value)
+
+  override def jsonEncode(value: Int): String = {
+    compact(render(JInt(value)))
+  }
+
+  override def jsonDecode(json: String): Int = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Int]
+  }
 }
 
 /**
@@ -238,6 +314,47 @@ class FloatParam(parent: String, name: String, doc: String, isValid: Float => Bo
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Float): ParamPair[Float] = super.w(value)
+
+  override def jsonEncode(value: Float): String = {
+    compact(render(FloatParam.jValueEncode(value)))
+  }
+
+  override def jsonDecode(json: String): Float = {
+    FloatParam.jValueDecode(parse(json))
+  }
+}
+
+private object FloatParam {
+
+  /** Encodes a param value into JValue. */
+  def jValueEncode(value: Float): JValue = {
+    value match {
+      case _ if value.isNaN =>
+        JString("NaN")
+      case Float.NegativeInfinity =>
+        JString("-Inf")
+      case Float.PositiveInfinity =>
+        JString("Inf")
+      case _ =>
+        JDouble(value)
+    }
+  }
+
+  /** Decodes a param value from JValue. */
+  def jValueDecode(jValue: JValue): Float = {
+    jValue match {
+      case JString("NaN") =>
+        Float.NaN
+      case JString("-Inf") =>
+        Float.NegativeInfinity
+      case JString("Inf") =>
+        Float.PositiveInfinity
+      case JDouble(x) =>
+        x.toFloat
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $jValue to Float.")
+    }
+  }
 }
 
 /**
@@ -258,6 +375,15 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Long): ParamPair[Long] = super.w(value)
+
+  override def jsonEncode(value: Long): String = {
+    compact(render(JInt(value)))
+  }
+
+  override def jsonDecode(json: String): Long = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Long]
+  }
 }
 
 /**
@@ -272,6 +398,15 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
+
+  override def jsonEncode(value: Boolean): String = {
+    compact(render(JBool(value)))
+  }
+
+  override def jsonDecode(json: String): Boolean = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Boolean]
+  }
 }
 
 /**
@@ -287,6 +422,16 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[String]): ParamPair[Array[String]] = w(value.asScala.toArray)
+
+  override def jsonEncode(value: Array[String]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq))
+  }
+
+  override def jsonDecode(json: String): Array[String] = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Seq[String]].toArray
+  }
 }
 
 /**
@@ -303,6 +448,20 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[java.lang.Double]): ParamPair[Array[Double]] =
     w(value.asScala.map(_.asInstanceOf[Double]).toArray)
+
+  override def jsonEncode(value: Array[Double]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq.map(DoubleParam.jValueEncode)))
+  }
+
+  override def jsonDecode(json: String): Array[Double] = {
+    parse(json) match {
+      case JArray(values) =>
+        values.map(DoubleParam.jValueDecode).toArray
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $json to Array[Double].")
+    }
+  }
 }
 
 /**
@@ -319,6 +478,16 @@ class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[In
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[java.lang.Integer]): ParamPair[Array[Int]] =
     w(value.asScala.map(_.asInstanceOf[Int]).toArray)
+
+  override def jsonEncode(value: Array[Int]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq))
+  }
+
+  override def jsonDecode(json: String): Array[Int] = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Seq[Int]].toArray
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index a2ea279f5d5e4..eeb03dba2f825 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -21,6 +21,120 @@ import org.apache.spark.SparkFunSuite
 
 class ParamsSuite extends SparkFunSuite {
 
+  test("json encode/decode") {
+    val dummy = new Params {
+      override def copy(extra: ParamMap): Params = defaultCopy(extra)
+
+      override val uid: String = "dummy"
+    }
+
+    { // BooleanParam
+      val param = new BooleanParam(dummy, "name", "doc")
+      for (value <- Seq(true, false)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // IntParam
+      val param = new IntParam(dummy, "name", "doc")
+      for (value <- Seq(Int.MinValue, -1, 0, 1, Int.MaxValue)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // LongParam
+      val param = new LongParam(dummy, "name", "doc")
+      for (value <- Seq(Long.MinValue, -1L, 0L, 1L, Long.MaxValue)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // FloatParam
+      val param = new FloatParam(dummy, "name", "doc")
+      for (value <- Seq(Float.NaN, Float.NegativeInfinity, Float.MinValue, -1.0f, -0.5f, 0.0f,
+        Float.MinPositiveValue, 0.5f, 1.0f, Float.MaxValue, Float.PositiveInfinity)) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        if (value.isNaN) {
+          assert(decoded.isNaN)
+        } else {
+          assert(decoded === value)
+        }
+      }
+    }
+
+    { // DoubleParam
+      val param = new DoubleParam(dummy, "name", "doc")
+      for (value <- Seq(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, -0.5, 0.0,
+          Double.MinPositiveValue, 0.5, 1.0, Double.MaxValue, Double.PositiveInfinity)) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        if (value.isNaN) {
+          assert(decoded.isNaN)
+        } else {
+          assert(decoded === value)
+        }
+      }
+    }
+
+    { // StringParam
+      val param = new Param[String](dummy, "name", "doc")
+      // Currently we do not support null.
+      for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // IntArrayParam
+      val param = new IntArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[Int]] = Seq(
+        Array(),
+        Array(1),
+        Array(Int.MinValue, 0, Int.MaxValue))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // DoubleArrayParam
+      val param = new DoubleArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[Double]] = Seq(
+         Array(),
+         Array(1.0),
+         Array(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0,
+           Double.MinPositiveValue, 1.0, Double.MaxValue, Double.PositiveInfinity))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        assert(decoded.length === value.length)
+        decoded.zip(value).foreach { case (actual, expected) =>
+          if (expected.isNaN) {
+            assert(actual.isNaN)
+          } else {
+            assert(actual === expected)
+          }
+        }
+      }
+    }
+
+    { // StringArrayParam
+      val param = new StringArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[String]] = Seq(
+        Array(),
+        Array(""),
+        Array("", "1", "abc", "quote\"", "newline\n"))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+  }
+
   test("param") {
     val solver = new TestParams()
     val uid = solver.uid

From b3ffac5178795f2d8e7908b3e77e8e89f50b5f6f Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 13 Oct 2015 13:49:59 -0700
Subject: [PATCH 537/802] [SPARK-10983] Unified memory manager
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch unifies the memory management of the storage and execution regions such that either side can borrow memory from each other. When memory pressure arises, storage will be evicted in favor of execution. To avoid regressions in cases where storage is crucial, we dynamically allocate a fraction of space for storage that execution cannot evict. Several configurations are introduced:

- **spark.memory.fraction (default 0.75)**: ​fraction of the heap space used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records.

- **spark.memory.storageFraction (default 0.5)**: size of the storage region within the space set aside by `s​park.memory.fraction`. ​Cached data may only be evicted if total storage exceeds this region.

- **spark.memory.useLegacyMode (default false)**: whether to use the memory management that existed in Spark 1.5 and before. This is mainly for backward compatibility.

For a detailed description of the design, see [SPARK-10000](https://issues.apache.org/jira/browse/SPARK-10000). This patch builds on top of the `MemoryManager` interface introduced in #9000.

Author: Andrew Or <andrew@databricks.com>

Closes #9084 from andrewor14/unified-memory-manager.
---
 .../scala/org/apache/spark/SparkConf.scala    |  23 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  11 +-
 .../apache/spark/memory/MemoryManager.scala   |  83 +++++--
 .../spark/memory/StaticMemoryManager.scala    | 105 +++------
 .../spark/memory/UnifiedMemoryManager.scala   | 141 ++++++++++++
 .../spark/shuffle/ShuffleMemoryManager.scala  |  38 ++--
 .../apache/spark/storage/BlockManager.scala   |   4 +
 .../apache/spark/storage/MemoryStore.scala    | 121 ++++++----
 .../collection/ExternalAppendOnlyMap.scala    |  10 -
 .../org/apache/spark/DistributedSuite.scala   |   7 +-
 .../scala/org/apache/spark/ShuffleSuite.scala |   6 +-
 .../spark/memory/MemoryManagerSuite.scala     | 133 +++++++++++
 .../memory/StaticMemoryManagerSuite.scala     | 105 ++++-----
 .../memory/UnifiedMemoryManagerSuite.scala    | 208 ++++++++++++++++++
 .../shuffle/ShuffleMemoryManagerSuite.scala   |   5 +-
 .../shuffle/unsafe/UnsafeShuffleSuite.scala   |   3 -
 .../ExternalAppendOnlyMapSuite.scala          |   9 +-
 .../util/collection/ExternalSorterSuite.scala |  23 +-
 docs/configuration.md                         |  99 ++++++---
 .../execution/TestShuffleMemoryManager.scala  |  10 +-
 .../execution/UnsafeRowSerializerSuite.scala  |   2 +-
 21 files changed, 840 insertions(+), 306 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index b344b5e173d67..1a0ac3d01759c 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -418,16 +418,35 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     }
 
     // Validate memory fractions
-    val memoryKeys = Seq(
+    val deprecatedMemoryKeys = Seq(
       "spark.storage.memoryFraction",
       "spark.shuffle.memoryFraction",
       "spark.shuffle.safetyFraction",
       "spark.storage.unrollFraction",
       "spark.storage.safetyFraction")
+    val memoryKeys = Seq(
+      "spark.memory.fraction",
+      "spark.memory.storageFraction") ++
+      deprecatedMemoryKeys
     for (key <- memoryKeys) {
       val value = getDouble(key, 0.5)
       if (value > 1 || value < 0) {
-        throw new IllegalArgumentException("$key should be between 0 and 1 (was '$value').")
+        throw new IllegalArgumentException(s"$key should be between 0 and 1 (was '$value').")
+      }
+    }
+
+    // Warn against deprecated memory fractions (unless legacy memory management mode is enabled)
+    val legacyMemoryManagementKey = "spark.memory.useLegacyMode"
+    val legacyMemoryManagement = getBoolean(legacyMemoryManagementKey, false)
+    if (!legacyMemoryManagement) {
+      val keyset = deprecatedMemoryKeys.toSet
+      val detected = settings.keys().asScala.filter(keyset.contains)
+      if (detected.nonEmpty) {
+        logWarning("Detected deprecated memory fraction settings: " +
+          detected.mkString("[", ", ", "]") + ". As of Spark 1.6, execution and storage " +
+          "memory management are unified. All memory fractions used in the old model are " +
+          "now deprecated and no longer read. If you wish to use the old memory management, " +
+          s"you may explicitly enable `$legacyMemoryManagementKey` (not recommended).")
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index df3d84a1f08e9..c32998345145a 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -30,7 +30,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.PythonWorkerFactory
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
-import org.apache.spark.memory.{MemoryManager, StaticMemoryManager}
+import org.apache.spark.memory.{MemoryManager, StaticMemoryManager, UnifiedMemoryManager}
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
@@ -335,7 +335,14 @@ object SparkEnv extends Logging {
     val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 
-    val memoryManager = new StaticMemoryManager(conf)
+    val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
+    val memoryManager: MemoryManager =
+      if (useLegacyMemoryManager) {
+        new StaticMemoryManager(conf)
+      } else {
+        new UnifiedMemoryManager(conf)
+      }
+
     val shuffleMemoryManager = ShuffleMemoryManager.create(conf, memoryManager, numUsableCores)
 
     val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
index 4bf73b696920d..7168ac549106f 100644
--- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -19,6 +19,7 @@ package org.apache.spark.memory
 
 import scala.collection.mutable
 
+import org.apache.spark.Logging
 import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
 
 
@@ -29,7 +30,7 @@ import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
  * sorts and aggregations, while storage memory refers to that used for caching and propagating
  * internal data across the cluster. There exists one of these per JVM.
  */
-private[spark] abstract class MemoryManager {
+private[spark] abstract class MemoryManager extends Logging {
 
   // The memory store used to evict cached blocks
   private var _memoryStore: MemoryStore = _
@@ -40,19 +41,38 @@ private[spark] abstract class MemoryManager {
     _memoryStore
   }
 
+  // Amount of execution/storage memory in use, accesses must be synchronized on `this`
+  protected var _executionMemoryUsed: Long = 0
+  protected var _storageMemoryUsed: Long = 0
+
   /**
    * Set the [[MemoryStore]] used by this manager to evict cached blocks.
    * This must be set after construction due to initialization ordering constraints.
    */
-  def setMemoryStore(store: MemoryStore): Unit = {
+  final def setMemoryStore(store: MemoryStore): Unit = {
     _memoryStore = store
   }
 
   /**
-   * Acquire N bytes of memory for execution.
+   * Total available memory for execution, in bytes.
+   */
+  def maxExecutionMemory: Long
+
+  /**
+   * Total available memory for storage, in bytes.
+   */
+  def maxStorageMemory: Long
+
+  // TODO: avoid passing evicted blocks around to simplify method signatures (SPARK-10985)
+
+  /**
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
    * @return number of bytes successfully granted (<= N).
    */
-  def acquireExecutionMemory(numBytes: Long): Long
+  def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long
 
   /**
    * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
@@ -66,52 +86,73 @@ private[spark] abstract class MemoryManager {
 
   /**
    * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   *
+   * This extra method allows subclasses to differentiate behavior between acquiring storage
+   * memory and acquiring unroll memory. For instance, the memory management model in Spark
+   * 1.5 and before places a limit on the amount of space that can be freed from unrolling.
    * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   *
    * @return whether all N bytes were successfully granted.
    */
   def acquireUnrollMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    acquireStorageMemory(blockId, numBytes, evictedBlocks)
+  }
 
   /**
    * Release N bytes of execution memory.
    */
-  def releaseExecutionMemory(numBytes: Long): Unit
+  def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _executionMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of execution " +
+        s"memory when we only have ${_executionMemoryUsed} bytes")
+      _executionMemoryUsed = 0
+    } else {
+      _executionMemoryUsed -= numBytes
+    }
+  }
 
   /**
    * Release N bytes of storage memory.
    */
-  def releaseStorageMemory(numBytes: Long): Unit
+  def releaseStorageMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _storageMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of storage " +
+        s"memory when we only have ${_storageMemoryUsed} bytes")
+      _storageMemoryUsed = 0
+    } else {
+      _storageMemoryUsed -= numBytes
+    }
+  }
 
   /**
    * Release all storage memory acquired.
    */
-  def releaseStorageMemory(): Unit
+  def releaseAllStorageMemory(): Unit = synchronized {
+    _storageMemoryUsed = 0
+  }
 
   /**
    * Release N bytes of unroll memory.
    */
-  def releaseUnrollMemory(numBytes: Long): Unit
-
-  /**
-   * Total available memory for execution, in bytes.
-   */
-  def maxExecutionMemory: Long
-
-  /**
-   * Total available memory for storage, in bytes.
-   */
-  def maxStorageMemory: Long
+  def releaseUnrollMemory(numBytes: Long): Unit = synchronized {
+    releaseStorageMemory(numBytes)
+  }
 
   /**
    * Execution memory currently in use, in bytes.
    */
-  def executionMemoryUsed: Long
+  final def executionMemoryUsed: Long = synchronized {
+    _executionMemoryUsed
+  }
 
   /**
    * Storage memory currently in use, in bytes.
    */
-  def storageMemoryUsed: Long
+  final def storageMemoryUsed: Long = synchronized {
+    _storageMemoryUsed
+  }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
index 150445edb9578..fa44f3723415d 100644
--- a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -19,7 +19,7 @@ package org.apache.spark.memory
 
 import scala.collection.mutable
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.SparkConf
 import org.apache.spark.storage.{BlockId, BlockStatus}
 
 
@@ -34,17 +34,7 @@ private[spark] class StaticMemoryManager(
     conf: SparkConf,
     override val maxExecutionMemory: Long,
     override val maxStorageMemory: Long)
-  extends MemoryManager with Logging {
-
-  // Max number of bytes worth of blocks to evict when unrolling
-  private val maxMemoryToEvictForUnroll: Long = {
-    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
-  }
-
-  // Amount of execution / storage memory in use
-  // Accesses must be synchronized on `this`
-  private var _executionMemoryUsed: Long = 0
-  private var _storageMemoryUsed: Long = 0
+  extends MemoryManager {
 
   def this(conf: SparkConf) {
     this(
@@ -53,11 +43,19 @@ private[spark] class StaticMemoryManager(
       StaticMemoryManager.getMaxStorageMemory(conf))
   }
 
+  // Max number of bytes worth of blocks to evict when unrolling
+  private val maxMemoryToEvictForUnroll: Long = {
+    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
+  }
+
   /**
    * Acquire N bytes of memory for execution.
    * @return number of bytes successfully granted (<= N).
    */
-  override def acquireExecutionMemory(numBytes: Long): Long = synchronized {
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
+    assert(numBytes >= 0)
     assert(_executionMemoryUsed <= maxExecutionMemory)
     val bytesToGrant = math.min(numBytes, maxExecutionMemory - _executionMemoryUsed)
     _executionMemoryUsed += bytesToGrant
@@ -72,7 +70,7 @@ private[spark] class StaticMemoryManager(
   override def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     acquireStorageMemory(blockId, numBytes, numBytes, evictedBlocks)
   }
 
@@ -88,7 +86,7 @@ private[spark] class StaticMemoryManager(
   override def acquireUnrollMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     val currentUnrollMemory = memoryStore.currentUnrollMemory
     val maxNumBytesToFree = math.max(0, maxMemoryToEvictForUnroll - currentUnrollMemory)
     val numBytesToFree = math.min(numBytes, maxNumBytesToFree)
@@ -108,71 +106,16 @@ private[spark] class StaticMemoryManager(
       blockId: BlockId,
       numBytesToAcquire: Long,
       numBytesToFree: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    // Note: Keep this outside synchronized block to avoid potential deadlocks!
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    assert(numBytesToAcquire >= 0)
+    assert(numBytesToFree >= 0)
     memoryStore.ensureFreeSpace(blockId, numBytesToFree, evictedBlocks)
-    synchronized {
-      assert(_storageMemoryUsed <= maxStorageMemory)
-      val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
-      if (enoughMemory) {
-        _storageMemoryUsed += numBytesToAcquire
-      }
-      enoughMemory
-    }
-  }
-
-  /**
-   * Release N bytes of execution memory.
-   */
-  override def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
-    if (numBytes > _executionMemoryUsed) {
-      logWarning(s"Attempted to release $numBytes bytes of execution " +
-        s"memory when we only have ${_executionMemoryUsed} bytes")
-      _executionMemoryUsed = 0
-    } else {
-      _executionMemoryUsed -= numBytes
-    }
-  }
-
-  /**
-   * Release N bytes of storage memory.
-   */
-  override def releaseStorageMemory(numBytes: Long): Unit = synchronized {
-    if (numBytes > _storageMemoryUsed) {
-      logWarning(s"Attempted to release $numBytes bytes of storage " +
-        s"memory when we only have ${_storageMemoryUsed} bytes")
-      _storageMemoryUsed = 0
-    } else {
-      _storageMemoryUsed -= numBytes
+    assert(_storageMemoryUsed <= maxStorageMemory)
+    val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
+    if (enoughMemory) {
+      _storageMemoryUsed += numBytesToAcquire
     }
-  }
-
-  /**
-   * Release all storage memory acquired.
-   */
-  override def releaseStorageMemory(): Unit = synchronized {
-    _storageMemoryUsed = 0
-  }
-
-  /**
-   * Release N bytes of unroll memory.
-   */
-  override def releaseUnrollMemory(numBytes: Long): Unit = {
-    releaseStorageMemory(numBytes)
-  }
-
-  /**
-   * Amount of execution memory currently in use, in bytes.
-   */
-  override def executionMemoryUsed: Long = synchronized {
-    _executionMemoryUsed
-  }
-
-  /**
-   * Amount of storage memory currently in use, in bytes.
-   */
-  override def storageMemoryUsed: Long = synchronized {
-    _storageMemoryUsed
+    enoughMemory
   }
 
 }
@@ -184,9 +127,10 @@ private[spark] object StaticMemoryManager {
    * Return the total amount of memory available for the storage region, in bytes.
    */
   private def getMaxStorageMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
     val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
     val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    (systemMaxMemory * memoryFraction * safetyFraction).toLong
   }
 
 
@@ -194,9 +138,10 @@ private[spark] object StaticMemoryManager {
    * Return the total amount of memory available for the execution region, in bytes.
    */
   private def getMaxExecutionMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
     val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
     val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    (systemMaxMemory * memoryFraction * safetyFraction).toLong
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
new file mode 100644
index 0000000000000..5bf78d5b674b3
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.{BlockStatus, BlockId}
+
+
+/**
+ * A [[MemoryManager]] that enforces a soft boundary between execution and storage such that
+ * either side can borrow memory from the other.
+ *
+ * The region shared between execution and storage is a fraction of the total heap space
+ * configurable through `spark.memory.fraction` (default 0.75). The position of the boundary
+ * within this space is further determined by `spark.memory.storageFraction` (default 0.5).
+ * This means the size of the storage region is 0.75 * 0.5 = 0.375 of the heap space by default.
+ *
+ * Storage can borrow as much execution memory as is free until execution reclaims its space.
+ * When this happens, cached blocks will be evicted from memory until sufficient borrowed
+ * memory is released to satisfy the execution memory request.
+ *
+ * Similarly, execution can borrow as much storage memory as is free. However, execution
+ * memory is *never* evicted by storage due to the complexities involved in implementing this.
+ * The implication is that attempts to cache blocks may fail if execution has already eaten
+ * up most of the storage space, in which case the new blocks will be evicted immediately
+ * according to their respective storage levels.
+ */
+private[spark] class UnifiedMemoryManager(conf: SparkConf, maxMemory: Long) extends MemoryManager {
+
+  def this(conf: SparkConf) {
+    this(conf, UnifiedMemoryManager.getMaxMemory(conf))
+  }
+
+  /**
+   * Size of the storage region, in bytes.
+   *
+   * This region is not statically reserved; execution can borrow from it if necessary.
+   * Cached blocks can be evicted only if actual storage memory usage exceeds this region.
+   */
+  private val storageRegionSize: Long = {
+    (maxMemory * conf.getDouble("spark.memory.storageFraction", 0.5)).toLong
+  }
+
+  /**
+   * Total amount of memory, in bytes, not currently occupied by either execution or storage.
+   */
+  private def totalFreeMemory: Long = synchronized {
+    assert(_executionMemoryUsed <= maxMemory)
+    assert(_storageMemoryUsed <= maxMemory)
+    assert(_executionMemoryUsed + _storageMemoryUsed <= maxMemory)
+    maxMemory - _executionMemoryUsed - _storageMemoryUsed
+  }
+
+  /**
+   * Total available memory for execution, in bytes.
+   * In this model, this is equivalent to the amount of memory not occupied by storage.
+   */
+  override def maxExecutionMemory: Long = synchronized {
+    maxMemory - _storageMemoryUsed
+  }
+
+  /**
+   * Total available memory for storage, in bytes.
+   * In this model, this is equivalent to the amount of memory not occupied by execution.
+   */
+  override def maxStorageMemory: Long = synchronized {
+    maxMemory - _executionMemoryUsed
+  }
+
+  /**
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   *
+   * This method evicts blocks only up to the amount of memory borrowed by storage.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return number of bytes successfully granted (<= N).
+   */
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
+    assert(numBytes >= 0)
+    val memoryBorrowedByStorage = math.max(0, _storageMemoryUsed - storageRegionSize)
+    // If there is not enough free memory AND storage has borrowed some execution memory,
+    // then evict as much memory borrowed by storage as needed to grant this request
+    val shouldEvictStorage = totalFreeMemory < numBytes && memoryBorrowedByStorage > 0
+    if (shouldEvictStorage) {
+      val spaceToEnsure = math.min(numBytes, memoryBorrowedByStorage)
+      memoryStore.ensureFreeSpace(spaceToEnsure, evictedBlocks)
+    }
+    val bytesToGrant = math.min(numBytes, totalFreeMemory)
+    _executionMemoryUsed += bytesToGrant
+    bytesToGrant
+  }
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    assert(numBytes >= 0)
+    memoryStore.ensureFreeSpace(blockId, numBytes, evictedBlocks)
+    val enoughMemory = totalFreeMemory >= numBytes
+    if (enoughMemory) {
+      _storageMemoryUsed += numBytes
+    }
+    enoughMemory
+  }
+
+}
+
+private object UnifiedMemoryManager {
+
+  /**
+   * Return the total amount of memory shared between execution and storage, in bytes.
+   */
+  private def getMaxMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
+    val memoryFraction = conf.getDouble("spark.memory.fraction", 0.75)
+    (systemMaxMemory * memoryFraction).toLong
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index bb64bb3f35df0..aaf543ce9232a 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.shuffle
 
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.annotations.VisibleForTesting
 
 import org.apache.spark._
 import org.apache.spark.memory.{StaticMemoryManager, MemoryManager}
+import org.apache.spark.storage.{BlockId, BlockStatus}
 import org.apache.spark.unsafe.array.ByteArrayMethods
 
 /**
@@ -36,8 +38,8 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
  * If there are N tasks, it ensures that each tasks can acquire at least 1 / 2N of the memory
  * before it has to spill, and at most 1 / N. Because N varies dynamically, we keep track of the
  * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever
- * this set changes. This is all done by synchronizing access on "this" to mutate state and using
- * wait() and notifyAll() to signal changes.
+ * this set changes. This is all done by synchronizing access to `memoryManager` to mutate state
+ * and using wait() and notifyAll() to signal changes.
  *
  * Use `ShuffleMemoryManager.create()` factory method to create a new instance.
  *
@@ -51,7 +53,6 @@ class ShuffleMemoryManager protected (
   extends Logging {
 
   private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
-  private val maxMemory = memoryManager.maxExecutionMemory
 
   private def currentTaskAttemptId(): Long = {
     // In case this is called on the driver, return an invalid task attempt id.
@@ -65,7 +66,7 @@ class ShuffleMemoryManager protected (
    * total memory pool (where N is the # of active tasks) before it is forced to spill. This can
    * happen if the number of tasks increases but an older task had a lot of memory already.
    */
-  def tryToAcquire(numBytes: Long): Long = synchronized {
+  def tryToAcquire(numBytes: Long): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     assert(numBytes > 0, "invalid number of bytes requested: " + numBytes)
 
@@ -73,15 +74,18 @@ class ShuffleMemoryManager protected (
     // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
     if (!taskMemory.contains(taskAttemptId)) {
       taskMemory(taskAttemptId) = 0L
-      notifyAll()  // Will later cause waiting tasks to wake up and check numTasks again
+      // This will later cause waiting tasks to wake up and check numTasks again
+      memoryManager.notifyAll()
     }
 
     // Keep looping until we're either sure that we don't want to grant this request (because this
     // task would have more than 1 / numActiveTasks of the memory) or we have enough free
     // memory to give it (we always let each task get at least 1 / (2 * numActiveTasks)).
+    // TODO: simplify this to limit each task to its own slot
     while (true) {
       val numActiveTasks = taskMemory.keys.size
       val curMem = taskMemory(taskAttemptId)
+      val maxMemory = memoryManager.maxExecutionMemory
       val freeMemory = maxMemory - taskMemory.values.sum
 
       // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
@@ -99,7 +103,7 @@ class ShuffleMemoryManager protected (
         } else {
           logInfo(
             s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
-          wait()
+          memoryManager.wait()
         }
       } else {
         return acquire(toGrant)
@@ -112,15 +116,23 @@ class ShuffleMemoryManager protected (
    * Acquire N bytes of execution memory from the memory manager for the current task.
    * @return number of bytes actually acquired (<= N).
    */
-  private def acquire(numBytes: Long): Long = synchronized {
+  private def acquire(numBytes: Long): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
-    val acquired = memoryManager.acquireExecutionMemory(numBytes)
+    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val acquired = memoryManager.acquireExecutionMemory(numBytes, evictedBlocks)
+    // Register evicted blocks, if any, with the active task metrics
+    // TODO: just do this in `acquireExecutionMemory` (SPARK-10985)
+    Option(TaskContext.get()).foreach { tc =>
+      val metrics = tc.taskMetrics()
+      val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
+      metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
+    }
     taskMemory(taskAttemptId) += acquired
     acquired
   }
 
   /** Release numBytes bytes for the current task. */
-  def release(numBytes: Long): Unit = synchronized {
+  def release(numBytes: Long): Unit = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
@@ -129,20 +141,20 @@ class ShuffleMemoryManager protected (
     }
     taskMemory(taskAttemptId) -= numBytes
     memoryManager.releaseExecutionMemory(numBytes)
-    notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
+    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
   /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
-  def releaseMemoryForThisTask(): Unit = synchronized {
+  def releaseMemoryForThisTask(): Unit = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     taskMemory.remove(taskAttemptId).foreach { numBytes =>
       memoryManager.releaseExecutionMemory(numBytes)
     }
-    notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
+    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
   /** Returns the memory consumption, in bytes, for the current task */
-  def getMemoryConsumptionForThisTask(): Long = synchronized {
+  def getMemoryConsumptionForThisTask(): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     taskMemory.getOrElse(taskAttemptId, 0L)
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 9f5bd2abbdc5d..c374b93766225 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -91,6 +91,10 @@ private[spark] class BlockManager(
   }
   memoryManager.setMemoryStore(memoryStore)
 
+  // Note: depending on the memory manager, `maxStorageMemory` may actually vary over time.
+  // However, since we use this only for reporting and logging, what we actually want here is
+  // the absolute maximum value that `maxStorageMemory` can ever possibly reach. We may need
+  // to revisit whether reporting this value as the "max" is intuitive to the user.
   private val maxMemory = memoryManager.maxStorageMemory
 
   private[spark]
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 35c57b923c43a..4dbac388e098b 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -37,15 +37,14 @@ private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
 private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: MemoryManager)
   extends BlockStore(blockManager) {
 
+  // Note: all changes to memory allocations, notably putting blocks, evicting blocks, and
+  // acquiring or releasing unroll memory, must be synchronized on `memoryManager`!
+
   private val conf = blockManager.conf
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
-  private val maxMemory = memoryManager.maxStorageMemory
-
-  // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
-  private val accountingLock = new Object
 
   // A mapping from taskAttemptId to amount of memory used for unrolling a block (in bytes)
-  // All accesses of this map are assumed to have manually synchronized on `accountingLock`
+  // All accesses of this map are assumed to have manually synchronized on `memoryManager`
   private val unrollMemoryMap = mutable.HashMap[Long, Long]()
   // Same as `unrollMemoryMap`, but for pending unroll memory as defined below.
   // Pending unroll memory refers to the intermediate memory occupied by a task
@@ -60,6 +59,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
   private val unrollMemoryThreshold: Long =
     conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
 
+  /** Total amount of memory available for storage, in bytes. */
+  private def maxMemory: Long = memoryManager.maxStorageMemory
+
   if (maxMemory < unrollMemoryThreshold) {
     logWarning(s"Max memory ${Utils.bytesToString(maxMemory)} is less than the initial memory " +
       s"threshold ${Utils.bytesToString(unrollMemoryThreshold)} needed to store a block in " +
@@ -75,7 +77,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    * Amount of storage memory, in bytes, used for caching blocks.
    * This does not include memory used for unrolling.
    */
-  private def blocksMemoryUsed: Long = memoryUsed - currentUnrollMemory
+  private def blocksMemoryUsed: Long = memoryManager.synchronized {
+    memoryUsed - currentUnrollMemory
+  }
 
   override def getSize(blockId: BlockId): Long = {
     entries.synchronized {
@@ -208,7 +212,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
-  override def remove(blockId: BlockId): Boolean = {
+  override def remove(blockId: BlockId): Boolean = memoryManager.synchronized {
     val entry = entries.synchronized { entries.remove(blockId) }
     if (entry != null) {
       memoryManager.releaseStorageMemory(entry.size)
@@ -220,11 +224,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
-  override def clear() {
+  override def clear(): Unit = memoryManager.synchronized {
     entries.synchronized {
       entries.clear()
     }
-    memoryManager.releaseStorageMemory()
+    unrollMemoryMap.clear()
+    pendingUnrollMemoryMap.clear()
+    memoryManager.releaseAllStorageMemory()
     logInfo("MemoryStore cleared")
   }
 
@@ -299,22 +305,23 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
       }
 
     } finally {
-      // If we return an array, the values returned will later be cached in `tryToPut`.
-      // In this case, we should release the memory after we cache the block there.
-      // Otherwise, if we return an iterator, we release the memory reserved here
-      // later when the task finishes.
+      // If we return an array, the values returned here will be cached in `tryToPut` later.
+      // In this case, we should release the memory only after we cache the block there.
       if (keepUnrolling) {
         val taskAttemptId = currentTaskAttemptId()
-        accountingLock.synchronized {
-          // Here, we transfer memory from unroll to pending unroll because we expect to cache this
-          // block in `tryToPut`. We do not release and re-acquire memory from the MemoryManager in
-          // order to avoid race conditions where another component steals the memory that we're
-          // trying to transfer.
+        memoryManager.synchronized {
+          // Since we continue to hold onto the array until we actually cache it, we cannot
+          // release the unroll memory yet. Instead, we transfer it to pending unroll memory
+          // so `tryToPut` can further transfer it to normal storage memory later.
+          // TODO: we can probably express this without pending unroll memory (SPARK-10907)
           val amountToTransferToPending = currentUnrollMemoryForThisTask - previousMemoryReserved
           unrollMemoryMap(taskAttemptId) -= amountToTransferToPending
           pendingUnrollMemoryMap(taskAttemptId) =
             pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + amountToTransferToPending
         }
+      } else {
+        // Otherwise, if we return an iterator, we can only release the unroll memory when
+        // the task finishes since we don't know when the iterator will be consumed.
       }
     }
   }
@@ -343,7 +350,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    * `value` will be lazily created. If it cannot be put into MemoryStore or disk, `value` won't be
    * created to avoid OOM since it may be a big ByteBuffer.
    *
-   * Synchronize on `accountingLock` to ensure that all the put requests and its associated block
+   * Synchronize on `memoryManager` to ensure that all the put requests and its associated block
    * dropping is done by only on thread at a time. Otherwise while one thread is dropping
    * blocks to free memory for one block, another thread may use up the freed space for
    * another block.
@@ -365,16 +372,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
      * for freeing up more space for another block that needs to be put. Only then the actually
      * dropping of blocks (and writing to disk if necessary) can proceed in parallel. */
 
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       // Note: if we have previously unrolled this block successfully, then pending unroll
       // memory should be non-zero. This is the amount that we already reserved during the
       // unrolling process. In this case, we can just reuse this space to cache our block.
-      //
-      // Note: the StaticMemoryManager counts unroll memory as storage memory. Here, the
-      // synchronization on `accountingLock` guarantees that the release of unroll memory and
-      // acquisition of storage memory happens atomically. However, if storage memory is acquired
-      // outside of MemoryStore or if unroll memory is counted as execution memory, then we will
-      // have to revisit this assumption. See SPARK-10983 for more context.
+      // The synchronization on `memoryManager` here guarantees that the release and acquire
+      // happen atomically. This relies on the assumption that all memory acquisitions are
+      // synchronized on the same lock.
       releasePendingUnrollMemoryForThisTask()
       val enoughMemory = memoryManager.acquireStorageMemory(blockId, size, droppedBlocks)
       if (enoughMemory) {
@@ -401,34 +405,62 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
+  /**
+   * Try to free up a given amount of space by evicting existing blocks.
+   *
+   * @param space the amount of memory to free, in bytes
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether the requested free space is freed.
+   */
+  private[spark] def ensureFreeSpace(
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    ensureFreeSpace(None, space, droppedBlocks)
+  }
+
+  /**
+   * Try to free up a given amount of space to store a block by evicting existing ones.
+   *
+   * @param space the amount of memory to free, in bytes
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether the requested free space is freed.
+   */
+  private[spark] def ensureFreeSpace(
+      blockId: BlockId,
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    ensureFreeSpace(Some(blockId), space, droppedBlocks)
+  }
+
   /**
    * Try to free up a given amount of space to store a particular block, but can fail if
    * either the block is bigger than our memory or it would require replacing another block
    * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
    * don't fit into memory that we want to avoid).
    *
-   * @param blockId the ID of the block we are freeing space for
+   * @param blockId the ID of the block we are freeing space for, if any
    * @param space the size of this block
    * @param droppedBlocks a holder for blocks evicted in the process
-   * @return whether there is enough free space.
+   * @return whether the requested free space is freed.
    */
-  private[spark] def ensureFreeSpace(
-      blockId: BlockId,
+  private def ensureFreeSpace(
+      blockId: Option[BlockId],
       space: Long,
       droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       val freeMemory = maxMemory - memoryUsed
-      val rddToAdd = getRddId(blockId)
+      val rddToAdd = blockId.flatMap(getRddId)
       val selectedBlocks = new ArrayBuffer[BlockId]
       var selectedMemory = 0L
 
-      logInfo(s"Ensuring $space bytes of free space for block $blockId " +
+      logInfo(s"Ensuring $space bytes of free space " +
+        blockId.map { id => s"for block $id" }.getOrElse("") +
         s"(free: $freeMemory, max: $maxMemory)")
 
       // Fail fast if the block simply won't fit
       if (space > maxMemory) {
-        logInfo(s"Will not store $blockId as the required space " +
-          s"($space bytes) than our memory limit ($maxMemory bytes)")
+        logInfo("Will not " + blockId.map { id => s"store $id" }.getOrElse("free memory") +
+          s" as the required space ($space bytes) exceeds our memory limit ($maxMemory bytes)")
         return false
       }
 
@@ -471,8 +503,10 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
         }
         true
       } else {
-        logInfo(s"Will not store $blockId as it would require dropping another block " +
-          "from the same RDD")
+        blockId.foreach { id =>
+          logInfo(s"Will not store $id as it would require dropping another block " +
+            "from the same RDD")
+        }
         false
       }
     }
@@ -495,8 +529,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
       blockId: BlockId,
       memory: Long,
       droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    accountingLock.synchronized {
-      // Note: all acquisitions of unroll memory must be synchronized on `accountingLock`
+    memoryManager.synchronized {
       val success = memoryManager.acquireUnrollMemory(blockId, memory, droppedBlocks)
       if (success) {
         val taskAttemptId = currentTaskAttemptId()
@@ -512,7 +545,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    */
   def releaseUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       if (unrollMemoryMap.contains(taskAttemptId)) {
         val memoryToRelease = math.min(memory, unrollMemoryMap(taskAttemptId))
         if (memoryToRelease > 0) {
@@ -531,7 +564,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    */
   def releasePendingUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       if (pendingUnrollMemoryMap.contains(taskAttemptId)) {
         val memoryToRelease = math.min(memory, pendingUnrollMemoryMap(taskAttemptId))
         if (memoryToRelease > 0) {
@@ -548,21 +581,21 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
   /**
    * Return the amount of memory currently occupied for unrolling blocks across all tasks.
    */
-  def currentUnrollMemory: Long = accountingLock.synchronized {
+  def currentUnrollMemory: Long = memoryManager.synchronized {
     unrollMemoryMap.values.sum + pendingUnrollMemoryMap.values.sum
   }
 
   /**
    * Return the amount of memory currently occupied for unrolling blocks by this task.
    */
-  def currentUnrollMemoryForThisTask: Long = accountingLock.synchronized {
+  def currentUnrollMemoryForThisTask: Long = memoryManager.synchronized {
     unrollMemoryMap.getOrElse(currentTaskAttemptId(), 0L)
   }
 
   /**
    * Return the number of tasks currently unrolling blocks.
    */
-  private def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
+  private def numTasksUnrolling: Int = memoryManager.synchronized { unrollMemoryMap.keys.size }
 
   /**
    * Log information about current memory usage.
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 29c5732f5a8c1..6a96b5dc12684 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -48,16 +48,6 @@ import org.apache.spark.executor.ShuffleWriteMetrics
  * However, if the spill threshold is too low, we spill frequently and incur unnecessary disk
  * writes. This may lead to a performance regression compared to the normal case of using the
  * non-spilling AppendOnlyMap.
- *
- * Two parameters control the memory threshold:
- *
- *   `spark.shuffle.memoryFraction` specifies the collective amount of memory used for storing
- *   these maps as a fraction of the executor's total memory. Since each concurrently running
- *   task maintains one map, the actual threshold for each map is this quantity divided by the
- *   number of running tasks.
- *
- *   `spark.shuffle.safetyFraction` specifies an additional margin of safety as a fraction of
- *   this threshold, in case map size estimation is not sufficiently accurate.
  */
 @DeveloperApi
 class ExternalAppendOnlyMap[K, V, C](
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 600c1403b0344..34a4bb968e732 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -213,11 +213,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
   }
 
   test("compute when only some partitions fit in memory") {
-    val conf = new SparkConf().set("spark.storage.memoryFraction", "0.01")
-    sc = new SparkContext(clusterUrl, "test", conf)
-    // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
-    // to only 5 MB (0.01 of 512 MB), so not all of it will fit in memory; we use 20 partitions
-    // to make sure that *some* of them do fit though
+    sc = new SparkContext(clusterUrl, "test", new SparkConf)
+    // TODO: verify that only a subset of partitions fit in memory (SPARK-11078)
     val data = sc.parallelize(1 to 4000000, 20).persist(StorageLevel.MEMORY_ONLY_SER)
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index d91b799ecfc08..4a0877d86f2c6 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -247,11 +247,13 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
         .setMaster("local")
         .set("spark.shuffle.spill.compress", shuffleSpillCompress.toString)
         .set("spark.shuffle.compress", shuffleCompress.toString)
-        .set("spark.shuffle.memoryFraction", "0.001")
       resetSparkContext()
       sc = new SparkContext(myConf)
+      val diskBlockManager = sc.env.blockManager.diskBlockManager
       try {
-        sc.parallelize(0 until 100000).map(i => (i / 4, i)).groupByKey().collect()
+        assert(diskBlockManager.getAllFiles().isEmpty)
+        sc.parallelize(0 until 10).map(i => (i / 4, i)).groupByKey().collect()
+        assert(diskBlockManager.getAllFiles().nonEmpty)
       } catch {
         case e: Exception =>
           val errMsg = s"Failed with spark.shuffle.spill.compress=$shuffleSpillCompress," +
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
new file mode 100644
index 0000000000000..36e4566310715
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import java.util.concurrent.atomic.AtomicLong
+
+import org.mockito.Matchers.{any, anyLong}
+import org.mockito.Mockito.{mock, when}
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.storage.MemoryStore
+
+
+/**
+ * Helper trait for sharing code among [[MemoryManager]] tests.
+ */
+private[memory] trait MemoryManagerSuite extends SparkFunSuite {
+
+  import MemoryManagerSuite.DEFAULT_ENSURE_FREE_SPACE_CALLED
+
+  // Note: Mockito's verify mechanism does not provide a way to reset method call counts
+  // without also resetting stubbed methods. Since our test code relies on the latter,
+  // we need to use our own variable to track invocations of `ensureFreeSpace`.
+
+  /**
+   * The amount of free space requested in the last call to [[MemoryStore.ensureFreeSpace]]
+   *
+   * This set whenever [[MemoryStore.ensureFreeSpace]] is called, and cleared when the test
+   * code makes explicit assertions on this variable through [[assertEnsureFreeSpaceCalled]].
+   */
+  private val ensureFreeSpaceCalled = new AtomicLong(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+
+  /**
+   * Make a mocked [[MemoryStore]] whose [[MemoryStore.ensureFreeSpace]] method is stubbed.
+   *
+   * This allows our test code to release storage memory when [[MemoryStore.ensureFreeSpace]]
+   * is called without relying on [[org.apache.spark.storage.BlockManager]] and all of its
+   * dependencies.
+   */
+  protected def makeMemoryStore(mm: MemoryManager): MemoryStore = {
+    val ms = mock(classOf[MemoryStore])
+    when(ms.ensureFreeSpace(anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 0))
+    when(ms.ensureFreeSpace(any(), anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 1))
+    mm.setMemoryStore(ms)
+    ms
+  }
+
+  /**
+   * Make an [[Answer]] that stubs [[MemoryStore.ensureFreeSpace]] with the right arguments.
+   */
+  private def ensureFreeSpaceAnswer(mm: MemoryManager, numBytesPos: Int): Answer[Boolean] = {
+    new Answer[Boolean] {
+      override def answer(invocation: InvocationOnMock): Boolean = {
+        val args = invocation.getArguments
+        require(args.size > numBytesPos, s"bad test: expected >$numBytesPos arguments " +
+          s"in ensureFreeSpace, found ${args.size}")
+        require(args(numBytesPos).isInstanceOf[Long], s"bad test: expected ensureFreeSpace " +
+          s"argument at index $numBytesPos to be a Long: ${args.mkString(", ")}")
+        val numBytes = args(numBytesPos).asInstanceOf[Long]
+        mockEnsureFreeSpace(mm, numBytes)
+      }
+    }
+  }
+
+  /**
+   * Simulate the part of [[MemoryStore.ensureFreeSpace]] that releases storage memory.
+   *
+   * This is a significant simplification of the real method, which actually drops existing
+   * blocks based on the size of each block. Instead, here we simply release as many bytes
+   * as needed to ensure the requested amount of free space. This allows us to set up the
+   * test without relying on the [[org.apache.spark.storage.BlockManager]], which brings in
+   * many other dependencies.
+   *
+   * Every call to this method will set a global variable, [[ensureFreeSpaceCalled]], that
+   * records the number of bytes this is called with. This variable is expected to be cleared
+   * by the test code later through [[assertEnsureFreeSpaceCalled]].
+   */
+  private def mockEnsureFreeSpace(mm: MemoryManager, numBytes: Long): Boolean = mm.synchronized {
+    require(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
+      "bad test: ensure free space variable was not reset")
+    // Record the number of bytes we freed this call
+    ensureFreeSpaceCalled.set(numBytes)
+    if (numBytes <= mm.maxStorageMemory) {
+      def freeMemory = mm.maxStorageMemory - mm.storageMemoryUsed
+      val spaceToRelease = numBytes - freeMemory
+      if (spaceToRelease > 0) {
+        mm.releaseStorageMemory(spaceToRelease)
+      }
+      freeMemory >= numBytes
+    } else {
+      // We attempted to free more bytes than our max allowable memory
+      false
+    }
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
+   */
+  protected def assertEnsureFreeSpaceCalled(ms: MemoryStore, numBytes: Long): Unit = {
+    assert(ensureFreeSpaceCalled.get() === numBytes,
+      s"expected ensure free space to be called with $numBytes")
+    ensureFreeSpaceCalled.set(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is NOT called.
+   */
+  protected def assertEnsureFreeSpaceNotCalled[T](ms: MemoryStore): Unit = {
+    assert(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
+      "ensure free space should not have been called!")
+  }
+}
+
+private object MemoryManagerSuite {
+  private val DEFAULT_ENSURE_FREE_SPACE_CALLED = -1L
+}
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
index c436a8b5c9f81..6cae1f871e24b 100644
--- a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -19,32 +19,44 @@ package org.apache.spark.memory
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.mockito.Mockito.{mock, reset, verify, when}
-import org.mockito.Matchers.{any, eq => meq}
+import org.mockito.Mockito.when
 
+import org.apache.spark.SparkConf
 import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
-import org.apache.spark.{SparkConf, SparkFunSuite}
 
 
-class StaticMemoryManagerSuite extends SparkFunSuite {
+class StaticMemoryManagerSuite extends MemoryManagerSuite {
   private val conf = new SparkConf().set("spark.storage.unrollFraction", "0.4")
+  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+
+  /**
+   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(
+      maxExecutionMem: Long,
+      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
+    val mm = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
+    val ms = makeMemoryStore(mm)
+    (mm, ms)
+  }
 
   test("basic execution memory") {
     val maxExecutionMem = 1000L
     val (mm, _) = makeThings(maxExecutionMem, Long.MaxValue)
     assert(mm.executionMemoryUsed === 0L)
-    assert(mm.acquireExecutionMemory(10L) === 10L)
+    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
     assert(mm.executionMemoryUsed === 10L)
-    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
     // Acquire up to the max
-    assert(mm.acquireExecutionMemory(1000L) === 890L)
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
-    assert(mm.acquireExecutionMemory(1L) === 0L)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
     mm.releaseExecutionMemory(800L)
     assert(mm.executionMemoryUsed === 200L)
     // Acquire after release
-    assert(mm.acquireExecutionMemory(1L) === 1L)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
     assert(mm.executionMemoryUsed === 201L)
     // Release beyond what was acquired
     mm.releaseExecutionMemory(maxExecutionMem)
@@ -54,37 +66,36 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
   test("basic storage memory") {
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("you can see the world you brought to live")
-    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
     // `ensureFreeSpace` should be called with the number of bytes requested
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 10L)
+    assertEnsureFreeSpaceCalled(ms, 10L)
     assert(mm.storageMemoryUsed === 10L)
-    assert(evictedBlocks.isEmpty)
     assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assertEnsureFreeSpaceCalled(ms, 100L)
     assert(mm.storageMemoryUsed === 110L)
-    // Acquire up to the max, not granted
-    assert(!mm.acquireStorageMemory(dummyBlock, 1000L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1000L)
+    // Acquire more than the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, maxStorageMem + 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, maxStorageMem + 1L)
     assert(mm.storageMemoryUsed === 110L)
-    assert(mm.acquireStorageMemory(dummyBlock, 890L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 890L)
+    // Acquire up to the max, requests after this are still granted due to LRU eviction
+    assert(mm.acquireStorageMemory(dummyBlock, maxStorageMem, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1000L)
     assert(mm.storageMemoryUsed === 1000L)
-    assert(!mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 1000L)
     mm.releaseStorageMemory(800L)
     assert(mm.storageMemoryUsed === 200L)
     // Acquire after release
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 201L)
-    mm.releaseStorageMemory()
+    mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 1L)
     // Release beyond what was acquired
     mm.releaseStorageMemory(100L)
@@ -95,18 +106,17 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
     val maxExecutionMem = 200L
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("ain't nobody love like you do")
-    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(maxExecutionMem, maxStorageMem)
     // Only execution memory should increase
-    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 100L)
-    assert(mm.acquireExecutionMemory(1000L) === 100L)
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 200L)
     // Only storage memory should increase
-    assert(mm.acquireStorageMemory(dummyBlock, 50L, dummyBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 50L)
+    assert(mm.acquireStorageMemory(dummyBlock, 50L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 50L)
     assert(mm.storageMemoryUsed === 50L)
     assert(mm.executionMemoryUsed === 200L)
     // Only execution memory should be released
@@ -114,7 +124,7 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
     assert(mm.storageMemoryUsed === 50L)
     assert(mm.executionMemoryUsed === 67L)
     // Only storage memory should be released
-    mm.releaseStorageMemory()
+    mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 67L)
   }
@@ -122,51 +132,26 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
   test("unroll memory") {
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("lonely water")
-    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
-    assert(mm.acquireUnrollMemory(dummyBlock, 100L, dummyBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.acquireUnrollMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 100L)
     assert(mm.storageMemoryUsed === 100L)
     mm.releaseUnrollMemory(40L)
     assert(mm.storageMemoryUsed === 60L)
     when(ms.currentUnrollMemory).thenReturn(60L)
-    assert(mm.acquireUnrollMemory(dummyBlock, 500L, dummyBlocks))
+    assert(mm.acquireUnrollMemory(dummyBlock, 500L, evictedBlocks))
     // `spark.storage.unrollFraction` is 0.4, so the max unroll space is 400 bytes.
     // Since we already occupy 60 bytes, we will try to ensure only 400 - 60 = 340 bytes.
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 340L)
+    assertEnsureFreeSpaceCalled(ms, 340L)
     assert(mm.storageMemoryUsed === 560L)
     when(ms.currentUnrollMemory).thenReturn(560L)
-    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, dummyBlocks))
+    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, evictedBlocks))
     assert(mm.storageMemoryUsed === 560L)
     // We already have 560 bytes > the max unroll space of 400 bytes, so no bytes are freed
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 0L)
+    assertEnsureFreeSpaceCalled(ms, 0L)
     // Release beyond what was acquired
     mm.releaseUnrollMemory(maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
   }
 
-  /**
-   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
-   */
-  private def makeThings(
-      maxExecutionMem: Long,
-      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
-    val mm = new StaticMemoryManager(
-      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
-    val ms = mock(classOf[MemoryStore])
-    mm.setMemoryStore(ms)
-    (mm, ms)
-  }
-
-  /**
-   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
-   */
-  private def assertEnsureFreeSpaceCalled(
-      ms: MemoryStore,
-      blockId: BlockId,
-      numBytes: Long): Unit = {
-    verify(ms).ensureFreeSpace(meq(blockId), meq(numBytes: java.lang.Long), any())
-    reset(ms)
-  }
-
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
new file mode 100644
index 0000000000000..e7baa50dc2cd0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+
+
+class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTester {
+  private val conf = new SparkConf().set("spark.memory.storageFraction", "0.5")
+  private val dummyBlock = TestBlockId("--")
+  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+
+  /**
+   * Make a [[UnifiedMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(maxMemory: Long): (UnifiedMemoryManager, MemoryStore) = {
+    val mm = new UnifiedMemoryManager(conf, maxMemory)
+    val ms = makeMemoryStore(mm)
+    (mm, ms)
+  }
+
+  private def getStorageRegionSize(mm: UnifiedMemoryManager): Long = {
+    mm invokePrivate PrivateMethod[Long]('storageRegionSize)()
+  }
+
+  test("storage region size") {
+    val maxMemory = 1000L
+    val (mm, _) = makeThings(maxMemory)
+    val storageFraction = conf.get("spark.memory.storageFraction").toDouble
+    val expectedStorageRegionSize = maxMemory * storageFraction
+    val actualStorageRegionSize = getStorageRegionSize(mm)
+    assert(expectedStorageRegionSize === actualStorageRegionSize)
+  }
+
+  test("basic execution memory") {
+    val maxMemory = 1000L
+    val (mm, _) = makeThings(maxMemory)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
+    assert(mm.executionMemoryUsed === 10L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    // Acquire up to the max
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
+    assert(mm.executionMemoryUsed === maxMemory)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
+    assert(mm.executionMemoryUsed === maxMemory)
+    mm.releaseExecutionMemory(800L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
+    assert(mm.executionMemoryUsed === 201L)
+    // Release beyond what was acquired
+    mm.releaseExecutionMemory(maxMemory)
+    assert(mm.executionMemoryUsed === 0L)
+  }
+
+  test("basic storage memory") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
+    // `ensureFreeSpace` should be called with the number of bytes requested
+    assertEnsureFreeSpaceCalled(ms, 10L)
+    assert(mm.storageMemoryUsed === 10L)
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 100L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire more than the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, maxMemory + 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, maxMemory + 1L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire up to the max, requests after this are still granted due to LRU eviction
+    assert(mm.acquireStorageMemory(dummyBlock, maxMemory, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1000L)
+    assert(mm.storageMemoryUsed === 1000L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 1000L)
+    mm.releaseStorageMemory(800L)
+    assert(mm.storageMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 201L)
+    mm.releaseAllStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 1L)
+    // Release beyond what was acquired
+    mm.releaseStorageMemory(100L)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  test("execution evicts storage") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    // First, ensure the test classes are set up as expected
+    val expectedStorageRegionSize = 500L
+    val expectedExecutionRegionSize = 500L
+    val storageRegionSize = getStorageRegionSize(mm)
+    val executionRegionSize = maxMemory - expectedStorageRegionSize
+    require(storageRegionSize === expectedStorageRegionSize,
+      "bad test: storage region size is unexpected")
+    require(executionRegionSize === expectedExecutionRegionSize,
+      "bad test: storage region size is unexpected")
+    // Acquire enough storage memory to exceed the storage region
+    assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 750L)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.storageMemoryUsed === 750L)
+    require(mm.storageMemoryUsed > storageRegionSize,
+      s"bad test: storage memory used should exceed the storage region")
+    // Execution needs to request 250 bytes to evict storage memory
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.executionMemoryUsed === 100L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    // Execution wants 200 bytes but only 150 are free, so storage is evicted
+    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assertEnsureFreeSpaceCalled(ms, 200L)
+    assert(mm.executionMemoryUsed === 300L)
+    mm.releaseAllStorageMemory()
+    require(mm.executionMemoryUsed < executionRegionSize,
+      s"bad test: execution memory used should be within the execution region")
+    require(mm.storageMemoryUsed === 0, "bad test: all storage memory should have been released")
+    // Acquire some storage memory again, but this time keep it within the storage region
+    assert(mm.acquireStorageMemory(dummyBlock, 400L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 400L)
+    require(mm.storageMemoryUsed < storageRegionSize,
+      s"bad test: storage memory used should be within the storage region")
+    // Execution cannot evict storage because the latter is within the storage fraction,
+    // so grant only what's remaining without evicting anything, i.e. 1000 - 300 - 400 = 300
+    assert(mm.acquireExecutionMemory(400L, evictedBlocks) === 300L)
+    assert(mm.executionMemoryUsed === 600L)
+    assert(mm.storageMemoryUsed === 400L)
+    assertEnsureFreeSpaceNotCalled(ms)
+  }
+
+  test("storage does not evict execution") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    // First, ensure the test classes are set up as expected
+    val expectedStorageRegionSize = 500L
+    val expectedExecutionRegionSize = 500L
+    val storageRegionSize = getStorageRegionSize(mm)
+    val executionRegionSize = maxMemory - expectedStorageRegionSize
+    require(storageRegionSize === expectedStorageRegionSize,
+      "bad test: storage region size is unexpected")
+    require(executionRegionSize === expectedExecutionRegionSize,
+      "bad test: storage region size is unexpected")
+    // Acquire enough execution memory to exceed the execution region
+    assert(mm.acquireExecutionMemory(800L, evictedBlocks) === 800L)
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 0L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    require(mm.executionMemoryUsed > executionRegionSize,
+      s"bad test: execution memory used should exceed the execution region")
+    // Storage should not be able to evict execution
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 100L)
+    assertEnsureFreeSpaceCalled(ms, 100L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 250L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 100L)
+    assertEnsureFreeSpaceCalled(ms, 250L)
+    mm.releaseExecutionMemory(maxMemory)
+    mm.releaseStorageMemory(maxMemory)
+    // Acquire some execution memory again, but this time keep it within the execution region
+    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 0L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    require(mm.executionMemoryUsed < executionRegionSize,
+      s"bad test: execution memory used should be within the execution region")
+    // Storage should still not be able to evict execution
+    assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceCalled(ms, 750L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 850L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceCalled(ms, 850L)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
index 6d45b1a101be6..5877aa042d4af 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
@@ -24,7 +24,8 @@ import org.mockito.Mockito._
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext}
+import org.apache.spark.{SparkFunSuite, TaskContext}
+import org.apache.spark.executor.TaskMetrics
 
 class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
 
@@ -37,7 +38,9 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
         try {
           val taskAttemptId = nextTaskAttemptId.getAndIncrement
           val mockTaskContext = mock(classOf[TaskContext], RETURNS_SMART_NULLS)
+          val taskMetrics = new TaskMetrics
           when(mockTaskContext.taskAttemptId()).thenReturn(taskAttemptId)
+          when(mockTaskContext.taskMetrics()).thenReturn(taskMetrics)
           TaskContext.setTaskContext(mockTaskContext)
           body
         } finally {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
index 6351539e91e97..259020a2ddc34 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
@@ -36,9 +36,6 @@ class UnsafeShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
 
   override def beforeAll() {
     conf.set("spark.shuffle.manager", "tungsten-sort")
-    // UnsafeShuffleManager requires at least 128 MB of memory per task in order to be able to sort
-    // shuffle records.
-    conf.set("spark.shuffle.memoryFraction", "0.5")
   }
 
   test("UnsafeShuffleManager properly cleans up files for shuffles that use the new shuffle path") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 12e9bafcc92c1..0a03c32c647ae 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
 
+// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
+
 class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private def createCombiner[T](i: T) = ArrayBuffer[T](i)
@@ -243,7 +245,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
    */
   private def testSimpleSpilling(codec: Option[String] = None): Unit = {
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // reduceByKey - should spill ~8 times
@@ -291,7 +292,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[String]
 
@@ -340,7 +340,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with many hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.0001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
 
@@ -365,7 +364,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
@@ -382,7 +380,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with null keys and values") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
@@ -401,8 +398,8 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("external aggregation updates peak execution memory") {
     val conf = createSparkConf(loadDefaults = false)
-      .set("spark.shuffle.memoryFraction", "0.001")
       .set("spark.shuffle.manager", "hash") // make sure we're not also using ExternalSorter
+      .set("spark.testing.memory", (10 * 1024 * 1024).toString)
     sc = new SparkContext("local", "test", conf)
     // No spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map without spilling") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index bdb0f4d507a7e..651c7eaa65ff5 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -24,6 +24,8 @@ import scala.util.Random
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
+// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
+
 class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
     val conf = new SparkConf(loadDefaults)
@@ -38,6 +40,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
     // Ensure that we actually have multiple batches per spill file
     conf.set("spark.shuffle.spill.batchSize", "10")
+    conf.set("spark.testing.memory", "2000000")
     conf
   }
 
@@ -50,7 +53,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def emptyDataStream(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -91,7 +93,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def fewElementsPerPartition(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -140,7 +141,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def emptyPartitionsWithSpilling(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
@@ -174,7 +174,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def testSpillingInLocalCluster(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
@@ -252,7 +251,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def spillingInLocalClusterWithManyReduceTasks(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
@@ -323,7 +321,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in sorter") {
     val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -348,7 +345,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in sorter if there are errors") {
     val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -372,7 +368,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in shuffle") {
     val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -387,7 +382,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in shuffle with errors") {
     val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -416,7 +410,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def noPartialAggregationOrSorting(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -438,7 +431,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWithoutSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -461,7 +453,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWIthSpillNoOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -485,7 +476,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWithSpillWithOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -512,7 +502,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def sortingWithoutAggregationNoSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -536,7 +525,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def sortingWithoutAggregationWithSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -553,7 +541,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -610,7 +597,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with many hash collisions") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.0001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
@@ -633,7 +619,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
@@ -657,7 +642,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with null keys and values") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -693,7 +677,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   private def sortWithoutBreakingSortingContracts(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.01")
     conf.set("spark.shuffle.manager", "sort")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 154a3aee6855a..771d93be04b06 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -445,17 +445,6 @@ Apart from these, the following properties are also available, and may be useful
     met.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.memoryFraction</code></td>
-  <td>0.2</td>
-  <td>
-    Fraction of Java heap to use for aggregation and cogroups during shuffles.
-    At any given time, the collective size of
-    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
-    begin to spill to disk. If spills are often, consider increasing this value at the expense of
-    <code>spark.storage.memoryFraction</code>.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.service.enabled</code></td>
   <td>false</td>
@@ -712,6 +701,76 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
+#### Memory Management
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.memory.fraction</code></td>
+  <td>0.75</td>
+  <td>
+    Fraction of the heap space used for execution and storage. The lower this is, the more
+    frequently spills and cached data eviction occur. The purpose of this config is to set
+    aside memory for internal metadata, user data structures, and imprecise size estimation
+    in the case of sparse, unusually large records.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.memory.storageFraction</code></td>
+  <td>0.5</td>
+  <td>
+    T​he size of the storage region within the space set aside by
+    <code>s​park.memory.fraction</code>. This region is not statically reserved, but dynamically
+    allocated as cache requests come in. ​Cached data may be evicted only if total storage exceeds
+    this region.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.memory.useLegacyMode</code></td>
+  <td>false</td>
+  <td>
+    ​Whether to enable the legacy memory management mode used in Spark 1.5 and before.
+    The legacy mode rigidly partitions the heap space into fixed-size regions,
+    potentially leading to excessive spilling if the application was not tuned.
+    The following deprecated memory fraction configurations are not read unless this is enabled:
+    <code>spark.shuffle.memoryFraction</code><br>
+    <code>spark.storage.memoryFraction</code><br>
+    <code>spark.storage.unrollFraction</code>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.memoryFraction</code></td>
+  <td>0.2</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of Java heap to use for aggregation and cogroups during shuffles.
+    At any given time, the collective size of
+    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
+    begin to spill to disk. If spills are often, consider increasing this value at the expense of
+    <code>spark.storage.memoryFraction</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.memoryFraction</code></td>
+  <td>0.6</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old"
+    generation of objects in the JVM, which by default is given 0.6 of the heap, but you can
+    increase it if you configure your own old generation size.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.unrollFraction</code></td>
+  <td>0.2</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
+    This is dynamically allocated by dropping existing blocks when there is not enough free
+    storage space to unroll the new block in its entirety.
+  </td>
+</tr>
+</table>
+
 #### Execution Behavior
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -824,15 +883,6 @@ Apart from these, the following properties are also available, and may be useful
     This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
     data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
 </tr>
-<tr>
-  <td><code>spark.storage.memoryFraction</code></td>
-  <td>0.6</td>
-  <td>
-    Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old"
-    generation of objects in the JVM, which by default is given 0.6 of the heap, but you can
-    increase it if you configure your own old generation size.
-  </td>
-</tr>
 <tr>
   <td><code>spark.storage.memoryMapThreshold</code></td>
   <td>2m</td>
@@ -842,15 +892,6 @@ Apart from these, the following properties are also available, and may be useful
     mapping has high overhead for blocks close to or below the page size of the operating system.
   </td>
 </tr>
-<tr>
-  <td><code>spark.storage.unrollFraction</code></td>
-  <td>0.2</td>
-  <td>
-    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
-    This is dynamically allocated by dropping existing blocks when there is not enough free
-    storage space to unroll the new block in its entirety.
-  </td>
-</tr>
 <tr>
   <td><code>spark.externalBlockStore.blockManager</code></td>
   <td>org.apache.spark.storage.TachyonBlockManager</td>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index ff65d7bdf8b92..835f52fa566a2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -57,7 +57,9 @@ class TestShuffleMemoryManager
 }
 
 private class GrantEverythingMemoryManager extends MemoryManager {
-  override def acquireExecutionMemory(numBytes: Long): Long = numBytes
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = numBytes
   override def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
@@ -66,12 +68,6 @@ private class GrantEverythingMemoryManager extends MemoryManager {
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
-  override def releaseExecutionMemory(numBytes: Long): Unit = { }
-  override def releaseStorageMemory(numBytes: Long): Unit = { }
-  override def releaseStorageMemory(): Unit = { }
-  override def releaseUnrollMemory(numBytes: Long): Unit = { }
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
-  override def executionMemoryUsed: Long = Long.MaxValue
-  override def storageMemoryUsed: Long = Long.MaxValue
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index f7d48bc53ebbc..75d1fced594c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -103,7 +103,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
       val conf = new SparkConf()
         .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
         .set("spark.shuffle.sort.bypassMergeThreshold", "0")
-        .set("spark.shuffle.memoryFraction", "0.0001")
+        .set("spark.testing.memory", "80000")
 
       sc = new SparkContext("local", "test", conf)
       outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")

From 0d1b73b78b600420121ea8e58ff659ae8b4feebe Mon Sep 17 00:00:00 2001
From: trystanleftwich <trystan@atscale.com>
Date: Tue, 13 Oct 2015 22:11:08 +0100
Subject: [PATCH 538/802] =?UTF-8?q?[SPARK-11052]=20Spaces=20in=20the=20bui?=
 =?UTF-8?q?ld=20dir=20causes=20failures=20in=20the=20build/mv=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…n script

Author: trystanleftwich <trystan@atscale.com>

Closes #9065 from trystanleftwich/SPARK-11052.
---
 build/mvn            | 10 +++++-----
 make-distribution.sh |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/build/mvn b/build/mvn
index ec0380afad319..7603ea03deb73 100755
--- a/build/mvn
+++ b/build/mvn
@@ -104,8 +104,8 @@ install_scala() {
     "scala-${scala_version}.tgz" \
     "scala-${scala_version}/bin/scala"
 
-  SCALA_COMPILER="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-compiler.jar"
-  SCALA_LIBRARY="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-library.jar"
+  SCALA_COMPILER="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-compiler.jar"
+  SCALA_LIBRARY="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-library.jar"
 }
 
 # Setup healthy defaults for the Zinc port if none were provided from
@@ -135,10 +135,10 @@ cd "${_CALLING_DIR}"
 
 # Now that zinc is ensured to be installed, check its status and, if its
 # not running or just installed, start it
-if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status -port ${ZINC_PORT}`" ]; then
+if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`"${ZINC_BIN}" -status -port ${ZINC_PORT}`" ]; then
   export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
-  ${ZINC_BIN} -shutdown -port ${ZINC_PORT}
-  ${ZINC_BIN} -start -port ${ZINC_PORT} \
+  "${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
+  "${ZINC_BIN}" -start -port ${ZINC_PORT} \
     -scala-compiler "${SCALA_COMPILER}" \
     -scala-library "${SCALA_LIBRARY}" &>/dev/null
 fi
diff --git a/make-distribution.sh b/make-distribution.sh
index 62c0ba6df7d3f..24418ace26270 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -121,7 +121,7 @@ if [ $(command -v git) ]; then
 fi
 
 
-if [ ! $(command -v "$MVN") ] ; then
+if [ ! "$(command -v "$MVN")" ] ; then
     echo -e "Could not locate Maven command: '$MVN'."
     echo -e "Specify the Maven command with the --mvn flag"
     exit -1;

From ef72673b234579c161b8cbb6cafc851d9eba1bfb Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 13 Oct 2015 15:09:31 -0700
Subject: [PATCH 539/802] [SPARK-11080] [SQL] Incorporate per-JVM id into
 ExprId to prevent unsafe cross-JVM comparisions

In the current implementation of named expressions' `ExprIds`, we rely on a per-JVM AtomicLong to ensure that expression ids are unique within a JVM. However, these expression ids will not be _globally_ unique. This opens the potential for id collisions if new expression ids happen to be created inside of tasks rather than on the driver.

There are currently a few cases where tasks allocate expression ids, which happen to be safe because those expressions are never compared to expressions created on the driver. In order to guard against the introduction of invalid comparisons between driver-created and executor-created expression ids, this patch extends `ExprId` to incorporate a UUID to identify the JVM that created the id, which prevents collisions.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9093 from JoshRosen/SPARK-11080.
---
 .../catalyst/expressions/namedExpressions.scala   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 5768c6087db32..8957df0be6814 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.util.UUID
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -24,16 +26,23 @@ import org.apache.spark.sql.types._
 
 object NamedExpression {
   private val curId = new java.util.concurrent.atomic.AtomicLong()
-  def newExprId: ExprId = ExprId(curId.getAndIncrement())
+  private[expressions] val jvmId = UUID.randomUUID()
+  def newExprId: ExprId = ExprId(curId.getAndIncrement(), jvmId)
   def unapply(expr: NamedExpression): Option[(String, DataType)] = Some(expr.name, expr.dataType)
 }
 
 /**
- * A globally unique (within this JVM) id for a given named expression.
+ * A globally unique id for a given named expression.
  * Used to identify which attribute output by a relation is being
  * referenced in a subsequent computation.
+ *
+ * The `id` field is unique within a given JVM, while the `uuid` is used to uniquely identify JVMs.
  */
-case class ExprId(id: Long)
+case class ExprId(id: Long, jvmId: UUID)
+
+object ExprId {
+  def apply(id: Long): ExprId = ExprId(id, NamedExpression.jvmId)
+}
 
 /**
  * An [[Expression]] that is named.

From d0482f6af33e976db237405b2a978db1b7c2fd5b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 13 Oct 2015 15:18:20 -0700
Subject: [PATCH 540/802] [SPARK-10932] [PROJECT INFRA] Port two minor changes
 to release-build.sh from scripts' old repo

Spark's release packaging scripts used to live in a separate repository. Although these scripts are now part of the Spark repo, there are some minor patches made against the old repos that are missing in Spark's copy of the script. This PR ports those changes.

/cc shivaram, who originally submitted these changes against https://github.com/rxin/spark-utils

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8986 from JoshRosen/port-release-build-fixes-from-rxin-repo.
---
 dev/create-release/release-build.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 9dac43ce54425..cb79e9eba06e2 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -70,7 +70,7 @@ GIT_REF=${GIT_REF:-master}
 # Destination directory parent on remote server
 REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
 
-SSH="ssh -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
+SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
 GPG="gpg --no-tty --batch"
 NEXUS_ROOT=https://repository.apache.org/service/local/staging
 NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
@@ -141,8 +141,12 @@ if [[ "$1" == "package" ]]; then
 
     export ZINC_PORT=$ZINC_PORT
     echo "Creating distribution: $NAME ($FLAGS)"
-    ./make-distribution.sh --name $NAME --tgz $FLAGS -DzincPort=$ZINC_PORT 2>&1 > \
-      ../binary-release-$NAME.log
+
+    # Get maven home set by MVN
+    MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
+
+    ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
     cd ..
     cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
 

From 3889b1c7a96da1111946fa63ad69489b83468646 Mon Sep 17 00:00:00 2001
From: vectorijk <jiangkai@gmail.com>
Date: Tue, 13 Oct 2015 15:57:36 -0700
Subject: [PATCH 541/802] [SPARK-11059] [ML] Change range of quantile
 probabilities in AFTSurvivalRegression

Value of the quantile probabilities array should be in the range (0, 1) instead of [0,1]
 in `AFTSurvivalRegression.scala` according to [Discussion] (https://github.com/apache/spark/pull/8926#discussion-diff-40698242)

Author: vectorijk <jiangkai@gmail.com>

Closes #9083 from vectorijk/spark-11059.
---
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 717caacad30eb..ac2c3d825f13c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -59,14 +59,14 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
 
   /**
    * Param for quantile probabilities array.
-   * Values of the quantile probabilities array should be in the range [0, 1]
+   * Values of the quantile probabilities array should be in the range (0, 1)
    * and the array should be non-empty.
    * @group param
    */
   @Since("1.6.0")
   final val quantileProbabilities: DoubleArrayParam = new DoubleArrayParam(this,
     "quantileProbabilities", "quantile probabilities array",
-    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)) && t.length > 0)
+    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1, false, false)) && t.length > 0)
 
   /** @group getParam */
   @Since("1.6.0")

From 328d1b3e4bc39cce653342e04f9e08af12dd7ed8 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 13 Oct 2015 17:09:17 -0700
Subject: [PATCH 542/802] [SPARK-11090] [SQL] Constructor for Product types
 from InternalRow

This is a first draft of the ability to construct expressions that will take a catalyst internal row and construct a Product (case class or tuple) that has fields with the correct names.  Support include:
 - Nested classes
 - Maps
 - Efficiently handling of arrays of primitive types

Not yet supported:
 - Case classes that require custom collection types (i.e. List instead of Seq).

Author: Michael Armbrust <michael@databricks.com>

Closes #9100 from marmbrus/productContructor.
---
 .../catalyst/expressions/UnsafeArrayData.java |   4 +
 .../spark/sql/catalyst/ScalaReflection.scala  | 302 +++++++++++++-
 .../spark/sql/catalyst/encoders/Encoder.scala |  14 +
 .../catalyst/encoders/ProductEncoder.scala    |  26 +-
 .../sql/catalyst/expressions/objects.scala    | 154 +++++++-
 .../spark/sql/types/ArrayBasedMapData.scala   |   4 +
 .../apache/spark/sql/types/ArrayData.scala    |   5 +
 .../spark/sql/types/GenericArrayData.scala    |   4 +-
 .../encoders/ProductEncoderSuite.scala        | 369 +++++++++++-------
 9 files changed, 723 insertions(+), 159 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 796f8abec9a1d..4c63abb071e3b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -74,6 +74,10 @@ private void assertIndexIsValid(int ordinal) {
     assert ordinal < numElements : "ordinal (" + ordinal + ") should < " + numElements;
   }
 
+  public Object[] array() {
+    throw new UnsupportedOperationException("Only supported on GenericArrayData.");
+  }
+
   /**
    * Construct a new UnsafeArrayData. The resulting UnsafeArrayData won't be usable until
    * `pointTo()` has been called, since the value returned by this constructor is equivalent
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 8b733f2a0b91f..8edd6498e5163 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedExtractValue, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
@@ -80,6 +81,9 @@ trait ScalaReflection {
    * Returns the Spark SQL DataType for a given scala type.  Where this is not an exact mapping
    * to a native type, an ObjectType is returned. Special handling is also used for Arrays including
    * those that hold primitive types.
+   *
+   * Unlike `schemaFor`, this function doesn't do any massaging of types into the Spark SQL type
+   * system.  As a result, ObjectType will be returned for things like boxed Integers
    */
   def dataTypeFor(tpe: `Type`): DataType = tpe match {
     case t if t <:< definitions.IntTpe => IntegerType
@@ -114,6 +118,298 @@ trait ScalaReflection {
       }
   }
 
+  /**
+   * Given a type `T` this function constructs and ObjectType that holds a class of type
+   * Array[T].  Special handling is performed for primitive types to map them back to their raw
+   * JVM form instead of the Scala Array that handles auto boxing.
+   */
+  def arrayClassFor(tpe: `Type`): DataType = {
+    val cls = tpe match {
+      case t if t <:< definitions.IntTpe => classOf[Array[Int]]
+      case t if t <:< definitions.LongTpe => classOf[Array[Long]]
+      case t if t <:< definitions.DoubleTpe => classOf[Array[Double]]
+      case t if t <:< definitions.FloatTpe => classOf[Array[Float]]
+      case t if t <:< definitions.ShortTpe => classOf[Array[Short]]
+      case t if t <:< definitions.ByteTpe => classOf[Array[Byte]]
+      case t if t <:< definitions.BooleanTpe => classOf[Array[Boolean]]
+      case other =>
+        // There is probably a better way to do this, but I couldn't find it...
+        val elementType = dataTypeFor(other).asInstanceOf[ObjectType].cls
+        java.lang.reflect.Array.newInstance(elementType, 1).getClass
+
+    }
+    ObjectType(cls)
+  }
+
+  /**
+   * Returns an expression that can be used to construct an object of type `T` given a an input
+   * row with a compatible schema.  Fields of the row will be extracted using UnresolvedAttributes
+   * of the same name as the constructor arguments.  Nested classes will have their fields accessed
+   * using UnresolvedExtractValue.
+   */
+  def constructorFor[T : TypeTag]: Expression = constructorFor(typeOf[T], None)
+
+  protected def constructorFor(
+      tpe: `Type`,
+      path: Option[Expression]): Expression = ScalaReflectionLock.synchronized {
+
+    /** Returns the current path with a sub-field extracted. */
+    def addToPath(part: String) =
+      path
+        .map(p => UnresolvedExtractValue(p, expressions.Literal(part)))
+        .getOrElse(UnresolvedAttribute(part))
+
+    /** Returns the current path or throws an error. */
+    def getPath = path.getOrElse(sys.error("Constructors must start at a class type"))
+
+    tpe match {
+      case t if !dataTypeFor(t).isInstanceOf[ObjectType] =>
+        getPath
+
+      case t if t <:< localTypeOf[Option[_]] =>
+        val TypeRef(_, _, Seq(optType)) = t
+        val boxedType = optType match {
+          // For primitive types we must manually box the primitive value.
+          case t if t <:< definitions.IntTpe => Some(classOf[java.lang.Integer])
+          case t if t <:< definitions.LongTpe => Some(classOf[java.lang.Long])
+          case t if t <:< definitions.DoubleTpe => Some(classOf[java.lang.Double])
+          case t if t <:< definitions.FloatTpe => Some(classOf[java.lang.Float])
+          case t if t <:< definitions.ShortTpe => Some(classOf[java.lang.Short])
+          case t if t <:< definitions.ByteTpe => Some(classOf[java.lang.Byte])
+          case t if t <:< definitions.BooleanTpe => Some(classOf[java.lang.Boolean])
+          case _ => None
+        }
+
+        boxedType.map { boxedType =>
+          val objectType = ObjectType(boxedType)
+          WrapOption(
+            objectType,
+            NewInstance(
+              boxedType,
+              getPath :: Nil,
+              propagateNull = true,
+              objectType))
+        }.getOrElse {
+          val className: String = optType.erasure.typeSymbol.asClass.fullName
+          val cls = Utils.classForName(className)
+          val objectType = ObjectType(cls)
+
+          WrapOption(objectType, constructorFor(optType, path))
+        }
+
+      case t if t <:< localTypeOf[java.lang.Integer] =>
+        val boxedType = classOf[java.lang.Integer]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Long] =>
+        val boxedType = classOf[java.lang.Long]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Double] =>
+        val boxedType = classOf[java.lang.Double]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Float] =>
+        val boxedType = classOf[java.lang.Float]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Short] =>
+        val boxedType = classOf[java.lang.Short]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Byte] =>
+        val boxedType = classOf[java.lang.Byte]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Boolean] =>
+        val boxedType = classOf[java.lang.Boolean]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.sql.Date] =>
+        StaticInvoke(
+          DateTimeUtils,
+          ObjectType(classOf[java.sql.Date]),
+          "toJavaDate",
+          getPath :: Nil,
+          propagateNull = true)
+
+      case t if t <:< localTypeOf[java.sql.Timestamp] =>
+        StaticInvoke(
+          DateTimeUtils,
+          ObjectType(classOf[java.sql.Timestamp]),
+          "toJavaTimestamp",
+          getPath :: Nil,
+          propagateNull = true)
+
+      case t if t <:< localTypeOf[java.lang.String] =>
+        Invoke(getPath, "toString", ObjectType(classOf[String]))
+
+      case t if t <:< localTypeOf[java.math.BigDecimal] =>
+        Invoke(getPath, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]))
+
+      case t if t <:< localTypeOf[Array[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val elementDataType = dataTypeFor(elementType)
+        val Schema(dataType, nullable) = schemaFor(elementType)
+
+        val primitiveMethod = elementType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        primitiveMethod.map { method =>
+          Invoke(getPath, method, dataTypeFor(t))
+        }.getOrElse {
+          val returnType = dataTypeFor(t)
+          Invoke(
+            MapObjects(p => constructorFor(elementType, Some(p)), getPath, dataType),
+            "array",
+            returnType)
+        }
+
+      case t if t <:< localTypeOf[Map[_, _]] =>
+        val TypeRef(_, _, Seq(keyType, valueType)) = t
+        val Schema(keyDataType, _) = schemaFor(keyType)
+        val Schema(valueDataType, valueNullable) = schemaFor(valueType)
+
+        val primitiveMethodKey = keyType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val keyData =
+          Invoke(
+            MapObjects(
+              p => constructorFor(keyType, Some(p)),
+              Invoke(getPath, "keyArray", ArrayType(keyDataType)),
+              keyDataType),
+            "array",
+            ObjectType(classOf[Array[Any]]))
+
+        val primitiveMethodValue = valueType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val valueData =
+          Invoke(
+            MapObjects(
+              p => constructorFor(valueType, Some(p)),
+              Invoke(getPath, "valueArray", ArrayType(valueDataType)),
+              valueDataType),
+            "array",
+            ObjectType(classOf[Array[Any]]))
+
+        StaticInvoke(
+          ArrayBasedMapData,
+          ObjectType(classOf[Map[_, _]]),
+          "toScalaMap",
+          keyData :: valueData :: Nil)
+
+      case t if t <:< localTypeOf[Seq[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val elementDataType = dataTypeFor(elementType)
+        val Schema(dataType, nullable) = schemaFor(elementType)
+
+        // Avoid boxing when possible by just wrapping a primitive array.
+        val primitiveMethod = elementType match {
+          case _ if nullable => None
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val arrayData = primitiveMethod.map { method =>
+          Invoke(getPath, method, arrayClassFor(elementType))
+        }.getOrElse {
+          Invoke(
+            MapObjects(p => constructorFor(elementType, Some(p)), getPath, dataType),
+            "array",
+            arrayClassFor(elementType))
+        }
+
+        StaticInvoke(
+          scala.collection.mutable.WrappedArray,
+          ObjectType(classOf[Seq[_]]),
+          "make",
+          arrayData :: Nil)
+
+
+      case t if t <:< localTypeOf[Product] =>
+        val formalTypeArgs = t.typeSymbol.asClass.typeParams
+        val TypeRef(_, _, actualTypeArgs) = t
+        val constructorSymbol = t.member(nme.CONSTRUCTOR)
+        val params = if (constructorSymbol.isMethod) {
+          constructorSymbol.asMethod.paramss
+        } else {
+          // Find the primary constructor, and use its parameter ordering.
+          val primaryConstructorSymbol: Option[Symbol] =
+            constructorSymbol.asTerm.alternatives.find(s =>
+              s.isMethod && s.asMethod.isPrimaryConstructor)
+
+          if (primaryConstructorSymbol.isEmpty) {
+            sys.error("Internal SQL error: Product object did not have a primary constructor.")
+          } else {
+            primaryConstructorSymbol.get.asMethod.paramss
+          }
+        }
+
+        val className: String = t.erasure.typeSymbol.asClass.fullName
+        val cls = Utils.classForName(className)
+
+        val arguments = params.head.map { p =>
+          val fieldName = p.name.toString
+          val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+          val dataType = dataTypeFor(fieldType)
+
+          constructorFor(fieldType, Some(addToPath(fieldName)))
+        }
+
+        val newInstance = NewInstance(cls, arguments, propagateNull = false, ObjectType(cls))
+
+        if (path.nonEmpty) {
+          expressions.If(
+            IsNull(getPath),
+            expressions.Literal.create(null, ObjectType(cls)),
+            newInstance
+          )
+        } else {
+          newInstance
+        }
+
+    }
+  }
+
   /** Returns expressions for extracting all the fields from the given type. */
   def extractorsFor[T : TypeTag](inputObject: Expression): Seq[Expression] = {
     ScalaReflectionLock.synchronized {
@@ -227,13 +523,13 @@ trait ScalaReflection {
           val elementDataType = dataTypeFor(elementType)
           val Schema(dataType, nullable) = schemaFor(elementType)
 
-          if (!elementDataType.isInstanceOf[AtomicType]) {
-            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
-          } else {
+          if (dataType.isInstanceOf[AtomicType]) {
             NewInstance(
               classOf[GenericArrayData],
               inputObject :: Nil,
               dataType = ArrayType(dataType, nullable))
+          } else {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
           }
 
         case t if t <:< localTypeOf[Map[_, _]] =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index 8dacfa9477ee6..3618247d5d51a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
+
 import scala.reflect.ClassTag
 
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.StructType
 
@@ -41,4 +43,16 @@ trait Encoder[T] {
    * copy the result before making another call if required.
    */
   def toRow(t: T): InternalRow
+
+  /**
+   * Returns an object of type `T`, extracting the required values from the provided row.  Note that
+   * you must bind` and encoder to a specific schema before you can call this function.
+   */
+  def fromRow(row: InternalRow): T
+
+  /**
+   * Returns a new copy of this encoder, where the expressions used by `fromRow` are bound to the
+   * given schema
+   */
+  def bind(schema: Seq[Attribute]): Encoder[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
index a23613673ebb5..b0381880c3bdb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag}
@@ -31,7 +33,7 @@ import org.apache.spark.sql.types.{ObjectType, StructType}
  * internal binary representation.
  */
 object ProductEncoder {
-  def apply[T <: Product : TypeTag]: Encoder[T] = {
+  def apply[T <: Product : TypeTag]: ClassEncoder[T] = {
     // We convert the not-serializable TypeTag into StructType and ClassTag.
     val schema = ScalaReflection.schemaFor[T].dataType.asInstanceOf[StructType]
     val mirror = typeTag[T].mirror
@@ -39,7 +41,8 @@ object ProductEncoder {
 
     val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
     val extractExpressions = ScalaReflection.extractorsFor[T](inputObject)
-    new ClassEncoder[T](schema, extractExpressions, ClassTag[T](cls))
+    val constructExpression = ScalaReflection.constructorFor[T]
+    new ClassEncoder[T](schema, extractExpressions, constructExpression, ClassTag[T](cls))
   }
 }
 
@@ -54,14 +57,31 @@ object ProductEncoder {
 case class ClassEncoder[T](
     schema: StructType,
     extractExpressions: Seq[Expression],
+    constructExpression: Expression,
     clsTag: ClassTag[T])
   extends Encoder[T] {
 
   private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
   private val inputRow = new GenericMutableRow(1)
 
+  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
+  private val dataType = ObjectType(clsTag.runtimeClass)
+
   override def toRow(t: T): InternalRow = {
     inputRow(0) = t
     extractProjection(inputRow)
   }
+
+  override def fromRow(row: InternalRow): T = {
+    constructProjection(row).get(0, dataType).asInstanceOf[T]
+  }
+
+  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
+    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    val resolvedExpression = analyzedPlan.expressions.head.children.head
+    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
+
+    copy(constructExpression = boundExpression)
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index e1f960a6e605c..e8c1c93cf5620 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
+
 import scala.language.existentials
 
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
@@ -48,7 +51,7 @@ case class StaticInvoke(
     case other => other.getClass.getName.stripSuffix("$")
   }
   override def nullable: Boolean = true
-  override def children: Seq[Expression] = Nil
+  override def children: Seq[Expression] = arguments
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
@@ -69,7 +72,7 @@ case class StaticInvoke(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        boolean ${ev.isNull} = true;
+        boolean ${ev.isNull} = !$argsNonNull;
         $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
 
         if ($argsNonNull) {
@@ -81,8 +84,8 @@ case class StaticInvoke(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        final boolean ${ev.isNull} = ${ev.value} == null;
         $javaType ${ev.value} = $objectName.$functionName($argString);
+        final boolean ${ev.isNull} = ${ev.value} == null;
       """
     }
   }
@@ -92,6 +95,10 @@ case class StaticInvoke(
  * Calls the specified function on an object, optionally passing arguments.  If the `targetObject`
  * expression evaluates to null then null will be returned.
  *
+ * In some cases, due to erasure, the schema may expect a primitive type when in fact the method
+ * is returning java.lang.Object.  In this case, we will generate code that attempts to unbox the
+ * value automatically.
+ *
  * @param targetObject An expression that will return the object to call the method on.
  * @param functionName The name of the method to call.
  * @param dataType The expected return type of the function.
@@ -109,6 +116,35 @@ case class Invoke(
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
 
+  lazy val method = targetObject.dataType match {
+    case ObjectType(cls) =>
+      cls
+        .getMethods
+        .find(_.getName == functionName)
+        .getOrElse(sys.error(s"Couldn't find $functionName on $cls"))
+        .getReturnType
+        .getName
+    case _ => ""
+  }
+
+  lazy val unboxer = (dataType, method) match {
+    case (IntegerType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Integer)$s).intValue()"
+    case (LongType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Long)$s).longValue()"
+    case (FloatType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Float)$s).floatValue()"
+    case (ShortType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Short)$s).shortValue()"
+    case (ByteType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Byte)$s).byteValue()"
+    case (DoubleType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Double)$s).doubleValue()"
+    case (BooleanType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Boolean)$s).booleanValue()"
+    case _ => identity[String] _
+  }
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val javaType = ctx.javaType(dataType)
     val obj = targetObject.gen(ctx)
@@ -123,6 +159,8 @@ case class Invoke(
       ""
     }
 
+    val value = unboxer(s"${obj.value}.$functionName($argString)")
+
     s"""
       ${obj.code}
       ${argGen.map(_.code).mkString("\n")}
@@ -130,7 +168,7 @@ case class Invoke(
       boolean ${ev.isNull} = ${obj.value} == null;
       $javaType ${ev.value} =
         ${ev.isNull} ?
-        ${ctx.defaultValue(dataType)} : ($javaType) ${obj.value}.$functionName($argString);
+        ${ctx.defaultValue(dataType)} : ($javaType) $value;
       $objNullCheck
     """
   }
@@ -190,8 +228,8 @@ case class NewInstance(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        final boolean ${ev.isNull} = ${ev.value} == null;
         $javaType ${ev.value} = new $className($argString);
+        final boolean ${ev.isNull} = ${ev.value} == null;
       """
     }
   }
@@ -210,8 +248,6 @@ case class UnwrapOption(
 
   override def nullable: Boolean = true
 
-  override def children: Seq[Expression] = Nil
-
   override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
 
   override def eval(input: InternalRow): Any =
@@ -231,6 +267,43 @@ case class UnwrapOption(
   }
 }
 
+/**
+ * Converts the result of evaluating `child` into an option, checking both the isNull bit and
+ * (in the case of reference types) equality with null.
+ * @param optionType The datatype to be held inside of the Option.
+ * @param child The expression to evaluate and wrap.
+ */
+case class WrapOption(optionType: DataType, child: Expression)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = ObjectType(classOf[Option[_]])
+
+  override def nullable: Boolean = true
+
+  override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(optionType)
+    val inputObject = child.gen(ctx)
+
+    s"""
+      ${inputObject.code}
+
+      boolean ${ev.isNull} = false;
+      scala.Option<$javaType> ${ev.value} =
+        ${inputObject.isNull} ?
+        scala.Option$$.MODULE$$.apply(null) : new scala.Some(${inputObject.value});
+    """
+  }
+}
+
+/**
+ * A place holder for the loop variable used in [[MapObjects]].  This should never be constructed
+ * manually, but will instead be passed into the provided lambda function.
+ */
 case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends Expression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
@@ -251,7 +324,7 @@ case class LambdaVariable(value: String, isNull: String, dataType: DataType) ext
  * as an ArrayType.  This is similar to a typical map operation, but where the lambda function
  * is expressed using catalyst expressions.
  *
- * The following collection ObjectTypes are currently supported: Seq, Array
+ * The following collection ObjectTypes are currently supported: Seq, Array, ArrayData
  *
  * @param function A function that returns an expression, given an attribute that can be used
  *                 to access the current value.  This is does as a lambda function so that
@@ -265,14 +338,32 @@ case class MapObjects(
     inputData: Expression,
     elementType: DataType) extends Expression {
 
-  private val loopAttribute = AttributeReference("loopVar", elementType)()
-  private val completeFunction = function(loopAttribute)
+  private lazy val loopAttribute = AttributeReference("loopVar", elementType)()
+  private lazy val completeFunction = function(loopAttribute)
 
-  private val (lengthFunction, itemAccessor) = inputData.dataType match {
-    case ObjectType(cls) if cls.isAssignableFrom(classOf[Seq[_]]) =>
-      (".size()", (i: String) => s".apply($i)")
+  private lazy val (lengthFunction, itemAccessor, primitiveElement) = inputData.dataType match {
+    case ObjectType(cls) if classOf[Seq[_]].isAssignableFrom(cls) =>
+      (".size()", (i: String) => s".apply($i)", false)
     case ObjectType(cls) if cls.isArray =>
-      (".length", (i: String) => s"[$i]")
+      (".length", (i: String) => s"[$i]", false)
+    case ArrayType(s: StructType, _) =>
+      (".numElements()", (i: String) => s".getStruct($i, ${s.size})", false)
+    case ArrayType(a: ArrayType, _) =>
+      (".numElements()", (i: String) => s".getArray($i)", true)
+    case ArrayType(IntegerType, _) =>
+      (".numElements()", (i: String) => s".getInt($i)", true)
+    case ArrayType(LongType, _) =>
+      (".numElements()", (i: String) => s".getLong($i)", true)
+    case ArrayType(FloatType, _) =>
+      (".numElements()", (i: String) => s".getFloat($i)", true)
+    case ArrayType(DoubleType, _) =>
+      (".numElements()", (i: String) => s".getDouble($i)", true)
+    case ArrayType(ByteType, _) =>
+      (".numElements()", (i: String) => s".getByte($i)", true)
+    case ArrayType(ShortType, _) =>
+      (".numElements()", (i: String) => s".getShort($i)", true)
+    case ArrayType(BooleanType, _) =>
+      (".numElements()", (i: String) => s".getBoolean($i)", true)
   }
 
   override def nullable: Boolean = true
@@ -294,15 +385,38 @@ case class MapObjects(
     val loopIsNull = ctx.freshName("loopIsNull")
 
     val loopVariable = LambdaVariable(loopValue, loopIsNull, elementType)
-    val boundFunction = completeFunction transform {
+    val substitutedFunction = completeFunction transform {
       case a: AttributeReference if a == loopAttribute => loopVariable
     }
+    // A hack to run this through the analyzer (to bind extractions).
+    val boundFunction =
+      SimpleAnalyzer.execute(Project(Alias(substitutedFunction, "")() :: Nil, LocalRelation(Nil)))
+        .expressions.head.children.head
 
     val genFunction = boundFunction.gen(ctx)
     val dataLength = ctx.freshName("dataLength")
     val convertedArray = ctx.freshName("convertedArray")
     val loopIndex = ctx.freshName("loopIndex")
 
+    val convertedType = ctx.javaType(boundFunction.dataType)
+
+    // Because of the way Java defines nested arrays, we have to handle the syntax specially.
+    // Specifically, we have to insert the [$dataLength] in between the type and any extra nested
+    // array declarations (i.e. new String[1][]).
+    val arrayConstructor = if (convertedType contains "[]") {
+      val rawType = convertedType.takeWhile(_ != '[')
+      val arrayPart = convertedType.reverse.takeWhile(c => c == '[' || c == ']').reverse
+      s"new $rawType[$dataLength]$arrayPart"
+    } else {
+      s"new $convertedType[$dataLength]"
+    }
+
+    val loopNullCheck = if (primitiveElement) {
+      s"boolean $loopIsNull = ${genInputData.value}.isNullAt($loopIndex);"
+    } else {
+      s"boolean $loopIsNull = ${genInputData.isNull} || $loopValue == null;"
+    }
+
     s"""
       ${genInputData.code}
 
@@ -310,19 +424,19 @@ case class MapObjects(
       $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
 
       if (!${ev.isNull}) {
-        Object[] $convertedArray = null;
+        $convertedType[] $convertedArray = null;
         int $dataLength = ${genInputData.value}$lengthFunction;
-        $convertedArray = new Object[$dataLength];
+        $convertedArray = $arrayConstructor;
 
         int $loopIndex = 0;
         while ($loopIndex < $dataLength) {
           $elementJavaType $loopValue =
             ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
-          boolean $loopIsNull = $loopValue == null;
+          $loopNullCheck
 
           ${genFunction.code}
 
-          $convertedArray[$loopIndex] = ${genFunction.value};
+          $convertedArray[$loopIndex] = ($convertedType)${genFunction.value};
           $loopIndex += 1;
         }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
index 52069598ee30e..5f22e59d5f1d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -62,4 +62,8 @@ object ArrayBasedMapData {
     val values = map.valueArray.asInstanceOf[GenericArrayData].array
     keys.zip(values).toMap
   }
+
+  def toScalaMap(keys: Array[Any], values: Array[Any]): Map[Any, Any] = {
+    keys.zip(values).toMap
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
index 642c56f12ded1..b4ea300f5f306 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
@@ -26,6 +26,8 @@ abstract class ArrayData extends SpecializedGetters with Serializable {
 
   def copy(): ArrayData
 
+  def array: Array[Any]
+
   def toBooleanArray(): Array[Boolean] = {
     val size = numElements()
     val values = new Array[Boolean](size)
@@ -103,6 +105,9 @@ abstract class ArrayData extends SpecializedGetters with Serializable {
     values
   }
 
+  def toObjectArray(elementType: DataType): Array[AnyRef] =
+    toArray[AnyRef](elementType: DataType)
+
   def toArray[T: ClassTag](elementType: DataType): Array[T] = {
     val size = numElements()
     val values = new Array[T](size)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index c3816033275d5..9448d88d6c5f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.types
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
-class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
+class GenericArrayData(val array: Array[Any]) extends ArrayData {
 
   def this(seq: scala.collection.GenIterable[Any]) = this(seq.toArray)
 
@@ -29,6 +29,8 @@ class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
   def this(primitiveArray: Array[Long]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Float]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Double]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Short]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq)
 
   override def copy(): ArrayData = new GenericArrayData(array.clone())
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
index 99c993d3febc2..02e43ddb35478 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -17,158 +17,263 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
-import java.sql.{Date, Timestamp}
+import java.util
+
+import org.apache.spark.sql.types.{StructField, ArrayType, ArrayData}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.runtime.universe._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.ScalaReflection._
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.catalyst._
 
-
 case class RepeatedStruct(s: Seq[PrimitiveData])
 
 case class NestedArray(a: Array[Array[Int]])
 
-class ProductEncoderSuite extends SparkFunSuite {
+case class BoxedData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    doubleField: java.lang.Double,
+    floatField: java.lang.Float,
+    shortField: java.lang.Short,
+    byteField: java.lang.Byte,
+    booleanField: java.lang.Boolean)
 
-  test("convert PrimitiveData to InternalRow") {
-    val inputData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val encoder = ProductEncoder[PrimitiveData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 1)
-    assert(convertedData.getLong(1) == 1.toLong)
-    assert(convertedData.getDouble(2) == 1.toDouble)
-    assert(convertedData.getFloat(3) == 1.toFloat)
-    assert(convertedData.getShort(4) == 1.toShort)
-    assert(convertedData.getByte(5) == 1.toByte)
-    assert(convertedData.getBoolean(6) == true)
-  }
+case class RepeatedData(
+    arrayField: Seq[Int],
+    arrayFieldContainsNull: Seq[java.lang.Integer],
+    mapField: scala.collection.Map[Int, Long],
+    mapFieldNull: scala.collection.Map[Int, java.lang.Long],
+    structField: PrimitiveData)
 
-  test("convert Some[_] to InternalRow") {
-    val primitiveData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val inputData = OptionalData(Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
-      Some(primitiveData))
-
-    val encoder = ProductEncoder[OptionalData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 2)
-    assert(convertedData.getLong(1) == 2.toLong)
-    assert(convertedData.getDouble(2) == 2.toDouble)
-    assert(convertedData.getFloat(3) == 2.toFloat)
-    assert(convertedData.getShort(4) == 2.toShort)
-    assert(convertedData.getByte(5) == 2.toByte)
-    assert(convertedData.getBoolean(6) == true)
-
-    val nestedRow = convertedData.getStruct(7, 7)
-    assert(nestedRow.getInt(0) == 1)
-    assert(nestedRow.getLong(1) == 1.toLong)
-    assert(nestedRow.getDouble(2) == 1.toDouble)
-    assert(nestedRow.getFloat(3) == 1.toFloat)
-    assert(nestedRow.getShort(4) == 1.toShort)
-    assert(nestedRow.getByte(5) == 1.toByte)
-    assert(nestedRow.getBoolean(6) == true)
-  }
+case class SpecificCollection(l: List[Int])
 
-  test("convert None to InternalRow") {
-    val inputData = OptionalData(None, None, None, None, None, None, None, None)
-    val encoder = ProductEncoder[OptionalData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.isNullAt(0))
-    assert(convertedData.isNullAt(1))
-    assert(convertedData.isNullAt(2))
-    assert(convertedData.isNullAt(3))
-    assert(convertedData.isNullAt(4))
-    assert(convertedData.isNullAt(5))
-    assert(convertedData.isNullAt(6))
-    assert(convertedData.isNullAt(7))
-  }
+class ProductEncoderSuite extends SparkFunSuite {
 
-  test("convert nullable but present data to InternalRow") {
-    val inputData = NullableData(
-      1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true, "test", new java.math.BigDecimal(1), new Date(0),
-      new Timestamp(0), Array[Byte](1, 2, 3))
-
-    val encoder = ProductEncoder[NullableData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 1)
-    assert(convertedData.getLong(1) == 1.toLong)
-    assert(convertedData.getDouble(2) == 1.toDouble)
-    assert(convertedData.getFloat(3) == 1.toFloat)
-    assert(convertedData.getShort(4) == 1.toShort)
-    assert(convertedData.getByte(5) == 1.toByte)
-    assert(convertedData.getBoolean(6) == true)
-  }
+  encodeDecodeTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 
-  test("convert nullable data to InternalRow") {
-    val inputData =
-      NullableData(null, null, null, null, null, null, null, null, null, null, null, null)
-
-    val encoder = ProductEncoder[NullableData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.isNullAt(0))
-    assert(convertedData.isNullAt(1))
-    assert(convertedData.isNullAt(2))
-    assert(convertedData.isNullAt(3))
-    assert(convertedData.isNullAt(4))
-    assert(convertedData.isNullAt(5))
-    assert(convertedData.isNullAt(6))
-    assert(convertedData.isNullAt(7))
-    assert(convertedData.isNullAt(8))
-    assert(convertedData.isNullAt(9))
-    assert(convertedData.isNullAt(10))
-    assert(convertedData.isNullAt(11))
-  }
+  // TODO: Support creating specific subclasses of Seq.
+  ignore("Specific collection types") { encodeDecodeTest(SpecificCollection(1 :: Nil)) }
 
-  test("convert repeated struct") {
-    val inputData = RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil)
-    val encoder = ProductEncoder[RepeatedStruct]
-
-    val converted = encoder.toRow(inputData)
-    val convertedStruct = converted.getArray(0).getStruct(0, 7)
-    assert(convertedStruct.getInt(0) == 1)
-    assert(convertedStruct.getLong(1) == 1.toLong)
-    assert(convertedStruct.getDouble(2) == 1.toDouble)
-    assert(convertedStruct.getFloat(3) == 1.toFloat)
-    assert(convertedStruct.getShort(4) == 1.toShort)
-    assert(convertedStruct.getByte(5) == 1.toByte)
-    assert(convertedStruct.getBoolean(6) == true)
-  }
+  encodeDecodeTest(
+    OptionalData(
+      Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
+      Some(PrimitiveData(1, 1, 1, 1, 1, 1, true))))
 
-  test("convert nested seq") {
-    val convertedData = ProductEncoder[Tuple1[Seq[Seq[Int]]]].toRow(Tuple1(Seq(Seq(1))))
-    assert(convertedData.getArray(0).getArray(0).getInt(0) == 1)
+  encodeDecodeTest(OptionalData(None, None, None, None, None, None, None, None))
 
-    val convertedData2 = ProductEncoder[Tuple1[Seq[Seq[Seq[Int]]]]].toRow(Tuple1(Seq(Seq(Seq(1)))))
-    assert(convertedData2.getArray(0).getArray(0).getArray(0).getInt(0) == 1)
-  }
+  encodeDecodeTest(
+    BoxedData(1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true))
 
-  test("convert nested array") {
-    val convertedData = ProductEncoder[Tuple1[Array[Array[Int]]]].toRow(Tuple1(Array(Array(1))))
-  }
+  encodeDecodeTest(
+    BoxedData(null, null, null, null, null, null, null))
+
+  encodeDecodeTest(
+    RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil))
 
-  test("convert complex") {
-    val inputData = ComplexData(
+  encodeDecodeTest(
+    RepeatedData(
       Seq(1, 2),
-      Array(1, 2),
-      1 :: 2 :: Nil,
       Seq(new Integer(1), null, new Integer(2)),
       Map(1 -> 2L),
-      Map(1 -> new java.lang.Long(2)),
-      PrimitiveData(1, 1, 1, 1, 1, 1, true),
-      Array(Array(1)))
-
-    val encoder = ProductEncoder[ComplexData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(!convertedData.isNullAt(0))
-    val seq = convertedData.getArray(0)
-    assert(seq.numElements() == 2)
-    assert(seq.getInt(0) == 1)
-    assert(seq.getInt(1) == 2)
+      Map(1 -> null),
+      PrimitiveData(1, 1, 1, 1, 1, 1, true)))
+
+  encodeDecodeTest(("nullable Seq[Integer]", Seq[Integer](1, null)))
+
+  encodeDecodeTest(("Seq[(String, String)]",
+    Seq(("a", "b"))))
+  encodeDecodeTest(("Seq[(Int, Int)]",
+    Seq((1, 2))))
+  encodeDecodeTest(("Seq[(Long, Long)]",
+    Seq((1L, 2L))))
+  encodeDecodeTest(("Seq[(Float, Float)]",
+    Seq((1.toFloat, 2.toFloat))))
+  encodeDecodeTest(("Seq[(Double, Double)]",
+    Seq((1.toDouble, 2.toDouble))))
+  encodeDecodeTest(("Seq[(Short, Short)]",
+    Seq((1.toShort, 2.toShort))))
+  encodeDecodeTest(("Seq[(Byte, Byte)]",
+    Seq((1.toByte, 2.toByte))))
+  encodeDecodeTest(("Seq[(Boolean, Boolean)]",
+    Seq((true, false))))
+
+  // TODO: Decoding/encoding of complex maps.
+  ignore("complex maps") {
+    encodeDecodeTest(("Map[Int, (String, String)]",
+      Map(1 ->("a", "b"))))
+  }
+
+  encodeDecodeTest(("ArrayBuffer[(String, String)]",
+    ArrayBuffer(("a", "b"))))
+  encodeDecodeTest(("ArrayBuffer[(Int, Int)]",
+    ArrayBuffer((1, 2))))
+  encodeDecodeTest(("ArrayBuffer[(Long, Long)]",
+    ArrayBuffer((1L, 2L))))
+  encodeDecodeTest(("ArrayBuffer[(Float, Float)]",
+    ArrayBuffer((1.toFloat, 2.toFloat))))
+  encodeDecodeTest(("ArrayBuffer[(Double, Double)]",
+    ArrayBuffer((1.toDouble, 2.toDouble))))
+  encodeDecodeTest(("ArrayBuffer[(Short, Short)]",
+    ArrayBuffer((1.toShort, 2.toShort))))
+  encodeDecodeTest(("ArrayBuffer[(Byte, Byte)]",
+    ArrayBuffer((1.toByte, 2.toByte))))
+  encodeDecodeTest(("ArrayBuffer[(Boolean, Boolean)]",
+    ArrayBuffer((true, false))))
+
+  encodeDecodeTest(("Seq[Seq[(Int, Int)]]",
+    Seq(Seq((1, 2)))))
+
+  encodeDecodeTestCustom(("Array[Array[(Int, Int)]]",
+    Array(Array((1, 2)))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[(Int, Int)]]",
+    Array(Array(Array((1, 2))))))
+  { (l, r) => l._2(0)(0)(0) == r._2(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[(Int, Int)]]]",
+    Array(Array(Array(Array((1, 2)))))))
+  { (l, r) => l._2(0)(0)(0)(0) == r._2(0)(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Array[(Int, Int)]]]]",
+    Array(Array(Array(Array(Array((1, 2))))))))
+  { (l, r) => l._2(0)(0)(0)(0)(0) == r._2(0)(0)(0)(0)(0) }
+
+
+  encodeDecodeTestCustom(("Array[Array[Integer]]",
+    Array(Array[Integer](1))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Int]]",
+    Array(Array(1))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Int]]",
+    Array(Array(Array(1)))))
+  { (l, r) => l._2(0)(0)(0) == r._2(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Int]]]",
+    Array(Array(Array(Array(1))))))
+  { (l, r) => l._2(0)(0)(0)(0) == r._2(0)(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Array[Int]]]]",
+    Array(Array(Array(Array(Array(1)))))))
+  { (l, r) => l._2(0)(0)(0)(0)(0) == r._2(0)(0)(0)(0)(0) }
+
+  encodeDecodeTest(("Array[Byte] null",
+    null: Array[Byte]))
+  encodeDecodeTestCustom(("Array[Byte]",
+    Array[Byte](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Int] null",
+    null: Array[Int]))
+  encodeDecodeTestCustom(("Array[Int]",
+    Array[Int](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Long] null",
+    null: Array[Long]))
+  encodeDecodeTestCustom(("Array[Long]",
+    Array[Long](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Double] null",
+    null: Array[Double]))
+  encodeDecodeTestCustom(("Array[Double]",
+    Array[Double](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Float] null",
+    null: Array[Float]))
+  encodeDecodeTestCustom(("Array[Float]",
+    Array[Float](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Boolean] null",
+    null: Array[Boolean]))
+  encodeDecodeTestCustom(("Array[Boolean]",
+    Array[Boolean](true, false)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Short] null",
+    null: Array[Short]))
+  encodeDecodeTestCustom(("Array[Short]",
+    Array[Short](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTestCustom(("java.sql.Timestamp",
+    new java.sql.Timestamp(1)))
+    { (l, r) => l._2.toString == r._2.toString }
+
+  encodeDecodeTestCustom(("java.sql.Date", new java.sql.Date(1)))
+    { (l, r) => l._2.toString == r._2.toString }
+
+  /** Simplified encodeDecodeTestCustom, where the comparison function can be `Object.equals`. */
+  protected def encodeDecodeTest[T <: Product : TypeTag](inputData: T) =
+    encodeDecodeTestCustom[T](inputData)((l, r) => l == r)
+
+  /**
+   * Constructs a test that round-trips `t` through an encoder, checking the results to ensure it
+   * matches the original.
+   */
+  protected def encodeDecodeTestCustom[T <: Product : TypeTag](
+      inputData: T)(
+      c: (T, T) => Boolean) = {
+    test(s"encode/decode: $inputData") {
+      val encoder = try ProductEncoder[T] catch {
+        case e: Exception =>
+          fail(s"Exception thrown generating encoder", e)
+      }
+      val convertedData = encoder.toRow(inputData)
+      val schema = encoder.schema.toAttributes
+      val boundEncoder = encoder.bind(schema)
+      val convertedBack = try boundEncoder.fromRow(convertedData) catch {
+        case e: Exception =>
+          fail(
+           s"""Exception thrown while decoding
+              |Converted: $convertedData
+              |Schema: ${schema.mkString(",")}
+              |${encoder.schema.treeString}
+              |
+              |Construct Expressions:
+              |${boundEncoder.constructExpression.treeString}
+              |
+            """.stripMargin, e)
+      }
+
+      if (!c(inputData, convertedBack)) {
+        val types =
+          convertedBack.productIterator.filter(_ != null).map(_.getClass.getName).mkString(",")
+
+        val encodedData = convertedData.toSeq(encoder.schema).zip(encoder.schema).map {
+          case (a: ArrayData, StructField(_, at: ArrayType, _, _)) =>
+            a.toArray[Any](at.elementType).toSeq
+          case (other, _) =>
+            other
+        }.mkString("[", ",", "]")
+
+        fail(
+          s"""Encoded/Decoded data does not match input data
+             |
+             |in:  $inputData
+             |out: $convertedBack
+             |types: $types
+             |
+             |Encoded Data: $encodedData
+             |Schema: ${schema.mkString(",")}
+             |${encoder.schema.treeString}
+             |
+             |Extract Expressions:
+             |${boundEncoder.extractExpressions.map(_.treeString).mkString("\n")}
+             |
+             |Construct Expressions:
+             |${boundEncoder.constructExpression.treeString}
+             |
+           """.stripMargin)
+      }
+    }
   }
 }

From e170c22160bb452f98c340489ebf8390116a8cbb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Tue, 13 Oct 2015 17:11:22 -0700
Subject: [PATCH 543/802] [SPARK-11032] [SQL] correctly handle having

We should not stop resolving having when the having condtion is resolved, or something like `count(1)` will crash.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #9105 from cloud-fan/having.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala    | 2 +-
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala  | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f5597a08d3595..041ab22827399 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -553,7 +553,7 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case filter @ Filter(havingCondition,
              aggregate @ Aggregate(grouping, originalAggExprs, child))
-          if aggregate.resolved && !filter.resolved =>
+          if aggregate.resolved =>
 
         // Try resolving the condition of the filter as though it is in the aggregate clause
         val aggregatedCondition =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index eca6f1073889a..636591630e136 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1809,4 +1809,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         df1.withColumn("diff", lit(0)))
     }
   }
+
+  test("SPARK-11032: resolve having correctly") {
+    withTempTable("src") {
+      Seq(1 -> "a").toDF("i", "j").registerTempTable("src")
+      checkAnswer(
+        sql("SELECT MIN(t.i) FROM (SELECT * FROM src WHERE i > 0) t HAVING(COUNT(1) > 0)"),
+        Row(1))
+    }
+  }
 }

From 15ff85b3163acbe8052d4489a00bcf1d2332fcf0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Tue, 13 Oct 2015 17:59:32 -0700
Subject: [PATCH 544/802] [SPARK-11068] [SQL] add callback to query execution

With this feature, we can track the query plan, time cost, exception during query execution for spark users.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #9078 from cloud-fan/callback.
---
 .../org/apache/spark/sql/DataFrame.scala      |  46 +++++-
 .../spark/sql/QueryExecutionListener.scala    | 136 ++++++++++++++++++
 .../org/apache/spark/sql/SQLContext.scala     |   3 +
 .../spark/sql/DataFrameCallbackSuite.scala    |  82 +++++++++++
 4 files changed, 261 insertions(+), 6 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 01f60aba87ede..bfe8d3c8ef957 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1344,7 +1344,9 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def head(n: Int): Array[Row] = limit(n).collect()
+  def head(n: Int): Array[Row] = withCallback("head", limit(n)) { df =>
+    df.collect(needCallback = false)
+  }
 
   /**
    * Returns the first row.
@@ -1414,8 +1416,18 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def collect(): Array[Row] = withNewExecutionId {
-    queryExecution.executedPlan.executeCollectPublic()
+  def collect(): Array[Row] = collect(needCallback = true)
+
+  private def collect(needCallback: Boolean): Array[Row] = {
+    def execute(): Array[Row] = withNewExecutionId {
+      queryExecution.executedPlan.executeCollectPublic()
+    }
+
+    if (needCallback) {
+      withCallback("collect", this)(_ => execute())
+    } else {
+      execute()
+    }
   }
 
   /**
@@ -1423,8 +1435,10 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def collectAsList(): java.util.List[Row] = withNewExecutionId {
-    java.util.Arrays.asList(rdd.collect() : _*)
+  def collectAsList(): java.util.List[Row] = withCallback("collectAsList", this) { _ =>
+    withNewExecutionId {
+      java.util.Arrays.asList(rdd.collect() : _*)
+    }
   }
 
   /**
@@ -1432,7 +1446,9 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def count(): Long = groupBy().count().collect().head.getLong(0)
+  def count(): Long = withCallback("count", groupBy().count()) { df =>
+    df.collect(needCallback = false).head.getLong(0)
+  }
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
@@ -1936,6 +1952,24 @@ class DataFrame private[sql](
     SQLExecution.withNewExecutionId(sqlContext, queryExecution)(body)
   }
 
+  /**
+   * Wrap a DataFrame action to track the QueryExecution and time cost, then report to the
+   * user-registered callback functions.
+   */
+  private def withCallback[T](name: String, df: DataFrame)(action: DataFrame => T) = {
+    try {
+      val start = System.nanoTime()
+      val result = action(df)
+      val end = System.nanoTime()
+      sqlContext.listenerManager.onSuccess(name, df.queryExecution, end - start)
+      result
+    } catch {
+      case e: Exception =>
+        sqlContext.listenerManager.onFailure(name, df.queryExecution, e)
+        throw e
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
   // End of deprecated methods
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
new file mode 100644
index 0000000000000..14fbebb45f8b7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.concurrent.locks.ReentrantReadWriteLock
+import scala.collection.mutable.ListBuffer
+
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.Logging
+import org.apache.spark.sql.execution.QueryExecution
+
+
+/**
+ * The interface of query execution listener that can be used to analyze execution metrics.
+ *
+ * Note that implementations should guarantee thread-safety as they will be used in a non
+ * thread-safe way.
+ */
+@Experimental
+trait QueryExecutionListener {
+
+  /**
+   * A callback function that will be called when a query executed successfully.
+   * Implementations should guarantee thread-safe.
+   *
+   * @param funcName the name of the action that triggered this query.
+   * @param qe the QueryExecution object that carries detail information like logical plan,
+   *           physical plan, etc.
+   * @param duration the execution time for this query in nanoseconds.
+   */
+  @DeveloperApi
+  def onSuccess(funcName: String, qe: QueryExecution, duration: Long)
+
+  /**
+   * A callback function that will be called when a query execution failed.
+   * Implementations should guarantee thread-safe.
+   *
+   * @param funcName the name of the action that triggered this query.
+   * @param qe the QueryExecution object that carries detail information like logical plan,
+   *           physical plan, etc.
+   * @param exception the exception that failed this query.
+   */
+  @DeveloperApi
+  def onFailure(funcName: String, qe: QueryExecution, exception: Exception)
+}
+
+@Experimental
+class ExecutionListenerManager extends Logging {
+  private[this] val listeners = ListBuffer.empty[QueryExecutionListener]
+  private[this] val lock = new ReentrantReadWriteLock()
+
+  /** Acquires a read lock on the cache for the duration of `f`. */
+  private def readLock[A](f: => A): A = {
+    val rl = lock.readLock()
+    rl.lock()
+    try f finally {
+      rl.unlock()
+    }
+  }
+
+  /** Acquires a write lock on the cache for the duration of `f`. */
+  private def writeLock[A](f: => A): A = {
+    val wl = lock.writeLock()
+    wl.lock()
+    try f finally {
+      wl.unlock()
+    }
+  }
+
+  /**
+   * Registers the specified QueryExecutionListener.
+   */
+  @DeveloperApi
+  def register(listener: QueryExecutionListener): Unit = writeLock {
+    listeners += listener
+  }
+
+  /**
+   * Unregisters the specified QueryExecutionListener.
+   */
+  @DeveloperApi
+  def unregister(listener: QueryExecutionListener): Unit = writeLock {
+    listeners -= listener
+  }
+
+  /**
+   * clears out all registered QueryExecutionListeners.
+   */
+  @DeveloperApi
+  def clear(): Unit = writeLock {
+    listeners.clear()
+  }
+
+  private[sql] def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      duration: Long): Unit = readLock {
+    withErrorHandling { listener =>
+      listener.onSuccess(funcName, qe, duration)
+    }
+  }
+
+  private[sql] def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception): Unit = readLock {
+    withErrorHandling { listener =>
+      listener.onFailure(funcName, qe, exception)
+    }
+  }
+
+  private def withErrorHandling(f: QueryExecutionListener => Unit): Unit = {
+    for (listener <- listeners) {
+      try {
+        f(listener)
+      } catch {
+        case e: Exception => logWarning("error executing query execution listener", e)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index cd937257d31a8..a835408f8af3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -177,6 +177,9 @@ class SQLContext private[sql](
    */
   def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
 
+  @transient
+  lazy val listenerManager: ExecutionListenerManager = new ExecutionListenerManager
+
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(conf)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
new file mode 100644
index 0000000000000..4e286a0076205
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.test.SharedSQLContext
+
+import scala.collection.mutable.ArrayBuffer
+
+class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+  import functions._
+
+  test("execute callback functions when a DataFrame action finished successfully") {
+    val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]
+    val listener = new QueryExecutionListener {
+      // Only test successful case here, so no need to implement `onFailure`
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
+
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+        metrics += ((funcName, qe, duration))
+      }
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val df = Seq(1 -> "a").toDF("i", "j")
+    df.select("i").collect()
+    df.filter($"i" > 0).count()
+
+    assert(metrics.length == 2)
+
+    assert(metrics(0)._1 == "collect")
+    assert(metrics(0)._2.analyzed.isInstanceOf[Project])
+    assert(metrics(0)._3 > 0)
+
+    assert(metrics(1)._1 == "count")
+    assert(metrics(1)._2.analyzed.isInstanceOf[Aggregate])
+    assert(metrics(1)._3 > 0)
+  }
+
+  test("execute callback functions when a DataFrame action failed") {
+    val metrics = ArrayBuffer.empty[(String, QueryExecution, Exception)]
+    val listener = new QueryExecutionListener {
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+        metrics += ((funcName, qe, exception))
+      }
+
+      // Only test failed case here, so no need to implement `onSuccess`
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {}
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val errorUdf = udf[Int, Int] { _ => throw new RuntimeException("udf error") }
+    val df = sparkContext.makeRDD(Seq(1 -> "a")).toDF("i", "j")
+
+    // Ignore the log when we are expecting an exception.
+    sparkContext.setLogLevel("FATAL")
+    val e = intercept[SparkException](df.select(errorUdf($"i")).collect())
+
+    assert(metrics.length == 1)
+    assert(metrics(0)._1 == "collect")
+    assert(metrics(0)._2.analyzed.isInstanceOf[Project])
+    assert(metrics(0)._3.getMessage == e.getMessage)
+  }
+}

From ce3f9a80657751ee0bc0ed6a9b6558acbb40af4f Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 13 Oct 2015 18:21:24 -0700
Subject: [PATCH 545/802] [SPARK-11091] [SQL] Change spark.sql.canonicalizeView
 to spark.sql.nativeView.

https://issues.apache.org/jira/browse/SPARK-11091

Author: Yin Huai <yhuai@databricks.com>

Closes #9103 from yhuai/SPARK-11091.
---
 .../main/scala/org/apache/spark/sql/SQLConf.scala  |  4 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala      |  2 +-
 .../scala/org/apache/spark/sql/hive/HiveQl.scala   |  2 +-
 .../spark/sql/hive/execution/SQLQuerySuite.scala   | 14 +++++++-------
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index f62df9bdebcc0..b08cc8e830737 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -328,7 +328,7 @@ private[spark] object SQLConf {
     doc = "When true, some predicates will be pushed down into the Hive metastore so that " +
           "unmatching partitions can be eliminated earlier.")
 
-  val CANONICALIZE_VIEW = booleanConf("spark.sql.canonicalizeView",
+  val NATIVE_VIEW = booleanConf("spark.sql.nativeView",
     defaultValue = Some(false),
     doc = "When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands.  " +
           "Note that this function is experimental and should ony be used when you are using " +
@@ -489,7 +489,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  private[spark] def canonicalizeView: Boolean = getConf(CANONICALIZE_VIEW)
+  private[spark] def nativeView: Boolean = getConf(NATIVE_VIEW)
 
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index cf59bc0d590b0..1f8223e1ff507 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -591,7 +591,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       case p: LogicalPlan if p.resolved => p
 
       case CreateViewAsSelect(table, child, allowExisting, replace, sql) =>
-        if (conf.canonicalizeView) {
+        if (conf.nativeView) {
           if (allowExisting && replace) {
             throw new AnalysisException(
               "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 250c232856885..1d505019400bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -537,7 +537,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       serde = None,
       viewText = Some(originalText))
 
-    // We need to keep the original SQL string so that if `spark.sql.canonicalizeView` is
+    // We need to keep the original SQL string so that if `spark.sql.nativeView` is
     // false, we can fall back to use hive native command later.
     // We can remove this when parser is configurable(can access SQLConf) in the future.
     val sql = context.getTokenRewriteStream
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 51b63f3688783..6aa34605b05a8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1282,7 +1282,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly parse CREATE VIEW statement") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {
         val df = (1 until 10).map(i => i -> i).toDF("i", "j")
         df.write.format("json").saveAsTable("jt")
@@ -1299,7 +1299,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle CREATE VIEW IF NOT EXISTS") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1316,7 +1316,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle CREATE OR REPLACE VIEW") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE OR REPLACE VIEW testView AS SELECT id FROM jt")
@@ -1339,7 +1339,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle ALTER VIEW") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1357,7 +1357,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for json table") {
     // json table is not hive-compatible, make sure the new flag fix it.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1369,7 +1369,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for partitioned parquet table") {
     // partitioned parquet table is not hive-compatible, make sure the new flag fix it.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("parTable") {
         val df = Seq(1 -> "a").toDF("i", "j")
         df.write.format("parquet").partitionBy("i").saveAsTable("parTable")
@@ -1382,7 +1382,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for joined tables") {
     // make sure the new flag can handle some complex cases like join and schema change.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt1", "jt2") {
         sqlContext.range(1, 10).toDF("id1").write.format("json").saveAsTable("jt1")
         sqlContext.range(1, 10).toDF("id2").write.format("json").saveAsTable("jt2")

From 8b32885704502ab2a715cf5142d7517181074428 Mon Sep 17 00:00:00 2001
From: Monica Liu <liu.monica.f@gmail.com>
Date: Tue, 13 Oct 2015 22:24:52 -0700
Subject: [PATCH 546/802] [SPARK-10981] [SPARKR] SparkR Join improvements

I was having issues with collect() and orderBy() in Spark 1.5.0 so I used the DataFrame.R file and test_sparkSQL.R file from the Spark 1.5.1 download. I only modified the join() function in DataFrame.R to include "full", "fullouter", "left", "right", and "leftsemi" and added corresponding test cases in the test for join() and merge() in test_sparkSQL.R file.
Pull request because I filed this JIRA bug report:
https://issues.apache.org/jira/browse/SPARK-10981

Author: Monica Liu <liu.monica.f@gmail.com>

Closes #9029 from mfliu/master.
---
 R/pkg/R/DataFrame.R              | 13 +++++++++----
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e0ce056243585..b7f5f978ebc2c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1414,9 +1414,10 @@ setMethod("where",
 #' @param x A Spark DataFrame
 #' @param y A Spark DataFrame
 #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
-#' Column expression. If joinExpr is omitted, join() wil perform a Cartesian join
+#' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
 #' @param joinType The type of join to perform. The following join types are available:
-#' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner".
+#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
+#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
 #' @return A DataFrame containing the result of the join operation.
 #' @rdname join
 #' @name join
@@ -1441,11 +1442,15 @@ setMethod("join",
               if (is.null(joinType)) {
                 sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc)
               } else {
-                if (joinType %in% c("inner", "outer", "left_outer", "right_outer", "semijoin")) {
+                if (joinType %in% c("inner", "outer", "full", "fullouter",
+                    "leftouter", "left_outer", "left",
+                    "rightouter", "right_outer", "right", "leftsemi")) {
+                  joinType <- gsub("_", "", joinType)
                   sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
                 } else {
                   stop("joinType must be one of the following types: ",
-                       "'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'")
+                      "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left',
+                      'rightouter', 'right_outer', 'right', 'leftsemi'")
                 }
               }
             }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d5509e475de05..46cab7646dcf9 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1071,7 +1071,7 @@ test_that("join() and merge() on a DataFrame", {
   expect_equal(names(joined2), c("age", "name", "name", "test"))
   expect_equal(count(joined2), 3)
 
-  joined3 <- join(df, df2, df$name == df2$name, "right_outer")
+  joined3 <- join(df, df2, df$name == df2$name, "rightouter")
   expect_equal(names(joined3), c("age", "name", "name", "test"))
   expect_equal(count(joined3), 4)
   expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
@@ -1082,11 +1082,34 @@ test_that("join() and merge() on a DataFrame", {
   expect_equal(count(joined4), 4)
   expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
 
+  joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+  expect_equal(names(joined5), c("age", "name", "name", "test"))
+  expect_equal(count(joined5), 3)
+  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
+
+  joined6 <- join(df, df2, df$name == df2$name, "inner")
+  expect_equal(names(joined6), c("age", "name", "name", "test"))
+  expect_equal(count(joined6), 3)
+
+  joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
+  expect_equal(names(joined7), c("age", "name"))
+  expect_equal(count(joined7), 3)
+
+  joined8 <- join(df, df2, df$name == df2$name, "left_outer")
+  expect_equal(names(joined8), c("age", "name", "name", "test"))
+  expect_equal(count(joined8), 3)
+  expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
+
+  joined9 <- join(df, df2, df$name == df2$name, "right_outer")
+  expect_equal(names(joined9), c("age", "name", "name", "test"))
+  expect_equal(count(joined9), 4)
+  expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
+
   merged <- select(merge(df, df2, df$name == df2$name, "outer"),
                    alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(merged), c("newAge", "name", "test"))
   expect_equal(count(merged), 4)
-  expect_equal(collect(orderBy(merged, joined4$name))$newAge[3], 24)
+  expect_equal(collect(orderBy(merged, merged$name))$newAge[3], 24)
 })
 
 test_that("toJSON() returns an RDD of the correct values", {

From 390b22fad69a33eb6daee25b6b858a2e768670a5 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 13 Oct 2015 22:31:23 -0700
Subject: [PATCH 547/802] [SPARK-10996] [SPARKR] Implement sampleBy() in
 DataFrameStatFunctions.

Author: Sun Rui <rui.sun@intel.com>

Closes #9023 from sun-rui/SPARK-10996.
---
 R/pkg/NAMESPACE                  |  3 ++-
 R/pkg/R/DataFrame.R              | 14 ++++++--------
 R/pkg/R/generics.R               |  6 +++++-
 R/pkg/R/sparkR.R                 | 12 +++---------
 R/pkg/R/stats.R                  | 32 ++++++++++++++++++++++++++++++++
 R/pkg/R/utils.R                  | 18 ++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 10 ++++++++++
 7 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ed9cd94e03b13..52f7a0106aae6 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -65,6 +65,7 @@ exportMethods("arrange",
               "repartition",
               "sample",
               "sample_frac",
+              "sampleBy",
               "saveAsParquetFile",
               "saveAsTable",
               "saveDF",
@@ -254,4 +255,4 @@ export("structField",
        "structType.structField",
        "print.structType")
 
-export("as.data.frame")
\ No newline at end of file
+export("as.data.frame")
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b7f5f978ebc2c..993be82a47f75 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1831,17 +1831,15 @@ setMethod("fillna",
               if (length(colNames) == 0 || !all(colNames != "")) {
                 stop("value should be an a named list with each name being a column name.")
               }
-
-              # Convert to the named list to an environment to be passed to JVM
-              valueMap <- new.env()
-              for (col in colNames) {
-                # Check each item in the named list is of valid type
-                v <- value[[col]]
+              # Check each item in the named list is of valid type
+              lapply(value, function(v) {
                 if (!(class(v) %in% c("integer", "numeric", "character"))) {
                   stop("Each item in value should be an integer, numeric or charactor.")
                 }
-                valueMap[[col]] <- v
-              }
+              })
+
+              # Convert to the named list to an environment to be passed to JVM
+              valueMap <- convertNamedListToEnv(value)
 
               # When value is a named list, caller is expected not to pass in cols
               if (!is.null(cols)) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c106a0024583e..4a419f785e92c 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -509,6 +509,10 @@ setGeneric("sample",
 setGeneric("sample_frac",
            function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
 
+#' @rdname statfunctions
+#' @export
+setGeneric("sampleBy", function(x, col, fractions, seed) { standardGeneric("sampleBy") })
+
 #' @rdname saveAsParquetFile
 #' @export
 setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
@@ -1006,4 +1010,4 @@ setGeneric("as.data.frame")
 
 #' @rdname attach
 #' @export
-setGeneric("attach")
\ No newline at end of file
+setGeneric("attach")
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index cc47110f54732..9cf2f1a361cf2 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -163,19 +163,13 @@ sparkR.init <- function(
     sparkHome <- suppressWarnings(normalizePath(sparkHome))
   }
 
-  sparkEnvirMap <- new.env()
-  for (varname in names(sparkEnvir)) {
-    sparkEnvirMap[[varname]] <- sparkEnvir[[varname]]
-  }
+  sparkEnvirMap <- convertNamedListToEnv(sparkEnvir)
 
-  sparkExecutorEnvMap <- new.env()
-  if (!any(names(sparkExecutorEnv) == "LD_LIBRARY_PATH")) {
+  sparkExecutorEnvMap <- convertNamedListToEnv(sparkExecutorEnv)
+  if(is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) {
     sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <-
       paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
   }
-  for (varname in names(sparkExecutorEnv)) {
-    sparkExecutorEnvMap[[varname]] <- sparkExecutorEnv[[varname]]
-  }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
   localJarPaths <- lapply(nonEmptyJars,
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 4928cf4d4367d..f79329b115404 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -127,3 +127,35 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
             sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
             collect(dataFrame(sct))
           })
+
+#' sampleBy
+#'
+#' Returns a stratified sample without replacement based on the fraction given on each stratum.
+#' 
+#' @param x A SparkSQL DataFrame
+#' @param col column that defines strata
+#' @param fractions A named list giving sampling fraction for each stratum. If a stratum is
+#'                  not specified, we treat its fraction as zero.
+#' @param seed random seed
+#' @return A new DataFrame that represents the stratified sample
+#'
+#' @rdname statfunctions
+#' @name sampleBy
+#' @export
+#' @examples
+#'\dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' sample <- sampleBy(df, "key", fractions, 36)
+#' }
+setMethod("sampleBy",
+          signature(x = "DataFrame", col = "character",
+                    fractions = "list", seed = "numeric"),
+          function(x, col, fractions, seed) {
+            fractionsEnv <- convertNamedListToEnv(fractions)
+
+            statFunctions <- callJMethod(x@sdf, "stat")
+            # Seed is expected to be Long on Scala side, here convert it to an integer
+            # due to SerDe limitation now.
+            sdf <- callJMethod(statFunctions, "sampleBy", col, fractionsEnv, as.integer(seed))
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 94f16c7ac52cc..0b9e2957fe9a5 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -605,3 +605,21 @@ structToList <- function(struct) {
   class(struct) <- "list"
   struct
 }
+
+# Convert a named list to an environment to be passed to JVM
+convertNamedListToEnv <- function(namedList) {
+  # Make sure each item in the list has a name
+  names <- names(namedList)
+  stopifnot(
+    if (is.null(names)) {
+      length(namedList) == 0
+    } else {
+      !any(is.na(names))
+    })
+
+  env <- new.env()
+  for (name in names) {
+    env[[name]] <- namedList[[name]]
+  }
+  env
+}
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 46cab7646dcf9..e1b42b0804933 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1416,6 +1416,16 @@ test_that("freqItems() on a DataFrame", {
   expect_identical(result[[2]], list(list(-1, -99)))
 })
 
+test_that("sampleBy() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) { as.character(i %% 3) })
+  df <- createDataFrame(sqlContext, l, "key")
+  fractions <- list("0" = 0.1, "1" = 0.2)
+  sample <- sampleBy(df, "key", fractions, 0)
+  result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
+  expect_identical(as.list(result[1, ]), list(key = "0", count = 2))
+  expect_identical(as.list(result[2, ]), list(key = "1", count = 10))
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)

From 135a2ce5b0b927b512c832d61c25e7b9d57e30be Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Wed, 14 Oct 2015 10:12:25 -0700
Subject: [PATCH 548/802] [SPARK-10619] Can't sort columns on Executor Page

should pick into spark 1.5.2 also.

https://issues.apache.org/jira/browse/SPARK-10619

looks like this was broken by commit: https://github.com/apache/spark/commit/fb1d06fc242ec00320f1a3049673fbb03c4a6eb9#diff-b8adb646ef90f616c34eb5c98d1ebd16
It looks like somethings were change to use the UIUtils.listingTable but executor page wasn't converted so when it removed sortable from the UIUtils. TABLE_CLASS_NOT_STRIPED it broke this page.

Simply add the sortable tag back in and it fixes both active UI and the history server UI.

Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #9101 from tgravescs/SPARK-10619.
---
 core/src/main/scala/org/apache/spark/ui/UIUtils.scala           | 1 +
 .../src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 2 +-
 .../src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala | 2 +-
 .../main/scala/org/apache/spark/streaming/ui/BatchPage.scala    | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 21dc8f0b65485..68a9f912a5d2c 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -31,6 +31,7 @@ import org.apache.spark.ui.scope.RDDOperationGraph
 private[spark] object UIUtils extends Logging {
   val TABLE_CLASS_NOT_STRIPED = "table table-bordered table-condensed"
   val TABLE_CLASS_STRIPED = TABLE_CLASS_NOT_STRIPED + " table-striped"
+  val TABLE_CLASS_STRIPED_SORTABLE = TABLE_CLASS_STRIPED + " sortable"
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index 01cddda4c62cd..1a29b0f412603 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -62,7 +62,7 @@ private[ui] class ExecutorsPage(
     val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty
 
     val execTable =
-      <table class={UIUtils.TABLE_CLASS_STRIPED}>
+      <table class={UIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
         <thead>
           <th>Executor ID</th>
           <th>Address</th>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index d5cdbfac104f8..be144f6065baa 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -50,7 +50,7 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
         hasBytesSpilled = data.hasBytesSpilled
     })
 
-    <table class={UIUtils.TABLE_CLASS_STRIPED}>
+    <table class={UIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
       <thead>
         <th>Executor ID</th>
         <th>Address</th>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 1b717b64542d5..a19b85a51d289 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -443,7 +443,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   def generateInputMetadataTable(inputMetadatas: Seq[(Int, String)]): Seq[Node] = {
-    <table class={SparkUIUtils.TABLE_CLASS_STRIPED}>
+    <table class={SparkUIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
       <thead>
         <tr>
           <th>Input</th>

From 31f315981709251d5d26c508a3dc62cf0e6f87e1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 14 Oct 2015 10:25:09 -0700
Subject: [PATCH 549/802] [SPARK-11040] [NETWORK] Make sure SASL handler
 delegates all events.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9053 from vanzin/SPARK-11040.
---
 .../spark/network/sasl/SaslRpcHandler.java    | 13 +++++++++++--
 .../server/TransportRequestHandler.java       |  8 +++++++-
 .../spark/network/sasl/SparkSaslSuite.java    | 19 +++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
index 3f2ebe32887b8..7033adb9cae6f 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
@@ -115,9 +115,18 @@ public StreamManager getStreamManager() {
 
   @Override
   public void connectionTerminated(TransportClient client) {
-    if (saslServer != null) {
-      saslServer.dispose();
+    try {
+      delegate.connectionTerminated(client);
+    } finally {
+      if (saslServer != null) {
+        saslServer.dispose();
+      }
     }
   }
 
+  @Override
+  public void exceptionCaught(Throwable cause, TransportClient client) {
+    delegate.exceptionCaught(cause, client);
+  }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index 96941d26be19d..9b8b047b49a86 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -76,7 +76,13 @@ public void exceptionCaught(Throwable cause) {
 
   @Override
   public void channelUnregistered() {
-    streamManager.connectionTerminated(channel);
+    if (streamManager != null) {
+      try {
+        streamManager.connectionTerminated(channel);
+      } catch (RuntimeException e) {
+        logger.error("StreamManager connectionTerminated() callback failed.", e);
+      }
+    }
     rpcHandler.connectionTerminated(reverseClient);
   }
 
diff --git a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
index 8104004847a24..3469e84e7f4da 100644
--- a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
+++ b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
@@ -153,6 +153,8 @@ public Void answer(InvocationOnMock invocation) {
       assertEquals("Pong", new String(response, StandardCharsets.UTF_8));
     } finally {
       ctx.close();
+      // There should be 2 terminated events; one for the client, one for the server.
+      verify(rpcHandler, times(2)).connectionTerminated(any(TransportClient.class));
     }
   }
 
@@ -334,6 +336,23 @@ public void testDataEncryptionIsActuallyEnabled() throws Exception {
     }
   }
 
+  @Test
+  public void testRpcHandlerDelegate() throws Exception {
+    // Tests all delegates exception for receive(), which is more complicated and already handled
+    // by all other tests.
+    RpcHandler handler = mock(RpcHandler.class);
+    RpcHandler saslHandler = new SaslRpcHandler(null, null, handler, null);
+
+    saslHandler.getStreamManager();
+    verify(handler).getStreamManager();
+
+    saslHandler.connectionTerminated(null);
+    verify(handler).connectionTerminated(any(TransportClient.class));
+
+    saslHandler.exceptionCaught(null, null);
+    verify(handler).exceptionCaught(any(Throwable.class), any(TransportClient.class));
+  }
+
   private static class SaslTestCtx {
 
     final TransportClient client;

From 7e1308d37f6ca35f063e67e4b87a77e932ad89a5 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 14 Oct 2015 12:31:29 -0700
Subject: [PATCH 550/802] [SPARK-8386] [SQL] add write.mode for insertIntoJDBC
 when the parm overwrite is false

the fix is for jira https://issues.apache.org/jira/browse/SPARK-8386

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #9042 from huaxingao/spark8386.
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index bfe8d3c8ef957..174bc6f42ad8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1674,7 +1674,7 @@ class DataFrame private[sql](
    */
   @deprecated("Use write.jdbc()", "1.4.0")
   def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
-    val w = if (overwrite) write.mode(SaveMode.Overwrite) else write
+    val w = if (overwrite) write.mode(SaveMode.Overwrite) else write.mode(SaveMode.Append)
     w.jdbc(url, table, new Properties)
   }
 

From 615cc858cf913522059b6ebdde65f0204f4fb030 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 12:36:31 -0700
Subject: [PATCH 551/802] [SPARK-10973]

Close #9064
Close #9063
Close #9062

These pull requests were merged into branch-1.5, branch-1.4, and branch-1.3.

From cf2e0ae7205443f052463e8cb9334ae2b6df2d0e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 12:41:02 -0700
Subject: [PATCH 552/802] [SPARK-11096] Post-hoc review Netty based RPC
 implementation - round 2

A few more changes:

1. Renamed IDVerifier -> RpcEndpointVerifier
2. Renamed NettyRpcAddress -> RpcEndpointAddress
3. Simplified NettyRpcHandler a bit by removing the connection count tracking. This is OK because I now force spark.shuffle.io.numConnectionsPerPeer to 1
4. Reduced spark.rpc.connect.threads to 64. It would be great to eventually remove this extra thread pool.
5. Minor cleanup & documentation.

Author: Reynold Xin <rxin@databricks.com>

Closes #9112 from rxin/SPARK-11096.
---
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   9 --
 .../apache/spark/rpc/netty/Dispatcher.scala   |   7 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 114 +++++++-----------
 ...Address.scala => RpcEndpointAddress.scala} |  32 ++---
 ...rifier.scala => RpcEndpointVerifier.scala} |  21 ++--
 .../rpc/netty/NettyRpcAddressSuite.scala      |   2 +-
 .../rpc/netty/NettyRpcHandlerSuite.scala      |   3 -
 7 files changed, 81 insertions(+), 107 deletions(-)
 rename core/src/main/scala/org/apache/spark/rpc/netty/{NettyRpcAddress.scala => RpcEndpointAddress.scala} (65%)
 rename core/src/main/scala/org/apache/spark/rpc/netty/{IDVerifier.scala => RpcEndpointVerifier.scala} (65%)

diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index ef491a0ae4f09..2c4a8b9a0a878 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -93,15 +93,6 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
     defaultLookupTimeout.awaitResult(asyncSetupEndpointRefByURI(uri))
   }
 
-  /**
-   * Retrieve the [[RpcEndpointRef]] represented by `systemName`, `address` and `endpointName`
-   * asynchronously.
-   */
-  def asyncSetupEndpointRef(
-      systemName: String, address: RpcAddress, endpointName: String): Future[RpcEndpointRef] = {
-    asyncSetupEndpointRefByURI(uriOf(systemName, address, endpointName))
-  }
-
   /**
    * Retrieve the [[RpcEndpointRef]] represented by `systemName`, `address` and `endpointName`.
    * This is a blocking action.
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index 398e9eafc1444..f1a8273f157ef 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -29,6 +29,9 @@ import org.apache.spark.network.client.RpcResponseCallback
 import org.apache.spark.rpc._
 import org.apache.spark.util.ThreadUtils
 
+/**
+ * A message dispatcher, responsible for routing RPC messages to the appropriate endpoint(s).
+ */
 private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
 
   private class EndpointData(
@@ -42,7 +45,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
-  private val receivers = new LinkedBlockingQueue[EndpointData]()
+  private val receivers = new LinkedBlockingQueue[EndpointData]
 
   /**
    * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced
@@ -52,7 +55,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   private var stopped = false
 
   def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
-    val addr = new NettyRpcAddress(nettyEnv.address.host, nettyEnv.address.port, name)
+    val addr = new RpcEndpointAddress(nettyEnv.address.host, nettyEnv.address.port, name)
     val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
     synchronized {
       if (stopped) {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 89b6df76c2707..a2b28c524df9c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -22,7 +22,6 @@ import java.nio.ByteBuffer
 import java.util.concurrent._
 import javax.annotation.concurrent.GuardedBy
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.{Future, Promise}
 import scala.reflect.ClassTag
@@ -45,8 +44,10 @@ private[netty] class NettyRpcEnv(
     host: String,
     securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
 
-  private val transportConf =
-    SparkTransportConf.fromSparkConf(conf, conf.getInt("spark.rpc.io.threads", 0))
+  // Override numConnectionsPerPeer to 1 for RPC.
+  private val transportConf = SparkTransportConf.fromSparkConf(
+    conf.clone.set("spark.shuffle.io.numConnectionsPerPeer", "1"),
+    conf.getInt("spark.rpc.io.threads", 0))
 
   private val dispatcher: Dispatcher = new Dispatcher(this)
 
@@ -54,14 +55,14 @@ private[netty] class NettyRpcEnv(
     new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
 
   private val clientFactory = {
-    val bootstraps: Seq[TransportClientBootstrap] =
+    val bootstraps: java.util.List[TransportClientBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslClientBootstrap(transportConf, "", securityManager,
+        java.util.Arrays.asList(new SaslClientBootstrap(transportConf, "", securityManager,
           securityManager.isSaslEncryptionEnabled()))
       } else {
-        Nil
+        java.util.Collections.emptyList[TransportClientBootstrap]
       }
-    transportContext.createClientFactory(bootstraps.asJava)
+    transportContext.createClientFactory(bootstraps)
   }
 
   val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
@@ -71,7 +72,7 @@ private[netty] class NettyRpcEnv(
   // TODO: a non-blocking TransportClientFactory.createClient in future
   private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
     "netty-rpc-connection",
-    conf.getInt("spark.rpc.connect.threads", 256))
+    conf.getInt("spark.rpc.connect.threads", 64))
 
   @volatile private var server: TransportServer = _
 
@@ -83,7 +84,8 @@ private[netty] class NettyRpcEnv(
         java.util.Collections.emptyList()
       }
     server = transportContext.createServer(port, bootstraps)
-    dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
+    dispatcher.registerRpcEndpoint(
+      RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher))
   }
 
   override lazy val address: RpcAddress = {
@@ -96,11 +98,11 @@ private[netty] class NettyRpcEnv(
   }
 
   def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = {
-    val addr = NettyRpcAddress(uri)
+    val addr = RpcEndpointAddress(uri)
     val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
-    val idVerifierRef =
-      new NettyRpcEndpointRef(conf, NettyRpcAddress(addr.host, addr.port, IDVerifier.NAME), this)
-    idVerifierRef.ask[Boolean](ID(endpointRef.name)).flatMap { find =>
+    val verifier = new NettyRpcEndpointRef(
+      conf, RpcEndpointAddress(addr.host, addr.port, RpcEndpointVerifier.NAME), this)
+    verifier.ask[Boolean](RpcEndpointVerifier.CheckExistence(endpointRef.name)).flatMap { find =>
       if (find) {
         Future.successful(endpointRef)
       } else {
@@ -117,16 +119,18 @@ private[netty] class NettyRpcEnv(
   private[netty] def send(message: RequestMessage): Unit = {
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
+      // Message to a local RPC endpoint.
       val promise = Promise[Any]()
       dispatcher.postLocalMessage(message, promise)
       promise.future.onComplete {
         case Success(response) =>
           val ack = response.asInstanceOf[Ack]
-          logDebug(s"Receive ack from ${ack.sender}")
+          logTrace(s"Received ack from ${ack.sender}")
         case Failure(e) =>
           logError(s"Exception when sending $message", e)
       }(ThreadUtils.sameThread)
     } else {
+      // Message to a remote RPC endpoint.
       try {
         // `createClient` will block if it cannot find a known connection, so we should run it in
         // clientConnectionExecutor
@@ -204,11 +208,10 @@ private[netty] class NettyRpcEnv(
           }
         })
       } catch {
-        case e: RejectedExecutionException => {
+        case e: RejectedExecutionException =>
           if (!promise.tryFailure(e)) {
             logWarning(s"Ignore failure", e)
           }
-        }
       }
     }
     promise.future
@@ -231,7 +234,7 @@ private[netty] class NettyRpcEnv(
   }
 
   override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
-    new NettyRpcAddress(address.host, address.port, endpointName).toString
+    new RpcEndpointAddress(address.host, address.port, endpointName).toString
 
   override def shutdown(): Unit = {
     cleanup()
@@ -310,9 +313,9 @@ private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
 
   @transient @volatile private var nettyEnv: NettyRpcEnv = _
 
-  @transient @volatile private var _address: NettyRpcAddress = _
+  @transient @volatile private var _address: RpcEndpointAddress = _
 
-  def this(conf: SparkConf, _address: NettyRpcAddress, nettyEnv: NettyRpcEnv) {
+  def this(conf: SparkConf, _address: RpcEndpointAddress, nettyEnv: NettyRpcEnv) {
     this(conf)
     this._address = _address
     this.nettyEnv = nettyEnv
@@ -322,7 +325,7 @@ private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
 
   private def readObject(in: ObjectInputStream): Unit = {
     in.defaultReadObject()
-    _address = in.readObject().asInstanceOf[NettyRpcAddress]
+    _address = in.readObject().asInstanceOf[RpcEndpointAddress]
     nettyEnv = NettyRpcEnv.currentEnv.value
   }
 
@@ -406,49 +409,37 @@ private[netty] class NettyRpcHandler(
   private type RemoteEnvAddress = RpcAddress
 
   // Store all client addresses and their NettyRpcEnv addresses.
+  // TODO: Is this even necessary?
   @GuardedBy("this")
   private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
 
-  // Store the connections from other NettyRpcEnv addresses. We need to keep track of the connection
-  // count because `TransportClientFactory.createClient` will create multiple connections
-  // (at most `spark.shuffle.io.numConnectionsPerPeer` connections) and randomly select a connection
-  // to send the message. See `TransportClientFactory.createClient` for more details.
-  @GuardedBy("this")
-  private val remoteConnectionCount = new mutable.HashMap[RemoteEnvAddress, Int]()
-
   override def receive(
       client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
     val requestMessage = nettyEnv.deserialize[RequestMessage](message)
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     assert(addr != null)
     val remoteEnvAddress = requestMessage.senderAddress
     val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-    val broadcastMessage: Option[RemoteProcessConnected] =
-      synchronized {
-        // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
-        if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
-          // clientAddr connects at the first time
-          val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-          // Increase the connection number of remoteEnvAddress
-          remoteConnectionCount.put(remoteEnvAddress, count + 1)
-          if (count == 0) {
-            // This is the first connection, so fire "Associated"
-            Some(RemoteProcessConnected(remoteEnvAddress))
-          } else {
-            None
-          }
-        } else {
-          None
-        }
+
+    // TODO: Can we add connection callback (channel registered) to the underlying framework?
+    // A variable to track whether we should dispatch the RemoteProcessConnected message.
+    var dispatchRemoteProcessConnected = false
+    synchronized {
+      if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
+        // clientAddr connects at the first time, fire "RemoteProcessConnected"
+        dispatchRemoteProcessConnected = true
       }
-    broadcastMessage.foreach(dispatcher.postToAll)
+    }
+    if (dispatchRemoteProcessConnected) {
+      dispatcher.postToAll(RemoteProcessConnected(remoteEnvAddress))
+    }
     dispatcher.postRemoteMessage(requestMessage, callback)
   }
 
   override def getStreamManager: StreamManager = new OneForOneStreamManager
 
   override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
       val broadcastMessage =
@@ -469,34 +460,21 @@ private[netty] class NettyRpcHandler(
   }
 
   override def connectionTerminated(client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-      val broadcastMessage =
-        synchronized {
-          // If the last connection to a remote RpcEnv is terminated, we should broadcast
-          // "Disassociated"
-          remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
-            remoteAddresses -= clientAddr
-            val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-            assert(count != 0, "remoteAddresses and remoteConnectionCount are not consistent")
-            if (count - 1 == 0) {
-              // We lost all clients, so clean up and fire "Disassociated"
-              remoteConnectionCount.remove(remoteEnvAddress)
-              Some(RemoteProcessDisconnected(remoteEnvAddress))
-            } else {
-              // Decrease the connection number of remoteEnvAddress
-              remoteConnectionCount.put(remoteEnvAddress, count - 1)
-              None
-            }
-          }
+      val messageOpt: Option[RemoteProcessDisconnected] =
+      synchronized {
+        remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
+          remoteAddresses -= clientAddr
+          Some(RemoteProcessDisconnected(remoteEnvAddress))
         }
-      broadcastMessage.foreach(dispatcher.postToAll)
+      }
+      messageOpt.foreach(dispatcher.postToAll)
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null. In this case,
       // we can ignore it since we don't fire "Associated".
       // See java.net.Socket.getRemoteSocketAddress
     }
   }
-
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
similarity index 65%
rename from core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
rename to core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
index 1876b25592086..87b6236936817 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
@@ -17,40 +17,44 @@
 
 package org.apache.spark.rpc.netty
 
-import java.net.URI
-
 import org.apache.spark.SparkException
 import org.apache.spark.rpc.RpcAddress
 
-private[netty] case class NettyRpcAddress(host: String, port: Int, name: String) {
+/**
+ * An address identifier for an RPC endpoint.
+ *
+ * @param host host name of the remote process.
+ * @param port the port the remote RPC environment binds to.
+ * @param name name of the remote endpoint.
+ */
+private[netty] case class RpcEndpointAddress(host: String, port: Int, name: String) {
 
   def toRpcAddress: RpcAddress = RpcAddress(host, port)
 
   override val toString = s"spark://$name@$host:$port"
 }
 
-private[netty] object NettyRpcAddress {
+private[netty] object RpcEndpointAddress {
 
-  def apply(sparkUrl: String): NettyRpcAddress = {
+  def apply(sparkUrl: String): RpcEndpointAddress = {
     try {
-      val uri = new URI(sparkUrl)
+      val uri = new java.net.URI(sparkUrl)
       val host = uri.getHost
       val port = uri.getPort
       val name = uri.getUserInfo
       if (uri.getScheme != "spark" ||
-        host == null ||
-        port < 0 ||
-        name == null ||
-        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
-        uri.getFragment != null ||
-        uri.getQuery != null) {
+          host == null ||
+          port < 0 ||
+          name == null ||
+          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
+          uri.getFragment != null ||
+          uri.getQuery != null) {
         throw new SparkException("Invalid Spark URL: " + sparkUrl)
       }
-      NettyRpcAddress(host, port, name)
+      RpcEndpointAddress(host, port, name)
     } catch {
       case e: java.net.URISyntaxException =>
         throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
     }
   }
-
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
similarity index 65%
rename from core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
rename to core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index fa9a3eb99b02a..99f20da2d66aa 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -14,26 +14,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.rpc.netty
 
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
 
 /**
- * A message used to ask the remote [[IDVerifier]] if an [[RpcEndpoint]] exists
- */
-private[netty] case class ID(name: String)
-
-/**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists.
+ *
+ * This is used when setting up a remote endpoint reference.
  */
-private[netty] class IDVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
+private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
   extends RpcEndpoint {
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case ID(name) => context.reply(dispatcher.verify(name))
+    case RpcEndpointVerifier.CheckExistence(name) => context.reply(dispatcher.verify(name))
   }
 }
 
-private[netty] object IDVerifier {
-  val NAME = "id-verifier"
+private[netty] object RpcEndpointVerifier {
+  val NAME = "endpoint-verifier"
+
+  /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */
+  case class CheckExistence(name: String)
 }
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
index a5d43d3704e37..973a07a0bde3a 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.SparkFunSuite
 class NettyRpcAddressSuite extends SparkFunSuite {
 
   test("toString") {
-    val addr = NettyRpcAddress("localhost", 12345, "test")
+    val addr = RpcEndpointAddress("localhost", 12345, "test")
     assert(addr.toString === "spark://test@localhost:12345")
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
index f24f78b8c4542..5430e4c0c4d6c 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -42,9 +42,6 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.receive(client, null, null)
 
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
-    nettyRpcHandler.receive(client, null, null)
-
     verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
   }
 

From 9a430a027faafb083ca569698effb697af26a1db Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 14 Oct 2015 15:08:13 -0700
Subject: [PATCH 553/802] [SPARK-11068] [SQL] [FOLLOW-UP] move execution
 listener to util

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9119 from cloud-fan/callback.
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala  | 1 +
 .../apache/spark/sql/{ => util}/QueryExecutionListener.scala   | 2 +-
 .../apache/spark/sql/{ => util}/DataFrameCallbackSuite.scala   | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/{ => util}/QueryExecutionListener.scala (99%)
 rename sql/core/src/test/scala/org/apache/spark/sql/{ => util}/DataFrameCallbackSuite.scala (97%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a835408f8af3a..3d5e35ab315eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -45,6 +45,7 @@ import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{execution => sparkexecution}
+import org.apache.spark.sql.util.ExecutionListenerManager
 import org.apache.spark.util.Utils
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index 14fbebb45f8b7..909a8abd225b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.util
 
 import java.util.concurrent.locks.ReentrantReadWriteLock
 import scala.collection.mutable.ListBuffer
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
similarity index 97%
rename from sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index 4e286a0076205..eb056cd519717 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.util
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql.{functions, QueryTest}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.test.SharedSQLContext

From 56d7da14ab8f89bf4f303b27f51fd22d23967ffb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 14 Oct 2015 16:05:37 -0700
Subject: [PATCH 554/802] [SPARK-10104] [SQL] Consolidate different forms of
 table identifiers

Right now, we have QualifiedTableName, TableIdentifier, and Seq[String] to represent table identifiers. We should only have one form and TableIdentifier is the best one because it provides methods to get table name, database name, return unquoted string, and return quoted string.

Author: Wenchen Fan <wenchen@databricks.com>
Author: Wenchen Fan <cloud0fan@163.com>

Closes #8453 from cloud-fan/table-name.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |   2 +-
 .../spark/sql/catalyst/TableIdentifier.scala  |  14 +-
 .../sql/catalyst/analysis/Analyzer.scala      |   4 +-
 .../spark/sql/catalyst/analysis/Catalog.scala | 174 ++++++------------
 .../sql/catalyst/analysis/unresolved.scala    |   6 +-
 .../spark/sql/catalyst/dsl/package.scala      |   3 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  24 ++-
 .../sql/catalyst/analysis/AnalysisTest.scala  |  10 +-
 .../analysis/DecimalPrecisionSuite.scala      |   4 +-
 .../apache/spark/sql/DataFrameReader.scala    |   3 +-
 .../apache/spark/sql/DataFrameWriter.scala    |   4 +-
 .../org/apache/spark/sql/SQLContext.scala     |   6 +-
 .../sql/execution/datasources/DDLParser.scala |   2 +-
 .../spark/sql/execution/datasources/ddl.scala |   7 +-
 .../sql/execution/datasources/rules.scala     |   4 +-
 .../apache/spark/sql/CachedTableSuite.scala   |   7 +-
 .../org/apache/spark/sql/JoinSuite.scala      |   5 +-
 .../apache/spark/sql/ListTablesSuite.scala    |   7 +-
 .../parquet/ParquetQuerySuite.scala           |   6 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 134 ++++----------
 .../org/apache/spark/sql/hive/HiveQl.scala    |  42 ++---
 .../hive/execution/CreateTableAsSelect.scala  |  12 +-
 .../hive/execution/CreateViewAsSelect.scala   |   9 +-
 .../spark/sql/hive/execution/commands.scala   |  10 +-
 .../apache/spark/sql/hive/test/TestHive.scala |   2 +-
 .../hive/JavaMetastoreDataSourcesSuite.java   |   6 +-
 .../spark/sql/hive/ListTablesSuite.scala      |   5 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |   9 +-
 .../spark/sql/hive/StatisticsSuite.scala      |   5 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   6 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |   5 +-
 32 files changed, 212 insertions(+), 327 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index dfab2398857e8..2595e1f90c837 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -170,7 +170,7 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     joinedRelation | relationFactor
 
   protected lazy val relationFactor: Parser[LogicalPlan] =
-    ( rep1sep(ident, ".") ~ (opt(AS) ~> opt(ident)) ^^ {
+    ( tableIdentifier ~ (opt(AS) ~> opt(ident)) ^^ {
         case tableIdent ~ alias => UnresolvedRelation(tableIdent, alias)
       }
       | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
index d701559bf2d9b..4d4e4ded99477 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
@@ -20,14 +20,16 @@ package org.apache.spark.sql.catalyst
 /**
  * Identifies a `table` in `database`.  If `database` is not defined, the current database is used.
  */
-private[sql] case class TableIdentifier(table: String, database: Option[String] = None) {
-  def withDatabase(database: String): TableIdentifier = this.copy(database = Some(database))
-
-  def toSeq: Seq[String] = database.toSeq :+ table
+private[sql] case class TableIdentifier(table: String, database: Option[String]) {
+  def this(table: String) = this(table, None)
 
   override def toString: String = quotedString
 
-  def quotedString: String = toSeq.map("`" + _ + "`").mkString(".")
+  def quotedString: String = database.map(db => s"`$db`.`$table`").getOrElse(s"`$table`")
+
+  def unquotedString: String = database.map(db => s"$db.$table").getOrElse(table)
+}
 
-  def unquotedString: String = toSeq.mkString(".")
+private[sql] object TableIdentifier {
+  def apply(tableName: String): TableIdentifier = new TableIdentifier(tableName)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 041ab22827399..e6046055bf0f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -105,7 +105,7 @@ class Analyzer(
         // here use the CTE definition first, check table name only and ignore database name
         // see https://github.com/apache/spark/pull/4929#discussion_r27186638 for more info
         case u : UnresolvedRelation =>
-          val substituted = cteRelations.get(u.tableIdentifier.last).map { relation =>
+          val substituted = cteRelations.get(u.tableIdentifier.table).map { relation =>
             val withAlias = u.alias.map(Subquery(_, relation))
             withAlias.getOrElse(relation)
           }
@@ -257,7 +257,7 @@ class Analyzer(
         catalog.lookupRelation(u.tableIdentifier, u.alias)
       } catch {
         case _: NoSuchTableException =>
-          u.failAnalysis(s"no such table ${u.tableName}")
+          u.failAnalysis(s"Table Not Found: ${u.tableName}")
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 4cc9a5520a085..8f4ce74a2ea38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -42,11 +42,9 @@ trait Catalog {
 
   val conf: CatalystConf
 
-  def tableExists(tableIdentifier: Seq[String]): Boolean
+  def tableExists(tableIdent: TableIdentifier): Boolean
 
-  def lookupRelation(
-      tableIdentifier: Seq[String],
-      alias: Option[String] = None): LogicalPlan
+  def lookupRelation(tableIdent: TableIdentifier, alias: Option[String] = None): LogicalPlan
 
   /**
    * Returns tuples of (tableName, isTemporary) for all tables in the given database.
@@ -56,89 +54,59 @@ trait Catalog {
 
   def refreshTable(tableIdent: TableIdentifier): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
+  def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  def unregisterTable(tableIdentifier: Seq[String]): Unit
+  def unregisterTable(tableIdent: TableIdentifier): Unit
 
   def unregisterAllTables(): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = {
-    if (conf.caseSensitiveAnalysis) {
-      tableIdentifier
-    } else {
-      tableIdentifier.map(_.toLowerCase)
-    }
-  }
-
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def getDbTableName(tableIdent: Seq[String]): String = {
-    val size = tableIdent.size
-    if (size <= 2) {
-      tableIdent.mkString(".")
-    } else {
-      tableIdent.slice(size - 2, size).mkString(".")
-    }
-  }
-
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = {
-    (tableIdent.lift(tableIdent.size - 2), tableIdent.last)
-  }
-
   /**
-   * It is not allowed to specifiy database name for tables stored in [[SimpleCatalog]].
-   * We use this method to check it.
+   * Get the table name of TableIdentifier for temporary tables.
    */
-  protected def checkTableIdentifier(tableIdentifier: Seq[String]): Unit = {
-    if (tableIdentifier.length > 1) {
+  protected def getTableName(tableIdent: TableIdentifier): String = {
+    // It is not allowed to specify database name for temporary tables.
+    // We check it here and throw exception if database is defined.
+    if (tableIdent.database.isDefined) {
       throw new AnalysisException("Specifying database name or other qualifiers are not allowed " +
         "for temporary tables. If the table name has dots (.) in it, please quote the " +
         "table name with backticks (`).")
     }
+    if (conf.caseSensitiveAnalysis) {
+      tableIdent.table
+    } else {
+      tableIdent.table.toLowerCase
+    }
   }
 }
 
 class SimpleCatalog(val conf: CatalystConf) extends Catalog {
-  val tables = new ConcurrentHashMap[String, LogicalPlan]
-
-  override def registerTable(
-      tableIdentifier: Seq[String],
-      plan: LogicalPlan): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.put(getDbTableName(tableIdent), plan)
+  private[this] val tables = new ConcurrentHashMap[String, LogicalPlan]
+
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
+    tables.put(getTableName(tableIdent), plan)
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.remove(getDbTableName(tableIdent))
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
+    tables.remove(getTableName(tableIdent))
   }
 
   override def unregisterAllTables(): Unit = {
     tables.clear()
   }
 
-  override def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.containsKey(getDbTableName(tableIdent))
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    tables.containsKey(getTableName(tableIdent))
   }
 
   override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val tableFullName = getDbTableName(tableIdent)
-    val table = tables.get(tableFullName)
+    val tableName = getTableName(tableIdent)
+    val table = tables.get(tableName)
     if (table == null) {
-      sys.error(s"Table Not Found: $tableFullName")
+      throw new NoSuchTableException
     }
-    val tableWithQualifiers = Subquery(tableIdent.last, table)
+    val tableWithQualifiers = Subquery(tableName, table)
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
@@ -146,11 +114,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    val result = ArrayBuffer.empty[(String, Boolean)]
-    for (name <- tables.keySet().asScala) {
-      result += ((name, true))
-    }
-    result
+    tables.keySet().asScala.map(_ -> true).toSeq
   }
 
   override def refreshTable(tableIdent: TableIdentifier): Unit = {
@@ -165,68 +129,50 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
  * lost when the JVM exits.
  */
 trait OverrideCatalog extends Catalog {
+  private[this] val overrides = new ConcurrentHashMap[String, LogicalPlan]
 
-  // TODO: This doesn't work when the database changes...
-  val overrides = new mutable.HashMap[(Option[String], String), LogicalPlan]()
-
-  abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    // A temporary tables only has a single part in the tableIdentifier.
-    val overriddenTable = if (tableIdentifier.length > 1) {
-      None: Option[LogicalPlan]
+  private def getOverriddenTable(tableIdent: TableIdentifier): Option[LogicalPlan] = {
+    if (tableIdent.database.isDefined) {
+      None
     } else {
-      overrides.get(getDBTable(tableIdent))
+      Option(overrides.get(getTableName(tableIdent)))
     }
-    overriddenTable match {
+  }
+
+  abstract override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    getOverriddenTable(tableIdent) match {
       case Some(_) => true
-      case None => super.tableExists(tableIdentifier)
+      case None => super.tableExists(tableIdent)
     }
   }
 
   abstract override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    // A temporary tables only has a single part in the tableIdentifier.
-    val overriddenTable = if (tableIdentifier.length > 1) {
-      None: Option[LogicalPlan]
-    } else {
-      overrides.get(getDBTable(tableIdent))
-    }
-    val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r))
+    getOverriddenTable(tableIdent) match {
+      case Some(table) =>
+        val tableName = getTableName(tableIdent)
+        val tableWithQualifiers = Subquery(tableName, table)
 
-    // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
-    // properly qualified with this alias.
-    val withAlias =
-      tableWithQualifers.map(r => alias.map(a => Subquery(a, r)).getOrElse(r))
+        // If an alias was specified by the lookup, wrap the plan in a sub-query so that attributes
+        // are properly qualified with this alias.
+        alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
 
-    withAlias.getOrElse(super.lookupRelation(tableIdentifier, alias))
+      case None => super.lookupRelation(tableIdent, alias)
+    }
   }
 
   abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    // We always return all temporary tables.
-    val temporaryTables = overrides.map {
-      case ((_, tableName), _) => (tableName, true)
-    }.toSeq
-
-    temporaryTables ++ super.getTables(databaseName)
+    overrides.keySet().asScala.map(_ -> true).toSeq ++ super.getTables(databaseName)
   }
 
-  override def registerTable(
-      tableIdentifier: Seq[String],
-      plan: LogicalPlan): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    overrides.put(getDBTable(tableIdent), plan)
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
+    overrides.put(getTableName(tableIdent), plan)
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
-    // A temporary tables only has a single part in the tableIdentifier.
-    // If tableIdentifier has more than one parts, it is not a temporary table
-    // and we do not need to do anything at here.
-    if (tableIdentifier.length == 1) {
-      val tableIdent = processTableIdentifier(tableIdentifier)
-      overrides.remove(getDBTable(tableIdent))
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
+    if (tableIdent.database.isEmpty) {
+      overrides.remove(getTableName(tableIdent))
     }
   }
 
@@ -243,12 +189,12 @@ object EmptyCatalog extends Catalog {
 
   override val conf: CatalystConf = EmptyConf
 
-  override def tableExists(tableIdentifier: Seq[String]): Boolean = {
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
     throw new UnsupportedOperationException
   }
 
   override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
     throw new UnsupportedOperationException
   }
@@ -257,15 +203,17 @@ object EmptyCatalog extends Catalog {
     throw new UnsupportedOperationException
   }
 
-  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 
-  override def unregisterAllTables(): Unit = {}
+  override def unregisterAllTables(): Unit = {
+    throw new UnsupportedOperationException
+  }
 
   override def refreshTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 43ee3191935eb..c97365003935e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.errors
+import org.apache.spark.sql.catalyst.{TableIdentifier, errors}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -36,11 +36,11 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
  * Holds the name of a relation that has yet to be looked up in a [[Catalog]].
  */
 case class UnresolvedRelation(
-    tableIdentifier: Seq[String],
+    tableIdentifier: TableIdentifier,
     alias: Option[String] = None) extends LeafNode {
 
   /** Returns a `.` separated name for this relation. */
-  def tableName: String = tableIdentifier.mkString(".")
+  def tableName: String = tableIdentifier.unquotedString
 
   override def output: Seq[Attribute] = Nil
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 699c4cc63d09a..27b3cd84b3846 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -286,7 +286,8 @@ package object dsl {
 
       def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
         InsertIntoTable(
-          analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite, false)
+          analysis.UnresolvedRelation(TableIdentifier(tableName)),
+          Map.empty, logicalPlan, overwrite, false)
 
       def analyze: LogicalPlan = EliminateSubQueries(analysis.SimpleAnalyzer.execute(logicalPlan))
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 820b336aac759..ec05cfa63c5bf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
@@ -53,32 +54,39 @@ class AnalysisSuite extends AnalysisTest {
       Project(testRelation.output, testRelation))
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("TbL.a")),
+        UnresolvedRelation(TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation))
 
     assertAnalysisError(
-      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Seq("cannot resolve"))
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation),
       caseSensitive = false)
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation),
       caseSensitive = false)
   }
 
   test("resolve relations") {
-    assertAnalysisError(UnresolvedRelation(Seq("tAbLe"), None), Seq("Table Not Found: tAbLe"))
+    assertAnalysisError(
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table Not Found: tAbLe"))
 
-    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation)
+    checkAnalysis(UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation)
 
-    checkAnalysis(UnresolvedRelation(Seq("tAbLe"), None), testRelation, caseSensitive = false)
+    checkAnalysis(
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), testRelation, caseSensitive = false)
 
-    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation, caseSensitive = false)
+    checkAnalysis(
+      UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation, caseSensitive = false)
   }
 
   test("divide should be casted into fractional types") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
index 53b3695a86be5..23861ed15da61 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
+import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf}
 
 trait AnalysisTest extends PlanTest {
 
@@ -30,8 +31,8 @@ trait AnalysisTest extends PlanTest {
     val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf)
     val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf)
 
-    caseSensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation)
-    caseInsensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation)
+    caseSensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)
+    caseInsensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)
 
     new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) {
       override val extendedResolutionRules = EliminateSubQueries :: Nil
@@ -67,8 +68,7 @@ trait AnalysisTest extends PlanTest {
       expectedErrors: Seq[String],
       caseSensitive: Boolean = true): Unit = {
     val analyzer = getAnalyzer(caseSensitive)
-    // todo: make sure we throw AnalysisException during analysis
-    val e = intercept[Exception] {
+    val e = intercept[AnalysisException] {
       analyzer.checkAnalysis(analyzer.execute(inputPlan))
     }
     assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index b4ad618c23e39..40c4ae7920918 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Union, Project, LocalRelation}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
+import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf}
 
 class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   val conf = new SimpleCatalystConf(true)
@@ -47,7 +47,7 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   val b: Expression = UnresolvedAttribute("b")
 
   before {
-    catalog.registerTable(Seq("table"), relation)
+    catalog.registerTable(TableIdentifier("table"), relation)
   }
 
   private def checkType(expression: Expression, expectedType: DataType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 97a8b6518a832..eacdea2c1e5b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.{Logging, Partition}
+import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier}
 
 /**
  * :: Experimental ::
@@ -287,7 +288,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    * @since 1.4.0
    */
   def table(tableName: String): DataFrame = {
-    DataFrame(sqlContext, sqlContext.catalog.lookupRelation(Seq(tableName)))
+    DataFrame(sqlContext, sqlContext.catalog.lookupRelation(TableIdentifier(tableName)))
   }
 
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 03e973666e888..764510ab4b4bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -171,7 +171,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     val overwrite = mode == SaveMode.Overwrite
     df.sqlContext.executePlan(
       InsertIntoTable(
-        UnresolvedRelation(tableIdent.toSeq),
+        UnresolvedRelation(tableIdent),
         partitions.getOrElse(Map.empty[String, Option[String]]),
         df.logicalPlan,
         overwrite,
@@ -201,7 +201,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
-    val tableExists = df.sqlContext.catalog.tableExists(tableIdent.toSeq)
+    val tableExists = df.sqlContext.catalog.tableExists(tableIdent)
 
     (tableExists, mode) match {
       case (true, SaveMode.Ignore) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 3d5e35ab315eb..361eb576c567a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -714,7 +714,7 @@ class SQLContext private[sql](
    * only during the lifetime of this instance of SQLContext.
    */
   private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = {
-    catalog.registerTable(Seq(tableName), df.logicalPlan)
+    catalog.registerTable(TableIdentifier(tableName), df.logicalPlan)
   }
 
   /**
@@ -728,7 +728,7 @@ class SQLContext private[sql](
    */
   def dropTempTable(tableName: String): Unit = {
     cacheManager.tryUncacheQuery(table(tableName))
-    catalog.unregisterTable(Seq(tableName))
+    catalog.unregisterTable(TableIdentifier(tableName))
   }
 
   /**
@@ -795,7 +795,7 @@ class SQLContext private[sql](
   }
 
   private def table(tableIdent: TableIdentifier): DataFrame = {
-    DataFrame(this, catalog.lookupRelation(tableIdent.toSeq))
+    DataFrame(this, catalog.lookupRelation(tableIdent))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
index f7a88b98c0b48..446739d5b8a2c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
@@ -140,7 +140,7 @@ class DDLParser(parseQuery: String => LogicalPlan)
   protected lazy val describeTable: Parser[LogicalPlan] =
     (DESCRIBE ~> opt(EXTENDED)) ~ tableIdentifier ^^ {
       case e ~ tableIdent =>
-        DescribeCommand(UnresolvedRelation(tableIdent.toSeq, None), e.isDefined)
+        DescribeCommand(UnresolvedRelation(tableIdent, None), e.isDefined)
     }
 
   protected lazy val refreshTable: Parser[LogicalPlan] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 31d6b75e13477..e7deeff13dc4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -71,7 +71,6 @@ case class CreateTableUsing(
  * can analyze the logical plan that will be used to populate the table.
  * So, [[PreWriteCheck]] can detect cases that are not allowed.
  */
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 case class CreateTableUsingAsSelect(
     tableIdent: TableIdentifier,
     provider: String,
@@ -93,7 +92,7 @@ case class CreateTempTableUsing(
     val resolved = ResolvedDataSource(
       sqlContext, userSpecifiedSchema, Array.empty[String], provider, options)
     sqlContext.catalog.registerTable(
-      tableIdent.toSeq,
+      tableIdent,
       DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
 
     Seq.empty[Row]
@@ -112,7 +111,7 @@ case class CreateTempTableUsingAsSelect(
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df)
     sqlContext.catalog.registerTable(
-      tableIdent.toSeq,
+      tableIdent,
       DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
 
     Seq.empty[Row]
@@ -128,7 +127,7 @@ case class RefreshTable(tableIdent: TableIdentifier)
 
     // If this table is cached as a InMemoryColumnarRelation, drop the original
     // cached version and make the new version cached lazily.
-    val logicalPlan = sqlContext.catalog.lookupRelation(tableIdent.toSeq)
+    val logicalPlan = sqlContext.catalog.lookupRelation(tableIdent)
     // Use lookupCachedData directly since RefreshTable also takes databaseName.
     val isCached = sqlContext.cacheManager.lookupCachedData(logicalPlan).nonEmpty
     if (isCached) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 8efc8016f94dd..b00e5680fef9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -143,9 +143,9 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
       case CreateTableUsingAsSelect(tableIdent, _, _, partitionColumns, mode, _, query) =>
         // When the SaveMode is Overwrite, we need to check if the table is an input table of
         // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
-        if (mode == SaveMode.Overwrite && catalog.tableExists(tableIdent.toSeq)) {
+        if (mode == SaveMode.Overwrite && catalog.tableExists(tableIdent)) {
           // Need to remove SubQuery operator.
-          EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match {
+          EliminateSubQueries(catalog.lookupRelation(tableIdent)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
             case l @ LogicalRelation(dest: BaseRelation, _) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 356d4ff3fa837..fd566c8276bc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.execution.PhysicalRDD
 
 import scala.concurrent.duration._
@@ -287,8 +288,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
     testData.select('key).registerTempTable("t1")
     sqlContext.table("t1")
     sqlContext.dropTempTable("t1")
-    assert(
-      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    intercept[NoSuchTableException](sqlContext.table("t1"))
   }
 
   test("Drops cached temporary table") {
@@ -300,8 +300,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
     assert(sqlContext.isCached("t2"))
 
     sqlContext.dropTempTable("t1")
-    assert(
-      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    intercept[NoSuchTableException](sqlContext.table("t1"))
     assert(!sqlContext.isCached("t2"))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 7a027e13089e3..b1fb06815868c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -359,8 +360,8 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     upperCaseData.where('N <= 4).registerTempTable("left")
     upperCaseData.where('N >= 3).registerTempTable("right")
 
-    val left = UnresolvedRelation(Seq("left"), None)
-    val right = UnresolvedRelation(Seq("right"), None)
+    val left = UnresolvedRelation(TableIdentifier("left"), None)
+    val right = UnresolvedRelation(TableIdentifier("right"), None)
 
     checkAnswer(
       left.join(right, $"left.N" === $"right.N", "full"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index eab0fbb196eb6..5688f46e5e3d4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -21,6 +21,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
+import org.apache.spark.sql.catalyst.TableIdentifier
 
 class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext {
   import testImplicits._
@@ -32,7 +33,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
   }
 
   after {
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
   }
 
   test("get all tables") {
@@ -44,7 +45,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
@@ -57,7 +58,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index cc02ef81c9f8b..baff7f5752a75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,7 +22,7 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{TableIdentifier, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
 import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT}
 import org.apache.spark.sql.test.SharedSQLContext
@@ -49,7 +49,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
       checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    sqlContext.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("overwriting") {
@@ -59,7 +59,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
       checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple))
     }
-    sqlContext.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("self-join") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index e620d7fb82af9..4d8a3f728e6b5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -358,7 +358,7 @@ class HiveContext private[hive](
   @Experimental
   def analyze(tableName: String) {
     val tableIdent = SqlParser.parseTableIdentifier(tableName)
-    val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq))
+    val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent))
 
     relation match {
       case relation: MetastoreRelation =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 1f8223e1ff507..5819cb9d08778 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.{InternalRow, SqlParser, TableIdentifier}
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.execution.{FileRelation, datasources}
@@ -103,10 +103,19 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   /** Usages should lock on `this`. */
   protected[hive] lazy val hiveWarehouse = new Warehouse(hive.hiveconf)
 
-  // TODO: Use this everywhere instead of tuples or databaseName, tableName,.
   /** A fully qualified identifier for a table (i.e., database.tableName) */
-  case class QualifiedTableName(database: String, name: String) {
-    def toLowerCase: QualifiedTableName = QualifiedTableName(database.toLowerCase, name.toLowerCase)
+  case class QualifiedTableName(database: String, name: String)
+
+  private def getQualifiedTableName(tableIdent: TableIdentifier) = {
+    QualifiedTableName(
+      tableIdent.database.getOrElse(client.currentDatabase).toLowerCase,
+      tableIdent.table.toLowerCase)
+  }
+
+  private def getQualifiedTableName(hiveTable: HiveTable) = {
+    QualifiedTableName(
+      hiveTable.specifiedDatabase.getOrElse(client.currentDatabase).toLowerCase,
+      hiveTable.name.toLowerCase)
   }
 
   /** A cache of Spark SQL data source tables that have been accessed. */
@@ -179,33 +188,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   }
 
   def invalidateTable(tableIdent: TableIdentifier): Unit = {
-    val databaseName = tableIdent.database.getOrElse(client.currentDatabase)
-    val tableName = tableIdent.table
-
-    cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase)
-  }
-
-  val caseSensitive: Boolean = false
-
-  /**
-   * Creates a data source table (a table created with USING clause) in Hive's metastore.
-   * Returns true when the table has been created. Otherwise, false.
-   */
-  // TODO: Remove this in SPARK-10104.
-  def createDataSourceTable(
-      tableName: String,
-      userSpecifiedSchema: Option[StructType],
-      partitionColumns: Array[String],
-      provider: String,
-      options: Map[String, String],
-      isExternal: Boolean): Unit = {
-    createDataSourceTable(
-      SqlParser.parseTableIdentifier(tableName),
-      userSpecifiedSchema,
-      partitionColumns,
-      provider,
-      options,
-      isExternal)
+    cachedDataSourceTables.invalidate(getQualifiedTableName(tableIdent))
   }
 
   def createDataSourceTable(
@@ -215,10 +198,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       provider: String,
       options: Map[String, String],
       isExternal: Boolean): Unit = {
-    val (dbName, tblName) = {
-      val database = tableIdent.database.getOrElse(client.currentDatabase)
-      processDatabaseAndTableName(database, tableIdent.table)
-    }
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
 
     val tableProperties = new mutable.HashMap[String, String]
     tableProperties.put("spark.sql.sources.provider", provider)
@@ -311,7 +291,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
 
     // TODO: Support persisting partitioned data source relations in Hive compatible format
     val qualifiedTableName = tableIdent.quotedString
-    val (hiveCompitiableTable, logMessage) = (maybeSerDe, dataSource.relation) match {
+    val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match {
       case (Some(serde), relation: HadoopFsRelation)
         if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
         val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
@@ -349,9 +329,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         (None, message)
     }
 
-    (hiveCompitiableTable, logMessage) match {
+    (hiveCompatibleTable, logMessage) match {
       case (Some(table), message) =>
-        // We first try to save the metadata of the table in a Hive compatiable way.
+        // We first try to save the metadata of the table in a Hive compatible way.
         // If Hive throws an error, we fall back to save its metadata in the Spark SQL
         // specific way.
         try {
@@ -374,48 +354,29 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     }
   }
 
-  def hiveDefaultTableFilePath(tableName: String): String = {
-    hiveDefaultTableFilePath(SqlParser.parseTableIdentifier(tableName))
-  }
-
   def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {
     // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName)
-    val database = tableIdent.database.getOrElse(client.currentDatabase)
-
-    new Path(
-      new Path(client.getDatabase(database).location),
-      tableIdent.table.toLowerCase).toString
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
+    new Path(new Path(client.getDatabase(dbName).location), tblName).toString
   }
 
-  def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val databaseName =
-      tableIdent
-        .lift(tableIdent.size - 2)
-        .getOrElse(client.currentDatabase)
-    val tblName = tableIdent.last
-    client.getTableOption(databaseName, tblName).isDefined
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
+    client.getTableOption(dbName, tblName).isDefined
   }
 
-  def lookupRelation(
-      tableIdentifier: Seq[String],
+  override def lookupRelation(
+      tableIdent: TableIdentifier,
       alias: Option[String]): LogicalPlan = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
-      client.currentDatabase)
-    val tblName = tableIdent.last
-    val table = client.getTable(databaseName, tblName)
+    val qualifiedTableName = getQualifiedTableName(tableIdent)
+    val table = client.getTable(qualifiedTableName.database, qualifiedTableName.name)
 
     if (table.properties.get("spark.sql.sources.provider").isDefined) {
-      val dataSourceTable =
-        cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
+      val dataSourceTable = cachedDataSourceTables(qualifiedTableName)
+      val tableWithQualifiers = Subquery(qualifiedTableName.name, dataSourceTable)
       // Then, if alias is specified, wrap the table with a Subquery using the alias.
       // Otherwise, wrap the table with a Subquery using the table name.
-      val withAlias =
-        alias.map(a => Subquery(a, dataSourceTable)).getOrElse(
-          Subquery(tableIdent.last, dataSourceTable))
-
-      withAlias
+      alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
     } else if (table.tableType == VirtualView) {
       val viewText = table.viewText.getOrElse(sys.error("Invalid view without text."))
       alias match {
@@ -425,7 +386,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         case Some(aliasText) => Subquery(aliasText, HiveQl.createPlan(viewText))
       }
     } else {
-      MetastoreRelation(databaseName, tblName, alias)(table)(hive)
+      MetastoreRelation(qualifiedTableName.database, qualifiedTableName.name, alias)(table)(hive)
     }
   }
 
@@ -524,26 +485,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     client.listTables(db).map(tableName => (tableName, false))
   }
 
-  protected def processDatabaseAndTableName(
-      databaseName: Option[String],
-      tableName: String): (Option[String], String) = {
-    if (!caseSensitive) {
-      (databaseName.map(_.toLowerCase), tableName.toLowerCase)
-    } else {
-      (databaseName, tableName)
-    }
-  }
-
-  protected def processDatabaseAndTableName(
-      databaseName: String,
-      tableName: String): (String, String) = {
-    if (!caseSensitive) {
-      (databaseName.toLowerCase, tableName.toLowerCase)
-    } else {
-      (databaseName, tableName)
-    }
-  }
-
   /**
    * When scanning or writing to non-partitioned Metastore Parquet tables, convert them to Parquet
    * data source relations for better performance.
@@ -597,8 +538,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
               "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
           }
 
-          val (dbName, tblName) = processDatabaseAndTableName(
-            table.specifiedDatabase.getOrElse(client.currentDatabase), table.name)
+          val QualifiedTableName(dbName, tblName) = getQualifiedTableName(table)
 
           execution.CreateViewAsSelect(
             table.copy(
@@ -636,7 +576,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
           val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists
           CreateTableUsingAsSelect(
             TableIdentifier(desc.name),
-            hive.conf.defaultDataSourceName,
+            conf.defaultDataSourceName,
             temporary = false,
             Array.empty[String],
             mode,
@@ -652,9 +592,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
             table
           }
 
-          val (dbName, tblName) =
-            processDatabaseAndTableName(
-              desc.specifiedDatabase.getOrElse(client.currentDatabase), desc.name)
+          val QualifiedTableName(dbName, tblName) = getQualifiedTableName(table)
 
           execution.CreateTableAsSelect(
             desc.copy(
@@ -712,7 +650,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
 
@@ -720,7 +658,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 1d505019400bc..d4ff5cc0f12a2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{logical, _}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.HiveShim._
@@ -442,24 +443,12 @@ private[hive] object HiveQl extends Logging {
       throw new NotImplementedError(s"No parse rules for StructField:\n ${dumpTree(a).toString} ")
   }
 
-  protected def extractDbNameTableName(tableNameParts: Node): (Option[String], String) = {
-    val (db, tableName) =
-      tableNameParts.getChildren.asScala.map {
-        case Token(part, Nil) => cleanIdentifier(part)
-      } match {
-        case Seq(tableOnly) => (None, tableOnly)
-        case Seq(databaseName, table) => (Some(databaseName), table)
-      }
-
-    (db, tableName)
-  }
-
-  protected def extractTableIdent(tableNameParts: Node): Seq[String] = {
+  protected def extractTableIdent(tableNameParts: Node): TableIdentifier = {
     tableNameParts.getChildren.asScala.map {
       case Token(part, Nil) => cleanIdentifier(part)
     } match {
-      case Seq(tableOnly) => Seq(tableOnly)
-      case Seq(databaseName, table) => Seq(databaseName, table)
+      case Seq(tableOnly) => TableIdentifier(tableOnly)
+      case Seq(databaseName, table) => TableIdentifier(table, Some(databaseName))
       case other => sys.error("Hive only supports tables names like 'tableName' " +
         s"or 'databaseName.tableName', found '$other'")
     }
@@ -518,13 +507,13 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       properties: Map[String, String],
       allowExist: Boolean,
       replace: Boolean): CreateViewAsSelect = {
-    val (db, viewName) = extractDbNameTableName(viewNameParts)
+    val TableIdentifier(viewName, dbName) = extractTableIdent(viewNameParts)
 
     val originalText = context.getTokenRewriteStream
       .toString(query.getTokenStartIndex, query.getTokenStopIndex)
 
     val tableDesc = HiveTable(
-      specifiedDatabase = db,
+      specifiedDatabase = dbName,
       name = viewName,
       schema = schema,
       partitionColumns = Seq.empty[HiveColumn],
@@ -611,7 +600,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
               case tableName =>
                 // It is describing a table with the format like "describe table".
                 DescribeCommand(
-                  UnresolvedRelation(Seq(tableName.getText), None), isExtended = extended.isDefined)
+                  UnresolvedRelation(TableIdentifier(tableName.getText), None),
+                  isExtended = extended.isDefined)
             }
           }
           // All other cases.
@@ -716,12 +706,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             "TOK_TABLELOCATION",
             "TOK_TABLEPROPERTIES"),
           children)
-      val (db, tableName) = extractDbNameTableName(tableNameParts)
+      val TableIdentifier(tblName, dbName) = extractTableIdent(tableNameParts)
 
       // TODO add bucket support
       var tableDesc: HiveTable = HiveTable(
-        specifiedDatabase = db,
-        name = tableName,
+        specifiedDatabase = dbName,
+        name = tblName,
         schema = Seq.empty[HiveColumn],
         partitionColumns = Seq.empty[HiveColumn],
         properties = Map[String, String](),
@@ -1264,15 +1254,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           nonAliasClauses)
       }
 
-      val tableIdent =
-        tableNameParts.getChildren.asScala.map {
-          case Token(part, Nil) => cleanIdentifier(part)
-        } match {
-          case Seq(tableOnly) => Seq(tableOnly)
-          case Seq(databaseName, table) => Seq(databaseName, table)
-          case other => sys.error("Hive only supports tables names like 'tableName' " +
-            s"or 'databaseName.tableName', found '$other'")
-      }
+      val tableIdent = extractTableIdent(tableNameParts)
       val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) }
       val relation = UnresolvedRelation(tableIdent, alias)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
index 8422287e177e5..e72a60b42e653 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
@@ -37,8 +38,7 @@ case class CreateTableAsSelect(
     allowExisting: Boolean)
   extends RunnableCommand {
 
-  def database: String = tableDesc.database
-  def tableName: String = tableDesc.name
+  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))
 
   override def children: Seq[LogicalPlan] = Seq(query)
 
@@ -72,18 +72,18 @@ case class CreateTableAsSelect(
       hiveContext.catalog.client.createTable(withSchema)
 
       // Get the Metastore Relation
-      hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match {
+      hiveContext.catalog.lookupRelation(tableIdentifier, None) match {
         case r: MetastoreRelation => r
       }
     }
     // TODO ideally, we should get the output data ready first and then
     // add the relation into catalog, just in case of failure occurs while data
     // processing.
-    if (hiveContext.catalog.tableExists(Seq(database, tableName))) {
+    if (hiveContext.catalog.tableExists(tableIdentifier)) {
       if (allowExisting) {
         // table already exists, will do nothing, to keep consistent with Hive
       } else {
-        throw new AnalysisException(s"$database.$tableName already exists.")
+        throw new AnalysisException(s"$tableIdentifier already exists.")
       }
     } else {
       hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
@@ -93,6 +93,6 @@ case class CreateTableAsSelect(
   }
 
   override def argString: String = {
-    s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]"
+    s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]"
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
index 2b504ac974f07..2c81115ee4fed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
 import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
@@ -38,18 +39,18 @@ private[hive] case class CreateViewAsSelect(
   assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
   assert(tableDesc.viewText.isDefined)
 
+  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))
+
   override def run(sqlContext: SQLContext): Seq[Row] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val database = tableDesc.database
-    val viewName = tableDesc.name
 
-    if (hiveContext.catalog.tableExists(Seq(database, viewName))) {
+    if (hiveContext.catalog.tableExists(tableIdentifier)) {
       if (allowExisting) {
         // view already exists, will do nothing, to keep consistent with Hive
       } else if (orReplace) {
         hiveContext.catalog.client.alertView(prepareTable())
       } else {
-        throw new AnalysisException(s"View $database.$viewName already exists. " +
+        throw new AnalysisException(s"View $tableIdentifier already exists. " +
           "If you want to update the view definition, please use ALTER VIEW AS or " +
           "CREATE OR REPLACE VIEW AS")
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 51ec92afd06ed..94210a5394f9b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -71,7 +71,7 @@ case class DropTable(
     }
     hiveContext.invalidateTable(tableName)
     hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
-    hiveContext.catalog.unregisterTable(Seq(tableName))
+    hiveContext.catalog.unregisterTable(TableIdentifier(tableName))
     Seq.empty[Row]
   }
 }
@@ -103,7 +103,6 @@ case class AddFile(path: String) extends RunnableCommand {
   }
 }
 
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSource(
     tableIdent: TableIdentifier,
@@ -131,7 +130,7 @@ case class CreateMetastoreDataSource(
     val tableName = tableIdent.unquotedString
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
 
-    if (hiveContext.catalog.tableExists(tableIdent.toSeq)) {
+    if (hiveContext.catalog.tableExists(tableIdent)) {
       if (allowExisting) {
         return Seq.empty[Row]
       } else {
@@ -160,7 +159,6 @@ case class CreateMetastoreDataSource(
   }
 }
 
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSourceAsSelect(
     tableIdent: TableIdentifier,
@@ -198,7 +196,7 @@ case class CreateMetastoreDataSourceAsSelect(
       }
 
     var existingSchema = None: Option[StructType]
-    if (sqlContext.catalog.tableExists(tableIdent.toSeq)) {
+    if (sqlContext.catalog.tableExists(tableIdent)) {
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
@@ -215,7 +213,7 @@ case class CreateMetastoreDataSourceAsSelect(
           val resolved = ResolvedDataSource(
             sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
-          EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match {
+          EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent)) match {
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _) =>
               if (l.relation != createdRelation.relation) {
                 val errorDescription =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index ff39ccb7c1ea5..6883d305cbead 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -191,7 +191,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       // Make sure any test tables referenced are loaded.
       val referencedTables =
         describedTables ++
-        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.last }
+        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.table }
       val referencedTestTables = referencedTables.filter(testTables.contains)
       logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
       referencedTestTables.foreach(loadTestTable)
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index c8d272794d10b..8c4af1b8eaf44 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -26,7 +26,6 @@
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.spark.sql.SaveMode;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -41,6 +40,8 @@
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.TableIdentifier;
 import org.apache.spark.util.Utils;
 
 public class JavaMetastoreDataSourcesSuite {
@@ -71,7 +72,8 @@ public void setUp() throws IOException {
     if (path.exists()) {
       path.delete();
     }
-    hiveManagedPath = new Path(sqlContext.catalog().hiveDefaultTableFilePath("javaSavedTable"));
+    hiveManagedPath = new Path(sqlContext.catalog().hiveDefaultTableFilePath(
+      new TableIdentifier("javaSavedTable")));
     fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration());
     if (fs.exists(hiveManagedPath)){
       fs.delete(hiveManagedPath, true);
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 579631df772b5..183aca29cf98d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.Row
 
@@ -31,14 +32,14 @@ class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAft
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
-    catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
+    catalog.registerTable(TableIdentifier("ListTablesSuiteTable"), df.logicalPlan)
     sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
     sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
     sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
   }
 
   override def afterAll(): Unit = {
-    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
     sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
     sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index d3565380005a0..d2928876887bd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.util.Utils
 
 /**
@@ -367,7 +368,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
            |)
          """.stripMargin)
 
-      val expectedPath = catalog.hiveDefaultTableFilePath("ctasJsonTable")
+      val expectedPath = catalog.hiveDefaultTableFilePath(TableIdentifier("ctasJsonTable"))
       val filesystemPath = new Path(expectedPath)
       val fs = filesystemPath.getFileSystem(sparkContext.hadoopConfiguration)
       if (fs.exists(filesystemPath)) fs.delete(filesystemPath, true)
@@ -472,7 +473,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           // Drop table will also delete the data.
           sql("DROP TABLE savedJsonTable")
           intercept[IOException] {
-            read.json(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+            read.json(catalog.hiveDefaultTableFilePath(TableIdentifier("savedJsonTable")))
           }
         }
 
@@ -703,7 +704,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
 
         // Manually create a metastore data source table.
         catalog.createDataSourceTable(
-          tableName = "wide_schema",
+          tableIdent = TableIdentifier("wide_schema"),
           userSpecifiedSchema = Some(schema),
           partitionColumns = Array.empty[String],
           provider = "json",
@@ -733,7 +734,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           "EXTERNAL" -> "FALSE"),
         tableType = ManagedTable,
         serdeProperties = Map(
-          "path" -> catalog.hiveDefaultTableFilePath(tableName)))
+          "path" -> catalog.hiveDefaultTableFilePath(TableIdentifier(tableName))))
 
       catalog.client.createTable(hiveTable)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 6a692d6fce562..9bb32f11b76bd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{Row, SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -68,7 +69,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      hiveContext.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
+      hiveContext.catalog.lookupRelation(TableIdentifier(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -115,7 +116,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton {
     intercept[UnsupportedOperationException] {
       hiveContext.analyze("tempTable")
     }
-    hiveContext.catalog.unregisterTable(Seq("tempTable"))
+    hiveContext.catalog.unregisterTable(TableIdentifier("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 6aa34605b05a8..c929ba50680bc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp}
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.DefaultParserDialect
+import org.apache.spark.sql.catalyst.{TableIdentifier, DefaultParserDialect}
 import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -266,7 +266,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("CTAS without serde") {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
-      val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
+      val relation = EliminateSubQueries(catalog.lookupRelation(TableIdentifier(tableName)))
       relation match {
         case LogicalRelation(r: ParquetRelation, _) =>
           if (!isDataSourceParquet) {
@@ -723,7 +723,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     (1 to 100).par.map { i =>
       val tableName = s"SPARK_6618_table_$i"
       sql(s"CREATE TABLE $tableName (col1 string)")
-      catalog.lookupRelation(Seq(tableName))
+      catalog.lookupRelation(TableIdentifier(tableName))
       table(tableName)
       tables()
       sql(s"DROP TABLE $tableName")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 5eb39b1129701..7efeab528c1dd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 
@@ -218,7 +219,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("overwriting") {
@@ -228,7 +229,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), data.map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("self-join") {

From 2b5e31c7e97811ef7b4da47609973b7f51444346 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 16:27:43 -0700
Subject: [PATCH 555/802] [SPARK-11113] [SQL] Remove DeveloperApi annotation
 from private classes.

o.a.s.sql.catalyst and o.a.s.sql.execution are supposed to be private.

Author: Reynold Xin <rxin@databricks.com>

Closes #9121 from rxin/SPARK-11113.
---
 .../expressions/codegen/package.scala         |  3 --
 .../spark/sql/execution/Aggregate.scala       |  3 --
 .../apache/spark/sql/execution/Exchange.scala |  5 +---
 .../spark/sql/execution/ExistingRDD.scala     |  6 +---
 .../apache/spark/sql/execution/Expand.scala   |  2 --
 .../apache/spark/sql/execution/Generate.scala |  3 --
 .../spark/sql/execution/LocalTableScan.scala  |  3 +-
 .../spark/sql/execution/QueryExecution.scala  |  7 ++---
 .../spark/sql/execution/ShuffledRowRDD.scala  |  1 -
 .../spark/sql/execution/SparkPlan.scala       |  6 ++--
 .../apache/spark/sql/execution/Window.scala   | 10 ++-----
 .../spark/sql/execution/basicOperators.scala  | 28 ++----------------
 .../apache/spark/sql/execution/commands.scala | 29 +++----------------
 .../execution/joins/BroadcastHashJoin.scala   |  3 --
 .../joins/BroadcastHashOuterJoin.scala        |  3 --
 .../joins/BroadcastLeftSemiJoinHash.scala     |  3 --
 .../joins/BroadcastNestedLoopJoin.scala       |  6 +---
 .../execution/joins/CartesianProduct.scala    |  6 +---
 .../sql/execution/joins/HashOuterJoin.scala   |  9 ++----
 .../sql/execution/joins/LeftSemiJoinBNL.scala |  3 --
 .../execution/joins/LeftSemiJoinHash.scala    |  3 --
 .../execution/joins/ShuffledHashJoin.scala    |  3 --
 .../joins/ShuffledHashOuterJoin.scala         |  3 --
 .../sql/execution/joins/SortMergeJoin.scala   |  3 --
 .../execution/joins/SortMergeOuterJoin.scala  |  3 --
 .../spark/sql/execution/joins/package.scala   |  6 ----
 .../apache/spark/sql/execution/python.scala   |  7 +----
 .../sql/execution/rowFormatConverters.scala   |  5 ----
 .../spark/sql/test/ExamplePointUDT.scala      |  3 --
 29 files changed, 22 insertions(+), 153 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 606fecbe06e47..41128fe389d46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.rules
 import org.apache.spark.util.Utils
 
@@ -40,10 +39,8 @@ package object codegen {
   }
 
   /**
-   * :: DeveloperApi ::
    * Dumps the bytecode from a class to the screen using javap.
    */
-  @DeveloperApi
   object DumpByteCode {
     import scala.sys.process._
     val dumpDirectory = Utils.createTempDir()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index f3b6a3a5f4a33..6f3f1bd97ad52 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution
 
 import java.util.HashMap
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
@@ -28,7 +27,6 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Groups input data by `groupingExpressions` and computes the `aggregateExpressions` for each
  * group.
  *
@@ -38,7 +36,6 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
  * @param aggregateExpressions expressions that are computed for each group.
  * @param child the input data source.
  */
-@DeveloperApi
 case class Aggregate(
     partial: Boolean,
     groupingExpressions: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 8efa471600b1b..289453753f18d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import java.util.Random
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -33,13 +33,10 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.util.MutablePair
-import org.apache.spark._
 
 /**
- * :: DeveloperApi ::
  * Performs a shuffle that will result in the desired `newPartitioning`.
  */
-@DeveloperApi
 case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
 
   override def nodeName: String = if (tungstenMode) "TungstenExchange" else "Exchange"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index abb60cf12e3a5..87bd92e00a2c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
@@ -27,10 +26,7 @@ import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.{Row, SQLContext}
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 object RDDConversions {
   def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
     data.mapPartitions { iterator =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
index d90cae1c4c060..a458881f40948 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
@@ -32,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartit
  * @param output      The output Schema
  * @param child       Child operator
  */
-@DeveloperApi
 case class Expand(
     projections: Seq[Seq[Expression]],
     output: Seq[Attribute],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index c3c0dc441c928..78e33d9f233a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -35,7 +34,6 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
 }
 
 /**
- * :: DeveloperApi ::
  * Applies a [[Generator]] to a stream of input rows, combining the
  * output of each into a new stream of rows.  This operation is similar to a `flatMap` in functional
  * programming with one important additional feature, which allows the input rows to be joined with
@@ -48,7 +46,6 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  * @param output the output attributes of this node, which constructed in analysis phase,
  *               and we can not change it, as the parent node bound with it already.
  */
-@DeveloperApi
 case class Generate(
     generator: Generator,
     join: Boolean,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index adb6bbc4acc5b..ba7f6287ac6c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 7bb4133a29059..fc9174549e642 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -17,18 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{InternalRow, optimizer}
-import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 /**
- * :: DeveloperApi ::
  * The primary workflow for executing relational queries using Spark.  Designed to allow easy
  * access to the intermediate phases of query execution for developers.
  */
-@DeveloperApi
 class QueryExecution(val sqlContext: SQLContext, val logical: LogicalPlan) {
   val analyzer = sqlContext.analyzer
   val optimizer = sqlContext.optimizer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index 743c99a899c61..fb338b90bf79b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -21,7 +21,6 @@ import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.DataType
 
 private class ShuffledRowRDDPartition(val idx: Int) extends Partition {
   override val index: Int = idx
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index fcb42047ffe60..8bb293ae87e64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -22,7 +22,6 @@ import java.util.concurrent.atomic.AtomicBoolean
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
@@ -32,7 +31,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric}
 import org.apache.spark.sql.types.DataType
 
 object SparkPlan {
@@ -40,9 +39,8 @@ object SparkPlan {
 }
 
 /**
- * :: DeveloperApi ::
+ * The base class for physical operators.
  */
-@DeveloperApi
 abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 55035f4bc5f2a..53c5ccf8fa37e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -17,19 +17,14 @@
 
 package org.apache.spark.sql.execution
 
-import java.util
-
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.collection.CompactBuffer
-import scala.collection.mutable
 
 /**
- * :: DeveloperApi ::
  * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted)
  * partition. The aggregates are calculated for each row in the group. Special processing
  * instructions, frames, are used to calculate these aggregates. Frames are processed in the order
@@ -76,7 +71,6 @@ import scala.collection.mutable
  * Entire Partition, Sliding, Growing & Shrinking. Boundary evaluation is also delegated to a pair
  * of specialized classes: [[RowBoundOrdering]] & [[RangeBoundOrdering]].
  */
-@DeveloperApi
 case class Window(
     projectList: Seq[Attribute],
     windowExpression: Seq[NamedExpression],
@@ -229,7 +223,7 @@ case class Window(
     // function result buffer.
     val framedWindowExprs = windowExprs.groupBy(_.windowSpec.frameSpecification)
     val factories = Array.ofDim[() => WindowFunctionFrame](framedWindowExprs.size)
-    val unboundExpressions = mutable.Buffer.empty[Expression]
+    val unboundExpressions = scala.collection.mutable.Buffer.empty[Expression]
     framedWindowExprs.zipWithIndex.foreach {
       case ((frame, unboundFrameExpressions), index) =>
         // Track the ordinal.
@@ -529,7 +523,7 @@ private[execution] final class SlidingWindowFunctionFrame(
   private[this] var inputLowIndex = 0
 
   /** Buffer used for storing prepared input for the window functions. */
-  private[this] val buffer = new util.ArrayDeque[Array[AnyRef]]
+  private[this] val buffer = new java.util.ArrayDeque[Array[AnyRef]]
 
   /** Index of the row we are currently writing. */
   private[this] var outputIndex = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 7804b67ac2367..4db9f4ee67bb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
@@ -28,10 +27,7 @@ import org.apache.spark.util.MutablePair
 import org.apache.spark.util.random.PoissonSampler
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
 
@@ -90,10 +86,6 @@ case class TungstenProject(projectList: Seq[NamedExpression], child: SparkPlan)
 }
 
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
 case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
@@ -125,8 +117,8 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Sample the dataset.
+ *
  * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
  * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
  *                   will be ub - lb.
@@ -134,7 +126,6 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
  * @param seed the random seed
  * @param child the SparkPlan
  */
-@DeveloperApi
 case class Sample(
     lowerBound: Double,
     upperBound: Double,
@@ -165,9 +156,8 @@ case class Sample(
 }
 
 /**
- * :: DeveloperApi ::
+ * Union two plans, without a distinct. This is UNION ALL in SQL.
  */
-@DeveloperApi
 case class Union(children: Seq[SparkPlan]) extends SparkPlan {
   // TODO: attributes output by union should be distinct for nullability purposes
   override def output: Seq[Attribute] = children.head.output
@@ -179,14 +169,12 @@ case class Union(children: Seq[SparkPlan]) extends SparkPlan {
 }
 
 /**
- * :: DeveloperApi ::
  * Take the first limit elements. Note that the implementation is different depending on whether
  * this is a terminal operator or not. If it is terminal and is invoked using executeCollect,
  * this operator uses something similar to Spark's take method on the Spark driver. If it is not
  * terminal or is invoked using execute, we first take the limit on each partition, and then
  * repartition all the data to a single partition to compute the global limit.
  */
-@DeveloperApi
 case class Limit(limit: Int, child: SparkPlan)
   extends UnaryNode {
   // TODO: Implement a partition local limit, and use a strategy to generate the proper limit plan:
@@ -219,14 +207,12 @@ case class Limit(limit: Int, child: SparkPlan)
 }
 
 /**
- * :: DeveloperApi ::
  * Take the first limit elements as defined by the sortOrder, and do projection if needed.
  * This is logically equivalent to having a [[Limit]] operator after a [[Sort]] operator,
  * or having a [[Project]] operator between them.
  * This could have been named TopK, but Spark's top operator does the opposite in ordering
  * so we name it TakeOrdered to avoid confusion.
  */
-@DeveloperApi
 case class TakeOrderedAndProject(
     limit: Int,
     sortOrder: Seq[SortOrder],
@@ -271,13 +257,11 @@ case class TakeOrderedAndProject(
 }
 
 /**
- * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.
  * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
  * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
  * the 100 new partitions will claim 10 of the current partitions.
  */
-@DeveloperApi
 case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
@@ -294,11 +278,9 @@ case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Returns a table with the elements from left that are not in right using
  * the built-in spark subtract function.
  */
-@DeveloperApi
 case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
 
@@ -308,11 +290,9 @@ case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Returns the rows in left that also appear in right using the built in spark
  * intersection function.
  */
-@DeveloperApi
 case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = children.head.output
 
@@ -322,12 +302,10 @@ case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * A plan node that does nothing but lie about the output of its child.  Used to spice a
  * (hopefully structurally equivalent) tree from a different optimization sequence into an already
  * resolved tree.
  */
-@DeveloperApi
 case class OutputFaker(output: Seq[Attribute], child: SparkPlan) extends SparkPlan {
   def children: Seq[SparkPlan] = child :: Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 05ccc53830bd1..856607615ae87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,11 +20,10 @@ package org.apache.spark.sql.execution
 import java.util.NoSuchElementException
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{ExpressionDescription, Expression, Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
@@ -74,10 +73,7 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
   override def argString: String = cmd.toString
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableCommand with Logging {
 
   private def keyValueOutput: Seq[Attribute] = {
@@ -180,10 +176,7 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
  *
  * Note that this command takes in a logical plan, runs the optimizer on the logical plan
  * (but do NOT actually execute it).
- *
- * :: DeveloperApi ::
  */
-@DeveloperApi
 case class ExplainCommand(
     logicalPlan: LogicalPlan,
     override val output: Seq[Attribute] =
@@ -203,10 +196,7 @@ case class ExplainCommand(
   }
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class CacheTableCommand(
     tableName: String,
     plan: Option[LogicalPlan],
@@ -231,10 +221,6 @@ case class CacheTableCommand(
 }
 
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
 case class UncacheTableCommand(tableName: String) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
@@ -246,10 +232,8 @@ case class UncacheTableCommand(tableName: String) extends RunnableCommand {
 }
 
 /**
- * :: DeveloperApi ::
  * Clear all cached data from the in-memory cache.
  */
-@DeveloperApi
 case object ClearCacheCommand extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
@@ -260,10 +244,7 @@ case object ClearCacheCommand extends RunnableCommand {
   override def output: Seq[Attribute] = Seq.empty
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class DescribeCommand(
     child: SparkPlan,
     override val output: Seq[Attribute],
@@ -286,9 +267,7 @@ case class DescribeCommand(
  * {{{
  *    SHOW TABLES [IN databaseName]
  * }}}
- * :: DeveloperApi ::
  */
-@DeveloperApi
 case class ShowTablesCommand(databaseName: Option[String]) extends RunnableCommand {
 
   // The result of SHOW TABLES has two columns, tableName and isTemporary.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 2e108cb814516..1d381e2eaef38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.joins
 import scala.concurrent._
 import scala.concurrent.duration._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -31,13 +30,11 @@ import org.apache.spark.util.ThreadUtils
 import org.apache.spark.{InternalAccumulator, TaskContext}
 
 /**
- * :: DeveloperApi ::
  * Performs an inner hash join of two child relations.  When the output RDD of this operator is
  * being constructed, a Spark job is asynchronously started to calculate the values for the
  * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
  * relation is not shuffled.
  */
-@DeveloperApi
 case class BroadcastHashJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index 69a8b95eaa7ec..ab81bd7b3fc04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.joins
 import scala.concurrent._
 import scala.concurrent.duration._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -31,13 +30,11 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.{InternalAccumulator, TaskContext}
 
 /**
- * :: DeveloperApi ::
  * Performs a outer hash join for two child relations.  When the output RDD of this operator is
  * being constructed, a Spark job is asynchronously started to calculate the values for the
  * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
  * relation is not shuffled.
  */
-@DeveloperApi
 case class BroadcastHashOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 78a8c16c62bca..c5cd6a2fd6372 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.{InternalAccumulator, TaskContext}
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Build the right table's join keys into a HashSet, and iteratively go through the left
  * table, to find the if join keys are in the Hash set.
  */
-@DeveloperApi
 case class BroadcastLeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index 28c88b1b03d02..efef8c8a8b96a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -27,10 +26,7 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.collection.CompactBuffer
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class BroadcastNestedLoopJoin(
     left: SparkPlan,
     right: SparkPlan,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
index 2115f40702286..0243e196dbc37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
@@ -17,17 +17,13 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output ++ right.output
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 66903347c88c1..15b06b1537f8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import java.util.{HashMap => JavaHashMap}
-
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -27,7 +24,7 @@ import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.LongSQLMetric
 import org.apache.spark.util.collection.CompactBuffer
 
-@DeveloperApi
+
 trait HashOuterJoin {
   self: SparkPlan =>
 
@@ -230,8 +227,8 @@ trait HashOuterJoin {
   protected[this] def buildHashTable(
       iter: Iterator[InternalRow],
       numIterRows: LongSQLMetric,
-      keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = {
-    val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]]()
+      keyGenerator: Projection): java.util.HashMap[InternalRow, CompactBuffer[InternalRow]] = {
+    val hashTable = new java.util.HashMap[InternalRow, CompactBuffer[InternalRow]]()
     while (iter.hasNext) {
       val currentRow = iter.next()
       numIterRows += 1
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
index ad6362542f2ff..efa7b49410edc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Using BroadcastNestedLoopJoin to calculate left semi join result when there's no join keys
  * for hash join.
  */
-@DeveloperApi
 case class LeftSemiJoinBNL(
     streamed: SparkPlan, broadcast: SparkPlan, condition: Option[Expression])
   extends BinaryNode {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 18808adaac63f..bf3b05be981fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Build the right table's join keys into a HashSet, and iteratively go through the left
  * table, to find the if join keys are in the Hash set.
  */
-@DeveloperApi
 case class LeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
index fc8c9439a6f07..755986af8b95e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Performs an inner hash join of two child relations by first shuffling the data using the join
  * keys.
  */
-@DeveloperApi
 case class ShuffledHashJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index d800c7456bdac..6b2cb9d8f6893 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -29,11 +28,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Performs a hash based outer join for two child relations by shuffling the data using
  * the join keys. This operator requires loading the associated partition in both side into memory.
  */
-@DeveloperApi
 case class ShuffledHashOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 70a1af6a7063a..17030947b7bbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -28,10 +27,8 @@ import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
 
 /**
- * :: DeveloperApi ::
  * Performs an sort merge join of two child relations.
  */
-@DeveloperApi
 case class SortMergeJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
index c117dff9c8b1d..7e854e6702f77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -30,10 +29,8 @@ import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
 import org.apache.spark.util.collection.BitSet
 
 /**
- * :: DeveloperApi ::
  * Performs an sort merge outer join of two child relations.
  */
-@DeveloperApi
 case class SortMergeOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
index 7f2ab1765b28f..134376628ae7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
@@ -17,21 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
-
 /**
- * :: DeveloperApi ::
  * Physical execution operators for join operations.
  */
 package object joins {
 
-  @DeveloperApi
   sealed abstract class BuildSide
 
-  @DeveloperApi
   case object BuildRight extends BuildSide
 
-  @DeveloperApi
   case object BuildLeft extends BuildSide
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index 5dbe0fc5f95c7..d4e6980967e82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -24,12 +24,11 @@ import scala.collection.JavaConverters._
 
 import net.razorvine.pickle._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -320,10 +319,8 @@ object EvaluatePython {
 }
 
 /**
- * :: DeveloperApi ::
  * Evaluates a [[PythonUDF]], appending the result to the end of the input tuple.
  */
-@DeveloperApi
 case class EvaluatePython(
     udf: PythonUDF,
     child: LogicalPlan,
@@ -337,7 +334,6 @@ case class EvaluatePython(
 }
 
 /**
- * :: DeveloperApi ::
  * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.
  *
  * Python evaluation works by sending the necessary (projected) input data via a socket to an
@@ -347,7 +343,6 @@ case class EvaluatePython(
  * we drain the queue to find the original input row. Note that if the Python process is way too
  * slow, this could lead to the queue growing unbounded and eventually run out of memory.
  */
-@DeveloperApi
 case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
   extends SparkPlan {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
index 855555dd1d4c4..0e601cd2cab5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -25,10 +24,8 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
- * :: DeveloperApi ::
  * Converts Java-object-based rows into [[UnsafeRow]]s.
  */
-@DeveloperApi
 case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
 
   require(UnsafeProjection.canSupport(child.schema), s"Cannot convert ${child.schema} to Unsafe")
@@ -48,10 +45,8 @@ case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Converts [[UnsafeRow]]s back into Java-object-based rows.
  */
-@DeveloperApi
 case class ConvertToSafe(child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
   override def outputPartitioning: Partitioning = child.outputPartitioning
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index 963e6030c14c8..a741a45f1c527 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.test
 
-import java.util
-
-import scala.collection.JavaConverters._
 import org.apache.spark.sql.types._
 
 /**

From 1baaf2b9bd7c949a8f95cd14fc1be2a56e1139b3 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 14 Oct 2015 16:29:32 -0700
Subject: [PATCH 556/802] [SPARK-10829] [SQL] Filter combine partition key and
 attribute doesn't work in DataSource scan

```scala
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
      withTempPath { dir =>
        val path = s"${dir.getCanonicalPath}/part=1"
        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)

        // If the "part = 1" filter gets pushed down, this query will throw an exception since
        // "part" is not a valid column in the actual Parquet file
        checkAnswer(
          sqlContext.read.parquet(path).filter("a > 0 and (part = 0 or a > 1)"),
          (2 to 3).map(i => Row(i, i.toString, 1)))
      }
    }
```

We expect the result to be:
```
2,1
3,1
```
But got
```
1,1
2,1
3,1
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8916 from chenghao-intel/partition_filter.
---
 .../datasources/DataSourceStrategy.scala      | 34 ++++++++++++-------
 .../parquet/ParquetFilterSuite.scala          | 17 ++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 918db8e7d083e..33181fa6c065f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -62,7 +62,22 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     // Scanning partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
-      val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
+      // We divide the filter expressions into 3 parts
+      val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
+
+      // TODO this is case-sensitive
+      // Only prunning the partition keys
+      val partitionFilters =
+        filters.filter(_.references.map(_.name).toSet.subsetOf(partitionColumnNames))
+
+      // Only pushes down predicates that do not reference partition keys.
+      val pushedFilters =
+        filters.filter(_.references.map(_.name).toSet.intersect(partitionColumnNames).isEmpty)
+
+      // Predicates with both partition keys and attributes
+      val combineFilters = filters.toSet -- partitionFilters.toSet -- pushedFilters.toSet
+
+      val selectedPartitions = prunePartitions(partitionFilters, t.partitionSpec).toArray
 
       logInfo {
         val total = t.partitionSpec.partitions.length
@@ -71,21 +86,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         s"Selected $selected partitions out of $total, pruned $percentPruned% partitions."
       }
 
-      // Only pushes down predicates that do not reference partition columns.
-      val pushedFilters = {
-        val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
-        filters.filter { f =>
-          val referencedColumnNames = f.references.map(_.name).toSet
-          referencedColumnNames.intersect(partitionColumnNames).isEmpty
-        }
-      }
-
-      buildPartitionedTableScan(
+      val scan = buildPartitionedTableScan(
         l,
         projects,
         pushedFilters,
         t.partitionSpec.partitionColumns,
-        selectedPartitions) :: Nil
+        selectedPartitions)
+
+      combineFilters
+        .reduceLeftOption(expressions.And)
+        .map(execution.Filter(_, scan)).getOrElse(scan) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 45ad3fde559c0..7a23f57f40392 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -297,4 +297,21 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-10829: Filter combine partition key and attribute doesn't work in DataSource scan") {
+    import testImplicits._
+
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        val path = s"${dir.getCanonicalPath}/part=1"
+        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
+
+        // If the "part = 1" filter gets pushed down, this query will throw an exception since
+        // "part" is not a valid column in the actual Parquet file
+        checkAnswer(
+          sqlContext.read.parquet(path).filter("a > 0 and (part = 0 or a > 1)"),
+          (2 to 3).map(i => Row(i, i.toString, 1)))
+      }
+    }
+  }
 }

From 4ace4f8a9c91beb21a0077e12b75637a4560a542 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 14 Oct 2015 17:27:27 -0700
Subject: [PATCH 557/802] [SPARK-11017] [SQL] Support ImperativeAggregates in
 TungstenAggregate

This patch extends TungstenAggregate to support ImperativeAggregate functions. The existing TungstenAggregate operator only supported DeclarativeAggregate functions, which are defined in terms of Catalyst expressions and can be evaluated via generated projections. ImperativeAggregate functions, on the other hand, are evaluated by calling their `initialize`, `update`, `merge`, and `eval` methods.

The basic strategy here is similar to how SortBasedAggregate evaluates both types of aggregate functions: use a generated projection to evaluate the expression-based declarative aggregates with dummy placeholder expressions inserted in place of the imperative aggregate function output, then invoke the imperative aggregate functions and target them against the aggregation buffer. The bulk of the diff here consists of code that was copied and adapted from SortBasedAggregate, with some key changes to handle TungstenAggregate's sort fallback path.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9038 from JoshRosen/support-interpreted-in-tungsten-agg-final.
---
 .../expressions/aggregate/functions.scala     |  19 +-
 .../expressions/aggregate/interfaces.scala    |  31 +-
 .../aggregate/AggregationIterator.scala       |  29 +-
 .../aggregate/TungstenAggregate.scala         |  22 +-
 .../TungstenAggregationIterator.scala         | 250 ++++++++++++----
 .../spark/sql/execution/aggregate/udaf.scala  |  79 +++--
 .../spark/sql/execution/aggregate/utils.scala | 269 +++++++++---------
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |  16 +-
 9 files changed, 457 insertions(+), 260 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 8aad0b7dee054..c0bc7ec09c34a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -472,10 +472,20 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
  * @param relativeSD the maximum estimation error allowed.
  */
 // scalastyle:on
-case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
-    extends ImperativeAggregate {
+case class HyperLogLogPlusPlus(
+    child: Expression,
+    relativeSD: Double = 0.05,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
+  extends ImperativeAggregate {
   import HyperLogLogPlusPlus._
 
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
   /**
    * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
    * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
@@ -546,6 +556,11 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
     AttributeReference(s"MS[$i]", LongType)()
   }
 
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
   /** Fill all words with zeros. */
   override def initialize(buffer: MutableRow): Unit = {
     var word = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 9ba3a9c980457..a2fab258fcac3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -150,6 +150,10 @@ sealed abstract class AggregateFunction2 extends Expression with ImplicitCastInp
  * We need to perform similar field number arithmetic when merging multiple intermediate
  * aggregate buffers together in `merge()` (in this case, use `inputAggBufferOffset` when accessing
  * the input buffer).
+ *
+ * Correct ImperativeAggregate evaluation depends on the correctness of `mutableAggBufferOffset` and
+ * `inputAggBufferOffset`, but not on the correctness of the attribute ids in `aggBufferAttributes`
+ * and `inputAggBufferAttributes`.
  */
 abstract class ImperativeAggregate extends AggregateFunction2 {
 
@@ -172,11 +176,13 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    *                     avg(y) mutableAggBufferOffset = 2
    *
    */
-  protected var mutableAggBufferOffset: Int = 0
+  protected val mutableAggBufferOffset: Int
 
-  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Unit = {
-    mutableAggBufferOffset = newMutableAggBufferOffset
-  }
+  /**
+   * Returns a copy of this ImperativeAggregate with an updated mutableAggBufferOffset.
+   * This new copy's attributes may have different ids than the original.
+   */
+  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate
 
   /**
    * The offset of this function's start buffer value in the underlying shared input aggregation
@@ -203,11 +209,17 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    *                       avg(y) inputAggBufferOffset = 3
    *
    */
-  protected var inputAggBufferOffset: Int = 0
+  protected val inputAggBufferOffset: Int
 
-  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): Unit = {
-    inputAggBufferOffset = newInputAggBufferOffset
-  }
+  /**
+   * Returns a copy of this ImperativeAggregate with an updated mutableAggBufferOffset.
+   * This new copy's attributes may have different ids than the original.
+   */
+  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate
+
+  // Note: although all subclasses implement inputAggBufferAttributes by simply cloning
+  // aggBufferAttributes, that common clone code cannot be placed here in the abstract
+  // ImperativeAggregate class, since that will lead to initialization ordering issues.
 
   /**
    * Initializes the mutable aggregation buffer located in `mutableAggBuffer`.
@@ -231,9 +243,6 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    * Use `fieldNumber + inputAggBufferOffset` to access fields of `inputAggBuffer`.
    */
   def merge(mutableAggBuffer: MutableRow, inputAggBuffer: InternalRow): Unit
-
-  final lazy val inputAggBufferAttributes: Seq[AttributeReference] =
-    aggBufferAttributes.map(_.newInstance())
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 8e0fbd109b413..99fb7a40b72e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -83,7 +83,7 @@ abstract class AggregationIterator(
     var i = 0
     while (i < allAggregateExpressions.length) {
       val func = allAggregateExpressions(i).aggregateFunction
-      val funcWithBoundReferences = allAggregateExpressions(i).mode match {
+      val funcWithBoundReferences: AggregateFunction2 = allAggregateExpressions(i).mode match {
         case Partial | Complete if func.isInstanceOf[ImperativeAggregate] =>
           // We need to create BoundReferences if the function is not an
           // expression-based aggregate function (it does not support code-gen) and the mode of
@@ -94,24 +94,24 @@ abstract class AggregationIterator(
         case _ =>
           // We only need to set inputBufferOffset for aggregate functions with mode
           // PartialMerge and Final.
-          func match {
+          val updatedFunc = func match {
             case function: ImperativeAggregate =>
               function.withNewInputAggBufferOffset(inputBufferOffset)
-            case _ =>
+            case function => function
           }
           inputBufferOffset += func.aggBufferSchema.length
-          func
+          updatedFunc
       }
-      // Set mutableBufferOffset for this function. It is important that setting
-      // mutableBufferOffset happens after all potential bindReference operations
-      // because bindReference will create a new instance of the function.
-      funcWithBoundReferences match {
+      val funcWithUpdatedAggBufferOffset = funcWithBoundReferences match {
         case function: ImperativeAggregate =>
+          // Set mutableBufferOffset for this function. It is important that setting
+          // mutableBufferOffset happens after all potential bindReference operations
+          // because bindReference will create a new instance of the function.
           function.withNewMutableAggBufferOffset(mutableBufferOffset)
-        case _ =>
+        case function => function
       }
-      mutableBufferOffset += funcWithBoundReferences.aggBufferSchema.length
-      functions(i) = funcWithBoundReferences
+      mutableBufferOffset += funcWithUpdatedAggBufferOffset.aggBufferSchema.length
+      functions(i) = funcWithUpdatedAggBufferOffset
       i += 1
     }
     functions
@@ -320,7 +320,7 @@ abstract class AggregationIterator(
   // Initializing the function used to generate the output row.
   protected val generateOutput: (InternalRow, MutableRow) => InternalRow = {
     val rowToBeEvaluated = new JoinedRow
-    val safeOutputRow = new GenericMutableRow(resultExpressions.length)
+    val safeOutputRow = new SpecificMutableRow(resultExpressions.map(_.dataType))
     val mutableOutput = if (outputsUnsafeRows) {
       UnsafeProjection.create(resultExpressions.map(_.dataType).toArray).apply(safeOutputRow)
     } else {
@@ -358,7 +358,8 @@ abstract class AggregationIterator(
         val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferSchemata)()
         val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
         // TODO: Use unsafe row.
-        val aggregateResult = new GenericMutableRow(aggregateResultSchema.length)
+        val aggregateResult = new SpecificMutableRow(aggregateResultSchema.map(_.dataType))
+        expressionAggEvalProjection.target(aggregateResult)
         val resultProjection =
           newMutableProjection(
             resultExpressions, groupingKeyAttributes ++ aggregateResultSchema)()
@@ -366,7 +367,7 @@ abstract class AggregationIterator(
 
         (currentGroupingKey: InternalRow, currentBuffer: MutableRow) => {
           // Generate results for all expression-based aggregate functions.
-          expressionAggEvalProjection.target(aggregateResult)(currentBuffer)
+          expressionAggEvalProjection(currentBuffer)
           // Generate results for all imperative aggregate functions.
           var i = 0
           while (i < allImperativeAggregateFunctions.length) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 7b3d072b2e067..c342940e6e757 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -24,8 +24,9 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
+import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, UnaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.types.StructType
 
 case class TungstenAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
@@ -34,10 +35,18 @@ case class TungstenAggregate(
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateAttributes: Seq[Attribute],
+    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
   extends UnaryNode {
 
+  private[this] val aggregateBufferAttributes = {
+    (nonCompleteAggregateExpressions ++ completeAggregateExpressions)
+      .flatMap(_.aggregateFunction.aggBufferAttributes)
+  }
+
+  require(TungstenAggregate.supportsAggregate(groupingExpressions, aggregateBufferAttributes))
+
   override private[sql] lazy val metrics = Map(
     "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
     "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
@@ -82,6 +91,7 @@ case class TungstenAggregate(
         nonCompleteAggregateAttributes,
         completeAggregateExpressions,
         completeAggregateAttributes,
+        initialInputBufferOffset,
         resultExpressions,
         newMutableProjection,
         child.output,
@@ -138,3 +148,13 @@ case class TungstenAggregate(
     }
   }
 }
+
+object TungstenAggregate {
+  def supportsAggregate(
+    groupingExpressions: Seq[Expression],
+    aggregateBufferAttributes: Seq[Attribute]): Boolean = {
+    val aggregationBufferSchema = StructType.fromAttributes(aggregateBufferAttributes)
+    UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
+      UnsafeProjection.canSupport(groupingExpressions)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 4bb95c9eb7f3e..fe708a5f71f79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.aggregate
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.{InternalAccumulator, Logging, SparkEnv, TaskContext}
 import org.apache.spark.sql.catalyst.expressions._
@@ -79,6 +81,7 @@ class TungstenAggregationIterator(
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateAttributes: Seq[Attribute],
+    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
@@ -134,19 +137,74 @@ class TungstenAggregationIterator(
       completeAggregateExpressions.map(_.mode).distinct.headOption
   }
 
-  // All aggregate functions. TungstenAggregationIterator only handles expression-based aggregate.
-  // If there is any functions that is an ImperativeAggregateFunction, we throw an
-  // IllegalStateException.
-  private[this] val allAggregateFunctions: Array[DeclarativeAggregate] = {
-    if (!allAggregateExpressions.forall(
-        _.aggregateFunction.isInstanceOf[DeclarativeAggregate])) {
-      throw new IllegalStateException(
-        "Only ExpressionAggregateFunctions should be passed in TungstenAggregationIterator.")
+  // Initialize all AggregateFunctions by binding references, if necessary,
+  // and setting inputBufferOffset and mutableBufferOffset.
+  private def initializeAllAggregateFunctions(
+      startingInputBufferOffset: Int): Array[AggregateFunction2] = {
+    var mutableBufferOffset = 0
+    var inputBufferOffset: Int = startingInputBufferOffset
+    val functions = new Array[AggregateFunction2](allAggregateExpressions.length)
+    var i = 0
+    while (i < allAggregateExpressions.length) {
+      val func = allAggregateExpressions(i).aggregateFunction
+      val aggregateExpressionIsNonComplete = i < nonCompleteAggregateExpressions.length
+      // We need to use this mode instead of func.mode in order to handle aggregation mode switching
+      // when switching to sort-based aggregation:
+      val mode = if (aggregateExpressionIsNonComplete) aggregationMode._1 else aggregationMode._2
+      val funcWithBoundReferences = mode match {
+        case Some(Partial) | Some(Complete) if func.isInstanceOf[ImperativeAggregate] =>
+          // We need to create BoundReferences if the function is not an
+          // expression-based aggregate function (it does not support code-gen) and the mode of
+          // this function is Partial or Complete because we will call eval of this
+          // function's children in the update method of this aggregate function.
+          // Those eval calls require BoundReferences to work.
+          BindReferences.bindReference(func, originalInputAttributes)
+        case _ =>
+          // We only need to set inputBufferOffset for aggregate functions with mode
+          // PartialMerge and Final.
+          val updatedFunc = func match {
+            case function: ImperativeAggregate =>
+              function.withNewInputAggBufferOffset(inputBufferOffset)
+            case function => function
+          }
+          inputBufferOffset += func.aggBufferSchema.length
+          updatedFunc
+      }
+      val funcWithUpdatedAggBufferOffset = funcWithBoundReferences match {
+        case function: ImperativeAggregate =>
+          // Set mutableBufferOffset for this function. It is important that setting
+          // mutableBufferOffset happens after all potential bindReference operations
+          // because bindReference will create a new instance of the function.
+          function.withNewMutableAggBufferOffset(mutableBufferOffset)
+        case function => function
+      }
+      mutableBufferOffset += funcWithUpdatedAggBufferOffset.aggBufferSchema.length
+      functions(i) = funcWithUpdatedAggBufferOffset
+      i += 1
     }
+    functions
+  }
 
-    allAggregateExpressions
-      .map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
-      .toArray
+  private[this] var allAggregateFunctions: Array[AggregateFunction2] =
+    initializeAllAggregateFunctions(initialInputBufferOffset)
+
+  // Positions of those imperative aggregate functions in allAggregateFunctions.
+  // For example, say that we have func1, func2, func3, func4 in aggregateFunctions, and
+  // func2 and func3 are imperative aggregate functions. Then
+  // allImperativeAggregateFunctionPositions will be [1, 2]. Note that this does not need to be
+  // updated when falling back to sort-based aggregation because the positions of the aggregate
+  // functions do not change in that case.
+  private[this] val allImperativeAggregateFunctionPositions: Array[Int] = {
+    val positions = new ArrayBuffer[Int]()
+    var i = 0
+    while (i < allAggregateFunctions.length) {
+      allAggregateFunctions(i) match {
+        case agg: DeclarativeAggregate =>
+        case _ => positions += i
+      }
+      i += 1
+    }
+    positions.toArray
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -155,25 +213,31 @@ class TungstenAggregationIterator(
   //         rows.
   ///////////////////////////////////////////////////////////////////////////
 
-  // The projection used to initialize buffer values.
-  private[this] val initialProjection: MutableProjection = {
-    val initExpressions = allAggregateFunctions.flatMap(_.initialValues)
+  // The projection used to initialize buffer values for all expression-based aggregates.
+  // Note that this projection does not need to be updated when switching to sort-based aggregation
+  // because the schema of empty aggregation buffers does not change in that case.
+  private[this] val expressionAggInitialProjection: MutableProjection = {
+    val initExpressions = allAggregateFunctions.flatMap {
+      case ae: DeclarativeAggregate => ae.initialValues
+      // For the positions corresponding to imperative aggregate functions, we'll use special
+      // no-op expressions which are ignored during projection code-generation.
+      case i: ImperativeAggregate => Seq.fill(i.aggBufferAttributes.length)(NoOp)
+    }
     newMutableProjection(initExpressions, Nil)()
   }
 
   // Creates a new aggregation buffer and initializes buffer values.
-  // This functions should be only called at most three times (when we create the hash map,
+  // This function should be only called at most three times (when we create the hash map,
   // when we switch to sort-based aggregation, and when we create the re-used buffer for
   // sort-based aggregation).
   private def createNewAggregationBuffer(): UnsafeRow = {
     val bufferSchema = allAggregateFunctions.flatMap(_.aggBufferAttributes)
-    val bufferRowSize: Int = bufferSchema.length
-
-    val genericMutableBuffer = new GenericMutableRow(bufferRowSize)
-    val unsafeProjection =
-      UnsafeProjection.create(bufferSchema.map(_.dataType))
-    val buffer = unsafeProjection.apply(genericMutableBuffer)
-    initialProjection.target(buffer)(EmptyRow)
+    val buffer: UnsafeRow = UnsafeProjection.create(bufferSchema.map(_.dataType))
+      .apply(new GenericMutableRow(bufferSchema.length))
+    // Initialize declarative aggregates' buffer values
+    expressionAggInitialProjection.target(buffer)(EmptyRow)
+    // Initialize imperative aggregates' buffer values
+    allAggregateFunctions.collect { case f: ImperativeAggregate => f }.foreach(_.initialize(buffer))
     buffer
   }
 
@@ -187,72 +251,124 @@ class TungstenAggregationIterator(
     aggregationMode match {
       // Partial-only
       case (Some(Partial), None) =>
-        val updateExpressions = allAggregateFunctions.flatMap(_.updateExpressions)
-        val updateProjection =
+        val updateExpressions = allAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val imperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+        val expressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          updateProjection.target(currentBuffer)
-          updateProjection(joinedRow(currentBuffer, row))
+          expressionAggUpdateProjection.target(currentBuffer)
+          // Process all expression-based aggregate functions.
+          expressionAggUpdateProjection(joinedRow(currentBuffer, row))
+          // Process all imperative aggregate functions
+          var i = 0
+          while (i < imperativeAggregateFunctions.length) {
+            imperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
         }
 
       // PartialMerge-only or Final-only
       case (Some(PartialMerge), None) | (Some(Final), None) =>
-        val mergeExpressions = allAggregateFunctions.flatMap(_.mergeExpressions)
-        val mergeProjection =
+        val mergeExpressions = allAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.mergeExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val imperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+        // This projection is used to merge buffer values for all expression-based aggregates.
+        val expressionAggMergeProjection =
           newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          mergeProjection.target(currentBuffer)
-          mergeProjection(joinedRow(currentBuffer, row))
+          // Process all expression-based aggregate functions.
+          expressionAggMergeProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+          // Process all imperative aggregate functions.
+          var i = 0
+          while (i < imperativeAggregateFunctions.length) {
+            imperativeAggregateFunctions(i).merge(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Final-Complete
       case (Some(Final), Some(Complete)) =>
-        val nonCompleteAggregateFunctions: Array[DeclarativeAggregate] =
-          allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
-        val completeAggregateFunctions: Array[DeclarativeAggregate] =
+        val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
+        val nonCompleteAggregateFunctions: Array[AggregateFunction2] =
+          allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
+        val nonCompleteImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          nonCompleteAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
         val completeOffsetExpressions =
           Seq.fill(completeAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
         val mergeExpressions =
-          nonCompleteAggregateFunctions.flatMap(_.mergeExpressions) ++ completeOffsetExpressions
+          nonCompleteAggregateFunctions.flatMap {
+            case ae: DeclarativeAggregate => ae.mergeExpressions
+            case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+          } ++ completeOffsetExpressions
         val finalMergeProjection =
           newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         // We do not touch buffer values of aggregate functions with the Final mode.
         val finalOffsetExpressions =
           Seq.fill(nonCompleteAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
-        val updateExpressions =
-          finalOffsetExpressions ++ completeAggregateFunctions.flatMap(_.updateExpressions)
+        val updateExpressions = finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
         val completeUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
           val input = joinedRow(currentBuffer, row)
-          // For all aggregate functions with mode Complete, update the given currentBuffer.
+          // For all aggregate functions with mode Complete, update buffers.
           completeUpdateProjection.target(currentBuffer)(input)
+          var i = 0
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
 
           // For all aggregate functions with mode Final, merge buffer values in row to
           // currentBuffer.
           finalMergeProjection.target(currentBuffer)(input)
+          i = 0
+          while (i < nonCompleteImperativeAggregateFunctions.length) {
+            nonCompleteImperativeAggregateFunctions(i).merge(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Complete-only
       case (None, Some(Complete)) =>
-        val completeAggregateFunctions: Array[DeclarativeAggregate] =
+        val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
+        // All imperative aggregate functions with mode Complete.
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
-        val updateExpressions =
-          completeAggregateFunctions.flatMap(_.updateExpressions)
-        val completeUpdateProjection =
+        val updateExpressions = completeAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val completeExpressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          completeUpdateProjection.target(currentBuffer)
-          // For all aggregate functions with mode Complete, update the given currentBuffer.
-          completeUpdateProjection(joinedRow(currentBuffer, row))
+          // For all aggregate functions with mode Complete, update buffers.
+          completeExpressionAggUpdateProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+          var i = 0
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Grouping only.
@@ -288,17 +404,30 @@ class TungstenAggregationIterator(
         val joinedRow = new JoinedRow()
         val evalExpressions = allAggregateFunctions.map {
           case ae: DeclarativeAggregate => ae.evaluateExpression
-          // case agg: AggregateFunction2 => Literal.create(null, agg.dataType)
+          case agg: AggregateFunction2 => NoOp
         }
-        val expressionAggEvalProjection = UnsafeProjection.create(evalExpressions, bufferAttributes)
+        val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferAttributes)()
         // These are the attributes of the row produced by `expressionAggEvalProjection`
         val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
+        val aggregateResult = new SpecificMutableRow(aggregateResultSchema.map(_.dataType))
+        expressionAggEvalProjection.target(aggregateResult)
         val resultProjection =
           UnsafeProjection.create(resultExpressions, groupingAttributes ++ aggregateResultSchema)
 
+        val allImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
           // Generate results for all expression-based aggregate functions.
-          val aggregateResult = expressionAggEvalProjection.apply(currentBuffer)
+          expressionAggEvalProjection(currentBuffer)
+          // Generate results for all imperative aggregate functions.
+          var i = 0
+          while (i < allImperativeAggregateFunctions.length) {
+            aggregateResult.update(
+              allImperativeAggregateFunctionPositions(i),
+              allImperativeAggregateFunctions(i).eval(currentBuffer))
+            i += 1
+          }
           resultProjection(joinedRow(currentGroupingKey, aggregateResult))
         }
 
@@ -481,10 +610,27 @@ class TungstenAggregationIterator(
       // When needsProcess is false, the format of input rows is groupingKey + aggregation buffer.
       // We need to project the aggregation buffer part from an input row.
       val buffer = createNewAggregationBuffer()
-      // The originalInputAttributes are using cloneBufferAttributes. So, we need to use
-      // allAggregateFunctions.flatMap(_.cloneBufferAttributes).
+      // In principle, we could use `allAggregateFunctions.flatMap(_.inputAggBufferAttributes)` to
+      // extract the aggregation buffer. In practice, however, we extract it positionally by relying
+      // on it being present at the end of the row. The reason for this relates to how the different
+      // aggregates handle input binding.
+      //
+      // ImperativeAggregate uses field numbers and field number offsets to manipulate its buffers,
+      // so its correctness does not rely on attribute bindings. When we fall back to sort-based
+      // aggregation, these field number offsets (mutableAggBufferOffset and inputAggBufferOffset)
+      // need to be updated and any internal state in the aggregate functions themselves must be
+      // reset, so we call withNewMutableAggBufferOffset and withNewInputAggBufferOffset to reset
+      // this state and update the offsets.
+      //
+      // The updated ImperativeAggregate will have different attribute ids for its
+      // aggBufferAttributes and inputAggBufferAttributes. This isn't a problem for the actual
+      // ImperativeAggregate evaluation, but it means that
+      // `allAggregateFunctions.flatMap(_.inputAggBufferAttributes)` will no longer match the
+      // attributes in `originalInputAttributes`, which is why we can't use those attributes here.
+      //
+      // For more details, see the discussion on PR #9038.
       val bufferExtractor = newMutableProjection(
-        allAggregateFunctions.flatMap(_.inputAggBufferAttributes),
+        originalInputAttributes.drop(initialInputBufferOffset),
         originalInputAttributes)()
       bufferExtractor.target(buffer)
 
@@ -511,8 +657,10 @@ class TungstenAggregationIterator(
     }
     aggregationMode = newAggregationMode
 
+    allAggregateFunctions = initializeAllAggregateFunctions(startingInputBufferOffset = 0)
+
     // Basically the value of the KVIterator returned by externalSorter
-    // will just aggregation buffer. At here, we use cloneBufferAttributes.
+    // will just aggregation buffer. At here, we use inputAggBufferAttributes.
     val newInputAttributes: Seq[Attribute] =
       allAggregateFunctions.flatMap(_.inputAggBufferAttributes)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index fd02be1225f27..d2f56e0fc14a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -321,9 +321,17 @@ private[sql] class InputAggregationBuffer private[sql] (
  */
 private[sql] case class ScalaUDAF(
     children: Seq[Expression],
-    udaf: UserDefinedAggregateFunction)
+    udaf: UserDefinedAggregateFunction,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
   extends ImperativeAggregate with Logging {
 
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
   require(
     children.length == udaf.inputSchema.length,
     s"$udaf only accepts ${udaf.inputSchema.length} arguments, " +
@@ -341,6 +349,11 @@ private[sql] case class ScalaUDAF(
 
   override val aggBufferAttributes: Seq[AttributeReference] = aggBufferSchema.toAttributes
 
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
   private[this] lazy val childrenSchema: StructType = {
     val inputFields = children.zipWithIndex.map {
       case (child, index) =>
@@ -382,51 +395,33 @@ private[sql] case class ScalaUDAF(
   }
 
   // This buffer is only used at executor side.
-  private[this] var inputAggregateBuffer: InputAggregationBuffer = null
-
-  // This buffer is only used at executor side.
-  private[this] var mutableAggregateBuffer: MutableAggregationBufferImpl = null
+  private[this] lazy val inputAggregateBuffer: InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      inputAggBufferOffset,
+      null)
+  }
 
   // This buffer is only used at executor side.
-  private[this] var evalAggregateBuffer: InputAggregationBuffer = null
-
-  /**
-   * Sets the inputBufferOffset to newInputBufferOffset and then create a new instance of
-   * `inputAggregateBuffer` based on this new inputBufferOffset.
-   */
-  override def withNewInputAggBufferOffset(newInputBufferOffset: Int): Unit = {
-    super.withNewInputAggBufferOffset(newInputBufferOffset)
-    // inputBufferOffset has been updated.
-    inputAggregateBuffer =
-      new InputAggregationBuffer(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        inputAggBufferOffset,
-        null)
+  private[this] lazy val mutableAggregateBuffer: MutableAggregationBufferImpl = {
+    new MutableAggregationBufferImpl(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      mutableAggBufferOffset,
+      null)
   }
 
-  /**
-   * Sets the mutableBufferOffset to newMutableBufferOffset and then create a new instance of
-   * `mutableAggregateBuffer` and `evalAggregateBuffer` based on this new mutableBufferOffset.
-   */
-  override def withNewMutableAggBufferOffset(newMutableBufferOffset: Int): Unit = {
-    super.withNewMutableAggBufferOffset(newMutableBufferOffset)
-    // mutableBufferOffset has been updated.
-    mutableAggregateBuffer =
-      new MutableAggregationBufferImpl(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        mutableAggBufferOffset,
-        null)
-    evalAggregateBuffer =
-      new InputAggregationBuffer(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        mutableAggBufferOffset,
-        null)
+  // This buffer is only used at executor side.
+  private[this] lazy val evalAggregateBuffer: InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      mutableAggBufferOffset,
+      null)
   }
 
   override def initialize(buffer: MutableRow): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index cf6e7ed0d337f..eaafd83158a15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -19,21 +19,12 @@ package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.execution.SparkPlan
 
 /**
  * Utility functions used by the query planner to convert our plan to new aggregation code path.
  */
 object Utils {
-  def supportsTungstenAggregate(
-      groupingExpressions: Seq[Expression],
-      aggregateBufferAttributes: Seq[Attribute]): Boolean = {
-    val aggregationBufferSchema = StructType.fromAttributes(aggregateBufferAttributes)
-
-    UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
-      UnsafeProjection.canSupport(groupingExpressions)
-  }
 
   def planAggregateWithoutPartial(
       groupingExpressions: Seq[NamedExpression],
@@ -70,8 +61,7 @@ object Utils {
     // Check if we can use TungstenAggregate.
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
-      supportsTungstenAggregate(
+      TungstenAggregate.supportsAggregate(
         groupingExpressions,
         aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
@@ -94,6 +84,7 @@ object Utils {
         nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
+        initialInputBufferOffset = 0,
         resultExpressions = partialResultExpressions,
         child = child)
     } else {
@@ -125,6 +116,7 @@ object Utils {
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
+        initialInputBufferOffset = groupingExpressions.length,
         resultExpressions = resultExpressions,
         child = partialAggregate)
     } else {
@@ -154,143 +146,150 @@ object Utils {
     val aggregateExpressions = functionsWithDistinct ++ functionsWithoutDistinct
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-        aggregateExpressions.forall(
-          _.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
-        supportsTungstenAggregate(
+        TungstenAggregate.supportsAggregate(
           groupingExpressions,
           aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
-    // 1. Create an Aggregate Operator for partial aggregations.
-    val groupingAttributes = groupingExpressions.map(_.toAttribute)
-
-    // It is safe to call head at here since functionsWithDistinct has at least one
-    // AggregateExpression2.
-    val distinctColumnExpressions =
-      functionsWithDistinct.head.aggregateFunction.children
-    val namedDistinctColumnExpressions = distinctColumnExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
+    // functionsWithDistinct is guaranteed to be non-empty. Even though it may contain more than one
+    // DISTINCT aggregate function, all of those functions will have the same column expression.
+    // For example, it would be valid for functionsWithDistinct to be
+    // [COUNT(DISTINCT foo), MAX(DISTINCT foo)], but [COUNT(DISTINCT bar), COUNT(DISTINCT foo)] is
+    // disallowed because those two distinct aggregates have different column expressions.
+    val distinctColumnExpression: Expression = {
+      val allDistinctColumnExpressions = functionsWithDistinct.head.aggregateFunction.children
+      assert(allDistinctColumnExpressions.length == 1)
+      allDistinctColumnExpressions.head
+    }
+    val namedDistinctColumnExpression: NamedExpression = distinctColumnExpression match {
+      case ne: NamedExpression => ne
+      case other => Alias(other, other.toString)()
     }
-    val distinctColumnExpressionMap = namedDistinctColumnExpressions.toMap
-    val distinctColumnAttributes = namedDistinctColumnExpressions.map(_._2.toAttribute)
+    val distinctColumnAttribute: Attribute = namedDistinctColumnExpression.toAttribute
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
 
-    val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial))
-    val partialAggregateAttributes =
-      partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-    val partialAggregateGroupingExpressions =
-      groupingExpressions ++ namedDistinctColumnExpressions.map(_._2)
-    val partialAggregateResult =
+    // 1. Create an Aggregate Operator for partial aggregations.
+    val partialAggregate: SparkPlan = {
+      val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial))
+      val partialAggregateAttributes =
+        partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      // We will group by the original grouping expression, plus an additional expression for the
+      // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, the grouping
+      // expressions will be [key, value].
+      val partialAggregateGroupingExpressions = groupingExpressions :+ namedDistinctColumnExpression
+      val partialAggregateResult =
         groupingAttributes ++
-        distinctColumnAttributes ++
-        partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
-    val partialAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = None,
-        // The grouping expressions are original groupingExpressions and
-        // distinct columns. For example, for avg(distinct value) ... group by key
-        // the grouping expressions of this Aggregate Operator will be [key, value].
-        groupingExpressions = partialAggregateGroupingExpressions,
-        nonCompleteAggregateExpressions = partialAggregateExpressions,
-        nonCompleteAggregateAttributes = partialAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        resultExpressions = partialAggregateResult,
-        child = child)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = None,
-        groupingExpressions = partialAggregateGroupingExpressions,
-        nonCompleteAggregateExpressions = partialAggregateExpressions,
-        nonCompleteAggregateAttributes = partialAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        initialInputBufferOffset = 0,
-        resultExpressions = partialAggregateResult,
-        child = child)
+          Seq(distinctColumnAttribute) ++
+          partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = None,
+          groupingExpressions = partialAggregateGroupingExpressions,
+          nonCompleteAggregateExpressions = partialAggregateExpressions,
+          nonCompleteAggregateAttributes = partialAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = 0,
+          resultExpressions = partialAggregateResult,
+          child = child)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = None,
+          groupingExpressions = partialAggregateGroupingExpressions,
+          nonCompleteAggregateExpressions = partialAggregateExpressions,
+          nonCompleteAggregateAttributes = partialAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = 0,
+          resultExpressions = partialAggregateResult,
+          child = child)
+      }
     }
 
     // 2. Create an Aggregate Operator for partial merge aggregations.
-    val partialMergeAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = PartialMerge))
-    val partialMergeAggregateAttributes =
-      partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-    val partialMergeAggregateResult =
+    val partialMergeAggregate: SparkPlan = {
+      val partialMergeAggregateExpressions =
+        functionsWithoutDistinct.map(_.copy(mode = PartialMerge))
+      val partialMergeAggregateAttributes =
+        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      val partialMergeAggregateResult =
         groupingAttributes ++
-        distinctColumnAttributes ++
-        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
-    val partialMergeAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
-        nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
-        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        resultExpressions = partialMergeAggregateResult,
-        child = partialAggregate)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
-        nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
-        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = partialMergeAggregateResult,
-        child = partialAggregate)
+          Seq(distinctColumnAttribute) ++
+          partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes :+ distinctColumnAttribute,
+          nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+          nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = partialMergeAggregateResult,
+          child = partialAggregate)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes :+ distinctColumnAttribute,
+          nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+          nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = partialMergeAggregateResult,
+          child = partialAggregate)
+      }
     }
 
-    // 3. Create an Aggregate Operator for partial merge aggregations.
-    val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
-    // The attributes of the final aggregation buffer, which is presented as input to the result
-    // projection:
-    val finalAggregateAttributes = finalAggregateExpressions.map {
-      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
-    }
+    // 3. Create an Aggregate Operator for the final aggregation.
+    val finalAndCompleteAggregate: SparkPlan = {
+      val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
+      // The attributes of the final aggregation buffer, which is presented as input to the result
+      // projection:
+      val finalAggregateAttributes = finalAggregateExpressions.map {
+        expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+      }
 
-    val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
-      // Children of an AggregateFunction with DISTINCT keyword has already
-      // been evaluated. At here, we need to replace original children
-      // to AttributeReferences.
-      case agg @ AggregateExpression2(aggregateFunction, mode, true) =>
-        val rewrittenAggregateFunction = aggregateFunction.transformDown {
-          case expr if distinctColumnExpressionMap.contains(expr) =>
-            distinctColumnExpressionMap(expr).toAttribute
-        }.asInstanceOf[AggregateFunction2]
-        // We rewrite the aggregate function to a non-distinct aggregation because
-        // its input will have distinct arguments.
-        // We just keep the isDistinct setting to true, so when users look at the query plan,
-        // they still can see distinct aggregations.
-        val rewrittenAggregateExpression =
-          AggregateExpression2(rewrittenAggregateFunction, Complete, true)
+      val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
+        // Children of an AggregateFunction with DISTINCT keyword has already
+        // been evaluated. At here, we need to replace original children
+        // to AttributeReferences.
+        case agg @ AggregateExpression2(aggregateFunction, mode, true) =>
+          val rewrittenAggregateFunction = aggregateFunction.transformDown {
+            case expr if expr == distinctColumnExpression => distinctColumnAttribute
+          }.asInstanceOf[AggregateFunction2]
+          // We rewrite the aggregate function to a non-distinct aggregation because
+          // its input will have distinct arguments.
+          // We just keep the isDistinct setting to true, so when users look at the query plan,
+          // they still can see distinct aggregations.
+          val rewrittenAggregateExpression =
+            AggregateExpression2(rewrittenAggregateFunction, Complete, isDistinct = true)
 
-        val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
-        (rewrittenAggregateExpression, aggregateFunctionAttribute)
-    }.unzip
-
-    val finalAndCompleteAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes,
-        nonCompleteAggregateExpressions = finalAggregateExpressions,
-        nonCompleteAggregateAttributes = finalAggregateAttributes,
-        completeAggregateExpressions = completeAggregateExpressions,
-        completeAggregateAttributes = completeAggregateAttributes,
-        resultExpressions = resultExpressions,
-        child = partialMergeAggregate)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes,
-        nonCompleteAggregateExpressions = finalAggregateExpressions,
-        nonCompleteAggregateAttributes = finalAggregateAttributes,
-        completeAggregateExpressions = completeAggregateExpressions,
-        completeAggregateAttributes = completeAggregateAttributes,
-        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = resultExpressions,
-        child = partialMergeAggregate)
+          val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
+          (rewrittenAggregateExpression, aggregateFunctionAttribute)
+      }.unzip
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes,
+          nonCompleteAggregateExpressions = finalAggregateExpressions,
+          nonCompleteAggregateAttributes = finalAggregateAttributes,
+          completeAggregateExpressions = completeAggregateExpressions,
+          completeAggregateAttributes = completeAggregateAttributes,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = resultExpressions,
+          child = partialMergeAggregate)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes,
+          nonCompleteAggregateExpressions = finalAggregateExpressions,
+          nonCompleteAggregateAttributes = finalAggregateAttributes,
+          completeAggregateExpressions = completeAggregateExpressions,
+          completeAggregateAttributes = completeAggregateAttributes,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = resultExpressions,
+          child = partialMergeAggregate)
+      }
     }
 
     finalAndCompleteAggregate :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index ed974b3a53d41..0cc4988ff681c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -39,7 +39,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
-        Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
+        0, Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
     } finally {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 18bbdb9908142..a2ebf6552fd06 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -553,10 +553,16 @@ private[hive] case class HiveGenericUDTF(
 private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
     children: Seq[Expression],
-    isUDAFBridgeRequired: Boolean = false)
+    isUDAFBridgeRequired: Boolean = false,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
   extends ImperativeAggregate with HiveInspectors {
 
-  def this() = this(null, null)
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
 
   @transient
   private lazy val resolver =
@@ -614,7 +620,11 @@ private[hive] case class HiveUDAFFunction(
     buffer = function.getNewAggregationBuffer
   }
 
-  override def aggBufferAttributes: Seq[AttributeReference] = Nil
+  override val aggBufferAttributes: Seq[AttributeReference] = Nil
+
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] = Nil
 
   // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
   // catalyst type checking framework.

From 9808052b5adfed7dafd6c1b3971b998e45b2799a Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 14 Oct 2015 20:56:08 -0700
Subject: [PATCH 558/802] [SPARK-11076] [SQL] Add decimal support for floor and
 ceil

Actually all of the `UnaryMathExpression` doens't support the Decimal, will create follow ups for supporing it. This is the first PR which will be good to review the approach I am taking.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #9086 from chenghao-intel/ceiling.
---
 .../expressions/mathExpressions.scala         | 48 +++++++++++++++----
 .../org/apache/spark/sql/types/Decimal.scala  | 32 +++++++++++--
 .../expressions/LiteralGenerator.scala        | 14 +++++-
 .../expressions/MathFunctionsSuite.scala      | 10 ++++
 4 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index a8164e9e29ec6..28f616fbb9ca5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -55,7 +55,7 @@ abstract class LeafMathExpression(c: Double, name: String)
 abstract class UnaryMathExpression(val f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
-  override def inputTypes: Seq[DataType] = Seq(DoubleType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType)
   override def dataType: DataType = DoubleType
   override def nullable: Boolean = true
   override def toString: String = s"$name($child)"
@@ -153,13 +153,28 @@ case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN"
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
 case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
-  override def dataType: DataType = LongType
-  protected override def nullSafeEval(input: Any): Any = {
-    f(input.asInstanceOf[Double]).toLong
+  override def dataType: DataType = child.dataType match {
+    case dt @ DecimalType.Fixed(_, 0) => dt
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType.bounded(precision - scale + 1, 0)
+    case _ => LongType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(DoubleType, DecimalType))
+
+  protected override def nullSafeEval(input: Any): Any = child.dataType match {
+    case DoubleType => f(input.asInstanceOf[Double]).toLong
+    case DecimalType.Fixed(precision, scale) => input.asInstanceOf[Decimal].ceil
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    child.dataType match {
+      case DecimalType.Fixed(_, 0) => defineCodeGen(ctx, ev, c => s"$c")
+      case DecimalType.Fixed(precision, scale) =>
+        defineCodeGen(ctx, ev, c => s"$c.ceil()")
+      case _ => defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    }
   }
 }
 
@@ -205,13 +220,28 @@ case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
 case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
-  override def dataType: DataType = LongType
-  protected override def nullSafeEval(input: Any): Any = {
-    f(input.asInstanceOf[Double]).toLong
+  override def dataType: DataType = child.dataType match {
+    case dt @ DecimalType.Fixed(_, 0) => dt
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType.bounded(precision - scale + 1, 0)
+    case _ => LongType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(DoubleType, DecimalType))
+
+  protected override def nullSafeEval(input: Any): Any = child.dataType match {
+    case DoubleType => f(input.asInstanceOf[Double]).toLong
+    case DecimalType.Fixed(precision, scale) => input.asInstanceOf[Decimal].floor
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    child.dataType match {
+      case DecimalType.Fixed(_, 0) => defineCodeGen(ctx, ev, c => s"$c")
+      case DecimalType.Fixed(precision, scale) =>
+        defineCodeGen(ctx, ev, c => s"$c.floor()")
+      case _ => defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    }
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index c11dab35cdf6f..c7a1a2e7469ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -107,7 +107,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    * Set this Decimal to the given BigDecimal value, with a given precision and scale.
    */
   def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = {
-    this.decimalVal = decimal.setScale(scale, ROUNDING_MODE)
+    this.decimalVal = decimal.setScale(scale, ROUND_HALF_UP)
     require(
       decimalVal.precision <= precision,
       s"Decimal precision ${decimalVal.precision} exceeds max precision $precision")
@@ -198,6 +198,16 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    * @return true if successful, false if overflow would occur
    */
   def changePrecision(precision: Int, scale: Int): Boolean = {
+    changePrecision(precision, scale, ROUND_HALF_UP)
+  }
+
+  /**
+   * Update precision and scale while keeping our value the same, and return true if successful.
+   *
+   * @return true if successful, false if overflow would occur
+   */
+  private[sql] def changePrecision(precision: Int, scale: Int,
+                      roundMode: BigDecimal.RoundingMode.Value): Boolean = {
     // fast path for UnsafeProjection
     if (precision == this.precision && scale == this.scale) {
       return true
@@ -231,7 +241,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     if (decimalVal.ne(null)) {
       // We get here if either we started with a BigDecimal, or we switched to one because we would
       // have overflowed our Long; in either case we must rescale decimalVal to the new scale.
-      val newVal = decimalVal.setScale(scale, ROUNDING_MODE)
+      val newVal = decimalVal.setScale(scale, roundMode)
       if (newVal.precision > precision) {
         return false
       }
@@ -309,10 +319,26 @@ final class Decimal extends Ordered[Decimal] with Serializable {
   }
 
   def abs: Decimal = if (this.compare(Decimal.ZERO) < 0) this.unary_- else this
+
+  def floor: Decimal = if (scale == 0) this else {
+    val value = this.clone()
+    value.changePrecision(
+      DecimalType.bounded(precision - scale + 1, 0).precision, 0, ROUND_FLOOR)
+    value
+  }
+
+  def ceil: Decimal = if (scale == 0) this else {
+    val value = this.clone()
+    value.changePrecision(
+      DecimalType.bounded(precision - scale + 1, 0).precision, 0, ROUND_CEILING)
+    value
+  }
 }
 
 object Decimal {
-  private val ROUNDING_MODE = BigDecimal.RoundingMode.HALF_UP
+  val ROUND_HALF_UP = BigDecimal.RoundingMode.HALF_UP
+  val ROUND_CEILING = BigDecimal.RoundingMode.CEILING
+  val ROUND_FLOOR = BigDecimal.RoundingMode.FLOOR
 
   /** Maximum number of decimal digits a Long can represent */
   val MAX_LONG_DIGITS = 18
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
index ee6d25157fc08..d9c91415e249d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
@@ -78,7 +78,18 @@ object LiteralGenerator {
         Double.NaN, Double.PositiveInfinity, Double.NegativeInfinity)
     } yield Literal.create(f, DoubleType)
 
-  // TODO: decimal type
+  // TODO cache the generated data
+  def decimalLiteralGen(precision: Int, scale: Int): Gen[Literal] = {
+    assert(scale >= 0)
+    assert(precision >= scale)
+    Arbitrary.arbBigInt.arbitrary.map { s =>
+      val a = (s % BigInt(10).pow(precision - scale)).toString()
+      val b = (s % BigInt(10).pow(scale)).abs.toString()
+      Literal.create(
+        Decimal(BigDecimal(s"$a.$b"), precision, scale),
+        DecimalType(precision, scale))
+    }
+  }
 
   lazy val stringLiteralGen: Gen[Literal] =
     for { s <- Arbitrary.arbString.arbitrary } yield Literal.create(s, StringType)
@@ -122,6 +133,7 @@ object LiteralGenerator {
       case StringType => stringLiteralGen
       case BinaryType => binaryLiteralGen
       case CalendarIntervalType => calendarIntervalLiterGen
+      case DecimalType.Fixed(precision, scale) => decimalLiteralGen(precision, scale)
       case dt => throw new IllegalArgumentException(s"not supported type $dt")
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 1b2a9163a3d09..88ed9fdd6465f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -246,11 +246,21 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("ceil") {
     testUnary(Ceil, (d: Double) => math.ceil(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType)
+
+    testUnary(Ceil, (d: Decimal) => d.ceil, (-20 to 20).map(x => Decimal(x * 0.1)))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(25, 3))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(25, 0))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(5, 0))
   }
 
   test("floor") {
     testUnary(Floor, (d: Double) => math.floor(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType)
+
+    testUnary(Floor, (d: Decimal) => d.floor, (-20 to 20).map(x => Decimal(x * 0.1)))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(25, 3))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(25, 0))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(5, 0))
   }
 
   test("factorial") {

From 0f62c2282bb30cb4fb6eea9d28b198d557a79b22 Mon Sep 17 00:00:00 2001
From: Adam Lewandowski <alewandowski@ipcoop.com>
Date: Thu, 15 Oct 2015 09:45:54 -0700
Subject: [PATCH 559/802] [SPARK-11093] [CORE]
 ChildFirstURLClassLoader#getResources should return all found resources, not
 just those in the child classloader

Author: Adam Lewandowski <alewandowski@ipcoop.com>

Closes #9106 from alewando/childFirstFix.
---
 .../spark/util/MutableURLClassLoader.scala    | 13 +++---
 .../util/MutableURLClassLoaderSuite.scala     | 40 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
index a1c33212cdb2b..945217203be72 100644
--- a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
@@ -21,6 +21,8 @@ import java.net.{URLClassLoader, URL}
 import java.util.Enumeration
 import java.util.concurrent.ConcurrentHashMap
 
+import scala.collection.JavaConverters._
+
 /**
  * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
  */
@@ -82,14 +84,9 @@ private[spark] class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoa
   }
 
   override def getResources(name: String): Enumeration[URL] = {
-    val urls = super.findResources(name)
-    val res =
-      if (urls != null && urls.hasMoreElements()) {
-        urls
-      } else {
-        parentClassLoader.getResources(name)
-      }
-    res
+    val childUrls = super.findResources(name).asScala
+    val parentUrls = parentClassLoader.getResources(name).asScala
+    (childUrls ++ parentUrls).asJavaEnumeration
   }
 
   override def addURL(url: URL) {
diff --git a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
index d3d464e84ffd7..8b53d4f14a6a4 100644
--- a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
@@ -19,9 +19,14 @@ package org.apache.spark.util
 
 import java.net.URLClassLoader
 
+import scala.collection.JavaConverters._
+
+import org.scalatest.Matchers
+import org.scalatest.Matchers._
+
 import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils}
 
-class MutableURLClassLoaderSuite extends SparkFunSuite {
+class MutableURLClassLoaderSuite extends SparkFunSuite with Matchers {
 
   val urls2 = List(TestUtils.createJarWithClasses(
       classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
@@ -32,6 +37,12 @@ class MutableURLClassLoaderSuite extends SparkFunSuite {
       toStringValue = "1",
       classpathUrls = urls2)).toArray
 
+  val fileUrlsChild = List(TestUtils.createJarWithFiles(Map(
+    "resource1" -> "resource1Contents-child",
+    "resource2" -> "resource2Contents"))).toArray
+  val fileUrlsParent = List(TestUtils.createJarWithFiles(Map(
+    "resource1" -> "resource1Contents-parent"))).toArray
+
   test("child first") {
     val parentLoader = new URLClassLoader(urls2, null)
     val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
@@ -68,6 +79,33 @@ class MutableURLClassLoaderSuite extends SparkFunSuite {
     }
   }
 
+  test("default JDK classloader get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new URLClassLoader(fileUrlsChild, parentLoader)
+    assert(classLoader.getResources("resource1").asScala.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+  }
+
+  test("parent first get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new MutableURLClassLoader(fileUrlsChild, parentLoader)
+    assert(classLoader.getResources("resource1").asScala.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+  }
+
+  test("child first get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new ChildFirstURLClassLoader(fileUrlsChild, parentLoader)
+
+    val res1 = classLoader.getResources("resource1").asScala.toList
+    assert(res1.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+
+    res1.map(scala.io.Source.fromURL(_).mkString) should contain inOrderOnly
+      ("resource1Contents-child", "resource1Contents-parent")
+  }
+
+
   test("driver sets context class loader in local mode") {
     // Test the case where the driver program sets a context classloader and then runs a job
     // in local mode. This is what happens when ./spark-submit is called with "local" as the

From aec4400beffc569c13cceea2d0c481dfa3f34175 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Thu, 15 Oct 2015 09:49:19 -0700
Subject: [PATCH 560/802] =?UTF-8?q?[SPARK-11099]=20[SPARK=20SHELL]=20[SPAR?=
 =?UTF-8?q?K=20SUBMIT]=20Default=20conf=20property=20file=20i=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Please help review it. Thanks

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9114 from zjffdu/SPARK-11099.
---
 .../launcher/AbstractCommandBuilder.java      | 14 ++++------
 .../SparkSubmitCommandBuilderSuite.java       | 28 +++++++++++++------
 .../src/test/resources/spark-defaults.conf    | 21 ++++++++++++++
 3 files changed, 45 insertions(+), 18 deletions(-)
 create mode 100644 launcher/src/test/resources/spark-defaults.conf

diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index cf3729b7febc3..3ee6bd92e47fc 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -272,15 +272,11 @@ void setPropertiesFile(String path) {
 
   Map<String, String> getEffectiveConfig() throws IOException {
     if (effectiveConfig == null) {
-      if (propertiesFile == null) {
-        effectiveConfig = conf;
-      } else {
-        effectiveConfig = new HashMap<>(conf);
-        Properties p = loadPropertiesFile();
-        for (String key : p.stringPropertyNames()) {
-          if (!effectiveConfig.containsKey(key)) {
-            effectiveConfig.put(key, p.getProperty(key));
-          }
+      effectiveConfig = new HashMap<>(conf);
+      Properties p = loadPropertiesFile();
+      for (String key : p.stringPropertyNames()) {
+        if (!effectiveConfig.containsKey(key)) {
+          effectiveConfig.put(key, p.getProperty(key));
         }
       }
     }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
index d5397b0685046..6aad47adbcc82 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
@@ -48,12 +48,14 @@ public static void cleanUp() throws Exception {
 
   @Test
   public void testDriverCmdBuilder() throws Exception {
-    testCmdBuilder(true);
+    testCmdBuilder(true, true);
+    testCmdBuilder(true, false);
   }
 
   @Test
   public void testClusterCmdBuilder() throws Exception {
-    testCmdBuilder(false);
+    testCmdBuilder(false, true);
+    testCmdBuilder(false, false);
   }
 
   @Test
@@ -149,7 +151,7 @@ public void testPySparkFallback() throws Exception {
     assertEquals("arg1", cmd.get(cmd.size() - 1));
   }
 
-  private void testCmdBuilder(boolean isDriver) throws Exception {
+  private void testCmdBuilder(boolean isDriver, boolean useDefaultPropertyFile) throws Exception {
     String deployMode = isDriver ? "client" : "cluster";
 
     SparkSubmitCommandBuilder launcher =
@@ -161,14 +163,20 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     launcher.appResource = "/foo";
     launcher.appName = "MyApp";
     launcher.mainClass = "my.Class";
-    launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
     launcher.appArgs.add("foo");
     launcher.appArgs.add("bar");
-    launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, "/driver");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "-Ddriver -XX:MaxPermSize=256m");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, "/native");
     launcher.conf.put("spark.foo", "foo");
+    // either set the property through "--conf" or through default property file
+    if (!useDefaultPropertyFile) {
+      launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
+      launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, "/driver");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "-Ddriver -XX:MaxPermSize=256m");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, "/native");
+    } else {
+      launcher.childEnv.put("SPARK_CONF_DIR", System.getProperty("spark.test.home")
+          + "/launcher/src/test/resources");
+    }
 
     Map<String, String> env = new HashMap<String, String>();
     List<String> cmd = launcher.buildCommand(env);
@@ -216,7 +224,9 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     }
 
     // Checks below are the same for both driver and non-driver mode.
-    assertEquals(dummyPropsFile.getAbsolutePath(), findArgValue(cmd, parser.PROPERTIES_FILE));
+    if (!useDefaultPropertyFile) {
+      assertEquals(dummyPropsFile.getAbsolutePath(), findArgValue(cmd, parser.PROPERTIES_FILE));
+    }
     assertEquals("yarn", findArgValue(cmd, parser.MASTER));
     assertEquals(deployMode, findArgValue(cmd, parser.DEPLOY_MODE));
     assertEquals("my.Class", findArgValue(cmd, parser.CLASS));
diff --git a/launcher/src/test/resources/spark-defaults.conf b/launcher/src/test/resources/spark-defaults.conf
new file mode 100644
index 0000000000000..239fc57883e98
--- /dev/null
+++ b/launcher/src/test/resources/spark-defaults.conf
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.driver.memory=1g
+spark.driver.extraClassPath=/driver
+spark.driver.extraJavaOptions=-Ddriver -XX:MaxPermSize=256m
+spark.driver.extraLibraryPath=/native
\ No newline at end of file

From 523adc24a683930304f408d477607edfe9de7b76 Mon Sep 17 00:00:00 2001
From: shellberg <sah@zepler.org>
Date: Thu, 15 Oct 2015 18:07:10 +0100
Subject: [PATCH 561/802] [SPARK-11066] Update DAGScheduler's "misbehaved
 ResultHandler"

Restrict tasks (of job) to only 1 to ensure that the causing Exception asserted for job failure is the deliberately thrown DAGSchedulerSuiteDummyException intended, not an UnsupportedOperationException from any second/subsequent tasks that can propagate from a race condition during code execution.

Author: shellberg <sah@zepler.org>

Closes #9076 from shellberg/shellberg-DAGSchedulerSuite-misbehavedResultHandlerTest-patch-1.
---
 .../apache/spark/scheduler/DAGSchedulerSuite.scala  | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 697c195e4ad1f..5b01ddb298c39 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -1375,18 +1375,27 @@ class DAGSchedulerSuite
     assert(sc.parallelize(1 to 10, 2).count() === 10)
   }
 
+  /**
+   * The job will be failed on first task throwing a DAGSchedulerSuiteDummyException.
+   *  Any subsequent task WILL throw a legitimate java.lang.UnsupportedOperationException.
+   *  If multiple tasks, there exists a race condition between the SparkDriverExecutionExceptions
+   *  and their differing causes as to which will represent result for job...
+   */
   test("misbehaved resultHandler should not crash DAGScheduler and SparkContext") {
     val e = intercept[SparkDriverExecutionException] {
+      // Number of parallelized partitions implies number of tasks of job
       val rdd = sc.parallelize(1 to 10, 2)
       sc.runJob[Int, Int](
         rdd,
         (context: TaskContext, iter: Iterator[Int]) => iter.size,
-        Seq(0, 1),
+        // For a robust test assertion, limit number of job tasks to 1; that is,
+        // if multiple RDD partitions, use id of any one partition, say, first partition id=0
+        Seq(0),
         (part: Int, result: Int) => throw new DAGSchedulerSuiteDummyException)
     }
     assert(e.getCause.isInstanceOf[DAGSchedulerSuiteDummyException])
 
-    // Make sure we can still run commands
+    // Make sure we can still run commands on our SparkContext
     assert(sc.parallelize(1 to 10, 2).count() === 10)
   }
 

From d45a0d3ca23df86cf0a95508ccc3b4b98f1b611c Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Thu, 15 Oct 2015 10:36:54 -0700
Subject: [PATCH 562/802] [SPARK-11047] Internal accumulators miss the internal
 flag when replaying events in the history server

Internal accumulators don't write the internal flag to event log. So on the history server Web UI, all accumulators are not internal. This causes incorrect peak execution memory and unwanted accumulator table displayed on the stage page.
To fix it, I add the "internal" property of AccumulableInfo when writing the event log.

Author: Carson Wang <carson.wang@intel.com>

Closes #9061 from carsonwang/accumulableBug.
---
 .../spark/scheduler/AccumulableInfo.scala     |  9 ++
 .../org/apache/spark/util/JsonProtocol.scala  |  6 +-
 .../apache/spark/util/JsonProtocolSuite.scala | 96 +++++++++++++------
 3 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index b6bff64ee368e..146cfb9ba8037 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -46,6 +46,15 @@ class AccumulableInfo private[spark] (
 }
 
 object AccumulableInfo {
+  def apply(
+      id: Long,
+      name: String,
+      update: Option[String],
+      value: String,
+      internal: Boolean): AccumulableInfo = {
+    new AccumulableInfo(id, name, update, value, internal)
+  }
+
   def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
     new AccumulableInfo(id, name, update, value, internal = false)
   }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 40729fa5a4ffe..a06dc6f709d33 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -282,7 +282,8 @@ private[spark] object JsonProtocol {
     ("ID" -> accumulableInfo.id) ~
     ("Name" -> accumulableInfo.name) ~
     ("Update" -> accumulableInfo.update.map(new JString(_)).getOrElse(JNothing)) ~
-    ("Value" -> accumulableInfo.value)
+    ("Value" -> accumulableInfo.value) ~
+    ("Internal" -> accumulableInfo.internal)
   }
 
   def taskMetricsToJson(taskMetrics: TaskMetrics): JValue = {
@@ -696,7 +697,8 @@ private[spark] object JsonProtocol {
     val name = (json \ "Name").extract[String]
     val update = Utils.jsonOption(json \ "Update").map(_.extract[String])
     val value = (json \ "Value").extract[String]
-    AccumulableInfo(id, name, update, value)
+    val internal = (json \ "Internal").extractOpt[Boolean].getOrElse(false)
+    AccumulableInfo(id, name, update, value, internal)
   }
 
   def taskMetricsFromJson(json: JValue): TaskMetrics = {
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index a24bf2931cca0..f9572921f43cb 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -364,6 +364,15 @@ class JsonProtocolSuite extends SparkFunSuite {
     assertEquals(expectedDenied, JsonProtocol.taskEndReasonFromJson(oldDenied))
   }
 
+  test("AccumulableInfo backward compatibility") {
+    // "Internal" property of AccumulableInfo were added after 1.5.1.
+    val accumulableInfo = makeAccumulableInfo(1)
+    val oldJson = JsonProtocol.accumulableInfoToJson(accumulableInfo)
+      .removeField({ _._1 == "Internal" })
+    val oldInfo = JsonProtocol.accumulableInfoFromJson(oldJson)
+    assert(false === oldInfo.internal)
+  }
+
   /** -------------------------- *
    | Helper test running methods |
    * --------------------------- */
@@ -723,15 +732,15 @@ class JsonProtocolSuite extends SparkFunSuite {
     val taskInfo = new TaskInfo(a, b, c, d, "executor", "your kind sir", TaskLocality.NODE_LOCAL,
       speculative)
     val (acc1, acc2, acc3) =
-      (makeAccumulableInfo(1), makeAccumulableInfo(2), makeAccumulableInfo(3))
+      (makeAccumulableInfo(1), makeAccumulableInfo(2), makeAccumulableInfo(3, internal = true))
     taskInfo.accumulables += acc1
     taskInfo.accumulables += acc2
     taskInfo.accumulables += acc3
     taskInfo
   }
 
-  private def makeAccumulableInfo(id: Int): AccumulableInfo =
-    AccumulableInfo(id, " Accumulable " + id, Some("delta" + id), "val" + id)
+  private def makeAccumulableInfo(id: Int, internal: Boolean = false): AccumulableInfo =
+    AccumulableInfo(id, " Accumulable " + id, Some("delta" + id), "val" + id, internal)
 
   /**
    * Creates a TaskMetrics object describing a task that read data from Hadoop (if hasHadoopInput is
@@ -812,13 +821,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      }
       |    ]
       |  },
@@ -866,13 +877,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      }
       |    ]
       |  }
@@ -902,19 +915,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  }
@@ -942,19 +958,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  }
@@ -988,19 +1007,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1074,19 +1096,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1157,19 +1182,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1251,13 +1279,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1309,13 +1339,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1384,13 +1416,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1476,13 +1510,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    }

From b591de7c07ba8e71092f71e34001520bec995a8a Mon Sep 17 00:00:00 2001
From: Nick Pritchard <nicholas.pritchard@falkonry.com>
Date: Thu, 15 Oct 2015 12:45:37 -0700
Subject: [PATCH 563/802] [SPARK-11039][Documentation][Web UI] Document
 additional ui configurations

Add documentation for configuration:
- spark.sql.ui.retainedExecutions
- spark.streaming.ui.retainedBatches

Author: Nick Pritchard <nicholas.pritchard@falkonry.com>

Closes #9052 from pnpritchard/SPARK-11039.
---
 docs/configuration.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index 771d93be04b06..46d92ceb762d6 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -554,6 +554,20 @@ Apart from these, the following properties are also available, and may be useful
     How many finished drivers the Spark UI and status APIs remember before garbage collecting.
   </td>
 </tr>
+<tr>
+  <td><code>spark.sql.ui.retainedExecutions</code></td>
+  <td>1000</td>
+  <td>
+    How many finished executions the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.ui.retainedBatches</code></td>
+  <td>1000</td>
+  <td>
+    How many finished batches the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
 </table>
 
 #### Compression and Serialization

From a5719804c5ed99ce36bd0dd230ab8b3b7a3b92e3 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 15 Oct 2015 14:46:40 -0700
Subject: [PATCH 564/802] [SPARK-11071] [LAUNCHER] Fix flakiness in
 LauncherServerSuite::timeout.

The test could fail depending on scheduling of the various threads
involved; the change removes some sources of races, while making the
test a little more resilient by trying a few times before giving up.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9079 from vanzin/SPARK-11071.
---
 .../apache/spark/launcher/LauncherServer.java |  9 ++++-
 .../spark/launcher/LauncherServerSuite.java   | 35 ++++++++++++++-----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index c5fd40816d62f..d099ee9aa9dae 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -242,7 +242,14 @@ public void run() {
           synchronized (clients) {
             clients.add(clientConnection);
           }
-          timeoutTimer.schedule(timeout, getConnectionTimeout());
+          long timeoutMs = getConnectionTimeout();
+          // 0 is used for testing to avoid issues with clock resolution / thread scheduling,
+          // and force an immediate timeout.
+          if (timeoutMs > 0) {
+            timeoutTimer.schedule(timeout, getConnectionTimeout());
+          } else {
+            timeout.run();
+          }
         }
       }
     } catch (IOException ioe) {
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 27cd1061a15b3..dc8fbb58d880b 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -121,12 +121,12 @@ private void wakeUp() {
 
   @Test
   public void testTimeout() throws Exception {
-    final long TEST_TIMEOUT = 10L;
-
     ChildProcAppHandle handle = null;
     TestClient client = null;
     try {
-      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, String.valueOf(TEST_TIMEOUT));
+      // LauncherServer will immediately close the server-side socket when the timeout is set
+      // to 0.
+      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, "0");
 
       handle = LauncherServer.newAppHandle();
 
@@ -134,12 +134,29 @@ public void testTimeout() throws Exception {
         LauncherServer.getServerInstance().getPort());
       client = new TestClient(s);
 
-      Thread.sleep(TEST_TIMEOUT * 10);
-      try {
-        client.send(new Hello(handle.getSecret(), "1.4.0"));
-        fail("Expected exception caused by connection timeout.");
-      } catch (IllegalStateException e) {
-        // Expected.
+      // Try a few times since the client-side socket may not reflect the server-side close
+      // immediately.
+      boolean helloSent = false;
+      int maxTries = 10;
+      for (int i = 0; i < maxTries; i++) {
+        try {
+          if (!helloSent) {
+            client.send(new Hello(handle.getSecret(), "1.4.0"));
+            helloSent = true;
+          } else {
+            client.send(new SetAppId("appId"));
+          }
+          fail("Expected exception caused by connection timeout.");
+        } catch (IllegalStateException | IOException e) {
+          // Expected.
+          break;
+        } catch (AssertionError e) {
+          if (i < maxTries - 1) {
+            Thread.sleep(100);
+          } else {
+            throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
+          }
+        }
       }
     } finally {
       SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);

From 723aa75a9d566c698aa49597f4f655396fef77bd Mon Sep 17 00:00:00 2001
From: Britta Weber <britta.weber@elasticsearch.com>
Date: Thu, 15 Oct 2015 14:47:11 -0700
Subject: [PATCH 565/802] fix typo bellow -> below

Author: Britta Weber <britta.weber@elasticsearch.com>

Closes #9136 from brwe/typo-bellow.
---
 docs/mllib-collaborative-filtering.md | 2 +-
 docs/mllib-linear-methods.md          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index b3fd51dca5c90..1ad52123c74aa 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -119,7 +119,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
-that is equivalent to the provided example in Scala is given bellow:
+that is equivalent to the provided example in Scala is given below:
 
 Refer to the [`ALS` Java docs](api/java/org/apache/spark/mllib/recommendation/ALS.html) for details on the API.
 
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index a3e1620c778ff..0c76e6e999465 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -230,7 +230,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
-that is equivalent to the provided example in Scala is given bellow:
+that is equivalent to the provided example in Scala is given below:
 
 Refer to the [`SVMWithSGD` Java docs](api/java/org/apache/spark/mllib/classification/SVMWithSGD.html) and [`SVMModel` Java docs](api/java/org/apache/spark/mllib/classification/SVMModel.html) for details on the API.
 
@@ -612,7 +612,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. The corresponding Java example to
-the Scala snippet provided, is presented bellow:
+the Scala snippet provided, is presented below:
 
 Refer to the [`LinearRegressionWithSGD` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html) and [`LinearRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionModel.html) for details on the API.
 

From 2d000124b72d0ff9e3ecefa03923405642516c4c Mon Sep 17 00:00:00 2001
From: KaiXinXiaoLei <huleilei1@huawei.com>
Date: Thu, 15 Oct 2015 14:48:01 -0700
Subject: [PATCH 566/802] [SPARK-10515] When killing executor, the pending
 replacement executors should not be lost

If the heartbeat receiver kills executors (and new ones are not registered to replace them), the idle timeout for the old executors will be lost (and then change a total number of executors requested by Driver), So new ones will be not to asked to replace them.
For example, executorsPendingToRemove=Set(1), and executor 2 is idle timeout before a new executor is asked to replace executor 1. Then driver kill executor 2, and sending RequestExecutors to AM. But executorsPendingToRemove=Set(1,2), So AM doesn't allocate a executor to replace 1.

see: https://github.com/apache/spark/pull/8668

Author: KaiXinXiaoLei <huleilei1@huawei.com>
Author: huleilei <huleilei1@huawei.com>

Closes #8945 from KaiXinXiaoLei/pendingexecutor.
---
 .../CoarseGrainedSchedulerBackend.scala       |  2 ++
 .../StandaloneDynamicAllocationSuite.scala    | 35 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 18771f79b44bb..55a564b5c8eac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -438,6 +438,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     if (!replace) {
       doRequestTotalExecutors(
         numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)
+    } else {
+      numPendingExecutors += knownExecutors.size
     }
 
     doKillExecutors(executorsToKill)
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index 2e2fa22eb4772..d145e78834b1b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -369,6 +369,41 @@ class StandaloneDynamicAllocationSuite
     assert(apps.head.getExecutorLimit === 1)
   }
 
+  test("the pending replacement executors should not be lost (SPARK-10515)") {
+    sc = new SparkContext(appConf)
+    val appId = sc.applicationId
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
+    // sync executors between the Master and the driver, needed because
+    // the driver refuses to kill executors it does not know about
+    syncExecutors(sc)
+    val executors = getExecutorIds(sc)
+    assert(executors.size === 2)
+    // kill executor 1, and replace it
+    assert(sc.killAndReplaceExecutor(executors.head))
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.head.executors.size === 2)
+    }
+
+    var apps = getApplications()
+    // kill executor 1
+    assert(sc.killExecutor(executors.head))
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
+    // kill executor 2
+    assert(sc.killExecutor(executors(1)))
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
+  }
+
   // ===============================
   // | Utility methods for testing |
   // ===============================

From 3b364ff0a4f38c2b8023429a55623de32be5f329 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 15 Oct 2015 14:50:01 -0700
Subject: [PATCH 567/802] [SPARK-11078] Ensure spilling tests actually spill

#9084 uncovered that many tests that test spilling don't actually spill. This is a follow-up patch to fix that to ensure our unit tests actually catch potential bugs in spilling. The size of this patch is inflated by the refactoring of `ExternalSorterSuite`, which had a lot of duplicate code and logic.

Author: Andrew Or <andrew@databricks.com>

Closes #9124 from andrewor14/spilling-tests.
---
 .../scala/org/apache/spark/TestUtils.scala    |  51 +
 .../spark/shuffle/ShuffleMemoryManager.scala  |   6 +-
 .../collection/ExternalAppendOnlyMap.scala    |   6 +
 .../spark/util/collection/Spillable.scala     |  37 +-
 .../org/apache/spark/DistributedSuite.scala   |  39 +-
 .../ExternalAppendOnlyMapSuite.scala          | 103 ++-
 .../util/collection/ExternalSorterSuite.scala | 871 ++++++++----------
 .../execution/TestShuffleMemoryManager.scala  |   2 +
 8 files changed, 534 insertions(+), 581 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 888763a3e8ebf..acfe751f6c746 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -24,10 +24,14 @@ import java.util.Arrays
 import java.util.jar.{JarEntry, JarOutputStream}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.io.{ByteStreams, Files}
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
 /**
@@ -154,4 +158,51 @@ private[spark] object TestUtils {
       "  @Override public String toString() { return \"" + toStringValue + "\"; }}")
     createCompiledClass(className, destDir, sourceFile, classpathUrls)
   }
+
+  /**
+   * Run some code involving jobs submitted to the given context and assert that the jobs spilled.
+   */
+  def assertSpilled[T](sc: SparkContext, identifier: String)(body: => T): Unit = {
+    val spillListener = new SpillListener
+    sc.addSparkListener(spillListener)
+    body
+    assert(spillListener.numSpilledStages > 0, s"expected $identifier to spill, but did not")
+  }
+
+  /**
+   * Run some code involving jobs submitted to the given context and assert that the jobs
+   * did not spill.
+   */
+  def assertNotSpilled[T](sc: SparkContext, identifier: String)(body: => T): Unit = {
+    val spillListener = new SpillListener
+    sc.addSparkListener(spillListener)
+    body
+    assert(spillListener.numSpilledStages == 0, s"expected $identifier to not spill, but did")
+  }
+
+}
+
+
+/**
+ * A [[SparkListener]] that detects whether spills have occurred in Spark jobs.
+ */
+private class SpillListener extends SparkListener {
+  private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]]
+  private val spilledStageIds = new mutable.HashSet[Int]
+
+  def numSpilledStages: Int = spilledStageIds.size
+
+  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
+    stageIdToTaskMetrics.getOrElseUpdate(
+      taskEnd.stageId, new ArrayBuffer[TaskMetrics]) += taskEnd.taskMetrics
+  }
+
+  override def onStageCompleted(stageComplete: SparkListenerStageCompleted): Unit = {
+    val stageId = stageComplete.stageInfo.stageId
+    val metrics = stageIdToTaskMetrics.remove(stageId).toSeq.flatten
+    val spilled = metrics.map(_.memoryBytesSpilled).sum > 0
+    if (spilled) {
+      spilledStageIds += stageId
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index aaf543ce9232a..9bd18da47f1a2 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -139,8 +139,10 @@ class ShuffleMemoryManager protected (
       throw new SparkException(
         s"Internal error: release called on $numBytes bytes but task only has $curMem")
     }
-    taskMemory(taskAttemptId) -= numBytes
-    memoryManager.releaseExecutionMemory(numBytes)
+    if (taskMemory.contains(taskAttemptId)) {
+      taskMemory(taskAttemptId) -= numBytes
+      memoryManager.releaseExecutionMemory(numBytes)
+    }
     memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 6a96b5dc12684..cfa58f5ef408a 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -95,6 +95,12 @@ class ExternalAppendOnlyMap[K, V, C](
   private val keyComparator = new HashComparator[K]
   private val ser = serializer.newInstance()
 
+  /**
+   * Number of files this map has spilled so far.
+   * Exposed for testing.
+   */
+  private[collection] def numSpills: Int = spilledMaps.size
+
   /**
    * Insert the given key and value into the map.
    */
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index 747ecf075a397..d2a68ca7a3b4c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -43,10 +43,15 @@ private[spark] trait Spillable[C] extends Logging {
   private[this] val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
 
   // Initial threshold for the size of a collection before we start tracking its memory usage
-  // Exposed for testing
+  // For testing only
   private[this] val initialMemoryThreshold: Long =
     SparkEnv.get.conf.getLong("spark.shuffle.spill.initialMemoryThreshold", 5 * 1024 * 1024)
 
+  // Force this collection to spill when there are this many elements in memory
+  // For testing only
+  private[this] val numElementsForceSpillThreshold: Long =
+    SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", Long.MaxValue)
+
   // Threshold for this collection's size in bytes before we start tracking its memory usage
   // To avoid a large number of small spills, initialize this to a value orders of magnitude > 0
   private[this] var myMemoryThreshold = initialMemoryThreshold
@@ -69,27 +74,27 @@ private[spark] trait Spillable[C] extends Logging {
    * @return true if `collection` was spilled to disk; false otherwise
    */
   protected def maybeSpill(collection: C, currentMemory: Long): Boolean = {
+    var shouldSpill = false
     if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
       // Claim up to double our current memory from the shuffle memory pool
       val amountToRequest = 2 * currentMemory - myMemoryThreshold
       val granted = shuffleMemoryManager.tryToAcquire(amountToRequest)
       myMemoryThreshold += granted
-      if (myMemoryThreshold <= currentMemory) {
-        // We were granted too little memory to grow further (either tryToAcquire returned 0,
-        // or we already had more memory than myMemoryThreshold); spill the current collection
-        _spillCount += 1
-        logSpillage(currentMemory)
-
-        spill(collection)
-
-        _elementsRead = 0
-        // Keep track of spills, and release memory
-        _memoryBytesSpilled += currentMemory
-        releaseMemoryForThisThread()
-        return true
-      }
+      // If we were granted too little memory to grow further (either tryToAcquire returned 0,
+      // or we already had more memory than myMemoryThreshold), spill the current collection
+      shouldSpill = currentMemory >= myMemoryThreshold
+    }
+    shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold
+    // Actually spill
+    if (shouldSpill) {
+      _spillCount += 1
+      logSpillage(currentMemory)
+      spill(collection)
+      _elementsRead = 0
+      _memoryBytesSpilled += currentMemory
+      releaseMemoryForThisThread()
     }
-    false
+    shouldSpill
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 34a4bb968e732..1c3f2bc315ddc 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -203,22 +203,35 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
   }
 
   test("compute without caching when no partitions fit in memory") {
-    sc = new SparkContext(clusterUrl, "test")
-    // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
-    // to only 50 KB (0.0001 of 512 MB), so no partitions should fit in memory
-    val data = sc.parallelize(1 to 4000000, 2).persist(StorageLevel.MEMORY_ONLY_SER)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
+    val size = 10000
+    val conf = new SparkConf()
+      .set("spark.storage.unrollMemoryThreshold", "1024")
+      .set("spark.testing.memory", (size / 2).toString)
+    sc = new SparkContext(clusterUrl, "test", conf)
+    val data = sc.parallelize(1 to size, 2).persist(StorageLevel.MEMORY_ONLY)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    // ensure only a subset of partitions were cached
+    val rddBlocks = sc.env.blockManager.master.getMatchingBlockIds(_.isRDD, askSlaves = true)
+    assert(rddBlocks.size === 0, s"expected no RDD blocks, found ${rddBlocks.size}")
   }
 
   test("compute when only some partitions fit in memory") {
-    sc = new SparkContext(clusterUrl, "test", new SparkConf)
-    // TODO: verify that only a subset of partitions fit in memory (SPARK-11078)
-    val data = sc.parallelize(1 to 4000000, 20).persist(StorageLevel.MEMORY_ONLY_SER)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
+    val size = 10000
+    val numPartitions = 10
+    val conf = new SparkConf()
+      .set("spark.storage.unrollMemoryThreshold", "1024")
+      .set("spark.testing.memory", (size * numPartitions).toString)
+    sc = new SparkContext(clusterUrl, "test", conf)
+    val data = sc.parallelize(1 to size, numPartitions).persist(StorageLevel.MEMORY_ONLY)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    // ensure only a subset of partitions were cached
+    val rddBlocks = sc.env.blockManager.master.getMatchingBlockIds(_.isRDD, askSlaves = true)
+    assert(rddBlocks.size > 0, "no RDD blocks found")
+    assert(rddBlocks.size < numPartitions, s"too many RDD blocks found, expected <$numPartitions")
   }
 
   test("passing environment variables to cluster") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 0a03c32c647ae..5cb506ea2164e 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -22,9 +22,10 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
 
-// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
 
 class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
+  import TestUtils.{assertNotSpilled, assertSpilled}
+
   private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private def createCombiner[T](i: T) = ArrayBuffer[T](i)
   private def mergeValue[T](buffer: ArrayBuffer[T], i: T): ArrayBuffer[T] = buffer += i
@@ -244,54 +245,53 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
    * If a compression codec is provided, use it. Otherwise, do not compress spills.
    */
   private def testSimpleSpilling(codec: Option[String] = None): Unit = {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
+    conf.set("spark.shuffle.manager", "hash") // avoid using external sorter
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
-    // reduceByKey - should spill ~8 times
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max).collect()
-    assert(resultA.length === 50000)
-    resultA.foreach { case (k, v) =>
-      assert(v === k * 2 + 1, s"Value for $k was wrong: expected ${k * 2 + 1}, got $v")
+    assertSpilled(sc, "reduceByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }.reduceByKey(math.max).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (k, v) =>
+        val expected = k * 2 + 1
+        assert(v === expected, s"Value for $k was wrong: expected $expected, got $v")
+      }
     }
 
-    // groupByKey - should spill ~17 times
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey().collect()
-    assert(resultB.length === 25000)
-    resultB.foreach { case (i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      assert(seq.toSet === expected,
-        s"Value for $i was wrong: expected $expected, got ${seq.toSet}")
+    assertSpilled(sc, "groupByKey") {
+      val result = sc.parallelize(0 until size).map { i => (i / 2, i) }.groupByKey().collect()
+      assert(result.length == size / 2)
+      result.foreach { case (i, seq) =>
+        val actual = seq.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual === expected, s"Value for $i was wrong: expected $expected, got $actual")
+      }
     }
 
-    // cogroup - should spill ~7 times
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2).collect()
-    assert(resultC.length === 10000)
-    resultC.foreach { case (i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet === Set[Int](0))
-          assert(seq2.toSet === Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet === Set[Int](1))
-          assert(seq2.toSet === Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet === Set[Int](5000))
-          assert(seq2.toSet === Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet === Set[Int](9999))
-          assert(seq2.toSet === Set[Int]())
-        case _ =>
+    assertSpilled(sc, "cogroup") {
+      val rdd1 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val rdd2 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val result = rdd1.cogroup(rdd2).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (i, (seq1, seq2)) =>
+        val actual1 = seq1.toSet
+        val actual2 = seq2.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual1 === expected, s"Value 1 for $i was wrong: expected $expected, got $actual1")
+        assert(actual2 === expected, s"Value 2 for $i was wrong: expected $expected, got $actual2")
       }
     }
+
     sc.stop()
   }
 
   test("spilling with hash collisions") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[String]
 
@@ -315,11 +315,12 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(w1.hashCode === w2.hashCode)
     }
 
-    map.insertAll((1 to 100000).iterator.map(_.toString).map(i => (i, i)))
+    map.insertAll((1 to size).iterator.map(_.toString).map(i => (i, i)))
     collisionPairs.foreach { case (w1, w2) =>
       map.insert(w1, w2)
       map.insert(w2, w1)
     }
+    assert(map.numSpills > 0, "map did not spill")
 
     // A map of collision pairs in both directions
     val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
@@ -334,22 +335,25 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2.equals(expectedValue))
       count += 1
     }
-    assert(count === 100000 + collisionPairs.size * 2)
+    assert(count === size + collisionPairs.size * 2)
     sc.stop()
   }
 
   test("spilling with many hash collisions") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
 
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
     for (i <- 1 to 10) {
-      for (j <- 1 to 10000) {
+      for (j <- 1 to size) {
         map.insert(FixedHashObject(j, j % 2), 1)
       }
     }
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     var count = 0
@@ -358,17 +362,20 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2 === 10)
       count += 1
     }
-    assert(count === 10000)
+    assert(count === size)
     sc.stop()
   }
 
   test("spilling with hash collisions using the Int.MaxValue key") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
-    (1 to 100000).foreach { i => map.insert(i, i) }
+    (1 to size).foreach { i => map.insert(i, i) }
     map.insert(Int.MaxValue, Int.MaxValue)
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     while (it.hasNext) {
@@ -379,14 +386,17 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("spilling with null keys and values") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
-    map.insertAll((1 to 100000).iterator.map(i => (i, i)))
+    map.insertAll((1 to size).iterator.map(i => (i, i)))
     map.insert(null.asInstanceOf[Int], 1)
     map.insert(1, null.asInstanceOf[Int])
     map.insert(null.asInstanceOf[Int], null.asInstanceOf[Int])
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     while (it.hasNext) {
@@ -397,17 +407,22 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("external aggregation updates peak execution memory") {
+    val spillThreshold = 1000
     val conf = createSparkConf(loadDefaults = false)
       .set("spark.shuffle.manager", "hash") // make sure we're not also using ExternalSorter
-      .set("spark.testing.memory", (10 * 1024 * 1024).toString)
+      .set("spark.shuffle.spill.numElementsForceSpillThreshold", spillThreshold.toString)
     sc = new SparkContext("local", "test", conf)
     // No spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map without spilling") {
-      sc.parallelize(1 to 10, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      assertNotSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold / 2, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      }
     }
     // With spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map with spilling") {
-      sc.parallelize(1 to 1000 * 1000, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      assertSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold * 3, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 651c7eaa65ff5..e2cb791771d99 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -18,535 +18,92 @@
 package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
-
 import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
-// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
 
 class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
-  private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
-    val conf = new SparkConf(loadDefaults)
-    if (kryo) {
-      conf.set("spark.serializer", classOf[KryoSerializer].getName)
-    } else {
-      // Make the Java serializer write a reset instruction (TC_RESET) after each object to test
-      // for a bug we had with bytes written past the last object in a batch (SPARK-2792)
-      conf.set("spark.serializer.objectStreamReset", "1")
-      conf.set("spark.serializer", classOf[JavaSerializer].getName)
-    }
-    conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
-    // Ensure that we actually have multiple batches per spill file
-    conf.set("spark.shuffle.spill.batchSize", "10")
-    conf.set("spark.testing.memory", "2000000")
-    conf
-  }
-
-  test("empty data stream with kryo ser") {
-    emptyDataStream(createSparkConf(false, true))
-  }
-
-  test("empty data stream with java ser") {
-    emptyDataStream(createSparkConf(false, false))
-  }
-
-  def emptyDataStream(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-
-    // Both aggregator and ordering
-    val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
-    assert(sorter.iterator.toSeq === Seq())
-    sorter.stop()
-
-    // Only aggregator
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), None, None)
-    assert(sorter2.iterator.toSeq === Seq())
-    sorter2.stop()
-
-    // Only ordering
-    val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    assert(sorter3.iterator.toSeq === Seq())
-    sorter3.stop()
-
-    // Neither aggregator nor ordering
-    val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), None, None)
-    assert(sorter4.iterator.toSeq === Seq())
-    sorter4.stop()
-  }
+  import TestUtils.{assertNotSpilled, assertSpilled}
 
-  test("few elements per partition with kryo ser") {
-    fewElementsPerPartition(createSparkConf(false, true))
-  }
+  testWithMultipleSer("empty data stream")(emptyDataStream)
 
-  test("few elements per partition with java ser") {
-    fewElementsPerPartition(createSparkConf(false, false))
-  }
+  testWithMultipleSer("few elements per partition")(fewElementsPerPartition)
 
-  def fewElementsPerPartition(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-    val elements = Set((1, 1), (2, 2), (5, 5))
-    val expected = Set(
-      (0, Set()), (1, Set((1, 1))), (2, Set((2, 2))), (3, Set()), (4, Set()),
-      (5, Set((5, 5))), (6, Set()))
-
-    // Both aggregator and ordering
-    val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
-    sorter.insertAll(elements.iterator)
-    assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter.stop()
-
-    // Only aggregator
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), None, None)
-    sorter2.insertAll(elements.iterator)
-    assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter2.stop()
+  testWithMultipleSer("empty partitions with spilling")(emptyPartitionsWithSpilling)
 
-    // Only ordering
-    val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
-    sorter3.insertAll(elements.iterator)
-    assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter3.stop()
-
-    // Neither aggregator nor ordering
-    val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), None, None)
-    sorter4.insertAll(elements.iterator)
-    assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter4.stop()
-  }
-
-  test("empty partitions with spilling with kryo ser") {
-    emptyPartitionsWithSpilling(createSparkConf(false, true))
+  // Load defaults, otherwise SPARK_HOME is not found
+  testWithMultipleSer("spilling in local cluster", loadDefaults = true) {
+    (conf: SparkConf) => testSpillingInLocalCluster(conf, 2)
   }
 
-  test("empty partitions with spilling with java ser") {
-    emptyPartitionsWithSpilling(createSparkConf(false, false))
-  }
-
-  def emptyPartitionsWithSpilling(conf: SparkConf) {
-    conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val elements = Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
-    sorter.insertAll(elements)
-    assert(sc.env.blockManager.diskBlockManager.getAllFiles().length > 0) // Make sure it spilled
-    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
-    assert(iter.next() === (0, Nil))
-    assert(iter.next() === (1, List((1, 1))))
-    assert(iter.next() === (2, (0 until 100000).map(x => (2, 2)).toList))
-    assert(iter.next() === (3, Nil))
-    assert(iter.next() === (4, Nil))
-    assert(iter.next() === (5, List((5, 5))))
-    assert(iter.next() === (6, Nil))
-    sorter.stop()
-  }
-
-  test("spilling in local cluster with kryo ser") {
-    // Load defaults, otherwise SPARK_HOME is not found
-    testSpillingInLocalCluster(createSparkConf(true, true))
-  }
-
-  test("spilling in local cluster with java ser") {
-    // Load defaults, otherwise SPARK_HOME is not found
-    testSpillingInLocalCluster(createSparkConf(true, false))
-  }
-
-  def testSpillingInLocalCluster(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
-
-    // reduceByKey - should spill ~8 times
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max).collect()
-    assert(resultA.length == 50000)
-    resultA.foreach { case(k, v) =>
-      if (v != k * 2 + 1) {
-        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
-      }
-    }
-
-    // groupByKey - should spill ~17 times
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey().collect()
-    assert(resultB.length == 25000)
-    resultB.foreach { case(i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      if (seq.toSet != expected) {
-        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
-      }
-    }
-
-    // cogroup - should spill ~7 times
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2).collect()
-    assert(resultC.length == 10000)
-    resultC.foreach { case(i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet == Set[Int](0))
-          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet == Set[Int](1))
-          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet == Set[Int](5000))
-          assert(seq2.toSet == Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet == Set[Int](9999))
-          assert(seq2.toSet == Set[Int]())
-        case _ =>
-      }
-    }
-
-    // larger cogroup - should spill ~7 times
-    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val resultD = rddD1.cogroup(rddD2).collect()
-    assert(resultD.length == 5000)
-    resultD.foreach { case(i, (seq1, seq2)) =>
-      val expected = Set(i * 2, i * 2 + 1)
-      if (seq1.toSet != expected) {
-        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
-      }
-      if (seq2.toSet != expected) {
-        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
-      }
-    }
-
-    // sortByKey - should spill ~17 times
-    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultE = rddE.sortByKey().collect().toSeq
-    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
-  }
-
-  test("spilling in local cluster with many reduce tasks with kryo ser") {
-    spillingInLocalClusterWithManyReduceTasks(createSparkConf(true, true))
-  }
-
-  test("spilling in local cluster with many reduce tasks with java ser") {
-    spillingInLocalClusterWithManyReduceTasks(createSparkConf(true, false))
-  }
-
-  def spillingInLocalClusterWithManyReduceTasks(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
-
-    // reduceByKey - should spill ~4 times per executor
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max _, 100).collect()
-    assert(resultA.length == 50000)
-    resultA.foreach { case(k, v) =>
-      if (v != k * 2 + 1) {
-        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
-      }
-    }
-
-    // groupByKey - should spill ~8 times per executor
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey(100).collect()
-    assert(resultB.length == 25000)
-    resultB.foreach { case(i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      if (seq.toSet != expected) {
-        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
-      }
-    }
-
-    // cogroup - should spill ~4 times per executor
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2, 100).collect()
-    assert(resultC.length == 10000)
-    resultC.foreach { case(i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet == Set[Int](0))
-          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet == Set[Int](1))
-          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet == Set[Int](5000))
-          assert(seq2.toSet == Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet == Set[Int](9999))
-          assert(seq2.toSet == Set[Int]())
-        case _ =>
-      }
-    }
-
-    // larger cogroup - should spill ~4 times per executor
-    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val resultD = rddD1.cogroup(rddD2).collect()
-    assert(resultD.length == 5000)
-    resultD.foreach { case(i, (seq1, seq2)) =>
-      val expected = Set(i * 2, i * 2 + 1)
-      if (seq1.toSet != expected) {
-        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
-      }
-      if (seq2.toSet != expected) {
-        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
-      }
-    }
-
-    // sortByKey - should spill ~8 times per executor
-    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultE = rddE.sortByKey().collect().toSeq
-    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
+  testWithMultipleSer("spilling in local cluster with many reduce tasks", loadDefaults = true) {
+    (conf: SparkConf) => testSpillingInLocalCluster(conf, 100)
   }
 
   test("cleanup of intermediate files in sorter") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val ord = implicitly[Ordering[Int]]
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 120000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
-
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter2.insertAll((0 until 120000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    assert(sorter2.iterator.toSet === (0 until 120000).map(i => (i, i)).toSet)
-    sorter2.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
+    cleanupIntermediateFilesInSorter(withFailures = false)
   }
 
-  test("cleanup of intermediate files in sorter if there are errors") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val ord = implicitly[Ordering[Int]]
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    intercept[SparkException] {
-      sorter.insertAll((0 until 120000).iterator.map(i => {
-        if (i == 119990) {
-          throw new SparkException("Intentional failure")
-        }
-        (i, i)
-      }))
-    }
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
+  test("cleanup of intermediate files in sorter with failures") {
+    cleanupIntermediateFilesInSorter(withFailures = true)
   }
 
   test("cleanup of intermediate files in shuffle") {
-    val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val data = sc.parallelize(0 until 100000, 2).map(i => (i, i))
-    assert(data.reduceByKey(_ + _).count() === 100000)
-
-    // After the shuffle, there should be only 4 files on disk: our two map output files and
-    // their index files. All other intermediate files should've been deleted.
-    assert(diskBlockManager.getAllFiles().length === 4)
-  }
-
-  test("cleanup of intermediate files in shuffle with errors") {
-    val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val data = sc.parallelize(0 until 100000, 2).map(i => {
-      if (i == 99990) {
-        throw new Exception("Intentional failure")
-      }
-      (i, i)
-    })
-    intercept[SparkException] {
-      data.reduceByKey(_ + _).count()
-    }
-
-    // After the shuffle, there should be only 2 files on disk: the output of task 1 and its index.
-    // All other files (map 2's output and intermediate merge files) should've been deleted.
-    assert(diskBlockManager.getAllFiles().length === 2)
-  }
-
-  test("no partial aggregation or sorting with kryo ser") {
-    noPartialAggregationOrSorting(createSparkConf(false, true))
-  }
-
-  test("no partial aggregation or sorting with java ser") {
-    noPartialAggregationOrSorting(createSparkConf(false, false))
-  }
-
-  def noPartialAggregationOrSorting(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i / 4, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100000).map(i => (i / 4, i)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
-  }
-
-  test("partial aggregation without spill with kryo ser") {
-    partialAggregationWithoutSpill(createSparkConf(false, true))
-  }
-
-  test("partial aggregation without spill with java ser") {
-    partialAggregationWithoutSpill(createSparkConf(false, false))
-  }
-
-  def partialAggregationWithoutSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100).iterator.map(i => (i / 2, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+    cleanupIntermediateFilesInShuffle(withFailures = false)
   }
 
-  test("partial aggregation with spill, no ordering with kryo ser") {
-    partialAggregationWIthSpillNoOrdering(createSparkConf(false, true))
+  test("cleanup of intermediate files in shuffle with failures") {
+    cleanupIntermediateFilesInShuffle(withFailures = true)
   }
 
-  test("partial aggregation with spill, no ordering with java ser") {
-    partialAggregationWIthSpillNoOrdering(createSparkConf(false, false))
+  testWithMultipleSer("no sorting or partial aggregation") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = false, withSpilling = false)
   }
 
-  def partialAggregationWIthSpillNoOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i / 2, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+  testWithMultipleSer("no sorting or partial aggregation with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = false, withSpilling = true)
   }
 
-  test("partial aggregation with spill, with ordering with kryo ser") {
-    partialAggregationWithSpillWithOrdering(createSparkConf(false, true))
+  testWithMultipleSer("sorting, no partial aggregation") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = true, withSpilling = false)
   }
 
-
-  test("partial aggregation with spill, with ordering with java ser") {
-    partialAggregationWithSpillWithOrdering(createSparkConf(false, false))
+  testWithMultipleSer("sorting, no partial aggregation with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = true, withSpilling = true)
   }
 
-  def partialAggregationWithSpillWithOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
-
-    // avoid combine before spill
-    sorter.insertAll((0 until 50000).iterator.map(i => (i , 2 * i)))
-    sorter.insertAll((0 until 50000).iterator.map(i => (i, 2 * i + 1)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+  testWithMultipleSer("partial aggregation, no sorting") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = false, withSpilling = false)
   }
 
-  test("sorting without aggregation, no spill with kryo ser") {
-    sortingWithoutAggregationNoSpill(createSparkConf(false, true))
+  testWithMultipleSer("partial aggregation, no sorting with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = false, withSpilling = true)
   }
 
-  test("sorting without aggregation, no spill with java ser") {
-    sortingWithoutAggregationNoSpill(createSparkConf(false, false))
+  testWithMultipleSer("partial aggregation and sorting") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = true, withSpilling = false)
   }
 
-  def sortingWithoutAggregationNoSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 100).iterator.map(i => (i, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
-    }).toSeq
-    assert(results === expected)
-  }
-
-  test("sorting without aggregation, with spill with kryo ser") {
-    sortingWithoutAggregationWithSpill(createSparkConf(false, true))
-  }
-
-  test("sorting without aggregation, with spill with java ser") {
-    sortingWithoutAggregationWithSpill(createSparkConf(false, false))
+  testWithMultipleSer("partial aggregation and sorting with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = true, withSpilling = true)
   }
 
-  def sortingWithoutAggregationWithSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100000).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
-    }).toSeq
-    assert(results === expected)
-  }
+  testWithMultipleSer("sort without breaking sorting contracts", loadDefaults = true)(
+    sortWithoutBreakingSortingContracts)
 
   test("spilling with hash collisions") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
-    def mergeCombiners(buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String])
-      : ArrayBuffer[String] = buffer1 ++= buffer2
+    def mergeCombiners(
+        buffer1: ArrayBuffer[String],
+        buffer2: ArrayBuffer[String]): ArrayBuffer[String] = buffer1 ++= buffer2
 
     val agg = new Aggregator[String, String, ArrayBuffer[String]](
       createCombiner _, mergeValue _, mergeCombiners _)
@@ -574,10 +131,11 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(w1.hashCode === w2.hashCode)
     }
 
-    val toInsert = (1 to 100000).iterator.map(_.toString).map(s => (s, s)) ++
+    val toInsert = (1 to size).iterator.map(_.toString).map(s => (s, s)) ++
       collisionPairs.iterator ++ collisionPairs.iterator.map(_.swap)
 
     sorter.insertAll(toInsert)
+    assert(sorter.numSpills > 0, "sorter did not spill")
 
     // A map of collision pairs in both directions
     val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
@@ -592,21 +150,21 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2.equals(expectedValue))
       count += 1
     }
-    assert(count === 100000 + collisionPairs.size * 2)
+    assert(count === size + collisionPairs.size * 2)
   }
 
   test("spilling with many hash collisions") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
-
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
     val sorter = new ExternalSorter[FixedHashObject, Int, Int](Some(agg), None, None, None)
-
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
-    val toInsert = for (i <- 1 to 10; j <- 1 to 10000) yield (FixedHashObject(j, j % 2), 1)
+    val toInsert = for (i <- 1 to 10; j <- 1 to size) yield (FixedHashObject(j, j % 2), 1)
     sorter.insertAll(toInsert.iterator)
-
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     var count = 0
     while (it.hasNext) {
@@ -614,11 +172,13 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2 === 10)
       count += 1
     }
-    assert(count === 10000)
+    assert(count === size)
   }
 
   test("spilling with hash collisions using the Int.MaxValue key") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
@@ -629,10 +189,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
     val agg = new Aggregator[Int, Int, ArrayBuffer[Int]](createCombiner, mergeValue, mergeCombiners)
     val sorter = new ExternalSorter[Int, Int, ArrayBuffer[Int]](Some(agg), None, None, None)
-
     sorter.insertAll(
-      (1 to 100000).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
-
+      (1 to size).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     while (it.hasNext) {
       // Should not throw NoSuchElementException
@@ -641,7 +200,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("spilling with null keys and values") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -655,12 +216,12 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
       Some(agg), None, None, None)
 
-    sorter.insertAll((1 to 100000).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
+    sorter.insertAll((1 to size).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
       (null.asInstanceOf[String], "1"),
       ("1", null.asInstanceOf[String]),
       (null.asInstanceOf[String], null.asInstanceOf[String])
     ))
-
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     while (it.hasNext) {
       // Should not throw NullPointerException
@@ -668,16 +229,301 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test("sort without breaking sorting contracts with kryo ser") {
-    sortWithoutBreakingSortingContracts(createSparkConf(true, true))
+  /* ============================= *
+   |  Helper test utility methods  |
+   * ============================= */
+
+  private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
+    val conf = new SparkConf(loadDefaults)
+    if (kryo) {
+      conf.set("spark.serializer", classOf[KryoSerializer].getName)
+    } else {
+      // Make the Java serializer write a reset instruction (TC_RESET) after each object to test
+      // for a bug we had with bytes written past the last object in a batch (SPARK-2792)
+      conf.set("spark.serializer.objectStreamReset", "1")
+      conf.set("spark.serializer", classOf[JavaSerializer].getName)
+    }
+    conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
+    // Ensure that we actually have multiple batches per spill file
+    conf.set("spark.shuffle.spill.batchSize", "10")
+    conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
+    conf
+  }
+
+  /**
+   * Run a test multiple times, each time with a different serializer.
+   */
+  private def testWithMultipleSer(
+      name: String,
+      loadDefaults: Boolean = false)(body: (SparkConf => Unit)): Unit = {
+    test(name + " with kryo ser") {
+      body(createSparkConf(loadDefaults, kryo = true))
+    }
+    test(name + " with java ser") {
+      body(createSparkConf(loadDefaults, kryo = false))
+    }
   }
 
-  test("sort without breaking sorting contracts with java ser") {
-    sortWithoutBreakingSortingContracts(createSparkConf(true, false))
+  /* =========================================== *
+   |  Helper methods that contain the test body  |
+   * =========================================== */
+
+  private def emptyDataStream(conf: SparkConf) {
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter.iterator.toSeq === Seq())
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), None, None)
+    assert(sorter2.iterator.toSeq === Seq())
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter3.iterator.toSeq === Seq())
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), None, None)
+    assert(sorter4.iterator.toSeq === Seq())
+    sorter4.stop()
+  }
+
+  private def fewElementsPerPartition(conf: SparkConf) {
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+    val elements = Set((1, 1), (2, 2), (5, 5))
+    val expected = Set(
+      (0, Set()), (1, Set((1, 1))), (2, Set((2, 2))), (3, Set()), (4, Set()),
+      (5, Set((5, 5))), (6, Set()))
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
+    sorter.insertAll(elements.iterator)
+    assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), None, None)
+    sorter2.insertAll(elements.iterator)
+    assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), Some(ord), None)
+    sorter3.insertAll(elements.iterator)
+    assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), None, None)
+    sorter4.insertAll(elements.iterator)
+    assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter4.stop()
+  }
+
+  private def emptyPartitionsWithSpilling(conf: SparkConf) {
+    val size = 1000
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
+    sc = new SparkContext("local", "test", conf)
+
+    val ord = implicitly[Ordering[Int]]
+    val elements = Iterator((1, 1), (5, 5)) ++ (0 until size).iterator.map(x => (2, 2))
+
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), Some(ord), None)
+    sorter.insertAll(elements)
+    assert(sorter.numSpills > 0, "sorter did not spill")
+    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
+    assert(iter.next() === (0, Nil))
+    assert(iter.next() === (1, List((1, 1))))
+    assert(iter.next() === (2, (0 until 1000).map(x => (2, 2)).toList))
+    assert(iter.next() === (3, Nil))
+    assert(iter.next() === (4, Nil))
+    assert(iter.next() === (5, List((5, 5))))
+    assert(iter.next() === (6, Nil))
+    sorter.stop()
+  }
+
+  private def testSpillingInLocalCluster(conf: SparkConf, numReduceTasks: Int) {
+    val size = 5000
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+
+    assertSpilled(sc, "reduceByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .reduceByKey(math.max _, numReduceTasks)
+        .collect()
+      assert(result.length === size / 2)
+      result.foreach { case (k, v) =>
+        val expected = k * 2 + 1
+        assert(v === expected, s"Value for $k was wrong: expected $expected, got $v")
+      }
+    }
+
+    assertSpilled(sc, "groupByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .groupByKey(numReduceTasks)
+        .collect()
+      assert(result.length == size / 2)
+      result.foreach { case (i, seq) =>
+        val actual = seq.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual === expected, s"Value for $i was wrong: expected $expected, got $actual")
+      }
+    }
+
+    assertSpilled(sc, "cogroup") {
+      val rdd1 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val rdd2 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val result = rdd1.cogroup(rdd2, numReduceTasks).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (i, (seq1, seq2)) =>
+        val actual1 = seq1.toSet
+        val actual2 = seq2.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual1 === expected, s"Value 1 for $i was wrong: expected $expected, got $actual1")
+        assert(actual2 === expected, s"Value 2 for $i was wrong: expected $expected, got $actual2")
+      }
+    }
+
+    assertSpilled(sc, "sortByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .sortByKey(numPartitions = numReduceTasks)
+        .collect()
+      val expected = (0 until size).map { i => (i / 2, i) }.toArray
+      assert(result.length === size)
+      result.zipWithIndex.foreach { case ((k, _), i) =>
+        val (expectedKey, _) = expected(i)
+        assert(k === expectedKey, s"Value for $i was wrong: expected $expectedKey, got $k")
+      }
+    }
+  }
+
+  private def cleanupIntermediateFilesInSorter(withFailures: Boolean): Unit = {
+    val size = 1200
+    val conf = createSparkConf(loadDefaults = false, kryo = false)
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = sc.env.blockManager.diskBlockManager
+    val ord = implicitly[Ordering[Int]]
+    val expectedSize = if (withFailures) size - 1 else size
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    if (withFailures) {
+      intercept[SparkException] {
+        sorter.insertAll((0 until size).iterator.map { i =>
+          if (i == size - 1) { throw new SparkException("intentional failure") }
+          (i, i)
+        })
+      }
+    } else {
+      sorter.insertAll((0 until size).iterator.map(i => (i, i)))
+    }
+    assert(sorter.iterator.toSet === (0 until expectedSize).map(i => (i, i)).toSet)
+    assert(sorter.numSpills > 0, "sorter did not spill")
+    assert(diskBlockManager.getAllFiles().nonEmpty, "sorter did not spill")
+    sorter.stop()
+    assert(diskBlockManager.getAllFiles().isEmpty, "spilled files were not cleaned up")
+  }
+
+  private def cleanupIntermediateFilesInShuffle(withFailures: Boolean): Unit = {
+    val size = 1200
+    val conf = createSparkConf(loadDefaults = false, kryo = false)
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = sc.env.blockManager.diskBlockManager
+    val data = sc.parallelize(0 until size, 2).map { i =>
+      if (withFailures && i == size - 1) {
+        throw new SparkException("intentional failure")
+      }
+      (i, i)
+    }
+
+    assertSpilled(sc, "test shuffle cleanup") {
+      if (withFailures) {
+        intercept[SparkException] {
+          data.reduceByKey(_ + _).count()
+        }
+        // After the shuffle, there should be only 2 files on disk: the output of task 1 and
+        // its index. All other files (map 2's output and intermediate merge files) should
+        // have been deleted.
+        assert(diskBlockManager.getAllFiles().length === 2)
+      } else {
+        assert(data.reduceByKey(_ + _).count() === size)
+        // After the shuffle, there should be only 4 files on disk: the output of both tasks
+        // and their indices. All intermediate merge files should have been deleted.
+        assert(diskBlockManager.getAllFiles().length === 4)
+      }
+    }
+  }
+
+  private def basicSorterTest(
+      conf: SparkConf,
+      withPartialAgg: Boolean,
+      withOrdering: Boolean,
+      withSpilling: Boolean) {
+    val size = 1000
+    if (withSpilling) {
+      conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
+    }
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+    val agg =
+      if (withPartialAgg) {
+        Some(new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j))
+      } else {
+        None
+      }
+    val ord = if (withOrdering) Some(implicitly[Ordering[Int]]) else None
+    val sorter = new ExternalSorter[Int, Int, Int](agg, Some(new HashPartitioner(3)), ord, None)
+    sorter.insertAll((0 until size).iterator.map { i => (i / 4, i) })
+    if (withSpilling) {
+      assert(sorter.numSpills > 0, "sorter did not spill")
+    } else {
+      assert(sorter.numSpills === 0, "sorter spilled")
+    }
+    val results = sorter.partitionedIterator.map { case (p, vs) => (p, vs.toSet) }.toSet
+    val expected = (0 until 3).map { p =>
+      var v = (0 until size).map { i => (i / 4, i) }.filter { case (k, _) => k % 3 == p }.toSet
+      if (withPartialAgg) {
+        v = v.groupBy(_._1).mapValues { s => s.map(_._2).sum }.toSet
+      }
+      (p, v.toSet)
+    }.toSet
+    assert(results === expected)
   }
 
   private def sortWithoutBreakingSortingContracts(conf: SparkConf) {
+    val size = 100000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // Using wrongOrdering to show integer overflow introduced exception.
@@ -690,17 +536,18 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       }
     }
 
-    val testData = Array.tabulate(100000) { _ => rand.nextInt().toString }
+    val testData = Array.tabulate(size) { _ => rand.nextInt().toString }
 
     val sorter1 = new ExternalSorter[String, String, String](
       None, None, Some(wrongOrdering), None)
     val thrown = intercept[IllegalArgumentException] {
       sorter1.insertAll(testData.iterator.map(i => (i, i)))
+      assert(sorter1.numSpills > 0, "sorter did not spill")
       sorter1.iterator
     }
 
-    assert(thrown.getClass() === classOf[IllegalArgumentException])
-    assert(thrown.getMessage().contains("Comparison method violates its general contract"))
+    assert(thrown.getClass === classOf[IllegalArgumentException])
+    assert(thrown.getMessage.contains("Comparison method violates its general contract"))
     sorter1.stop()
 
     // Using aggregation and external spill to make sure ExternalSorter using
@@ -716,6 +563,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val sorter2 = new ExternalSorter[String, String, ArrayBuffer[String]](
       Some(agg), None, None, None)
     sorter2.insertAll(testData.iterator.map(i => (i, i)))
+    assert(sorter2.numSpills > 0, "sorter did not spill")
 
     // To validate the hash ordering of key
     var minKey = Int.MinValue
@@ -729,12 +577,23 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("sorting updates peak execution memory") {
+    val spillThreshold = 1000
     val conf = createSparkConf(loadDefaults = false, kryo = false)
       .set("spark.shuffle.manager", "sort")
+      .set("spark.shuffle.spill.numElementsForceSpillThreshold", spillThreshold.toString)
     sc = new SparkContext("local", "test", conf)
     // Avoid aggregating here to make sure we're not also using ExternalAppendOnlyMap
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter") {
-      sc.parallelize(1 to 1000, 2).repartition(100).count()
+    // No spilling
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter without spilling") {
+      assertNotSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold / 2, 2).repartition(100).count()
+      }
+    }
+    // With spilling
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter with spilling") {
+      assertSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold * 3, 2).repartition(100).count()
+      }
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index 835f52fa566a2..c4358f409b6ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -68,6 +68,8 @@ private class GrantEverythingMemoryManager extends MemoryManager {
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def releaseExecutionMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(numBytes: Long): Unit = { }
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
 }

From 6a2359ff1f7ad2233af2c530313d6ec2ecf70d19 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 15 Oct 2015 14:50:58 -0700
Subject: [PATCH 568/802] [SPARK-10412] [SQL] report memory usage for tungsten
 sql physical operator

https://issues.apache.org/jira/browse/SPARK-10412

some screenshots:
### aggregate:
![screen shot 2015-10-12 at 2 23 11 pm](https://cloud.githubusercontent.com/assets/3182036/10439534/618320a4-70ef-11e5-94d8-62ea7f2d1531.png)

### join
![screen shot 2015-10-12 at 2 23 29 pm](https://cloud.githubusercontent.com/assets/3182036/10439537/6724797c-70ef-11e5-8f75-0cf5cbd42048.png)

Author: Wenchen Fan <wenchen@databricks.com>
Author: Wenchen Fan <cloud0fan@163.com>

Closes #8931 from cloud-fan/viz.
---
 .../aggregate/TungstenAggregate.scala         | 10 ++-
 .../TungstenAggregationIterator.scala         | 10 ++-
 .../sql/execution/metric/SQLMetrics.scala     | 72 +++++++++++++------
 .../org/apache/spark/sql/execution/sort.scala | 16 +++++
 .../sql/execution/ui/ExecutionPage.scala      |  2 +-
 .../spark/sql/execution/ui/SQLListener.scala  |  9 ++-
 .../sql/execution/ui/SparkPlanGraph.scala     |  4 +-
 .../TungstenAggregationIteratorSuite.scala    |  3 +-
 .../execution/metric/SQLMetricsSuite.scala    | 13 +++-
 .../sql/execution/ui/SQLListenerSuite.scala   | 20 +++---
 10 files changed, 116 insertions(+), 43 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index c342940e6e757..0d3a4b36c161b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -49,7 +49,9 @@ case class TungstenAggregate(
 
   override private[sql] lazy val metrics = Map(
     "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
-    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"),
+    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
+    "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))
 
   override def outputsUnsafeRows: Boolean = true
 
@@ -79,6 +81,8 @@ case class TungstenAggregate(
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     val numInputRows = longMetric("numInputRows")
     val numOutputRows = longMetric("numOutputRows")
+    val dataSize = longMetric("dataSize")
+    val spillSize = longMetric("spillSize")
 
     /**
      * Set up the underlying unsafe data structures used before computing the parent partition.
@@ -97,7 +101,9 @@ case class TungstenAggregate(
         child.output,
         testFallbackStartsAt,
         numInputRows,
-        numOutputRows)
+        numOutputRows,
+        dataSize,
+        spillSize)
     }
 
     /** Compute a partition using the iterator already set up previously. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index fe708a5f71f79..7cd0f7b81e46c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -87,7 +87,9 @@ class TungstenAggregationIterator(
     originalInputAttributes: Seq[Attribute],
     testFallbackStartsAt: Option[Int],
     numInputRows: LongSQLMetric,
-    numOutputRows: LongSQLMetric)
+    numOutputRows: LongSQLMetric,
+    dataSize: LongSQLMetric,
+    spillSize: LongSQLMetric)
   extends Iterator[UnsafeRow] with Logging {
 
   // The parent partition iterator, to be initialized later in `start`
@@ -110,6 +112,10 @@ class TungstenAggregationIterator(
       s"$allAggregateExpressions should have no more than 2 kinds of modes.")
   }
 
+  // Remember spill data size of this task before execute this operator so that we can
+  // figure out how many bytes we spilled for this operator.
+  private val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled
+
   //
   // The modes of AggregateExpressions. Right now, we can handle the following mode:
   //  - Partial-only:
@@ -842,6 +848,8 @@ class TungstenAggregationIterator(
         val mapMemory = hashMap.getPeakMemoryUsedBytes
         val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
         val peakMemory = Math.max(mapMemory, sorterMemory)
+        dataSize += peakMemory
+        spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore
         TaskContext.get().internalMetricsToAccumulators(
           InternalAccumulator.PEAK_EXECUTION_MEMORY).add(peakMemory)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 7a2a98ec18cb8..075b7ad881112 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.metric
 
+import org.apache.spark.util.Utils
 import org.apache.spark.{Accumulable, AccumulableParam, SparkContext}
 
 /**
@@ -35,6 +36,12 @@ private[sql] abstract class SQLMetric[R <: SQLMetricValue[T], T](
  */
 private[sql] trait SQLMetricParam[R <: SQLMetricValue[T], T] extends AccumulableParam[R, T] {
 
+  /**
+   * A function that defines how we aggregate the final accumulator results among all tasks,
+   * and represent it in string for a SQL physical operator.
+   */
+  val stringValue: Seq[T] => String
+
   def zero: R
 }
 
@@ -63,26 +70,12 @@ private[sql] class LongSQLMetricValue(private var _value : Long) extends SQLMetr
   override def value: Long = _value
 }
 
-/**
- * A wrapper of Int to avoid boxing and unboxing when using Accumulator
- */
-private[sql] class IntSQLMetricValue(private var _value: Int) extends SQLMetricValue[Int] {
-
-  def add(term: Int): IntSQLMetricValue = {
-    _value += term
-    this
-  }
-
-  // Although there is a boxing here, it's fine because it's only called in SQLListener
-  override def value: Int = _value
-}
-
 /**
  * A specialized long Accumulable to avoid boxing and unboxing when using Accumulator's
  * `+=` and `add`.
  */
-private[sql] class LongSQLMetric private[metric](name: String)
-  extends SQLMetric[LongSQLMetricValue, Long](name, LongSQLMetricParam) {
+private[sql] class LongSQLMetric private[metric](name: String, param: LongSQLMetricParam)
+  extends SQLMetric[LongSQLMetricValue, Long](name, param) {
 
   override def +=(term: Long): Unit = {
     localValue.add(term)
@@ -93,7 +86,8 @@ private[sql] class LongSQLMetric private[metric](name: String)
   }
 }
 
-private object LongSQLMetricParam extends SQLMetricParam[LongSQLMetricValue, Long] {
+private class LongSQLMetricParam(val stringValue: Seq[Long] => String, initialValue: Long)
+  extends SQLMetricParam[LongSQLMetricValue, Long] {
 
   override def addAccumulator(r: LongSQLMetricValue, t: Long): LongSQLMetricValue = r.add(t)
 
@@ -102,20 +96,56 @@ private object LongSQLMetricParam extends SQLMetricParam[LongSQLMetricValue, Lon
 
   override def zero(initialValue: LongSQLMetricValue): LongSQLMetricValue = zero
 
-  override def zero: LongSQLMetricValue = new LongSQLMetricValue(0L)
+  override def zero: LongSQLMetricValue = new LongSQLMetricValue(initialValue)
 }
 
 private[sql] object SQLMetrics {
 
-  def createLongMetric(sc: SparkContext, name: String): LongSQLMetric = {
-    val acc = new LongSQLMetric(name)
+  private def createLongMetric(
+      sc: SparkContext,
+      name: String,
+      stringValue: Seq[Long] => String,
+      initialValue: Long): LongSQLMetric = {
+    val param = new LongSQLMetricParam(stringValue, initialValue)
+    val acc = new LongSQLMetric(name, param)
     sc.cleaner.foreach(_.registerAccumulatorForCleanup(acc))
     acc
   }
 
+  def createLongMetric(sc: SparkContext, name: String): LongSQLMetric = {
+    createLongMetric(sc, name, _.sum.toString, 0L)
+  }
+
+  /**
+   * Create a metric to report the size information (including total, min, med, max) like data size,
+   * spill size, etc.
+   */
+  def createSizeMetric(sc: SparkContext, name: String): LongSQLMetric = {
+    val stringValue = (values: Seq[Long]) => {
+      // This is a workaround for SPARK-11013.
+      // We use -1 as initial value of the accumulator, if the accumulator is valid, we will update
+      // it at the end of task and the value will be at least 0.
+      val validValues = values.filter(_ >= 0)
+      val Seq(sum, min, med, max) = {
+        val metric = if (validValues.length == 0) {
+          Seq.fill(4)(0L)
+        } else {
+          val sorted = validValues.sorted
+          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
+        }
+        metric.map(Utils.bytesToString)
+      }
+      s"\n$sum ($min, $med, $max)"
+    }
+    // The final result of this metric in physical operator UI may looks like:
+    // data size total (min, med, max):
+    // 100GB (100MB, 1GB, 10GB)
+    createLongMetric(sc, s"$name total (min, med, max)", stringValue, -1L)
+  }
+
   /**
    * A metric that its value will be ignored. Use this one when we need a metric parameter but don't
    * care about the value.
    */
-  val nullLongMetric = new LongSQLMetric("null")
+  val nullLongMetric = new LongSQLMetric("null", new LongSQLMetricParam(_.sum.toString, 0L))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index 27f26245a5ef0..9385e5734db5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution}
+import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
@@ -93,10 +94,17 @@ case class TungstenSort(
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
+  override private[sql] lazy val metrics = Map(
+    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
+    "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))
+
   protected override def doExecute(): RDD[InternalRow] = {
     val schema = child.schema
     val childOutput = child.output
 
+    val dataSize = longMetric("dataSize")
+    val spillSize = longMetric("spillSize")
+
     /**
      * Set up the sorter in each partition before computing the parent partition.
      * This makes sure our sorter is not starved by other sorters used in the same task.
@@ -131,7 +139,15 @@ case class TungstenSort(
         partitionIndex: Int,
         sorter: UnsafeExternalRowSorter,
         parentIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
+      // Remember spill data size of this task before execute this operator so that we can
+      // figure out how many bytes we spilled for this operator.
+      val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled
+
       val sortedIterator = sorter.sort(parentIterator.asInstanceOf[Iterator[UnsafeRow]])
+
+      dataSize += sorter.getPeakMemoryUsage
+      spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore
+
       taskContext.internalMetricsToAccumulators(
         InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage)
       sortedIterator
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index a4dbd2e1978d0..e74d6fb396e1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -100,7 +100,7 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
     // scalastyle:on
   }
 
-  private def planVisualization(metrics: Map[Long, Any], graph: SparkPlanGraph): Seq[Node] = {
+  private def planVisualization(metrics: Map[Long, String], graph: SparkPlanGraph): Seq[Node] = {
     val metadata = graph.nodes.flatMap { node =>
       val nodeId = s"plan-meta-data-${node.id}"
       <div id={nodeId}>{node.desc}</div>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index d6472400a6a21..b302b519998ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -252,7 +252,7 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
   /**
    * Get all accumulator updates from all tasks which belong to this execution and merge them.
    */
-  def getExecutionMetrics(executionId: Long): Map[Long, Any] = synchronized {
+  def getExecutionMetrics(executionId: Long): Map[Long, String] = synchronized {
     _executionIdToData.get(executionId) match {
       case Some(executionUIData) =>
         val accumulatorUpdates = {
@@ -264,8 +264,7 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
           }
         }.filter { case (id, _) => executionUIData.accumulatorMetrics.contains(id) }
         mergeAccumulatorUpdates(accumulatorUpdates, accumulatorId =>
-          executionUIData.accumulatorMetrics(accumulatorId).metricParam).
-          mapValues(_.asInstanceOf[SQLMetricValue[_]].value)
+          executionUIData.accumulatorMetrics(accumulatorId).metricParam)
       case None =>
         // This execution has been dropped
         Map.empty
@@ -274,11 +273,11 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
 
   private def mergeAccumulatorUpdates(
       accumulatorUpdates: Seq[(Long, Any)],
-      paramFunc: Long => SQLMetricParam[SQLMetricValue[Any], Any]): Map[Long, Any] = {
+      paramFunc: Long => SQLMetricParam[SQLMetricValue[Any], Any]): Map[Long, String] = {
     accumulatorUpdates.groupBy(_._1).map { case (accumulatorId, values) =>
       val param = paramFunc(accumulatorId)
       (accumulatorId,
-        values.map(_._2.asInstanceOf[SQLMetricValue[Any]]).foldLeft(param.zero)(param.addInPlace))
+        param.stringValue(values.map(_._2.asInstanceOf[SQLMetricValue[Any]].value)))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
index ae3d752dde348..f1fce5478a3fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue}
 private[ui] case class SparkPlanGraph(
     nodes: Seq[SparkPlanGraphNode], edges: Seq[SparkPlanGraphEdge]) {
 
-  def makeDotFile(metrics: Map[Long, Any]): String = {
+  def makeDotFile(metrics: Map[Long, String]): String = {
     val dotFile = new StringBuilder
     dotFile.append("digraph G {\n")
     nodes.foreach(node => dotFile.append(node.makeDotNode(metrics) + "\n"))
@@ -87,7 +87,7 @@ private[sql] object SparkPlanGraph {
 private[ui] case class SparkPlanGraphNode(
     id: Long, name: String, desc: String, metrics: Seq[SQLPlanMetric]) {
 
-  def makeDotNode(metricsValue: Map[Long, Any]): String = {
+  def makeDotNode(metricsValue: Map[Long, String]): String = {
     val values = {
       for (metric <- metrics;
            value <- metricsValue.get(metric.accumulatorId)) yield {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 0cc4988ff681c..cc0ac1b07c21a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -39,7 +39,8 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
-        0, Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
+        0, Seq.empty, newMutableProjection, Seq.empty, None,
+        dummyAccum, dummyAccum, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
     } finally {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 6afffae161ef6..cdd885ba14203 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -93,7 +93,16 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
         }.toMap
         (node.id, node.name -> nodeMetrics)
       }.toMap
-      assert(expectedMetrics === actualMetrics)
+
+      assert(expectedMetrics.keySet === actualMetrics.keySet)
+      for (nodeId <- expectedMetrics.keySet) {
+        val (expectedNodeName, expectedMetricsMap) = expectedMetrics(nodeId)
+        val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId)
+        assert(expectedNodeName === actualNodeName)
+        for (metricName <- expectedMetricsMap.keySet) {
+          assert(expectedMetricsMap(metricName).toString === actualMetricsMap(metricName))
+        }
+      }
     } else {
       // TODO Remove this "else" once we fix the race condition that missing the JobStarted event.
       // Since we cannot track all jobs, the metric values could be wrong and we should not check
@@ -489,7 +498,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       // Because "save" will create a new DataFrame internally, we cannot get the real metric id.
       // However, we still can check the value.
-      assert(metricValues.values.toSeq === Seq(2L))
+      assert(metricValues.values.toSeq === Seq("2"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 727cf3665a871..cc1c1e10e98c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -74,6 +74,10 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
+    def checkAnswer(actual: Map[Long, String], expected: Map[Long, Long]): Unit = {
+      assert(actual === expected.mapValues(_.toString))
+    }
+
     val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
@@ -114,7 +118,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 0, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, metrics)
@@ -122,7 +126,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 0, createTaskMetrics(accumulatorUpdates.mapValues(_ * 2)))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 3))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 3))
 
     // Retrying a stage should reset the metrics
     listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 1)))
@@ -133,7 +137,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 1, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     // Ignore the task end for the first attempt
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -144,7 +148,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(0, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 100))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     // Finish two tasks
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -162,7 +166,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(1, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 3))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 5))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 5))
 
     // Summit a new stage
     listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(1, 0)))
@@ -173,7 +177,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 1, 0, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 7))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 7))
 
     // Finish two tasks
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -191,7 +195,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(1, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 3))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
 
     assert(executionUIData.runningJobs === Seq(0))
     assert(executionUIData.succeededJobs.isEmpty)
@@ -208,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(executionUIData.succeededJobs === Seq(0))
     assert(executionUIData.failedJobs.isEmpty)
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {

From eb0b4d6e2ddfb765f082d0d88472626336ad2609 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 15 Oct 2015 17:36:55 -0700
Subject: [PATCH 569/802] [SPARK-11135] [SQL] Exchange incorrectly skips sorts
 when existing ordering is non-empty subset of required ordering

In Spark SQL, the Exchange planner tries to avoid unnecessary sorts in cases where the data has already been sorted by a superset of the requested sorting columns. For instance, let's say that a query calls for an operator's input to be sorted by `a.asc` and the input happens to already be sorted by `[a.asc, b.asc]`. In this case, we do not need to re-sort the input. The converse, however, is not true: if the query calls for `[a.asc, b.asc]`, then `a.asc` alone will not satisfy the ordering requirements, requiring an additional sort to be planned by Exchange.

However, the current Exchange code gets this wrong and incorrectly skips sorting when the existing output ordering is a subset of the required ordering. This is simple to fix, however.

This bug was introduced in https://github.com/apache/spark/pull/7458, so it affects 1.5.0+.

This patch fixes the bug and significantly improves the unit test coverage of Exchange's sort-planning logic.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9140 from JoshRosen/SPARK-11135.
---
 .../apache/spark/sql/execution/Exchange.scala |  5 +-
 .../spark/sql/execution/PlannerSuite.scala    | 49 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 289453753f18d..1d3379a5e2d91 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -219,6 +219,8 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution
     val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering
     var children: Seq[SparkPlan] = operator.children
+    assert(requiredChildDistributions.length == children.length)
+    assert(requiredChildOrderings.length == children.length)
 
     // Ensure that the operator's children satisfy their output distribution requirements:
     children = children.zip(requiredChildDistributions).map { case (child, distribution) =>
@@ -248,8 +250,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
       if (requiredOrdering.nonEmpty) {
         // If child.outputOrdering is [a, b] and requiredOrdering is [a], we do not need to sort.
-        val minSize = Seq(requiredOrdering.size, child.outputOrdering.size).min
-        if (minSize == 0 || requiredOrdering.take(minSize) != child.outputOrdering.take(minSize)) {
+        if (requiredOrdering != child.outputOrdering.take(requiredOrdering.length)) {
           sqlContext.planner.BasicOperators.getSortOperator(requiredOrdering, global = false, child)
         } else {
           child
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index cafa1d5154788..ebdab1c26d7bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -354,6 +354,55 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
+  test("EnsureRequirements adds sort when there is no existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq.empty) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingB)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.isEmpty) {
+      fail(s"Sort should have been added:\n$outputPlan")
+    }
+  }
+
+  test("EnsureRequirements skips sort when required ordering is prefix of existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq(orderingA, orderingB)) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingA)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.nonEmpty) {
+      fail(s"No sorts should have been added:\n$outputPlan")
+    }
+  }
+
+  // This is a regression test for SPARK-11135
+  test("EnsureRequirements adds sort when required ordering isn't a prefix of existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq(orderingA)) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingA, orderingB)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.isEmpty) {
+      fail(s"Sort should have been added:\n$outputPlan")
+    }
+  }
+
   // ---------------------------------------------------------------------------------------------
 }
 

From 43f5d1f326d7a2a4a78fe94853d0d05237568203 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Fri, 16 Oct 2015 11:53:47 +0100
Subject: [PATCH 570/802] [SPARK-11060] [STREAMING] Fix some potential NPE in
 DStream transformation

This patch fixes:

1. Guard out against NPEs in `TransformedDStream` when parent DStream returns None instead of empty RDD.
2. Verify some input streams which will potentially return None.
3. Add unit test to verify the behavior when input stream returns None.

cc tdas , please help to review, thanks a lot :).

Author: jerryshao <sshao@hortonworks.com>

Closes #9070 from jerryshao/SPARK-11060.
---
 .../dstream/ConstantInputDStream.scala        |  6 +-
 .../streaming/dstream/QueueInputDStream.scala |  2 +-
 .../dstream/TransformedDStream.scala          |  7 +-
 .../streaming/dstream/UnionDStream.scala      | 11 ++--
 .../streaming/BasicOperationsSuite.scala      | 66 +++++++++++++++++++
 5 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
index f396c347581ce..4eb92dd8b1053 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.reflect.ClassTag
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Time, StreamingContext}
-import scala.reflect.ClassTag
 
 /**
  * An input stream that always returns the same RDD on each timestep. Useful for testing.
@@ -27,6 +28,9 @@ import scala.reflect.ClassTag
 class ConstantInputDStream[T: ClassTag](ssc_ : StreamingContext, rdd: RDD[T])
   extends InputDStream[T](ssc_) {
 
+  require(rdd != null,
+    "parameter rdd null is illegal, which will lead to NPE in the following transformation")
+
   override def start() {}
 
   override def stop() {}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index a2685046e03d4..cd073646370d0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -62,7 +62,7 @@ class QueueInputDStream[T: ClassTag](
     } else if (defaultRDD != null) {
       Some(defaultRDD)
     } else {
-      None
+      Some(ssc.sparkContext.emptyRDD)
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
index ab01f47d5cf99..5eabdf63dc8d7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.dstream
 import scala.reflect.ClassTag
 
 import org.apache.spark.SparkException
-import org.apache.spark.rdd.{PairRDDFunctions, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Duration, Time}
 
 private[streaming]
@@ -39,7 +39,10 @@ class TransformedDStream[U: ClassTag] (
   override def slideDuration: Duration = parents.head.slideDuration
 
   override def compute(validTime: Time): Option[RDD[U]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
+      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
+      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
+    }
     val transformedRDD = transformFunc(parentRDDs, validTime)
     if (transformedRDD == null) {
       throw new SparkException("Transform function must not return null. " +
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
index 9405dbaa12329..d73ffdfd84d2d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import org.apache.spark.SparkException
 import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.UnionRDD
 
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.ClassTag
-
 private[streaming]
 class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
   extends DStream[T](parents.head.ssc) {
@@ -41,8 +42,8 @@ class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
     val rdds = new ArrayBuffer[RDD[T]]()
     parents.map(_.getOrCompute(validTime)).foreach {
       case Some(rdd) => rdds += rdd
-      case None => throw new Exception("Could not generate RDD from a parent for unifying at time "
-        + validTime)
+      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
+        s" time $validTime")
     }
     if (rdds.size > 0) {
       Some(new UnionRDD(ssc.sc, rdds))
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 9988f410f0bc1..9d296c6d3ef8b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -191,6 +191,20 @@ class BasicOperationsSuite extends TestSuiteBase {
     )
   }
 
+  test("union with input stream return None") {
+    val input = Seq(1 to 4, 101 to 104, 201 to 204, null)
+    val output = Seq(1 to 8, 101 to 108, 201 to 208)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (s: DStream[Int]) => s.union(s.map(_ + 4)),
+        output,
+        input.length,
+        false
+      )
+    }
+  }
+
   test("StreamingContext.union") {
     val input = Seq(1 to 4, 101 to 104, 201 to 204)
     val output = Seq(1 to 12, 101 to 112, 201 to 212)
@@ -224,6 +238,19 @@ class BasicOperationsSuite extends TestSuiteBase {
     }
   }
 
+  test("transform with input stream return None") {
+    val input = Seq(1 to 4, 5 to 8, null)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (r: DStream[Int]) => r.transform(rdd => rdd.map(_.toString)),
+        input.filterNot(_ == null).map(_.map(_.toString)),
+        input.length,
+        false
+      )
+    }
+  }
+
   test("transformWith") {
     val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), Seq("")   )
@@ -244,6 +271,27 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(inputData1, inputData2, operation, outputData, true)
   }
 
+  test("transformWith with input stream return None") {
+    val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), null )
+    val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), null )
+    val outputData = Seq(
+      Seq("a", "b", "a", "b"),
+      Seq("a", "b", "", ""),
+      Seq("")
+    )
+
+    val operation = (s1: DStream[String], s2: DStream[String]) => {
+      s1.transformWith(           // RDD.join in transform
+        s2,
+        (rdd1: RDD[String], rdd2: RDD[String]) => rdd1.union(rdd2)
+      )
+    }
+
+    intercept[SparkException] {
+      testOperation(inputData1, inputData2, operation, outputData, inputData1.length, true)
+    }
+  }
+
   test("StreamingContext.transform") {
     val input = Seq(1 to 4, 101 to 104, 201 to 204)
     val output = Seq(1 to 12, 101 to 112, 201 to 212)
@@ -260,6 +308,24 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(input, operation, output)
   }
 
+  test("StreamingContext.transform with input stream return None") {
+    val input = Seq(1 to 4, 101 to 104, 201 to 204, null)
+    val output = Seq(1 to 12, 101 to 112, 201 to 212)
+
+    // transform over 3 DStreams by doing union of the 3 RDDs
+    val operation = (s: DStream[Int]) => {
+      s.context.transform(
+        Seq(s, s.map(_ + 4), s.map(_ + 8)),   // 3 DStreams
+        (rdds: Seq[RDD[_]], time: Time) =>
+          rdds.head.context.union(rdds.map(_.asInstanceOf[RDD[Int]]))  // union of RDDs
+      )
+    }
+
+    intercept[SparkException] {
+      testOperation(input, operation, output, input.length, false)
+    }
+  }
+
   test("cogroup") {
     val inputData1 = Seq( Seq("a", "a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "a", "b"), Seq("b", ""), Seq(), Seq()   )

From ed775042cceb61a0566502e1306ac3c70f4a6a5f Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 12:03:05 +0100
Subject: [PATCH 571/802] [SPARK-11092] [DOCS] Add source links to scaladoc
 generation

Modify the SBT build script to include GitHub source links for generated Scaladocs, on releases only (no snapshots).

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9110 from jodersky/unidoc.
---
 project/SparkBuild.scala | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1339980c38800..8f0f310ddd24e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -156,6 +156,10 @@ object SparkBuild extends PomBuild {
 
     javacOptions in Compile ++= Seq("-encoding", "UTF-8"),
 
+    scalacOptions in Compile ++= Seq(
+      "-sourcepath", (baseDirectory in ThisBuild).value.getAbsolutePath  // Required for relative source links in scaladoc
+    ),
+
     // Implements -Xfatal-warnings, ignoring deprecation warnings.
     // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
     compile in Compile := {
@@ -489,6 +493,8 @@ object Unidoc {
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/hive/test")))
   }
 
+  val unidocSourceBase = settingKey[String]("Base URL of source links in Scaladoc.")
+
   lazy val settings = scalaJavaUnidocSettings ++ Seq (
     publish := {},
 
@@ -531,8 +537,19 @@ object Unidoc {
       "-noqualifier", "java.lang"
     ),
 
-    // Group similar methods together based on the @group annotation.
-    scalacOptions in (ScalaUnidoc, unidoc) ++= Seq("-groups")
+    // Use GitHub repository for Scaladoc source linke
+    unidocSourceBase := s"https://github.com/apache/spark/tree/v${version.value}",
+
+    scalacOptions in (ScalaUnidoc, unidoc) ++= Seq(
+      "-groups" // Group similar methods together based on the @group annotation.
+    ) ++ (
+      // Add links to sources when generating Scaladoc for a non-snapshot release
+      if (!isSnapshot.value) {
+        Opts.doc.sourceUrl(unidocSourceBase.value + "€{FILE_PATH}.scala")
+      } else {
+        Seq()
+      }
+    )
   )
 }
 

From 08698ee1d6f29b2c999416f18a074d5193cdacd5 Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 14:26:34 +0100
Subject: [PATCH 572/802] [SPARK-11094] Strip extra strings from Java version
 in test runner

Removes any extra strings from the Java version, fixing subsequent integer parsing.
This is required since some OpenJDK versions (specifically in Debian testing), append an extra "-internal" string to the version field.

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9111 from jodersky/fixtestrunner.
---
 dev/run-tests.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1a816585187d9..d4d6880491bc8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -176,17 +176,14 @@ def determine_java_version(java_exe):
     # find raw version string, eg 'java version "1.8.0_25"'
     raw_version_str = next(x for x in raw_output_lines if " version " in x)
 
-    version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
-    version, update = version_str.split('_')  # eg ['1.8.0', '25']
+    match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str)
 
-    # map over the values and convert them to integers
-    version_info = [int(x) for x in version.split('.') + [update]]
-
-    return JavaVersion(major=version_info[0],
-                       minor=version_info[1],
-                       patch=version_info[2],
-                       update=version_info[3])
+    major = int(match.group(1))
+    minor = int(match.group(2))
+    patch = int(match.group(3))
+    update = int(match.group(4))
 
+    return JavaVersion(major, minor, patch, update)
 
 # -------------------------------------------------------------------------------------------------
 # Functions for running the other build and test scripts

From 4ee2cea2a43f7d04ab8511d9c029f80c5dadd48e Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 17:24:18 +0100
Subject: [PATCH 573/802] [SPARK-11122] [BUILD] [WARN] Add tag to fatal
 warnings

Shows that an error is actually due to a fatal warning.

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9128 from jodersky/fatalwarnings.
---
 project/SparkBuild.scala | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 8f0f310ddd24e..766edd9500c30 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -164,7 +164,7 @@ object SparkBuild extends PomBuild {
     // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
     compile in Compile := {
       val analysis = (compile in Compile).value
-      val s = streams.value
+      val out = streams.value
 
       def logProblem(l: (=> String) => Unit, f: File, p: xsbti.Problem) = {
         l(f.toString + ":" + p.position.line.fold("")(_ + ":") + " " + p.message)
@@ -181,7 +181,14 @@ object SparkBuild extends PomBuild {
             failed = failed + 1
           }
 
-          logProblem(if (deprecation) s.log.warn else s.log.error, k, p)
+          val printer: (=> String) => Unit = s => if (deprecation) {
+            out.log.warn(s)
+          } else {
+            out.log.error("[warn] " + s)
+          }
+
+          logProblem(printer, k, p)
+
         }
       }
 

From b9c5e5d4ac4c9fe29e880f4ee562a9c552e81d29 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Fri, 16 Oct 2015 11:19:37 -0700
Subject: [PATCH 574/802] [SPARK-11124] JsonParser/Generator should be closed
 for resource recycle

Some json parsers are not closed. parser in JacksonParser#parseJson, for example.

Author: navis.ryu <navis@apache.org>

Closes #9130 from navis/SPARK-11124.
---
 .../scala/org/apache/spark/util/Utils.scala   |  4 ++
 .../expressions/jsonExpressions.scala         | 56 +++++++++----------
 .../datasources/json/InferSchema.scala        |  8 ++-
 .../datasources/json/JacksonParser.scala      | 41 +++++++-------
 4 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index bd7e51c3b5100..22c05a2479422 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2153,6 +2153,10 @@ private[spark] object Utils extends Logging {
       conf.getInt("spark.executor.instances", 0) == 0
   }
 
+  def tryWithResource[R <: Closeable, T](createResource: => R)(f: R => T): T = {
+    val resource = createResource
+    try f.apply(resource) finally resource.close()
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 0770fab0ae901..8c9853e628d2c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.types.{StructField, StructType, StringType, DataType}
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -134,16 +135,18 @@ case class GetJsonObject(json: Expression, path: Expression)
 
     if (parsed.isDefined) {
       try {
-        val parser = jsonFactory.createParser(jsonStr.getBytes)
-        val output = new ByteArrayOutputStream()
-        val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
-        parser.nextToken()
-        val matched = evaluatePath(parser, generator, RawStyle, parsed.get)
-        generator.close()
-        if (matched) {
-          UTF8String.fromBytes(output.toByteArray)
-        } else {
-          null
+        Utils.tryWithResource(jsonFactory.createParser(jsonStr.getBytes)) { parser =>
+          val output = new ByteArrayOutputStream()
+          val matched = Utils.tryWithResource(
+            jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator =>
+            parser.nextToken()
+            evaluatePath(parser, generator, RawStyle, parsed.get)
+          }
+          if (matched) {
+            UTF8String.fromBytes(output.toByteArray)
+          } else {
+            null
+          }
         }
       } catch {
         case _: JsonProcessingException => null
@@ -250,17 +253,18 @@ case class GetJsonObject(json: Expression, path: Expression)
         // temporarily buffer child matches, the emitted json will need to be
         // modified slightly if there is only a single element written
         val buffer = new StringWriter()
-        val flattenGenerator = jsonFactory.createGenerator(buffer)
-        flattenGenerator.writeStartArray()
 
         var dirty = 0
-        while (p.nextToken() != END_ARRAY) {
-          // track the number of array elements and only emit an outer array if
-          // we've written more than one element, this matches Hive's behavior
-          dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0)
+        Utils.tryWithResource(jsonFactory.createGenerator(buffer)) { flattenGenerator =>
+          flattenGenerator.writeStartArray()
+
+          while (p.nextToken() != END_ARRAY) {
+            // track the number of array elements and only emit an outer array if
+            // we've written more than one element, this matches Hive's behavior
+            dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0)
+          }
+          flattenGenerator.writeEndArray()
         }
-        flattenGenerator.writeEndArray()
-        flattenGenerator.close()
 
         val buf = buffer.getBuffer
         if (dirty > 1) {
@@ -370,12 +374,8 @@ case class JsonTuple(children: Seq[Expression])
     }
 
     try {
-      val parser = jsonFactory.createParser(json.getBytes)
-
-      try {
-        parseRow(parser, input)
-      } finally {
-        parser.close()
+      Utils.tryWithResource(jsonFactory.createParser(json.getBytes)) {
+        parser => parseRow(parser, input)
       }
     } catch {
       case _: JsonProcessingException =>
@@ -420,12 +420,8 @@ case class JsonTuple(children: Seq[Expression])
 
           // write the output directly to UTF8 encoded byte array
           if (parser.nextToken() != JsonToken.VALUE_NULL) {
-            val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
-
-            try {
-              copyCurrentStructure(generator, parser)
-            } finally {
-              generator.close()
+            Utils.tryWithResource(jsonFactory.createGenerator(output, JsonEncoding.UTF8)) {
+              generator => copyCurrentStructure(generator, parser)
             }
 
             row(idx) = UTF8String.fromBytes(output.toByteArray)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index b6f3410bad690..d0780028dacb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 private[sql] object InferSchema {
   /**
@@ -47,9 +48,10 @@ private[sql] object InferSchema {
       val factory = new JsonFactory()
       iter.map { row =>
         try {
-          val parser = factory.createParser(row)
-          parser.nextToken()
-          inferField(parser)
+          Utils.tryWithResource(factory.createParser(row)) { parser =>
+            parser.nextToken()
+            inferField(parser)
+          }
         } catch {
           case _: JsonParseException =>
             StructType(Seq(StructField(columnNameOfCorruptRecords, StringType)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index c51140749c8e6..09b8a9e936a1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 private[sql] object JacksonParser {
   def apply(
@@ -86,9 +87,9 @@ private[sql] object JacksonParser {
 
       case (_, StringType) =>
         val writer = new ByteArrayOutputStream()
-        val generator = factory.createGenerator(writer, JsonEncoding.UTF8)
-        generator.copyCurrentStructure(parser)
-        generator.close()
+        Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) {
+          generator => generator.copyCurrentStructure(parser)
+        }
         UTF8String.fromBytes(writer.toByteArray)
 
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
@@ -245,22 +246,24 @@ private[sql] object JacksonParser {
 
       iter.flatMap { record =>
         try {
-          val parser = factory.createParser(record)
-          parser.nextToken()
-
-          convertField(factory, parser, schema) match {
-            case null => failedRecord(record)
-            case row: InternalRow => row :: Nil
-            case array: ArrayData =>
-              if (array.numElements() == 0) {
-                Nil
-              } else {
-                array.toArray[InternalRow](schema)
-              }
-            case _ =>
-              sys.error(
-                s"Failed to parse record $record. Please make sure that each line of the file " +
-                  "(or each string in the RDD) is a valid JSON object or an array of JSON objects.")
+          Utils.tryWithResource(factory.createParser(record)) { parser =>
+            parser.nextToken()
+
+            convertField(factory, parser, schema) match {
+              case null => failedRecord(record)
+              case row: InternalRow => row :: Nil
+              case array: ArrayData =>
+                if (array.numElements() == 0) {
+                  Nil
+                } else {
+                  array.toArray[InternalRow](schema)
+                }
+              case _ =>
+                sys.error(
+                  s"Failed to parse record $record. Please make sure that each line of the file " +
+                    "(or each string in the RDD) is a valid JSON object or " +
+                    "an array of JSON objects.")
+            }
           }
         } catch {
           case _: JsonProcessingException =>

From 3d683a139b333456a6bd8801ac5f113d1ac3fd18 Mon Sep 17 00:00:00 2001
From: Pravin Gadakh <pravingadakh177@gmail.com>
Date: Fri, 16 Oct 2015 13:38:50 -0700
Subject: [PATCH 575/802] [SPARK-10581] [DOCS] Groups are not resolved in
 scaladoc in sql classes

Groups are not resolved properly in scaladoc in following classes:

sql/core/src/main/scala/org/apache/spark/sql/Column.scala
sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Author: Pravin Gadakh <pravingadakh177@gmail.com>

Closes #9148 from pravingadakh/master.
---
 sql/core/src/main/scala/org/apache/spark/sql/Column.scala | 8 ++++----
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala  | 2 +-
 .../src/main/scala/org/apache/spark/sql/functions.scala   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 807bc8c30c12d..1f826887ac774 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -41,10 +41,10 @@ private[sql] object Column {
  * :: Experimental ::
  * A column in a [[DataFrame]].
  *
- * @groupname java_expr_ops Java-specific expression operators.
- * @groupname expr_ops Expression operators.
- * @groupname df_ops DataFrame functions.
- * @groupname Ungrouped Support functions for DataFrames.
+ * @groupname java_expr_ops Java-specific expression operators
+ * @groupname expr_ops Expression operators
+ * @groupname df_ops DataFrame functions
+ * @groupname Ungrouped Support functions for DataFrames
  *
  * @since 1.3.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 361eb576c567a..e83657a60558d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -59,7 +59,7 @@ import org.apache.spark.util.Utils
  * @groupname specificdata Specific Data Sources
  * @groupname config Configuration
  * @groupname dataframes Custom DataFrame Creation
- * @groupname Ungrouped Support functions for language integrated queries.
+ * @groupname Ungrouped Support functions for language integrated queries
  *
  * @since 1.0.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 2467b4e48415b..15c864a8ab641 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -43,7 +43,7 @@ import org.apache.spark.util.Utils
  * @groupname window_funcs Window functions
  * @groupname string_funcs String functions
  * @groupname collection_funcs Collection functions
- * @groupname Ungrouped Support functions for DataFrames.
+ * @groupname Ungrouped Support functions for DataFrames
  * @since 1.3.0
  */
 @Experimental

From 369d786f58580e7df73e7e23f27390d37269d0de Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 16 Oct 2015 13:53:06 -0700
Subject: [PATCH 576/802] [SPARK-10974] [STREAMING] Add progress bar for output
 operation column and use red dots for failed batches

Screenshot:
<img width="1363" alt="1" src="https://cloud.githubusercontent.com/assets/1000778/10342571/385d9340-6d4c-11e5-8e79-1fa4c3c98f81.png">

Also fixed the description and duration for output operations that don't have spark jobs.
<img width="1354" alt="2" src="https://cloud.githubusercontent.com/assets/1000778/10342775/4bd52a0e-6d4d-11e5-99bc-26265a9fc792.png">

Author: zsxwing <zsxwing@gmail.com>

Closes #9010 from zsxwing/output-op-progress-bar.
---
 .../streaming/ui/static/streaming-page.js     |  26 ++-
 .../apache/spark/streaming/DStreamGraph.scala |   2 +-
 .../spark/streaming/scheduler/BatchInfo.scala |  23 +--
 .../spark/streaming/scheduler/Job.scala       |  30 ++-
 .../streaming/scheduler/JobScheduler.scala    |  12 +-
 .../spark/streaming/scheduler/JobSet.scala    |  17 +-
 .../scheduler/OutputOperationInfo.scala       |   6 +-
 .../spark/streaming/ui/AllBatchesTable.scala  |  40 ++--
 .../apache/spark/streaming/ui/BatchPage.scala | 174 +++++++-----------
 .../spark/streaming/ui/BatchUIData.scala      |  67 ++++++-
 .../ui/StreamingJobProgressListener.scala     |  14 ++
 .../streaming/StreamingListenerSuite.scala    |  16 +-
 .../spark/streaming/UISeleniumSuite.scala     |   2 +-
 .../StreamingJobProgressListenerSuite.scala   |  30 +--
 14 files changed, 258 insertions(+), 201 deletions(-)

diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
index 4886b68eeaf76..f82323a1cdd94 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
@@ -154,34 +154,40 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
     var lastClickedBatch = null;
     var lastTimeout = null;
 
+    function isFailedBatch(batchTime) {
+        return $("#batch-" + batchTime).attr("isFailed") == "true";
+    }
+
     // Add points to the line. However, we make it invisible at first. But when the user moves mouse
     // over a point, it will be displayed with its detail.
     svg.selectAll(".point")
         .data(data)
         .enter().append("circle")
-            .attr("stroke", "white") // white and opacity = 0 make it invisible
-            .attr("fill", "white")
-            .attr("opacity", "0")
+            .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";}) // white and opacity = 0 make it invisible
+            .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+            .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";})
             .style("cursor", "pointer")
             .attr("cx", function(d) { return x(d.x); })
             .attr("cy", function(d) { return y(d.y); })
-            .attr("r", function(d) { return 3; })
+            .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";})
             .on('mouseover', function(d) {
                 var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x];
                 showBootstrapTooltip(d3.select(this).node(), tip);
                 // show the point
                 d3.select(this)
-                    .attr("stroke", "steelblue")
-                    .attr("fill", "steelblue")
-                    .attr("opacity", "1");
+                    .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";})
+                    .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";})
+                    .attr("opacity", "1")
+                    .attr("r", "3");
             })
             .on('mouseout',  function() {
                 hideBootstrapTooltip(d3.select(this).node());
                 // hide the point
                 d3.select(this)
-                    .attr("stroke", "white")
-                    .attr("fill", "white")
-                    .attr("opacity", "0");
+                    .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+                    .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+                    .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";})
+                    .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";});
             })
             .on("click", function(d) {
                 if (lastTimeout != null) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index de79c9ef1abfa..1b0b7890b3b00 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -113,7 +113,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
     val jobs = this.synchronized {
       outputStreams.flatMap { outputStream =>
         val jobOption = outputStream.generateJob(time)
-        jobOption.foreach(_.setCallSite(outputStream.creationSite.longForm))
+        jobOption.foreach(_.setCallSite(outputStream.creationSite))
         jobOption
       }
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 463f899dc249b..436eb0a566141 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -29,6 +29,7 @@ import org.apache.spark.streaming.Time
  *                        the streaming scheduler queue
  * @param processingStartTime Clock time of when the first job of this batch started processing
  * @param processingEndTime Clock time of when the last job of this batch finished processing
+ * @param outputOperationInfos The output operations in this batch
  */
 @DeveloperApi
 case class BatchInfo(
@@ -36,13 +37,10 @@ case class BatchInfo(
     streamIdToInputInfo: Map[Int, StreamInputInfo],
     submissionTime: Long,
     processingStartTime: Option[Long],
-    processingEndTime: Option[Long]
+    processingEndTime: Option[Long],
+    outputOperationInfos: Map[Int, OutputOperationInfo]
   ) {
 
-  private var _failureReasons: Map[Int, String] = Map.empty
-
-  private var _numOutputOp: Int = 0
-
   @deprecated("Use streamIdToInputInfo instead", "1.5.0")
   def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
 
@@ -72,19 +70,4 @@ case class BatchInfo(
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
 
-  /** Set the failure reasons corresponding to every output ops in the batch */
-  private[streaming] def setFailureReason(reasons: Map[Int, String]): Unit = {
-    _failureReasons = reasons
-  }
-
-  /** Failure reasons corresponding to every output ops in the batch */
-  private[streaming] def failureReasons = _failureReasons
-
-  /** Set the number of output operations in this batch */
-  private[streaming] def setNumOutputOp(numOutputOp: Int): Unit = {
-    _numOutputOp = numOutputOp
-  }
-
-  /** Return the number of output operations in this batch */
-  private[streaming] def numOutputOp: Int = _numOutputOp
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
index 1373053f064f3..ab1b3565fcc19 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.streaming.scheduler
 
+import scala.util.{Failure, Try}
+
 import org.apache.spark.streaming.Time
-import scala.util.Try
+import org.apache.spark.util.{Utils, CallSite}
 
 /**
  * Class representing a Spark computation. It may contain multiple Spark jobs.
@@ -29,7 +31,9 @@ class Job(val time: Time, func: () => _) {
   private var _outputOpId: Int = _
   private var isSet = false
   private var _result: Try[_] = null
-  private var _callSite: String = "Unknown"
+  private var _callSite: CallSite = null
+  private var _startTime: Option[Long] = None
+  private var _endTime: Option[Long] = None
 
   def run() {
     _result = Try(func())
@@ -71,11 +75,29 @@ class Job(val time: Time, func: () => _) {
     _outputOpId = outputOpId
   }
 
-  def setCallSite(callSite: String): Unit = {
+  def setCallSite(callSite: CallSite): Unit = {
     _callSite = callSite
   }
 
-  def callSite: String = _callSite
+  def callSite: CallSite = _callSite
+
+  def setStartTime(startTime: Long): Unit = {
+    _startTime = Some(startTime)
+  }
+
+  def setEndTime(endTime: Long): Unit = {
+    _endTime = Some(endTime)
+  }
+
+  def toOutputOperationInfo: OutputOperationInfo = {
+    val failureReason = if (_result != null && _result.isFailure) {
+      Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception))
+    } else {
+      None
+    }
+    OutputOperationInfo(
+      time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason)
+  }
 
   override def toString: String = id
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 0a4a396a0f498..2480b4ec093e2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -20,13 +20,13 @@ package org.apache.spark.streaming.scheduler
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
 import scala.collection.JavaConverters._
-import scala.util.{Failure, Success}
+import scala.util.Failure
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.ui.UIUtils
-import org.apache.spark.util.{EventLoop, ThreadUtils}
+import org.apache.spark.util.{EventLoop, ThreadUtils, Utils}
 
 
 private[scheduler] sealed trait JobSchedulerEvent
@@ -162,16 +162,16 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
       // correct "jobSet.processingStartTime".
       listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
     }
-    listenerBus.post(StreamingListenerOutputOperationStarted(
-      OutputOperationInfo(job.time, job.outputOpId, job.callSite, Some(startTime), None)))
+    job.setStartTime(startTime)
+    listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
     logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
   }
 
   private def handleJobCompletion(job: Job, completedTime: Long) {
     val jobSet = jobSets.get(job.time)
     jobSet.handleJobCompletion(job)
-    listenerBus.post(StreamingListenerOutputOperationCompleted(
-      OutputOperationInfo(job.time, job.outputOpId, job.callSite, None, Some(completedTime))))
+    job.setEndTime(completedTime)
+    listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
     logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
     if (jobSet.hasCompleted) {
       jobSets.remove(jobSet.time)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 08f63cc99268f..f76300351e3c0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -64,24 +64,13 @@ case class JobSet(
   }
 
   def toBatchInfo: BatchInfo = {
-    val failureReasons: Map[Int, String] = {
-      if (hasCompleted) {
-        jobs.filter(_.result.isFailure).map { job =>
-          (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception))
-        }.toMap
-      } else {
-        Map.empty
-      }
-    }
-    val binfo = new BatchInfo(
+    BatchInfo(
       time,
       streamIdToInputInfo,
       submissionTime,
       if (processingStartTime >= 0) Some(processingStartTime) else None,
-      if (processingEndTime >= 0) Some(processingEndTime) else None
+      if (processingEndTime >= 0) Some(processingEndTime) else None,
+      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
     )
-    binfo.setFailureReason(failureReasons)
-    binfo.setNumOutputOp(jobs.size)
-    binfo
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
index d5614b343912b..137e512a670da 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
@@ -25,17 +25,21 @@ import org.apache.spark.streaming.Time
  * Class having information on output operations.
  * @param batchTime Time of the batch
  * @param id Id of this output operation. Different output operations have different ids in a batch.
+ * @param name The name of this output operation.
  * @param description The description of this output operation.
  * @param startTime Clock time of when the output operation started processing
  * @param endTime Clock time of when the output operation started processing
+ * @param failureReason Failure reason if this output operation fails
  */
 @DeveloperApi
 case class OutputOperationInfo(
     batchTime: Time,
     id: Int,
+    name: String,
     description: String,
     startTime: Option[Long],
-    endTime: Option[Long]) {
+    endTime: Option[Long],
+    failureReason: Option[String]) {
 
   /**
    * Return the duration of this output operation.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index 3e6590d66f587..125cafd41b8af 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.streaming.ui
 
-import java.text.SimpleDateFormat
-import java.util.Date
-
 import scala.xml.Node
 
 import org.apache.spark.ui.{UIUtils => SparkUIUtils}
@@ -46,7 +43,8 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
     val formattedProcessingTime = processingTime.map(SparkUIUtils.formatDuration).getOrElse("-")
     val batchTimeId = s"batch-$batchTime"
 
-    <td id={batchTimeId} sorttable_customkey={batchTime.toString}>
+    <td id={batchTimeId} sorttable_customkey={batchTime.toString}
+        isFailed={batch.isFailed.toString}>
       <a href={s"batch?id=$batchTime"}>
         {formattedBatchTime}
       </a>
@@ -75,6 +73,19 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
     batchTable
   }
 
+  protected def createOutputOperationProgressBar(batch: BatchUIData): Seq[Node] = {
+    <td class="progress-cell">
+      {
+      SparkUIUtils.makeProgressBar(
+        started = batch.numActiveOutputOp,
+        completed = batch.numCompletedOutputOp,
+        failed = batch.numFailedOutputOp,
+        skipped = 0,
+        total = batch.outputOperations.size)
+      }
+    </td>
+  }
+
   /**
    * Return HTML for all rows of this table.
    */
@@ -86,7 +97,10 @@ private[ui] class ActiveBatchTable(
     waitingBatches: Seq[BatchUIData],
     batchInterval: Long) extends BatchTableBase("active-batches-table", batchInterval) {
 
-  override protected def columns: Seq[Node] = super.columns ++ <th>Status</th>
+  override protected def columns: Seq[Node] = super.columns ++ {
+    <th>Output Ops: Succeeded/Total</th>
+      <th>Status</th>
+  }
 
   override protected def renderRows: Seq[Node] = {
     // The "batchTime"s of "waitingBatches" must be greater than "runningBatches"'s, so display
@@ -96,11 +110,11 @@ private[ui] class ActiveBatchTable(
   }
 
   private def runningBatchRow(batch: BatchUIData): Seq[Node] = {
-    baseRow(batch) ++ <td>processing</td>
+    baseRow(batch) ++ createOutputOperationProgressBar(batch) ++ <td>processing</td>
   }
 
   private def waitingBatchRow(batch: BatchUIData): Seq[Node] = {
-    baseRow(batch) ++ <td>queued</td>
+    baseRow(batch) ++ createOutputOperationProgressBar(batch) ++ <td>queued</td>
   }
 }
 
@@ -119,17 +133,11 @@ private[ui] class CompletedBatchTable(batches: Seq[BatchUIData], batchInterval:
   private def completedBatchRow(batch: BatchUIData): Seq[Node] = {
     val totalDelay = batch.totalDelay
     val formattedTotalDelay = totalDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
-    val numFailedOutputOp = batch.failureReason.size
-    val outputOpColumn = if (numFailedOutputOp > 0) {
-        s"${batch.numOutputOp - numFailedOutputOp}/${batch.numOutputOp}" +
-          s" (${numFailedOutputOp} failed)"
-      } else {
-        s"${batch.numOutputOp}/${batch.numOutputOp}"
-      }
-    baseRow(batch) ++
+
+    baseRow(batch) ++ {
       <td sorttable_customkey={totalDelay.getOrElse(Long.MaxValue).toString}>
         {formattedTotalDelay}
       </td>
-      <td>{outputOpColumn}</td>
+    } ++ createOutputOperationProgressBar(batch)
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index a19b85a51d289..2ed925572826e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -47,32 +47,30 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   private def generateJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: SparkJobIdWithUIData): Seq[Node] = {
     if (sparkJob.jobUIData.isDefined) {
-      generateNormalJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
+      generateNormalJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
+        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
     } else {
-      generateDroppedJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
+      generateDroppedJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
+        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
     }
   }
 
   private def generateOutputOpRowWithoutSparkJobs(
-    outputOpId: OutputOpId,
+    outputOpData: OutputOperationUIData,
     outputOpDescription: Seq[Node],
-    formattedOutputOpDuration: String,
-    outputOpStatus: String): Seq[Node] = {
+    formattedOutputOpDuration: String): Seq[Node] = {
     <tr>
-      <td class="output-op-id-cell" >{outputOpId.toString}</td>
+      <td class="output-op-id-cell" >{outputOpData.id.toString}</td>
       <td>{outputOpDescription}</td>
       <td>{formattedOutputOpDuration}</td>
-      {outputOpStatusCell(outputOpStatus, rowspan = 1)}
+      {outputOpStatusCell(outputOpData, rowspan = 1)}
       <!-- Job Id -->
       <td>-</td>
       <!-- Duration -->
@@ -91,10 +89,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
    * one cell, we use "rowspan" for the first row of a output op.
    */
   private def generateNormalJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: JobUIData): Seq[Node] = {
@@ -116,12 +113,12 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     // scalastyle:off
     val prefixCells =
       if (isFirstRow) {
-        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
+        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpData.id.toString}</td>
         <td rowspan={numSparkJobRowsInOutputOp.toString}>
           {outputOpDescription}
         </td>
         <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
-        {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
+        {outputOpStatusCell(outputOpData, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -161,10 +158,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
    * with "-" cells.
    */
   private def generateDroppedJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       jobId: Int): Seq[Node] = {
@@ -173,10 +169,10 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     // scalastyle:off
     val prefixCells =
       if (isFirstRow) {
-        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
+        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpData.id.toString}</td>
           <td rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpDescription}</td>
           <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
-          {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
+          {outputOpStatusCell(outputOpData, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -199,47 +195,34 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   private def generateOutputOpIdRow(
-      outputOpId: OutputOpId,
-      outputOpStatus: String,
+      outputOpData: OutputOperationUIData,
       sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
-    // We don't count the durations of dropped jobs
-    val sparkJobDurations = sparkJobs.filter(_.jobUIData.nonEmpty).map(_.jobUIData.get).
-      map(sparkJob => {
-        sparkJob.submissionTime.map { start =>
-          val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis())
-          end - start
-        }
-      })
     val formattedOutputOpDuration =
-      if (sparkJobDurations.isEmpty || sparkJobDurations.exists(_ == None)) {
-        // If no job or any job does not finish, set "formattedOutputOpDuration" to "-"
+      if (outputOpData.duration.isEmpty) {
         "-"
       } else {
-        SparkUIUtils.formatDuration(sparkJobDurations.flatMap(x => x).sum)
+        SparkUIUtils.formatDuration(outputOpData.duration.get)
       }
 
-    val description = generateOutputOpDescription(sparkJobs)
+    val description = generateOutputOpDescription(outputOpData)
 
     if (sparkJobs.isEmpty) {
-      generateOutputOpRowWithoutSparkJobs(
-        outputOpId, description, formattedOutputOpDuration, outputOpStatus)
+      generateOutputOpRowWithoutSparkJobs(outputOpData, description, formattedOutputOpDuration)
     } else {
       val firstRow =
         generateJobRow(
-          outputOpId,
+          outputOpData,
           description,
           formattedOutputOpDuration,
-          outputOpStatus,
           sparkJobs.size,
           true,
           sparkJobs.head)
       val tailRows =
         sparkJobs.tail.map { sparkJob =>
           generateJobRow(
-            outputOpId,
+            outputOpData,
             description,
             formattedOutputOpDuration,
-            outputOpStatus,
             sparkJobs.size,
             false,
             sparkJob)
@@ -248,35 +231,18 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
-  private def generateOutputOpDescription(sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
-    val lastStageInfo =
-      sparkJobs.flatMap(_.jobUIData).headOption. // Get the first JobUIData
-        flatMap { sparkJob => // For the first job, get the latest Stage info
-          if (sparkJob.stageIds.isEmpty) {
-            None
-          } else {
-            sparkListener.stageIdToInfo.get(sparkJob.stageIds.max)
-          }
-        }
-    lastStageInfo match {
-      case Some(stageInfo) =>
-        val details = if (stageInfo.details.nonEmpty) {
-          <span
-            onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
-            class="expand-details">
-              +details
-          </span> ++
-          <div class="stage-details collapsed">
-            <pre>{stageInfo.details}</pre>
-          </div>
-        } else {
-          NodeSeq.Empty
-        }
-
-        <div> {stageInfo.name} {details} </div>
-      case None =>
-        Text("(Unknown)")
-    }
+  private def generateOutputOpDescription(outputOp: OutputOperationUIData): Seq[Node] = {
+    <div>
+      {outputOp.name}
+      <span
+        onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
+        class="expand-details">
+          +details
+      </span>
+      <div class="stage-details collapsed">
+        <pre>{outputOp.description}</pre>
+      </div>
+    </div>
   }
 
   private def failureReasonCell(
@@ -329,6 +295,19 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
+  private def generateOutputOperationStatusForUI(failure: String): String = {
+    if (failure.startsWith("org.apache.spark.SparkException")) {
+      "Failed due to Spark job error\n" + failure
+    } else {
+      var nextLineIndex = failure.indexOf("\n")
+      if (nextLineIndex < 0) {
+        nextLineIndex = failure.size
+      }
+      val firstLine = failure.substring(0, nextLineIndex)
+      s"Failed due to error: $firstLine\n$failure"
+    }
+  }
+
   /**
    * Generate the job table for the batch.
    */
@@ -338,26 +317,15 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         // sort SparkJobIds for each OutputOpId
         (outputOpId, outputOpIdAndSparkJobIds.map(_.sparkJobId).sorted)
       }
-    val outputOps = (0 until batchUIData.numOutputOp).map { outputOpId =>
-      val status = batchUIData.failureReason.get(outputOpId).map { failure =>
-        if (failure.startsWith("org.apache.spark.SparkException")) {
-          "Failed due to Spark job error\n" + failure
-        } else {
-          var nextLineIndex = failure.indexOf("\n")
-          if (nextLineIndex < 0) {
-            nextLineIndex = failure.size
-          }
-          val firstLine = failure.substring(0, nextLineIndex)
-          s"Failed due to error: $firstLine\n$failure"
-        }
-      }.getOrElse("Succeeded")
-      val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
-      (outputOpId, status, sparkJobIds)
-    }
+
+    val outputOps: Seq[(OutputOperationUIData, Seq[SparkJobId])] =
+      batchUIData.outputOperations.map { case (outputOpId, outputOperation) =>
+        val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
+        (outputOperation, sparkJobIds)
+      }.toSeq.sortBy(_._1.id)
     sparkListener.synchronized {
-      val outputOpIdWithJobs: Seq[(OutputOpId, String, Seq[SparkJobIdWithUIData])] =
-        outputOps.map { case (outputOpId, status, sparkJobIds) =>
-          (outputOpId, status,
+      val outputOpWithJobs = outputOps.map { case (outputOpData, sparkJobIds) =>
+          (outputOpData,
             sparkJobIds.map(sparkJobId => SparkJobIdWithUIData(sparkJobId, getJobData(sparkJobId))))
         }
 
@@ -367,9 +335,8 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         </thead>
         <tbody>
           {
-            outputOpIdWithJobs.map {
-              case (outputOpId, status, sparkJobIds) =>
-                generateOutputOpIdRow(outputOpId, status, sparkJobIds)
+            outputOpWithJobs.map { case (outputOpData, sparkJobIds) =>
+              generateOutputOpIdRow(outputOpData, sparkJobIds)
             }
           }
         </tbody>
@@ -377,7 +344,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
-  def render(request: HttpServletRequest): Seq[Node] = {
+  def render(request: HttpServletRequest): Seq[Node] = streamingListener.synchronized {
     val batchTime = Option(request.getParameter("id")).map(id => Time(id.toLong)).getOrElse {
       throw new IllegalArgumentException(s"Missing id parameter")
     }
@@ -430,14 +397,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         </ul>
       </div>
 
-    val jobTable =
-      if (batchUIData.outputOpIdSparkJobIdPairs.isEmpty) {
-        <div>Cannot find any job for Batch {formattedBatchTime}.</div>
-      } else {
-        generateJobTable(batchUIData)
-      }
-
-    val content = summary ++ jobTable
+    val content = summary ++ generateJobTable(batchUIData)
 
     SparkUIUtils.headerSparkPage(s"Details of batch at $formattedBatchTime", content, parent)
   }
@@ -471,11 +431,17 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       replaceAllLiterally("\t", "&nbsp;&nbsp;&nbsp;&nbsp;").replaceAllLiterally("\n", "<br/>"))
   }
 
-  private def outputOpStatusCell(status: String, rowspan: Int): Seq[Node] = {
-    if (status == "Succeeded") {
-      <td rowspan={rowspan.toString}>Succeeded</td>
-    } else {
-      failureReasonCell(status, rowspan, includeFirstLineInExpandDetails = false)
+  private def outputOpStatusCell(outputOp: OutputOperationUIData, rowspan: Int): Seq[Node] = {
+    outputOp.failureReason match {
+      case Some(failureReason) =>
+        val failureReasonForUI = generateOutputOperationStatusForUI(failureReason)
+        failureReasonCell(failureReasonForUI, rowspan, includeFirstLineInExpandDetails = false)
+      case None =>
+        if (outputOp.endTime.isEmpty) {
+          <td rowspan={rowspan.toString}>-</td>
+        } else {
+          <td rowspan={rowspan.toString}>Succeeded</td>
+        }
     }
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
index e6c2e2140c6c4..3ef3689de1c45 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
@@ -18,8 +18,10 @@
 
 package org.apache.spark.streaming.ui
 
+import scala.collection.mutable
+
 import org.apache.spark.streaming.Time
-import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo}
+import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo}
 import org.apache.spark.streaming.ui.StreamingJobProgressListener._
 
 private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId)
@@ -30,8 +32,7 @@ private[ui] case class BatchUIData(
     val submissionTime: Long,
     val processingStartTime: Option[Long],
     val processingEndTime: Option[Long],
-    val numOutputOp: Int,
-    val failureReason: Map[Int, String],
+    val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(),
     var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) {
 
   /**
@@ -61,19 +62,75 @@ private[ui] case class BatchUIData(
    * The number of recorders received by the receivers in this batch.
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
+
+  /**
+   * Update an output operation information of this batch.
+   */
+  def updateOutputOperationInfo(outputOperationInfo: OutputOperationInfo): Unit = {
+    assert(batchTime == outputOperationInfo.batchTime)
+    outputOperations(outputOperationInfo.id) = OutputOperationUIData(outputOperationInfo)
+  }
+
+  /**
+   * Return the number of failed output operations.
+   */
+  def numFailedOutputOp: Int = outputOperations.values.count(_.failureReason.nonEmpty)
+
+  /**
+   * Return the number of running output operations.
+   */
+  def numActiveOutputOp: Int = outputOperations.values.count(_.endTime.isEmpty)
+
+  /**
+   * Return the number of completed output operations.
+   */
+  def numCompletedOutputOp: Int = outputOperations.values.count {
+      op => op.failureReason.isEmpty && op.endTime.nonEmpty
+    }
+
+  /**
+   * Return if this batch has any output operations
+   */
+  def isFailed: Boolean = numFailedOutputOp != 0
 }
 
 private[ui] object BatchUIData {
 
   def apply(batchInfo: BatchInfo): BatchUIData = {
+    val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]()
+    outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply)
     new BatchUIData(
       batchInfo.batchTime,
       batchInfo.streamIdToInputInfo,
       batchInfo.submissionTime,
       batchInfo.processingStartTime,
       batchInfo.processingEndTime,
-      batchInfo.numOutputOp,
-      batchInfo.failureReasons
+      outputOperations
+    )
+  }
+}
+
+private[ui] case class OutputOperationUIData(
+    id: OutputOpId,
+    name: String,
+    description: String,
+    startTime: Option[Long],
+    endTime: Option[Long],
+    failureReason: Option[String]) {
+
+  def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s
+}
+
+private[ui] object OutputOperationUIData {
+
+  def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = {
+    OutputOperationUIData(
+      outputOperationInfo.id,
+      outputOperationInfo.name,
+      outputOperationInfo.description,
+      outputOperationInfo.startTime,
+      outputOperationInfo.endTime,
+      outputOperationInfo.failureReason
     )
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index 78aeb004e18b1..f6cc6edf2569a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -119,6 +119,20 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     }
   }
 
+  override def onOutputOperationStarted(
+      outputOperationStarted: StreamingListenerOutputOperationStarted): Unit = synchronized {
+    // This method is called after onBatchStarted
+    runningBatchUIData(outputOperationStarted.outputOperationInfo.batchTime).
+      updateOutputOperationInfo(outputOperationStarted.outputOperationInfo)
+  }
+
+  override def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = synchronized {
+    // This method is called before onBatchCompleted
+    runningBatchUIData(outputOperationCompleted.outputOperationInfo.batchTime).
+      updateOutputOperationInfo(outputOperationCompleted.outputOperationInfo)
+  }
+
   override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized {
     getBatchTimeAndOutputOpId(jobStart.properties).foreach { case (batchTime, outputOpId) =>
       var outputOpIdToSparkJobIds = batchTimeToOutputOpIdSparkJobIdPair.get(batchTime)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 2b43b7467042b..5dc0472c7770c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming
 
-import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
+import scala.collection.mutable.{ArrayBuffer, HashMap, SynchronizedBuffer, SynchronizedMap}
 import scala.concurrent.Future
 import scala.concurrent.ExecutionContext.Implicits.global
 
@@ -221,7 +221,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
       }
     }
     _ssc.stop()
-    failureReasonsCollector.failureReasons
+    failureReasonsCollector.failureReasons.toMap
   }
 
   /** Check if a sequence of numbers is in increasing order */
@@ -307,14 +307,16 @@ class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_O
 }
 
 /**
- * A StreamingListener that saves the latest `failureReasons` in `BatchInfo` to the `failureReasons`
- * field.
+ * A StreamingListener that saves all latest `failureReasons` in a batch.
  */
 class FailureReasonsCollector extends StreamingListener {
 
-  @volatile var failureReasons: Map[Int, String] = null
+  val failureReasons = new HashMap[Int, String] with SynchronizedMap[Int, String]
 
-  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
-    failureReasons = batchCompleted.batchInfo.failureReasons
+  override def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = {
+    outputOperationCompleted.outputOperationInfo.failureReason.foreach { f =>
+      failureReasons(outputOperationCompleted.outputOperationInfo.id) = f
+    }
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index d1df78871d3b8..a5744a9009c1c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -117,7 +117,7 @@ class UISeleniumSuite
 
         findAll(cssSelector("""#active-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
-            "Status")
+            "Output Ops: Succeeded/Total", "Status")
         }
         findAll(cssSelector("""#completed-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 995f1197ccdfd..af4718b4eb705 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -63,7 +63,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       1 -> StreamInputInfo(1, 300L, Map(StreamInputInfo.METADATA_KEY_DESCRIPTION -> "test")))
 
     // onBatchSubmitted
-    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
+    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None, Map.empty)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
     listener.waitingBatches should be (List(BatchUIData(batchInfoSubmitted)))
     listener.runningBatches should be (Nil)
@@ -75,7 +75,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalReceivedRecords should be (0)
 
     // onBatchStarted
-    val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoStarted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
     listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (List(BatchUIData(batchInfoStarted)))
@@ -116,7 +117,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
         OutputOpIdAndSparkJobId(1, 1))
 
     // onBatchCompleted
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoCompleted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
     listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (Nil)
@@ -156,7 +158,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoCompleted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
 
     for(_ <- 0 until (limit + 10)) {
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
@@ -173,8 +176,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     // fulfill completedBatchInfos
     for(i <- 0 until limit) {
-      val batchInfoCompleted =
-        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      val batchInfoCompleted = BatchInfo(
+          Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
       val jobStart = createJobStart(Time(1000 + i * 100), outputOpId = 0, jobId = 1)
       listener.onJobStart(jobStart)
@@ -185,7 +188,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.onJobStart(jobStart)
 
     val batchInfoSubmitted =
-      BatchInfo(Time(1000 + limit * 100), Map.empty, (1000 + limit * 100), None, None)
+      BatchInfo(Time(1000 + limit * 100), Map.empty, (1000 + limit * 100), None, None, Map.empty)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
 
     // We still can see the info retrieved from onJobStart
@@ -201,8 +204,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     // A lot of "onBatchCompleted"s happen before "onJobStart"
     for(i <- limit + 1 to limit * 2) {
-      val batchInfoCompleted =
-        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      val batchInfoCompleted = BatchInfo(
+          Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     }
 
@@ -227,11 +230,13 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
       // onBatchSubmitted
-      val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
+      val batchInfoSubmitted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None, Map.empty)
       listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
 
       // onBatchStarted
-      val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+      val batchInfoStarted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
       listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
 
       // onJobStart
@@ -248,7 +253,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       listener.onJobStart(jobStart4)
 
       // onBatchCompleted
-      val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+      val batchInfoCompleted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     }
 

From e1eef248f13f6c334fe4eea8a29a1de5470a2e62 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 16 Oct 2015 13:56:51 -0700
Subject: [PATCH 577/802] [SPARK-11104] [STREAMING] Fix a deadlock in
 StreamingContex.stop

The following deadlock may happen if shutdownHook and StreamingContext.stop are running at the same time.
```
Java stack information for the threads listed above:
===================================================
"Thread-2":
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:699)
	- waiting to lock <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.StreamingContext.org$apache$spark$streaming$StreamingContext$$stopOnShutdown(StreamingContext.scala:729)
	at org.apache.spark.streaming.StreamingContext$$anonfun$start$1.apply$mcV$sp(StreamingContext.scala:625)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:266)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1697)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:236)
	at scala.util.Try$.apply(Try.scala:161)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:236)
	- locked <0x00000005405b6a00> (a org.apache.spark.util.SparkShutdownHookManager)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:216)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
"main":
	at org.apache.spark.util.SparkShutdownHookManager.remove(ShutdownHookManager.scala:248)
	- waiting to lock <0x00000005405b6a00> (a org.apache.spark.util.SparkShutdownHookManager)
	at org.apache.spark.util.ShutdownHookManager$.removeShutdownHook(ShutdownHookManager.scala:199)
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:712)
	- locked <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:684)
	- locked <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.SessionByKeyBenchmark$.main(SessionByKeyBenchmark.scala:108)
	at org.apache.spark.streaming.SessionByKeyBenchmark.main(SessionByKeyBenchmark.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:680)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
```

This PR just moved `ShutdownHookManager.removeShutdownHook` out of `synchronized` to avoid deadlock.

Author: zsxwing <zsxwing@gmail.com>

Closes #9116 from zsxwing/stop-deadlock.
---
 .../spark/streaming/StreamingContext.scala    | 55 +++++++++++--------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 9b2632c229548..051f53de64cd5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -694,32 +694,39 @@ class StreamingContext private[streaming] (
    * @param stopGracefully if true, stops gracefully by waiting for the processing of all
    *                       received data to be completed
    */
-  def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = synchronized {
-    try {
-      state match {
-        case INITIALIZED =>
-          logWarning("StreamingContext has not been started yet")
-        case STOPPED =>
-          logWarning("StreamingContext has already been stopped")
-        case ACTIVE =>
-          scheduler.stop(stopGracefully)
-          // Removing the streamingSource to de-register the metrics on stop()
-          env.metricsSystem.removeSource(streamingSource)
-          uiTab.foreach(_.detach())
-          StreamingContext.setActiveContext(null)
-          waiter.notifyStop()
-          if (shutdownHookRef != null) {
-            ShutdownHookManager.removeShutdownHook(shutdownHookRef)
-          }
-          logInfo("StreamingContext stopped successfully")
+  def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = {
+    var shutdownHookRefToRemove: AnyRef = null
+    synchronized {
+      try {
+        state match {
+          case INITIALIZED =>
+            logWarning("StreamingContext has not been started yet")
+          case STOPPED =>
+            logWarning("StreamingContext has already been stopped")
+          case ACTIVE =>
+            scheduler.stop(stopGracefully)
+            // Removing the streamingSource to de-register the metrics on stop()
+            env.metricsSystem.removeSource(streamingSource)
+            uiTab.foreach(_.detach())
+            StreamingContext.setActiveContext(null)
+            waiter.notifyStop()
+            if (shutdownHookRef != null) {
+              shutdownHookRefToRemove = shutdownHookRef
+              shutdownHookRef = null
+            }
+            logInfo("StreamingContext stopped successfully")
+        }
+      } finally {
+        // The state should always be Stopped after calling `stop()`, even if we haven't started yet
+        state = STOPPED
       }
-      // Even if we have already stopped, we still need to attempt to stop the SparkContext because
-      // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true).
-      if (stopSparkContext) sc.stop()
-    } finally {
-      // The state should always be Stopped after calling `stop()`, even if we haven't started yet
-      state = STOPPED
     }
+    if (shutdownHookRefToRemove != null) {
+      ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
+    }
+    // Even if we have already stopped, we still need to attempt to stop the SparkContext because
+    // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true).
+    if (stopSparkContext) sc.stop()
   }
 
   private def stopOnShutdown(): Unit = {

From ac09a3a465f3b57f3964c5fd621ab0d2216e2354 Mon Sep 17 00:00:00 2001
From: gweidner <gweidner@us.ibm.com>
Date: Fri, 16 Oct 2015 14:02:12 -0700
Subject: [PATCH 578/802] [SPARK-11109] [CORE] Move FsHistoryProvider off
 deprecated AccessControlException

Switched from deprecated org.apache.hadoop.fs.permission.AccessControlException to org.apache.hadoop.security.AccessControlException.

Author: gweidner <gweidner@us.ibm.com>

Closes #9144 from gweidner/SPARK-11109.
---
 .../org/apache/spark/deploy/history/FsHistoryProvider.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 5eb8adf97d90b..80bfda9dddb39 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -27,7 +27,7 @@ import scala.collection.mutable
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
-import org.apache.hadoop.fs.permission.AccessControlException
+import org.apache.hadoop.security.AccessControlException
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil

From 1ec0a0dc2819d3db3555799cb78c2946f652bff4 Mon Sep 17 00:00:00 2001
From: Bhargav Mangipudi <bhargav.mangipudi@gmail.com>
Date: Fri, 16 Oct 2015 14:36:05 -0700
Subject: [PATCH 579/802] =?UTF-8?q?[SPARK-11050]=20[MLLIB]=20PySpark=20Spa?=
 =?UTF-8?q?rseVector=20can=20return=20wrong=20index=20in=20e=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rror message

For negative indices in the SparseVector, we update the index value. If we have an incorrect index
at this point, the error message has the incorrect *updated* index instead of the original one. This
change contains the fix for the same.

Author: Bhargav Mangipudi <bhargav.mangipudi@gmail.com>

Closes #9069 from bhargav/spark-10759.
---
 python/pyspark/mllib/linalg/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d903b9030d8ce..5276eb41cf29e 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -764,10 +764,11 @@ def __getitem__(self, index):
         if not isinstance(index, int):
             raise TypeError(
                 "Indices must be of type integer, got type %s" % type(index))
+
+        if index >= self.size or index < -self.size:
+            raise ValueError("Index %d out of bounds." % index)
         if index < 0:
             index += self.size
-        if index >= self.size or index < 0:
-            raise ValueError("Index %d out of bounds." % index)
 
         insert_index = np.searchsorted(inds, index)
         if insert_index >= inds.size:

From 10046ea76cf8f0d08fe7ef548e4dbec69d9c73b8 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 16 Oct 2015 15:30:07 -0700
Subject: [PATCH 580/802] [SPARK-10599] [MLLIB] Lower communication for block
 matrix multiplication

This PR aims to decrease communication costs in BlockMatrix multiplication in two ways:
 - Simulate the multiplication on the driver, and figure out which blocks actually need to be shuffled
 - Send the block once to a partition, and join inside the partition rather than sending multiple copies to the same partition

**NOTE**: One important note is that right now, the old behavior of checking for multiple blocks with the same index is lost. This is not hard to add, but is a little more expensive than how it was.

Initial benchmarking showed promising results (look below), however I did hit some `FileNotFound` exceptions with the new implementation after the shuffle.

Size A: 1e5 x 1e5
Size B: 1e5 x 1e5
Block Sizes: 1024 x 1024
Sparsity: 0.01
Old implementation: 1m 13s
New implementation: 9s

cc avulanov Would you be interested in helping me benchmark this? I used your code from the mailing list (which you sent about 3 months ago?), and the old implementation didn't even run, but the new implementation completed in 268s in a 120 GB / 16 core cluster

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #8757 from brkyvz/opt-bmm.
---
 .../linalg/distributed/BlockMatrix.scala      | 80 ++++++++++++++-----
 .../linalg/distributed/BlockMatrixSuite.scala | 18 +++++
 2 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index a33b6137cf9cc..81a6c0550bda7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -54,12 +54,14 @@ private[mllib] class GridPartitioner(
   /**
    * Returns the index of the partition the input coordinate belongs to.
    *
-   * @param key The coordinate (i, j) or a tuple (i, j, k), where k is the inner index used in
-   *            multiplication. k is ignored in computing partitions.
+   * @param key The partition id i (calculated through this method for coordinate (i, j) in
+   *            `simulateMultiply`, the coordinate (i, j) or a tuple (i, j, k), where k is
+   *            the inner index used in multiplication. k is ignored in computing partitions.
    * @return The index of the partition, which the coordinate belongs to.
    */
   override def getPartition(key: Any): Int = {
     key match {
+      case i: Int => i
       case (i: Int, j: Int) =>
         getPartitionId(i, j)
       case (i: Int, j: Int, _: Int) =>
@@ -352,12 +354,49 @@ class BlockMatrix @Since("1.3.0") (
     }
   }
 
+  /** Block (i,j) --> Set of destination partitions */
+  private type BlockDestinations = Map[(Int, Int), Set[Int]]
+
+  /**
+   * Simulate the multiplication with just block indices in order to cut costs on communication,
+   * when we are actually shuffling the matrices.
+   * The `colsPerBlock` of this matrix must equal the `rowsPerBlock` of `other`.
+   * Exposed for tests.
+   *
+   * @param other The BlockMatrix to multiply
+   * @param partitioner The partitioner that will be used for the resulting matrix `C = A * B`
+   * @return A tuple of [[BlockDestinations]]. The first element is the Map of the set of partitions
+   *         that we need to shuffle each blocks of `this`, and the second element is the Map for
+   *         `other`.
+   */
+  private[distributed] def simulateMultiply(
+      other: BlockMatrix,
+      partitioner: GridPartitioner): (BlockDestinations, BlockDestinations) = {
+    val leftMatrix = blockInfo.keys.collect() // blockInfo should already be cached
+    val rightMatrix = other.blocks.keys.collect()
+    val leftDestinations = leftMatrix.map { case (rowIndex, colIndex) =>
+      val rightCounterparts = rightMatrix.filter(_._1 == colIndex)
+      val partitions = rightCounterparts.map(b => partitioner.getPartition((rowIndex, b._2)))
+      ((rowIndex, colIndex), partitions.toSet)
+    }.toMap
+    val rightDestinations = rightMatrix.map { case (rowIndex, colIndex) =>
+      val leftCounterparts = leftMatrix.filter(_._2 == rowIndex)
+      val partitions = leftCounterparts.map(b => partitioner.getPartition((b._1, colIndex)))
+      ((rowIndex, colIndex), partitions.toSet)
+    }.toMap
+    (leftDestinations, rightDestinations)
+  }
+
   /**
    * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
    * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
    * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
    * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
    * some performance issues until support for multiplying two sparse matrices is added.
+   *
+   * Note: The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
+   * there were blocks with duplicate indices. Now, the blocks with duplicate indices will be added
+   * with each other.
    */
   @Since("1.3.0")
   def multiply(other: BlockMatrix): BlockMatrix = {
@@ -368,33 +407,30 @@ class BlockMatrix @Since("1.3.0") (
     if (colsPerBlock == other.rowsPerBlock) {
       val resultPartitioner = GridPartitioner(numRowBlocks, other.numColBlocks,
         math.max(blocks.partitions.length, other.blocks.partitions.length))
-      // Each block of A must be multiplied with the corresponding blocks in each column of B.
-      // TODO: Optimize to send block to a partition once, similar to ALS
+      val (leftDestinations, rightDestinations) = simulateMultiply(other, resultPartitioner)
+      // Each block of A must be multiplied with the corresponding blocks in the columns of B.
       val flatA = blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
-        Iterator.tabulate(other.numColBlocks)(j => ((blockRowIndex, j, blockColIndex), block))
+        val destinations = leftDestinations.getOrElse((blockRowIndex, blockColIndex), Set.empty)
+        destinations.map(j => (j, (blockRowIndex, blockColIndex, block)))
       }
       // Each block of B must be multiplied with the corresponding blocks in each row of A.
       val flatB = other.blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
-        Iterator.tabulate(numRowBlocks)(i => ((i, blockColIndex, blockRowIndex), block))
+        val destinations = rightDestinations.getOrElse((blockRowIndex, blockColIndex), Set.empty)
+        destinations.map(j => (j, (blockRowIndex, blockColIndex, block)))
       }
-      val newBlocks: RDD[MatrixBlock] = flatA.cogroup(flatB, resultPartitioner)
-        .flatMap { case ((blockRowIndex, blockColIndex, _), (a, b)) =>
-          if (a.size > 1 || b.size > 1) {
-            throw new SparkException("There are multiple MatrixBlocks with indices: " +
-              s"($blockRowIndex, $blockColIndex). Please remove them.")
-          }
-          if (a.nonEmpty && b.nonEmpty) {
-            val C = b.head match {
-              case dense: DenseMatrix => a.head.multiply(dense)
-              case sparse: SparseMatrix => a.head.multiply(sparse.toDense)
-              case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.")
+      val newBlocks = flatA.cogroup(flatB, resultPartitioner).flatMap { case (pId, (a, b)) =>
+        a.flatMap { case (leftRowIndex, leftColIndex, leftBlock) =>
+          b.filter(_._1 == leftColIndex).map { case (rightRowIndex, rightColIndex, rightBlock) =>
+            val C = rightBlock match {
+              case dense: DenseMatrix => leftBlock.multiply(dense)
+              case sparse: SparseMatrix => leftBlock.multiply(sparse.toDense)
+              case _ =>
+                throw new SparkException(s"Unrecognized matrix type ${rightBlock.getClass}.")
             }
-            Iterator(((blockRowIndex, blockColIndex), C.toBreeze))
-          } else {
-            Iterator()
+            ((leftRowIndex, rightColIndex), C.toBreeze)
           }
-      }.reduceByKey(resultPartitioner, (a, b) => a + b)
-        .mapValues(Matrices.fromBreeze)
+        }
+      }.reduceByKey(resultPartitioner, (a, b) => a + b).mapValues(Matrices.fromBreeze)
       // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices
       new BlockMatrix(newBlocks, rowsPerBlock, other.colsPerBlock, numRows(), other.numCols())
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
index 93fe04c139b9a..b8eb10305801c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -235,6 +235,24 @@ class BlockMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(localC ~== result absTol 1e-8)
   }
 
+  test("simulate multiply") {
+    val blocks: Seq[((Int, Int), Matrix)] = Seq(
+      ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))),
+      ((1, 1), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))))
+    val rdd = sc.parallelize(blocks, 2)
+    val B = new BlockMatrix(rdd, colPerPart, rowPerPart)
+    val resultPartitioner = GridPartitioner(gridBasedMat.numRowBlocks, B.numColBlocks,
+      math.max(numPartitions, 2))
+    val (destinationsA, destinationsB) = gridBasedMat.simulateMultiply(B, resultPartitioner)
+    assert(destinationsA((0, 0)) === Set(0))
+    assert(destinationsA((0, 1)) === Set(2))
+    assert(destinationsA((1, 0)) === Set(0))
+    assert(destinationsA((1, 1)) === Set(2))
+    assert(destinationsA((2, 1)) === Set(3))
+    assert(destinationsB((0, 0)) === Set(0))
+    assert(destinationsB((1, 1)) === Set(2, 3))
+  }
+
   test("validate") {
     // No error
     gridBasedMat.validate()

From 8ac71d62d976bbfd0159cac6816dd8fa580ae1cb Mon Sep 17 00:00:00 2001
From: zero323 <matthew.szymkiewicz@gmail.com>
Date: Fri, 16 Oct 2015 15:53:26 -0700
Subject: [PATCH 581/802] [SPARK-11084] [ML] [PYTHON] Check if index can
 contain non-zero value before binary search

At this moment `SparseVector.__getitem__` executes `np.searchsorted` first and checks if result is in an expected range after that. It is possible to check if index can contain non-zero value before executing `np.searchsorted`.

Author: zero323 <matthew.szymkiewicz@gmail.com>

Closes #9098 from zero323/sparse_vector_getitem_improved.
---
 python/pyspark/mllib/linalg/__init__.py |  4 ++--
 python/pyspark/mllib/tests.py           | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 5276eb41cf29e..ae9ce58450905 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -770,10 +770,10 @@ def __getitem__(self, index):
         if index < 0:
             index += self.size
 
-        insert_index = np.searchsorted(inds, index)
-        if insert_index >= inds.size:
+        if (inds.size == 0) or (index > inds.item(-1)):
             return 0.
 
+        insert_index = np.searchsorted(inds, index)
         row_ind = inds[insert_index]
         if row_ind == index:
             return vals[insert_index]
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 2a6a5cd3fe40e..2ad69a0ab1d3d 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -252,6 +252,16 @@ def test_sparse_vector_indexing(self):
         for ind in [7.8, '1']:
             self.assertRaises(TypeError, sv.__getitem__, ind)
 
+        zeros = SparseVector(4, {})
+        self.assertEqual(zeros[0], 0.0)
+        self.assertEqual(zeros[3], 0.0)
+        for ind in [4, -5]:
+            self.assertRaises(ValueError, zeros.__getitem__, ind)
+
+        empty = SparseVector(0, {})
+        for ind in [-1, 0, 1]:
+            self.assertRaises(ValueError, empty.__getitem__, ind)
+
     def test_matrix_indexing(self):
         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
         expected = [[0, 6], [1, 8], [4, 10]]

From e1e77b22b3b577909a12c3aa898eb53be02267fd Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 17 Oct 2015 10:04:19 -0700
Subject: [PATCH 582/802] [SPARK-11029] [ML] Add computeCost to KMeansModel in
 spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11029

We should add a method analogous to spark.mllib.clustering.KMeansModel.computeCost to spark.ml.clustering.KMeansModel.
This will be a temp fix until we have proper evaluators defined for clustering.

Author: Yuhao Yang <hhbyyh@gmail.com>
Author: yuhaoyang <yuhao@zhanglipings-iMac.local>

Closes #9073 from hhbyyh/computeCost.
---
 .../org/apache/spark/ml/clustering/KMeans.scala      | 12 ++++++++++++
 .../org/apache/spark/ml/clustering/KMeansSuite.scala |  1 +
 2 files changed, 13 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index f40ab71fb22a6..509be63002396 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -117,6 +117,18 @@ class KMeansModel private[ml] (
 
   @Since("1.5.0")
   def clusterCenters: Array[Vector] = parentModel.clusterCenters
+
+  /**
+   * Return the K-means cost (sum of squared distances of points to their nearest center) for this
+   * model on the given data.
+   */
+  // TODO: Replace the temp fix when we have proper evaluators defined for clustering.
+  @Since("1.6.0")
+  def computeCost(dataset: DataFrame): Double = {
+    SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
+    val data = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
+    parentModel.computeCost(data)
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 688b0e31f91dc..c05f90550d161 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -104,5 +104,6 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val clusters = transformed.select(predictionColName).map(_.getInt(0)).distinct().collect().toSet
     assert(clusters.size === k)
     assert(clusters === Set(0, 1, 2, 3, 4))
+    assert(model.computeCost(dataset) < 0.1)
   }
 }

From cca2258685147be6c950c9f5c4e50eaa1e090714 Mon Sep 17 00:00:00 2001
From: Luvsandondov Lkhamsuren <lkhamsurenl@gmail.com>
Date: Sat, 17 Oct 2015 10:07:42 -0700
Subject: [PATCH 583/802] [SPARK-9963] [ML] RandomForest cleanup: replace
 predictNodeIndex with predictImpl

predictNodeIndex is moved to LearningNode and renamed predictImpl for consistency with Node.predictImpl

Author: Luvsandondov Lkhamsuren <lkhamsurenl@gmail.com>

Closes #8609 from lkhamsurenl/SPARK-9963.
---
 .../scala/org/apache/spark/ml/tree/Node.scala | 37 ++++++++++++++++
 .../spark/ml/tree/impl/RandomForest.scala     | 44 +------------------
 2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index cd24931293903..d89682611e3f5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -279,6 +279,43 @@ private[tree] class LearningNode(
     }
   }
 
+  /**
+   * Get the node index corresponding to this data point.
+   * This function mimics prediction, passing an example from the root node down to a leaf
+   * or unsplit node; that node's index is returned.
+   *
+   * @param binnedFeatures  Binned feature vector for data point.
+   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
+   * @return  Leaf index if the data point reaches a leaf.
+   *          Otherwise, last node reachable in tree matching this example.
+   *          Note: This is the global node index, i.e., the index used in the tree.
+   *                This index is different from the index used during training a particular
+   *                group of nodes on one call to [[findBestSplits()]].
+   */
+  def predictImpl(binnedFeatures: Array[Int], splits: Array[Array[Split]]): Int = {
+    if (this.isLeaf || this.split.isEmpty) {
+      this.id
+    } else {
+      val split = this.split.get
+      val featureIndex = split.featureIndex
+      val splitLeft = split.shouldGoLeft(binnedFeatures(featureIndex), splits(featureIndex))
+      if (this.leftChild.isEmpty) {
+        // Not yet split. Return next layer of nodes to train
+        if (splitLeft) {
+          LearningNode.leftChildIndex(this.id)
+        } else {
+          LearningNode.rightChildIndex(this.id)
+        }
+      } else {
+        if (splitLeft) {
+          this.leftChild.get.predictImpl(binnedFeatures, splits)
+        } else {
+          this.rightChild.get.predictImpl(binnedFeatures, splits)
+        }
+      }
+    }
+  }
+
 }
 
 private[tree] object LearningNode {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index c494556085e95..96d5652857e08 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -205,47 +205,6 @@ private[ml] object RandomForest extends Logging {
     }
   }
 
-  /**
-   * Get the node index corresponding to this data point.
-   * This function mimics prediction, passing an example from the root node down to a leaf
-   * or unsplit node; that node's index is returned.
-   *
-   * @param node  Node in tree from which to classify the given data point.
-   * @param binnedFeatures  Binned feature vector for data point.
-   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
-   * @return  Leaf index if the data point reaches a leaf.
-   *          Otherwise, last node reachable in tree matching this example.
-   *          Note: This is the global node index, i.e., the index used in the tree.
-   *                This index is different from the index used during training a particular
-   *                group of nodes on one call to [[findBestSplits()]].
-   */
-  private def predictNodeIndex(
-      node: LearningNode,
-      binnedFeatures: Array[Int],
-      splits: Array[Array[Split]]): Int = {
-    if (node.isLeaf || node.split.isEmpty) {
-      node.id
-    } else {
-      val split = node.split.get
-      val featureIndex = split.featureIndex
-      val splitLeft = split.shouldGoLeft(binnedFeatures(featureIndex), splits(featureIndex))
-      if (node.leftChild.isEmpty) {
-        // Not yet split. Return index from next layer of nodes to train
-        if (splitLeft) {
-          LearningNode.leftChildIndex(node.id)
-        } else {
-          LearningNode.rightChildIndex(node.id)
-        }
-      } else {
-        if (splitLeft) {
-          predictNodeIndex(node.leftChild.get, binnedFeatures, splits)
-        } else {
-          predictNodeIndex(node.rightChild.get, binnedFeatures, splits)
-        }
-      }
-    }
-  }
-
   /**
    * Helper for binSeqOp, for data which can contain a mix of ordered and unordered features.
    *
@@ -453,8 +412,7 @@ private[ml] object RandomForest extends Logging {
         agg: Array[DTStatsAggregator],
         baggedPoint: BaggedPoint[TreePoint]): Array[DTStatsAggregator] = {
       treeToNodeToIndexInfo.foreach { case (treeIndex, nodeIndexToInfo) =>
-        val nodeIndex =
-          predictNodeIndex(topNodes(treeIndex), baggedPoint.datum.binnedFeatures, splits)
+        val nodeIndex = topNodes(treeIndex).predictImpl(baggedPoint.datum.binnedFeatures, splits)
         nodeBinSeqOp(treeIndex, nodeIndexToInfo.getOrElse(nodeIndex, null), agg, baggedPoint)
       }
       agg

From 254937420678a299f06b6f4e2696c623da56cf3a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 17 Oct 2015 12:41:42 -0700
Subject: [PATCH 584/802] [SPARK-11165] Logging trait should be private - not
 DeveloperApi.

Its classdoc actually says; "NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility."

Author: Reynold Xin <rxin@databricks.com>

Closes #9155 from rxin/private-logging-trait.
---
 core/src/main/scala/org/apache/spark/Logging.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index f0598816d6c07..69f6e06ee0057 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -21,11 +21,10 @@ import org.apache.log4j.{LogManager, PropertyConfigurator}
 import org.slf4j.{Logger, LoggerFactory}
 import org.slf4j.impl.StaticLoggerBinder
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.Private
 import org.apache.spark.util.Utils
 
 /**
- * :: DeveloperApi ::
  * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
  * logging messages at different levels using methods that only evaluate parameters lazily if the
  * log level is enabled.
@@ -33,7 +32,7 @@ import org.apache.spark.util.Utils
  * NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility.
  *       This will likely be changed or removed in future releases.
  */
-@DeveloperApi
+@Private
 trait Logging {
   // Make the log field transient so that objects with Logging can
   // be serialized and used on another machine

From 57f83e36d63bbd79663c49a6c1e8f6c3c8fe4789 Mon Sep 17 00:00:00 2001
From: Koert Kuipers <koert@tresata.com>
Date: Sat, 17 Oct 2015 14:56:24 -0700
Subject: [PATCH 585/802] [SPARK-10185] [SQL] Feat sql comma separated paths

Make sure comma-separated paths get processed correcly in ResolvedDataSource for a HadoopFsRelationProvider

Author: Koert Kuipers <koert@tresata.com>

Closes #8416 from koertkuipers/feat-sql-comma-separated-paths.
---
 python/pyspark/sql/readwriter.py              | 14 +++++-
 python/test_support/sql/people1.json          |  2 +
 .../apache/spark/sql/DataFrameReader.scala    | 11 +++++
 .../datasources/ResolvedDataSource.scala      | 47 +++++++++++++++----
 .../org/apache/spark/sql/DataFrameSuite.scala | 18 +++++++
 5 files changed, 81 insertions(+), 11 deletions(-)
 create mode 100644 python/test_support/sql/people1.json

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index f43d8bf646a9e..93832d4c713e5 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -116,6 +116,10 @@ def load(self, path=None, format=None, schema=None, **options):
         ...     opt2=1, opt3='str')
         >>> df.dtypes
         [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
+        >>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json',
+        ...     'python/test_support/sql/people1.json'])
+        >>> df.dtypes
+        [('age', 'bigint'), ('aka', 'string'), ('name', 'string')]
         """
         if format is not None:
             self.format(format)
@@ -123,7 +127,15 @@ def load(self, path=None, format=None, schema=None, **options):
             self.schema(schema)
         self.options(**options)
         if path is not None:
-            return self._df(self._jreader.load(path))
+            if type(path) == list:
+                paths = path
+                gateway = self._sqlContext._sc._gateway
+                jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
+                for i in range(0, len(paths)):
+                    jpaths[i] = paths[i]
+                return self._df(self._jreader.load(jpaths))
+            else:
+                return self._df(self._jreader.load(path))
         else:
             return self._df(self._jreader.load())
 
diff --git a/python/test_support/sql/people1.json b/python/test_support/sql/people1.json
new file mode 100644
index 0000000000000..6d217da77d155
--- /dev/null
+++ b/python/test_support/sql/people1.json
@@ -0,0 +1,2 @@
+{"name":"Jonathan", "aka": "John"}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index eacdea2c1e5b3..e8651a3569d6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -22,6 +22,7 @@ import java.util.Properties
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.StringUtils
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
@@ -123,6 +124,16 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
     DataFrame(sqlContext, LogicalRelation(resolved.relation))
   }
 
+  /**
+   * Loads input in as a [[DataFrame]], for data sources that support multiple paths.
+   * Only works if the source is a HadoopFsRelationProvider.
+   *
+   * @since 1.6.0
+   */
+  def load(paths: Array[String]): DataFrame = {
+    option("paths", paths.map(StringUtils.escapeString(_, '\\', ',')).mkString(",")).load()
+  }
+
   /**
    * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
index 011724436621d..54beabbf63b5f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -24,6 +24,7 @@ import scala.language.{existentials, implicitConversions}
 import scala.util.{Success, Failure, Try}
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.StringUtils
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -89,7 +90,11 @@ object ResolvedDataSource extends Logging {
     val relation = userSpecifiedSchema match {
       case Some(schema: StructType) => clazz.newInstance() match {
         case dataSource: SchemaRelationProvider =>
-          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
+          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+          if (caseInsensitiveOptions.contains("paths")) {
+            throw new AnalysisException(s"$className does not support paths option.")
+          }
+          dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
         case dataSource: HadoopFsRelationProvider =>
           val maybePartitionsSchema = if (partitionColumns.isEmpty) {
             None
@@ -99,10 +104,19 @@ object ResolvedDataSource extends Logging {
 
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
-            val patternPath = new Path(caseInsensitiveOptions("path"))
-            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
+            if (caseInsensitiveOptions.contains("paths") &&
+              caseInsensitiveOptions.contains("path")) {
+              throw new AnalysisException(s"Both path and paths options are present.")
+            }
+            caseInsensitiveOptions.get("paths")
+              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
+              .getOrElse(Array(caseInsensitiveOptions("path")))
+              .flatMap{ pathString =>
+                val hdfsPath = new Path(pathString)
+                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
+              }
           }
 
           val dataSchema =
@@ -122,14 +136,27 @@ object ResolvedDataSource extends Logging {
 
       case None => clazz.newInstance() match {
         case dataSource: RelationProvider =>
-          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options))
+          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+          if (caseInsensitiveOptions.contains("paths")) {
+            throw new AnalysisException(s"$className does not support paths option.")
+          }
+          dataSource.createRelation(sqlContext, caseInsensitiveOptions)
         case dataSource: HadoopFsRelationProvider =>
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
-            val patternPath = new Path(caseInsensitiveOptions("path"))
-            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
+            if (caseInsensitiveOptions.contains("paths") &&
+              caseInsensitiveOptions.contains("path")) {
+              throw new AnalysisException(s"Both path and paths options are present.")
+            }
+            caseInsensitiveOptions.get("paths")
+              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
+              .getOrElse(Array(caseInsensitiveOptions("path")))
+              .flatMap{ pathString =>
+                val hdfsPath = new Path(pathString)
+                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
+              }
           }
           dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions)
         case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index d919877746c72..832ea02cb6e77 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -890,6 +890,24 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       .collect()
   }
 
+  test("SPARK-10185: Read multiple Hadoop Filesystem paths and paths with a comma in it") {
+    withTempDir { dir =>
+      val df1 = Seq((1, 22)).toDF("a", "b")
+      val dir1 = new File(dir, "dir,1").getCanonicalPath
+      df1.write.format("json").save(dir1)
+
+      val df2 = Seq((2, 23)).toDF("a", "b")
+      val dir2 = new File(dir, "dir2").getCanonicalPath
+      df2.write.format("json").save(dir2)
+
+      checkAnswer(sqlContext.read.format("json").load(Array(dir1, dir2)),
+        Row(1, 22) :: Row(2, 23) :: Nil)
+
+      checkAnswer(sqlContext.read.format("json").load(dir1),
+        Row(1, 22) :: Nil)
+    }
+  }
+
   test("SPARK-10034: Sort on Aggregate with aggregation expression named 'aggOrdering'") {
     val df = Seq(1 -> 2).toDF("i", "j")
     val query = df.groupBy('i)

From 022a8f6a1f7cb477a65a65482982c021ce08a73c Mon Sep 17 00:00:00 2001
From: ph <ph@plista.com>
Date: Sat, 17 Oct 2015 15:37:51 -0700
Subject: [PATCH 586/802] [SPARK-11129] [MESOS] Link Spark WebUI from Mesos
 WebUI

Mesos has a feature for linking to frameworks running on top of Mesos
from the Mesos WebUI. This commit enables Spark to make use of this
feature so one can directly visit the running Spark WebUIs from the
Mesos WebUI.

Author: ph <ph@plista.com>

Closes #9135 from philipphoffmann/SPARK-11129.
---
 .../cluster/mesos/CoarseMesosSchedulerBackend.scala        | 7 ++++++-
 .../scheduler/cluster/mesos/MesosSchedulerBackend.scala    | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 65cb5016cfcc9..d10a77f8e5c78 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -127,7 +127,12 @@ private[spark] class CoarseMesosSchedulerBackend(
   override def start() {
     super.start()
     val driver = createSchedulerDriver(
-      master, CoarseMesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+      master,
+      CoarseMesosSchedulerBackend.this,
+      sc.sparkUser,
+      sc.appName,
+      sc.conf,
+      sc.ui.map(_.appUIAddress))
     startScheduler(driver)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 8edf7007a5daf..6196176c7cc33 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -68,7 +68,12 @@ private[spark] class MesosSchedulerBackend(
   override def start() {
     classLoader = Thread.currentThread.getContextClassLoader
     val driver = createSchedulerDriver(
-      master, MesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+      master,
+      MesosSchedulerBackend.this,
+      sc.sparkUser,
+      sc.appName,
+      sc.conf,
+      sc.ui.map(_.appUIAddress))
     startScheduler(driver)
   }
 

From e2dfdbb2c0523517880138f214775f9a896f2271 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Sat, 17 Oct 2015 16:41:49 -0700
Subject: [PATCH 587/802] [SPARK-11000] [YARN] Load `metadata.Hive` class only
 when `hive.metastore.uris` was set to avoid bootting the database twice

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #9026 from SaintBacchus/SPARK-11000.
---
 .../main/scala/org/apache/spark/deploy/yarn/Client.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 9fcfe362a3ba2..08aecfa7f6fe0 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1327,11 +1327,8 @@ object Client extends Logging {
       val mirror = universe.runtimeMirror(getClass.getClassLoader)
 
       try {
-        val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
-        val hive = hiveClass.getMethod("get").invoke(null)
-
-        val hiveConf = hiveClass.getMethod("getConf").invoke(hive)
         val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+        val hiveConf = hiveConfClass.newInstance()
 
         val hiveConfGet = (param: String) => Option(hiveConfClass
           .getMethod("get", classOf[java.lang.String])
@@ -1341,6 +1338,9 @@ object Client extends Logging {
 
         // Check for local metastore
         if (metastore_uri != None && metastore_uri.get.toString.size > 0) {
+          val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
+          val hive = hiveClass.getMethod("get").invoke(null, hiveConf.asInstanceOf[Object])
+
           val metastore_kerberos_principal_conf_var = mirror.classLoader
             .loadClass("org.apache.hadoop.hive.conf.HiveConf$ConfVars")
             .getField("METASTORE_KERBEROS_PRINCIPAL").get("varname").toString

From 3895b2113a726171b3c9c04fe41b3cc93d6d14b5 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Sun, 18 Oct 2015 02:12:56 -0700
Subject: [PATCH 588/802] [SPARK-11172] Close JsonParser/Generator in test

Author: tedyu <yuzhihong@gmail.com>

Closes #9157 from tedyu/master.
---
 .../sql/execution/datasources/json/JsonSuite.scala | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index b614e6c4148fd..7540223bf2771 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -47,13 +47,15 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val factory = new JsonFactory()
     def enforceCorrectType(value: Any, dataType: DataType): Any = {
       val writer = new StringWriter()
-      val generator = factory.createGenerator(writer)
-      generator.writeObject(value)
-      generator.flush()
+      Utils.tryWithResource(factory.createGenerator(writer)) { generator =>
+        generator.writeObject(value)
+        generator.flush()
+      }
 
-      val parser = factory.createParser(writer.toString)
-      parser.nextToken()
-      JacksonParser.convertField(factory, parser, dataType)
+      Utils.tryWithResource(factory.createParser(writer.toString)) { parser =>
+        parser.nextToken()
+        JacksonParser.convertField(factory, parser, dataType)
+      }
     }
 
     val intNumber: Int = 2147483647

From a112d69fdcd9f6d8805be6e0bc6d2211e26867c2 Mon Sep 17 00:00:00 2001
From: Lukasz Piepiora <lpiepiora@gmail.com>
Date: Sun, 18 Oct 2015 14:25:57 +0100
Subject: [PATCH 589/802] [SPARK-11174] [DOCS] Fix typo in the GraphX
 programming guide

This patch fixes a small typo in the GraphX programming guide

Author: Lukasz Piepiora <lpiepiora@gmail.com>

Closes #9160 from lpiepiora/11174-fix-typo-in-graphx-programming-guide.
---
 docs/graphx-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index c861a763d6222..6a512ab234bb2 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -944,7 +944,7 @@ The three additional functions exposed by the `EdgeRDD` are:
 {% highlight scala %}
 // Transform the edge attributes while preserving the structure
 def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2]
-// Revere the edges reusing both attributes and structure
+// Reverse the edges reusing both attributes and structure
 def reverse: EdgeRDD[ED]
 // Join two `EdgeRDD`s partitioned using the same partitioning strategy.
 def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]

From 0480d6ca83d170618fa6a817ad64a2872438d47f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 18 Oct 2015 09:54:38 -0700
Subject: [PATCH 590/802] [SPARK-11169] Remove the extra spaces in merge script

Our merge script now turns
```
[SPARK-1234][SPARK-1235][SPARK-1236][SQL] description
```
into
```
[SPARK-1234] [SPARK-1235] [SPARK-1236] [SQL] description
```
The extra spaces are more annoying in git since the first line of a git commit is supposed to be very short.

Doctest passes with the following command:
```
python -m doctest merge_spark_pr.py
```

Author: Reynold Xin <rxin@databricks.com>

Closes #9156 from rxin/SPARK-11169.
---
 dev/merge_spark_pr.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index b9bdec3d70864..bf1a000f46791 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -300,24 +300,24 @@ def resolve_jira_issues(title, merge_branches, comment):
 def standardize_jira_ref(text):
     """
     Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX] [MLLIB] Issue"
+    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue"
 
     >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
-    '[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful'
+    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'
     >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
-    '[SPARK-4123] [PROJECT INFRA] [WIP] Show new dependencies added in pull requests'
+    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'
     >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954] [MLLIB] Top by key'
+    '[SPARK-5954][MLLIB] Top by key'
     >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl")
     '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
     >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
     '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.'
     >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146] [WIP] Vagrant support for Spark'
+    '[SPARK-1146][WIP] Vagrant support for Spark'
     >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
     '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'
     >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
-    '[SPARK-6250] [SPARK-6146] [SPARK-5911] [SQL] Types are now reserved words in DDL parser.'
+    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'
     >>> standardize_jira_ref("Additional information for users building from source code")
     'Additional information for users building from source code'
     """
@@ -325,7 +325,7 @@ def standardize_jira_ref(text):
     components = []
 
     # If the string is compliant, no need to process any further
-    if (re.search(r'^\[SPARK-[0-9]{3,6}\] (\[[A-Z0-9_\s,]+\] )+\S+', text)):
+    if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)):
         return text
 
     # Extract JIRA ref(s):
@@ -348,7 +348,7 @@ def standardize_jira_ref(text):
         text = pattern.search(text).groups()[0]
 
     # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = ' '.join(jira_refs).strip() + " " + ' '.join(components).strip() + " " + text.strip()
+    clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip()
 
     # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
     clean_text = re.sub(r'\s+', ' ', clean_text.strip())

From 8d4449c7f5d528410306c288a042c4594b81a881 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Sun, 18 Oct 2015 10:36:50 -0700
Subject: [PATCH 591/802] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #8737 (close requested by 'srowen')
Closes #5323 (close requested by 'JoshRosen')
Closes #6148 (close requested by 'JoshRosen')
Closes #7557 (close requested by 'JoshRosen')
Closes #7047 (close requested by 'srowen')
Closes #8713 (close requested by 'marmbrus')
Closes #5834 (close requested by 'srowen')
Closes #7467 (close requested by 'tdas')
Closes #8943 (close requested by 'xiaowen147')
Closes #4434 (close requested by 'JoshRosen')
Closes #8949 (close requested by 'srowen')
Closes #5368 (close requested by 'JoshRosen')
Closes #8186 (close requested by 'marmbrus')
Closes #5147 (close requested by 'JoshRosen')

From a337c235a12d4ea6a7d6db457acc6b32f1915241 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Sun, 18 Oct 2015 11:39:19 -0700
Subject: [PATCH 592/802] [SPARK-11158][SQL] Modified _verify_type() to be more
 informative on Errors by presenting the Object

The _verify_type() function had Errors that were raised when there were Type conversion issues but left out the Object in question. The Object is now added in the Error to reduce the strain on the user to debug through to figure out the Object that failed the Type conversion.

The use case for me was a Pandas DataFrame that contained 'nan' as values for columns of Strings.

Author: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Author: Mahmoud Lababidi <lababidi@gmail.com>

Closes #9149 from lababidi/master.
---
 python/pyspark/sql/types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 1f86894855cbe..5bc0773fa8660 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1127,15 +1127,15 @@ def _verify_type(obj, dataType):
         return
 
     _type = type(dataType)
-    assert _type in _acceptable_types, "unknown datatype: %s" % dataType
+    assert _type in _acceptable_types, "unknown datatype: %s for object %r" % (dataType, obj)
 
     if _type is StructType:
         if not isinstance(obj, (tuple, list)):
-            raise TypeError("StructType can not accept object in type %s" % type(obj))
+            raise TypeError("StructType can not accept object %r in type %s" % (obj, type(obj)))
     else:
         # subclass of them can not be fromInternald in JVM
         if type(obj) not in _acceptable_types[_type]:
-            raise TypeError("%s can not accept object in type %s" % (dataType, type(obj)))
+            raise TypeError("%s can not accept object %r in type %s" % (dataType, obj, type(obj)))
 
     if isinstance(dataType, ArrayType):
         for i in obj:

From 94c8fef296e5cdac9a93ed34acc079e51839caa7 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sun, 18 Oct 2015 13:51:45 -0700
Subject: [PATCH 593/802] [SPARK-11126][SQL] Fix a memory leak in
 SQLListener._stageIdToStageMetrics

SQLListener adds all stage infos to `_stageIdToStageMetrics`, but only removes stage infos belonging to SQL executions. This PR fixed it by ignoring stages that don't belong to SQL executions.

Reported by Terry Hoo in https://www.mail-archive.com/userspark.apache.org/msg38810.html

Author: zsxwing <zsxwing@gmail.com>

Closes #9132 from zsxwing/SPARK-11126.
---
 .../spark/sql/execution/ui/SQLListener.scala   |  8 +++++++-
 .../sql/execution/ui/SQLListenerSuite.scala    | 18 ++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index b302b519998ac..5a072de400b6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -126,7 +126,13 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
     val stageId = stageSubmitted.stageInfo.stageId
     val stageAttemptId = stageSubmitted.stageInfo.attemptId
     // Always override metrics for old stage attempt
-    _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId)
+    if (_stageIdToStageMetrics.contains(stageId)) {
+      _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId)
+    } else {
+      // If a stage belongs to some SQL execution, its stageId will be put in "onJobStart".
+      // Since "_stageIdToStageMetrics" doesn't contain it, it must not belong to any SQL execution.
+      // So we can ignore it. Otherwise, this may lead to memory leaks (SPARK-11126).
+    }
   }
 
   override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index cc1c1e10e98c4..03bcee94a2b91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -313,7 +313,22 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(executionUIData.failedJobs === Seq(0))
   }
 
-  ignore("no memory leak") {
+  test("SPARK-11126: no memory leak when running non SQL jobs") {
+    val previousStageNumber = sqlContext.listener.stageIdToStageMetrics.size
+    sqlContext.sparkContext.parallelize(1 to 10).foreach(i => ())
+    // listener should ignore the non SQL stage
+    assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber)
+
+    sqlContext.sparkContext.parallelize(1 to 10).toDF().foreach(i => ())
+    // listener should save the SQL stage
+    assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber + 1)
+  }
+
+}
+
+class SQLListenerMemoryLeakSuite extends SparkFunSuite {
+
+  test("no memory leak") {
     val conf = new SparkConf()
       .setMaster("local")
       .setAppName("test")
@@ -348,5 +363,4 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       sc.stop()
     }
   }
-
 }

From d3180c25d8cf0899a7238e7d24b35c5ae918cc1d Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Sun, 18 Oct 2015 22:45:14 -0700
Subject: [PATCH 594/802] [SPARK-7018][BUILD] Refactor dev/run-tests-jenkins
 into Python

This commit refactors the `run-tests-jenkins` script into Python. This refactoring was done by brennonyork in #7401; this PR contains a few minor edits from joshrosen in order to bring it up to date with other recent changes.

From the original PR description (by brennonyork):

Currently a few things are left out that, could and I think should, be smaller JIRA's after this.

1. There are still a few areas where we use environment variables where we don't need to (like `CURRENT_BLOCK`). I might get around to fixing this one in lieu of everything else, but wanted to point that out.
2. The PR tests are still written in bash. I opted to not change those and just rewrite the runner into Python. This is a great follow-on JIRA IMO.
3. All of the linting scripts are still in bash as well and would likely do to just add those in as follow-on JIRA's as well.

Closes #7401.

Author: Brennon York <brennon.york@capitalone.com>

Closes #9161 from JoshRosen/run-tests-jenkins-refactoring.
---
 dev/lint-python                    |   2 +-
 dev/run-tests-codes.sh             |  30 ----
 dev/run-tests-jenkins              | 204 +-------------------------
 dev/run-tests-jenkins.py           | 228 +++++++++++++++++++++++++++++
 dev/run-tests.py                   |  20 +--
 dev/sparktestsupport/__init__.py   |  14 ++
 dev/sparktestsupport/shellutils.py |  37 ++++-
 python/run-tests.py                |  19 +--
 8 files changed, 285 insertions(+), 269 deletions(-)
 delete mode 100644 dev/run-tests-codes.sh
 create mode 100755 dev/run-tests-jenkins.py

diff --git a/dev/lint-python b/dev/lint-python
index 575dbb0ae321b..0b97213ae3dff 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
deleted file mode 100644
index 1f16790522e76..0000000000000
--- a/dev/run-tests-codes.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-readonly BLOCK_GENERAL=10
-readonly BLOCK_RAT=11
-readonly BLOCK_SCALA_STYLE=12
-readonly BLOCK_PYTHON_STYLE=13
-readonly BLOCK_R_STYLE=14
-readonly BLOCK_DOCUMENTATION=15
-readonly BLOCK_BUILD=16
-readonly BLOCK_MIMA=17
-readonly BLOCK_SPARK_UNIT_TESTS=18
-readonly BLOCK_PYSPARK_UNIT_TESTS=19
-readonly BLOCK_SPARKR_UNIT_TESTS=20
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index d3b05fa6df0ce..e79accf9e987a 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -22,207 +22,7 @@
 # Environment variables are populated by the code here:
 #+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139
 
-# Go to the Spark project root directory
-FWDIR="$(cd `dirname $0`/..; pwd)"
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-source "$FWDIR/dev/run-tests-codes.sh"
-
-COMMENTS_URL="https://api.github.com/repos/apache/spark/issues/$ghprbPullId/comments"
-PULL_REQUEST_URL="https://github.com/apache/spark/pull/$ghprbPullId"
-
-# Important Environment Variables
-# ---
-# $ghprbActualCommit
-#+  This is the hash of the most recent commit in the PR.
-#+  The merge-base of this and master is the commit from which the PR was branched.
-# $sha1
-#+  If the patch merges cleanly, this is a reference to the merge commit hash
-#+    (e.g. "origin/pr/2606/merge").
-#+  If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
-#+  The merge-base of this and master in the case of a clean merge is the most recent commit
-#+    against master.
-
-COMMIT_URL="https://github.com/apache/spark/commit/${ghprbActualCommit}"
-# GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
-SHORT_COMMIT_HASH="${ghprbActualCommit:0:7}"
-
-# format: http://linux.die.net/man/1/timeout
-# must be less than the timeout configured on Jenkins (currently 300m)
-TESTS_TIMEOUT="250m"
-
-# Array to capture all tests to run on the pull request. These tests are held under the
-#+ dev/tests/ directory.
-#
-# To write a PR test:
-#+  * the file must reside within the dev/tests directory
-#+  * be an executable bash script
-#+  * accept three arguments on the command line, the first being the Github PR long commit
-#+    hash, the second the Github SHA1 hash, and the final the current PR hash
-#+  * and, lastly, return string output to be included in the pr message output that will
-#+    be posted to Github
-PR_TESTS=(
-  "pr_merge_ability"
-  "pr_public_classes"
-# DISABLED (pwendell) "pr_new_dependencies"
-)
-
-function post_message () {
-  local message=$1
-  local data="{\"body\": \"$message\"}"
-  local HTTP_CODE_HEADER="HTTP Response Code: "
-
-  echo "Attempting to post to Github..."
-
-  local curl_output=$(
-    curl `#--dump-header -` \
-      --silent \
-      --user x-oauth-basic:$GITHUB_OAUTH_KEY \
-      --request POST \
-      --data "$data" \
-      --write-out "${HTTP_CODE_HEADER}%{http_code}\n" \
-      --header "Content-Type: application/json" \
-      "$COMMENTS_URL" #> /dev/null #| "$FWDIR/dev/jq" .id #| head -n 8
-  )
-  local curl_status=${PIPESTATUS[0]}
-
-  if [ "$curl_status" -ne 0 ]; then
-      echo "Failed to post message to GitHub." >&2
-      echo " > curl_status: ${curl_status}" >&2
-      echo " > curl_output: ${curl_output}" >&2
-      echo " > data: ${data}" >&2
-      # exit $curl_status
-  fi
-
-  local api_response=$(
-    echo "${curl_output}" \
-    | grep -v -e "^${HTTP_CODE_HEADER}"
-  )
-
-  local http_code=$(
-    echo "${curl_output}" \
-    | grep -e "^${HTTP_CODE_HEADER}" \
-    | sed -r -e "s/^${HTTP_CODE_HEADER}//g"
-  )
-
-  if [ -n "$http_code" ] && [ "$http_code" -ne "201" ]; then
-      echo " > http_code: ${http_code}." >&2
-      echo " > api_response: ${api_response}" >&2
-      echo " > data: ${data}" >&2
-  fi
-
-  if [ "$curl_status" -eq 0 ] && [ "$http_code" -eq "201" ]; then
-    echo " > Post successful."
-  fi
-}
-
-# post start message
-{
-  start_message="\
-  [Test build ${BUILD_DISPLAY_NAME} has started](${BUILD_URL}consoleFull) for \
-  PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL})."
-
-  post_message "$start_message"
-}
-
-# Environment variable to capture PR test output
-pr_message=""
-# Ensure we save off the current HEAD to revert to
-current_pr_head="`git rev-parse HEAD`"
-
-echo "HEAD:  `git rev-parse HEAD`"
-echo "\$ghprbActualCommit: $ghprbActualCommit"
-echo "\$sha1:  $sha1"
-echo "\$ghprbPullTitle: $ghprbPullTitle"
-
-# Run pull request tests
-for t in "${PR_TESTS[@]}"; do
-  this_test="${FWDIR}/dev/tests/${t}.sh"
-  # Ensure the test can be found and is a file
-  if [ -f "${this_test}" ]; then
-    echo "Running test: $t"
-    this_mssg="$(bash "${this_test}" "${ghprbActualCommit}" "${sha1}" "${current_pr_head}")"
-    # Check if this is the merge test as we submit that note *before* and *after*
-    # the tests run
-    [ "$t" == "pr_merge_ability" ] && merge_note="${this_mssg}"
-    pr_message="${pr_message}\n${this_mssg}"
-    # Ensure, after each test, that we're back on the current PR
-    git checkout -f "${current_pr_head}" &>/dev/null
-  else
-    echo "Cannot find test ${this_test}."
-  fi
-done
-
-# run tests
-{
-  # Marks this build is a pull request build.
-  export AMP_JENKINS_PRB=true
-  if [[ $ghprbPullTitle == *"test-maven"* ]]; then
-    export AMPLAB_JENKINS_BUILD_TOOL="maven"
-  fi
-  if [[ $ghprbPullTitle == *"test-hadoop1.0"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop1.0"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.0"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.0"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.2"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.2"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.3"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.3"
-  fi
-
-  timeout "${TESTS_TIMEOUT}" ./dev/run-tests
-  test_result="$?"
-
-  if [ "$test_result" -eq "124" ]; then
-    fail_message="**[Test build ${BUILD_DISPLAY_NAME} timed out](${BUILD_URL}console)** \
-    for PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL}) \
-    after a configured wait of \`${TESTS_TIMEOUT}\`."
-
-    post_message "$fail_message"
-    exit $test_result
-  elif [ "$test_result" -eq "0" ]; then
-    test_result_note=" * This patch **passes all tests**."
-  else
-    if [ "$test_result" -eq "$BLOCK_GENERAL" ]; then
-      failing_test="some tests"
-    elif [ "$test_result" -eq "$BLOCK_RAT" ]; then
-      failing_test="RAT tests"
-    elif [ "$test_result" -eq "$BLOCK_SCALA_STYLE" ]; then
-      failing_test="Scala style tests"
-    elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
-      failing_test="Python style tests"
-    elif [ "$test_result" -eq "$BLOCK_R_STYLE" ]; then
-      failing_test="R style tests"
-    elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then
-      failing_test="to generate documentation"
-    elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
-      failing_test="to build"
-    elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then
-      failing_test="MiMa tests"
-    elif [ "$test_result" -eq "$BLOCK_SPARK_UNIT_TESTS" ]; then
-      failing_test="Spark unit tests"
-    elif [ "$test_result" -eq "$BLOCK_PYSPARK_UNIT_TESTS" ]; then
-      failing_test="PySpark unit tests"
-    elif [ "$test_result" -eq "$BLOCK_SPARKR_UNIT_TESTS" ]; then
-      failing_test="SparkR unit tests"
-    else
-      failing_test="some tests"
-    fi
-
-    test_result_note=" * This patch **fails $failing_test**."
-  fi
-}
-
-# post end message
-{
-  result_message="\
-  [Test build ${BUILD_DISPLAY_NAME} has finished](${BUILD_URL}console) for \
-  PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL})."
-
-  result_message="${result_message}\n${test_result_note}"
-  result_message="${result_message}${pr_message}"
-
-  post_message "$result_message"
-}
-
-exit $test_result
+exec python -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
new file mode 100755
index 0000000000000..623004310e189
--- /dev/null
+++ b/dev/run-tests-jenkins.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import os
+import sys
+import json
+import urllib2
+import functools
+import subprocess
+
+from sparktestsupport import SPARK_HOME, ERROR_CODES
+from sparktestsupport.shellutils import run_cmd
+
+
+def print_err(msg):
+    """
+    Given a set of arguments, will print them to the STDERR stream
+    """
+    print(msg, file=sys.stderr)
+
+
+def post_message_to_github(msg, ghprb_pull_id):
+    print("Attempting to post to Github...")
+
+    url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments"
+    github_oauth_key = os.environ["GITHUB_OAUTH_KEY"]
+
+    posted_message = json.dumps({"body": msg})
+    request = urllib2.Request(url,
+                              headers={
+                                  "Authorization": "token %s" % github_oauth_key,
+                                  "Content-Type": "application/json"
+                              },
+                              data=posted_message)
+    try:
+        response = urllib2.urlopen(request)
+
+        if response.getcode() == 201:
+            print(" > Post successful.")
+    except urllib2.HTTPError as http_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > http_code: %s" % http_e.code)
+        print_err(" > api_response: %s" % http_e.read())
+        print_err(" > data: %s" % posted_message)
+    except urllib2.URLError as url_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > urllib2_status: %s" % url_e.reason[1])
+        print_err(" > data: %s" % posted_message)
+
+
+def pr_message(build_display_name,
+               build_url,
+               ghprb_pull_id,
+               short_commit_hash,
+               commit_url,
+               msg,
+               post_msg=''):
+    # align the arguments properly for string formatting
+    str_args = (build_display_name,
+                msg,
+                build_url,
+                ghprb_pull_id,
+                short_commit_hash,
+                commit_url,
+                str(' ' + post_msg + '.') if post_msg else '.')
+    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
+
+
+def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
+    """
+    Executes a set of pull request checks to ease development and report issues with various
+    components such as style, linting, dependencies, compatibilities, etc.
+    @return a list of messages to post back to Github
+    """
+    # Ensure we save off the current HEAD to revert to
+    current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip()
+    pr_results = list()
+
+    for pr_test in pr_tests:
+        test_name = pr_test + '.sh'
+        pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name),
+                                   ghprb_actual_commit, sha1],
+                                  return_output=True).rstrip())
+        # Ensure, after each test, that we're back on the current PR
+        run_cmd(['git', 'checkout', '-f', current_pr_head])
+    return pr_results
+
+
+def run_tests(tests_timeout):
+    """
+    Runs the `dev/run-tests` script and responds with the correct error message
+    under the various failure scenarios.
+    @return a tuple containing the test result code and the result note to post to Github
+    """
+
+    test_result_code = subprocess.Popen(['timeout',
+                                         tests_timeout,
+                                         os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
+
+    failure_note_by_errcode = {
+        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
+        ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
+        ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
+        ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
+        ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests',
+        ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests',
+        ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation',
+        ERROR_CODES["BLOCK_BUILD"]: 'to build',
+        ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
+        ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
+        ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
+        ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
+        ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
+            tests_timeout)
+    }
+
+    if test_result_code == 0:
+        test_result_note = ' * This patch passes all tests.'
+    else:
+        test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
+
+    return [test_result_code, test_result_note]
+
+
+def main():
+    # Important Environment Variables
+    # ---
+    # $ghprbActualCommit
+    #   This is the hash of the most recent commit in the PR.
+    #   The merge-base of this and master is the commit from which the PR was branched.
+    # $sha1
+    #   If the patch merges cleanly, this is a reference to the merge commit hash
+    #     (e.g. "origin/pr/2606/merge").
+    #   If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
+    #   The merge-base of this and master in the case of a clean merge is the most recent commit
+    #     against master.
+    ghprb_pull_id = os.environ["ghprbPullId"]
+    ghprb_actual_commit = os.environ["ghprbActualCommit"]
+    ghprb_pull_title = os.environ["ghprbPullTitle"]
+    sha1 = os.environ["sha1"]
+
+    # Marks this build as a pull request build.
+    os.environ["AMP_JENKINS_PRB"] = "true"
+    # Switch to a Maven-based build if the PR title contains "test-maven":
+    if "test-maven" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven"
+    # Switch the Hadoop profile based on the PR title:
+    if "test-hadoop1.0" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2"
+    if "test-hadoop2.3" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3"
+
+    build_display_name = os.environ["BUILD_DISPLAY_NAME"]
+    build_url = os.environ["BUILD_URL"]
+
+    commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit
+
+    # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
+    short_commit_hash = ghprb_actual_commit[0:7]
+
+    # format: http://linux.die.net/man/1/timeout
+    # must be less than the timeout configured on Jenkins (currently 300m)
+    tests_timeout = "250m"
+
+    # Array to capture all test names to run on the pull request. These tests are represented
+    # by their file equivalents in the dev/tests/ directory.
+    #
+    # To write a PR test:
+    #   * the file must reside within the dev/tests directory
+    #   * be an executable bash script
+    #   * accept three arguments on the command line, the first being the Github PR long commit
+    #     hash, the second the Github SHA1 hash, and the final the current PR hash
+    #   * and, lastly, return string output to be included in the pr message output that will
+    #     be posted to Github
+    pr_tests = [
+        "pr_merge_ability",
+        "pr_public_classes"
+        # DISABLED (pwendell) "pr_new_dependencies"
+    ]
+
+    # `bind_message_base` returns a function to generate messages for Github posting
+    github_message = functools.partial(pr_message,
+                                       build_display_name,
+                                       build_url,
+                                       ghprb_pull_id,
+                                       short_commit_hash,
+                                       commit_url)
+
+    # post start message
+    post_message_to_github(github_message('has started'), ghprb_pull_id)
+
+    pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1)
+
+    test_result_code, test_result_note = run_tests(tests_timeout)
+
+    # post end message
+    result_message = github_message('has finished')
+    result_message += '\n' + test_result_note + '\n'
+    result_message += '\n'.join(pr_check_results)
+
+    post_message_to_github(result_message, ghprb_pull_id)
+
+    sys.exit(test_result_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dev/run-tests.py b/dev/run-tests.py
index d4d6880491bc8..6b4b71073453d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -27,10 +27,11 @@
 import subprocess
 from collections import namedtuple
 
-from sparktestsupport import SPARK_HOME, USER_HOME
+from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
 import sparktestsupport.modules as modules
 
+
 # -------------------------------------------------------------------------------------------------
 # Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
@@ -130,19 +131,6 @@ def determine_tags_to_exclude(changed_modules):
 # Functions for working with subprocesses and shell tools
 # -------------------------------------------------------------------------------------------------
 
-def get_error_codes(err_code_file):
-    """Function to retrieve all block numbers from the `run-tests-codes.sh`
-    file to maintain backwards compatibility with the `run-tests-jenkins`
-    script"""
-
-    with open(err_code_file, 'r') as f:
-        err_codes = [e.split()[1].strip().split('=')
-                     for e in f if e.startswith("readonly")]
-        return dict(err_codes)
-
-
-ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
-
 
 def determine_java_executable():
     """Will return the path of the java executable that will be used by Spark's
@@ -191,7 +179,7 @@ def determine_java_version(java_exe):
 
 
 def set_title_and_block(title, err_block):
-    os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block])
     line_str = '=' * 72
 
     print('')
@@ -467,7 +455,7 @@ def main():
     rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
     rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
 
-    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])
 
     java_exe = determine_java_executable()
 
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index 12696d98fb988..8ab6d9e37ca2f 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -19,3 +19,17 @@
 
 SPARK_HOME = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../"))
 USER_HOME = os.environ.get("HOME")
+ERROR_CODES = {
+    "BLOCK_GENERAL": 10,
+    "BLOCK_RAT": 11,
+    "BLOCK_SCALA_STYLE": 12,
+    "BLOCK_PYTHON_STYLE": 13,
+    "BLOCK_R_STYLE": 14,
+    "BLOCK_DOCUMENTATION": 15,
+    "BLOCK_BUILD": 16,
+    "BLOCK_MIMA": 17,
+    "BLOCK_SPARK_UNIT_TESTS": 18,
+    "BLOCK_PYSPARK_UNIT_TESTS": 19,
+    "BLOCK_SPARKR_UNIT_TESTS": 20,
+    "BLOCK_TIMEOUT": 124
+}
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 12bd0bf3a4fe9..d280e797077d1 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -22,6 +22,36 @@
 import sys
 
 
+if sys.version_info >= (2, 7):
+    subprocess_check_output = subprocess.check_output
+    subprocess_check_call = subprocess.check_call
+else:
+    # SPARK-8763
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_output(*popenargs, **kwargs):
+        if 'stdout' in kwargs:
+            raise ValueError('stdout argument not allowed, it will be overridden.')
+        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
+        output, unused_err = process.communicate()
+        retcode = process.poll()
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise subprocess.CalledProcessError(retcode, cmd, output=output)
+        return output
+
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_call(*popenargs, **kwargs):
+        retcode = call(*popenargs, **kwargs)
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise CalledProcessError(retcode, cmd)
+        return 0
+
+
 def exit_from_command_with_retcode(cmd, retcode):
     print("[error] running", ' '.join(cmd), "; received return code", retcode)
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
@@ -39,7 +69,7 @@ def rm_r(path):
         os.remove(path)
 
 
-def run_cmd(cmd):
+def run_cmd(cmd, return_output=False):
     """
     Given a command as a list of arguments will attempt to execute the command
     and, on failure, print an error message and exit.
@@ -48,7 +78,10 @@ def run_cmd(cmd):
     if not isinstance(cmd, list):
         cmd = cmd.split()
     try:
-        subprocess.check_call(cmd)
+        if return_output:
+            return subprocess_check_output(cmd)
+        else:
+            return subprocess_check_call(cmd)
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 
diff --git a/python/run-tests.py b/python/run-tests.py
index 152f5cc98d0fd..f5857f8c62214 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -31,23 +31,6 @@
     import Queue
 else:
     import queue as Queue
-if sys.version_info >= (2, 7):
-    subprocess_check_output = subprocess.check_output
-else:
-    # SPARK-8763
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_output(*popenargs, **kwargs):
-        if 'stdout' in kwargs:
-            raise ValueError('stdout argument not allowed, it will be overridden.')
-        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
-        output, unused_err = process.communicate()
-        retcode = process.poll()
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise subprocess.CalledProcessError(retcode, cmd, output=output)
-        return output
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -55,7 +38,7 @@ def subprocess_check_output(*popenargs, **kwargs):
 
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which  # noqa
+from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
 from sparktestsupport.modules import all_modules  # noqa
 
 
From beb8bc1ea588b7f9ab7effff707c0f784421364d Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 19 Oct 2015 00:06:51 -0700
Subject: [PATCH 595/802] [SPARK-11126][SQL] Fix the potential flaky test

The unit test added in #9132 is flaky. This is a follow up PR to add `listenerBus.waitUntilEmpty` to fix it.

Author: zsxwing <zsxwing@gmail.com>

Closes #9163 from zsxwing/SPARK-11126-follow-up.
---
 .../org/apache/spark/sql/execution/ui/SQLListenerSuite.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 03bcee94a2b91..c15aac775096c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -316,10 +316,12 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   test("SPARK-11126: no memory leak when running non SQL jobs") {
     val previousStageNumber = sqlContext.listener.stageIdToStageMetrics.size
     sqlContext.sparkContext.parallelize(1 to 10).foreach(i => ())
+    sqlContext.sparkContext.listenerBus.waitUntilEmpty(10000)
     // listener should ignore the non SQL stage
     assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber)
 
     sqlContext.sparkContext.parallelize(1 to 10).toDF().foreach(i => ())
+    sqlContext.sparkContext.listenerBus.waitUntilEmpty(10000)
     // listener should save the SQL stage
     assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber + 1)
   }

From bd64c2d550c36405f9be25a5c6a8eaa54bf4e7e7 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Mon, 19 Oct 2015 09:59:18 +0100
Subject: [PATCH 596/802] =?UTF-8?q?[SPARK-10921][YARN]=20Completely=20remo?=
 =?UTF-8?q?ve=20the=20use=20of=20SparkContext.prefer=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…redNodeLocationData

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #8976 from jaceklaskowski/SPARK-10921.
---
 .../scala/org/apache/spark/SparkContext.scala | 22 +++++--------------
 project/MimaExcludes.scala                    |  3 +++
 .../spark/deploy/yarn/ApplicationMaster.scala |  1 -
 .../spark/deploy/yarn/YarnRMClient.scala      |  2 --
 4 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 0c72adfb9505b..ccba3ed9e643c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -90,11 +90,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   // NOTE: this must be placed at the beginning of the SparkContext constructor.
   SparkContext.markPartiallyConstructed(this, allowMultipleContexts)
 
-  // This is used only by YARN for now, but should be relevant to other cluster types (Mesos,
-  // etc) too. This is typically generated from InputFormatInfo.computePreferredLocations. It
-  // contains a map from hostname to a list of input format splits on the host.
-  private[spark] var preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()
-
   val startTime = System.currentTimeMillis()
 
   private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
@@ -116,16 +111,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * Alternative constructor for setting preferred locations where Spark will create executors.
    *
    * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters
-   * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
-   * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
-   * from a list of input files or InputFormats for the application.
+   * @param preferredNodeLocationData not used. Left for backward compatibility.
    */
   @deprecated("Passing in preferred locations has no effect at all, see SPARK-8949", "1.5.0")
   @DeveloperApi
   def this(config: SparkConf, preferredNodeLocationData: Map[String, Set[SplitInfo]]) = {
     this(config)
     logWarning("Passing in preferred locations has no effect at all, see SPARK-8949")
-    this.preferredNodeLocationData = preferredNodeLocationData
   }
 
   /**
@@ -147,10 +139,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param jars Collection of JARs to send to the cluster. These can be paths on the local file
    *             system or HDFS, HTTP, HTTPS, or FTP URLs.
    * @param environment Environment variables to set on worker nodes.
-   * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
-   * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
-   * from a list of input files or InputFormats for the application.
+   * @param preferredNodeLocationData not used. Left for backward compatibility.
    */
+  @deprecated("Passing in preferred locations has no effect at all, see SPARK-10921", "1.6.0")
   def this(
       master: String,
       appName: String,
@@ -163,7 +154,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     if (preferredNodeLocationData.nonEmpty) {
       logWarning("Passing in preferred locations has no effect at all, see SPARK-8949")
     }
-    this.preferredNodeLocationData = preferredNodeLocationData
   }
 
   // NOTE: The below constructors could be consolidated using default arguments. Due to
@@ -177,7 +167,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param appName A name for your application, to display on the cluster web UI.
    */
   private[spark] def this(master: String, appName: String) =
-    this(master, appName, null, Nil, Map(), Map())
+    this(master, appName, null, Nil, Map())
 
   /**
    * Alternative constructor that allows setting common Spark properties directly
@@ -187,7 +177,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param sparkHome Location where Spark is installed on cluster nodes.
    */
   private[spark] def this(master: String, appName: String, sparkHome: String) =
-    this(master, appName, sparkHome, Nil, Map(), Map())
+    this(master, appName, sparkHome, Nil, Map())
 
   /**
    * Alternative constructor that allows setting common Spark properties directly
@@ -199,7 +189,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    *             system or HDFS, HTTP, HTTPS, or FTP URLs.
    */
   private[spark] def this(master: String, appName: String, sparkHome: String, jars: Seq[String]) =
-    this(master, appName, sparkHome, jars, Map(), Map())
+    this(master, appName, sparkHome, jars, Map())
 
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 08e4a449cf762..0872d3f3e7093 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -100,6 +100,9 @@ object MimaExcludes {
           "org.apache.spark.sql.SQLContext.setSession"),
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.sql.SQLContext.createSession")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkContext.preferredNodeLocationData_=")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 3791eea5bf178..d1d248bf79beb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -255,7 +255,6 @@ private[spark] class ApplicationMaster(
       driverRef,
       yarnConf,
       _sparkConf,
-      if (sc != null) sc.preferredNodeLocationData else Map(),
       uiAddress,
       historyAddress,
       securityMgr)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index df042bf291de7..d2a211f6711ff 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -49,7 +49,6 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
    *
    * @param conf The Yarn configuration.
    * @param sparkConf The Spark configuration.
-   * @param preferredNodeLocations Map with hints about where to allocate containers.
    * @param uiAddress Address of the SparkUI.
    * @param uiHistoryAddress Address of the application on the History Server.
    */
@@ -58,7 +57,6 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
       driverRef: RpcEndpointRef,
       conf: YarnConfiguration,
       sparkConf: SparkConf,
-      preferredNodeLocations: Map[String, Set[SplitInfo]],
       uiAddress: String,
       uiHistoryAddress: String,
       securityMgr: SecurityManager

From dfa41e63b98c28b087c56f94658b5e99e8a7758c Mon Sep 17 00:00:00 2001
From: Alex Angelini <alex.louis.angelini@gmail.com>
Date: Mon, 19 Oct 2015 10:07:39 -0700
Subject: [PATCH 597/802] [SPARK-9643] Upgrade pyrolite to 4.9

Includes: https://github.com/irmen/Pyrolite/pull/23 which fixes datetimes with timezones.

JoshRosen

https://issues.apache.org/jira/browse/SPARK-9643

Author: Alex Angelini <alex.louis.angelini@gmail.com>

Closes #7950 from angelini/upgrade_pyrolite_up.
---
 core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/pom.xml b/core/pom.xml
index c0af98a04fb1d..fdcb6a7902bbd 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -339,7 +339,7 @@
     <dependency>
       <groupId>net.razorvine</groupId>
       <artifactId>pyrolite</artifactId>
-      <version>4.4</version>
+      <version>4.9</version>
       <exclusions>
         <exclusion>
           <groupId>net.razorvine</groupId>

From 4c33a34ba3167ae67fdb4978ea2166ce65638fb9 Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Mon, 19 Oct 2015 10:46:10 -0700
Subject: [PATCH 598/802] =?UTF-8?q?[SPARK-10668]=20[ML]=20Use=20WeightedLe?=
 =?UTF-8?q?astSquares=20in=20LinearRegression=20with=20L=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…2 regularization if the number of features is small

Author: lewuathe <lewuathe@me.com>
Author: Lewuathe <sasaki@treasure-data.com>
Author: Kai Sasaki <sasaki@treasure-data.com>
Author: Lewuathe <lewuathe@me.com>

Closes #8884 from Lewuathe/SPARK-10668.
---
 R/pkg/R/mllib.R                               |    5 +-
 R/pkg/inst/tests/test_mllib.R                 |    2 +-
 .../ml/param/shared/SharedParamsCodeGen.scala |    4 +-
 .../spark/ml/param/shared/sharedParams.scala  |   17 +
 .../apache/spark/ml/r/SparkRWrappers.scala    |    4 +-
 .../ml/regression/LinearRegression.scala      |   50 +-
 .../regression/JavaLinearRegressionSuite.java |    3 +-
 .../ml/regression/LinearRegressionSuite.scala | 1045 +++++++++--------
 .../spark/ml/tuning/CrossValidatorSuite.scala |    2 +-
 .../ml/tuning/TrainValidationSplitSuite.scala |    2 +-
 10 files changed, 640 insertions(+), 494 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index cd00bbbeec698..25615e805e03c 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -45,11 +45,12 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' summary(model)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
-          function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {
+          function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
+            solver = "auto") {
             family <- match.arg(family)
             model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                  "fitRModelFormula", deparse(formula), data@sdf, family, lambda,
-                                 alpha)
+                                 alpha, solver)
             return(new("PipelineModel", model = model))
           })
 
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 032f8ec68b9d0..3331ce738358c 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -59,7 +59,7 @@ test_that("feature interaction vs native glm", {
 
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
-  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "l-bfgs"))
   coefs <- as.vector(stats$coefficients)
   rCoefs <- as.vector(coef(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)))
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8cb6b5493c61c..c7bca1243092c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -73,7 +73,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."),
       ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
-        "all instance weights as 1.0."))
+        "all instance weights as 1.0."),
+      ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " +
+        "empty, default value is 'auto'.", Some("\"auto\"")))
 
     val code = genSharedParams(params)
     val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index e3625212e5251..cb2a060a34dd6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -357,4 +357,21 @@ private[ml] trait HasWeightCol extends Params {
   /** @group getParam */
   final def getWeightCol: String = $(weightCol)
 }
+
+/**
+ * Trait for shared param solver (default: "auto").
+ */
+private[ml] trait HasSolver extends Params {
+
+  /**
+   * Param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'..
+   * @group param
+   */
+  final val solver: Param[String] = new Param[String](this, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.")
+
+  setDefault(solver, "auto")
+
+  /** @group getParam */
+  final def getSolver: String = $(solver)
+}
 // scalastyle:on
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index f5a022c31ed90..fec61fed3cb9c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -30,13 +30,15 @@ private[r] object SparkRWrappers {
       df: DataFrame,
       family: String,
       lambda: Double,
-      alpha: Double): PipelineModel = {
+      alpha: Double,
+      solver: String): PipelineModel = {
     val formula = new RFormula().setFormula(value)
     val estimator = family match {
       case "gaussian" => new LinearRegression()
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
         .setFitIntercept(formula.hasIntercept)
+        .setSolver(solver)
       case "binomial" => new LogisticRegression()
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index dd09667ef5a0f..573a61a6eabdf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -25,6 +25,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS,
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.optim.WeightedLeastSquares
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -43,7 +44,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
-    with HasFitIntercept with HasStandardization with HasWeightCol
+    with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver
 
 /**
  * :: Experimental ::
@@ -130,9 +131,53 @@ class LinearRegression(override val uid: String)
   def setWeightCol(value: String): this.type = set(weightCol, value)
   setDefault(weightCol -> "")
 
+  /**
+   * Set the solver algorithm used for optimization.
+   * In case of linear regression, this can be "l-bfgs", "normal" and "auto".
+   * The default value is "auto" which means that the solver algorithm is
+   * selected automatically.
+   * @group setParam
+   */
+  def setSolver(value: String): this.type = set(solver, value)
+  setDefault(solver -> "auto")
+
   override protected def train(dataset: DataFrame): LinearRegressionModel = {
-    // Extract columns from data.  If dataset is persisted, do not persist instances.
+    // Extract the number of features before deciding optimization solver.
+    val numFeatures = dataset.select(col($(featuresCol))).limit(1).map {
+      case Row(features: Vector) => features.size
+    }.toArray()(0)
     val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
+
+    if (($(solver) == "auto" && $(elasticNetParam) == 0.0 && numFeatures <= 4096) ||
+      $(solver) == "normal") {
+      require($(elasticNetParam) == 0.0, "Only L2 regularization can be used when normal " +
+        "solver is used.'")
+      // For low dimensional data, WeightedLeastSquares is more efficiently since the
+      // training algorithm only requires one pass through the data. (SPARK-10668)
+      val instances: RDD[WeightedLeastSquares.Instance] = dataset.select(
+        col($(labelCol)), w, col($(featuresCol))).map {
+          case Row(label: Double, weight: Double, features: Vector) =>
+            WeightedLeastSquares.Instance(weight, features, label)
+      }
+
+      val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
+        $(standardization), true)
+      val model = optimizer.fit(instances)
+      // When it is trained by WeightedLeastSquares, training summary does not
+      // attached returned model.
+      val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept))
+      // WeightedLeastSquares does not run through iterations. So it does not generate
+      // an objective history.
+      val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol()
+      val trainingSummary = new LinearRegressionTrainingSummary(
+        summaryModel.transform(dataset),
+        predictionColName,
+        $(labelCol),
+        $(featuresCol),
+        Array(0D))
+      return lrModel.setSummary(trainingSummary)
+    }
+
     val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).map {
       case Row(label: Double, weight: Double, features: Vector) =>
         Instance(label, weight, features)
@@ -155,7 +200,6 @@ class LinearRegression(override val uid: String)
         new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)(seqOp, combOp)
     }
 
-    val numFeatures = featuresSummarizer.mean.size
     val yMean = ySummarizer.mean(0)
     val yStd = math.sqrt(ySummarizer.variance(0))
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index 91c589d00abd5..4fb0b0d1092b6 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -61,6 +61,7 @@ public void tearDown() {
   public void linearRegressionDefaultParams() {
     LinearRegression lr = new LinearRegression();
     assertEquals("label", lr.getLabelCol());
+    assertEquals("auto", lr.getSolver());
     LinearRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, prediction FROM prediction");
@@ -75,7 +76,7 @@ public void linearRegressionWithSetters() {
     // Set params, train, and check as many params as we can.
     LinearRegression lr = new LinearRegression()
         .setMaxIter(10)
-        .setRegParam(1.0);
+        .setRegParam(1.0).setSolver("l-bfgs");
     LinearRegressionModel model = lr.fit(dataset);
     LinearRegression parent = (LinearRegression) model.parent();
     assertEquals(10, parent.getMaxIter());
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 73a0a5caf8640..a6e0c72ba9030 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
+  private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
 
@@ -50,15 +51,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
-
+        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
   }
 
   test("params") {
@@ -76,6 +76,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lir.getElasticNetParam === 0.0)
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
+    assert(lir.getSolver == "auto")
     val model = lir.fit(dataset)
 
     // copied model must have the same parent.
@@ -93,525 +94,603 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("linear regression with intercept without regularization") {
-    val trainer1 = new LinearRegression
-    // The result should be the same regardless of standardization without regularization
-    val trainer2 = (new LinearRegression).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
-       features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
-       label <- as.numeric(data$V1)
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         6.298698
-       as.numeric.data.V2. 4.700706
-       as.numeric.data.V3. 7.199082
-     */
-    val interceptR = 6.298698
-    val weightsR = Vectors.dense(4.700706, 7.199082)
-
-    assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-3)
-    assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = new LinearRegression().setSolver(solver)
+      // The result should be the same regardless of standardization without regularization
+      val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         Using the following R code to load the data and train the model using glmnet package.
+
+         library("glmnet")
+         data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+         features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
+         label <- as.numeric(data$V1)
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         6.298698
+         as.numeric.data.V2. 4.700706
+         as.numeric.data.V3. 7.199082
+       */
+      val interceptR = 6.298698
+      val weightsR = Vectors.dense(4.700706, 7.199082)
+
+      assert(model1.intercept ~== interceptR relTol 1E-3)
+      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model2.intercept ~== interceptR relTol 1E-3)
+      assert(model2.weights ~= weightsR relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression without intercept without regularization") {
-    val trainer1 = (new LinearRegression).setFitIntercept(false)
-    // Without regularization the results should be the same
-    val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
-    val model2 = trainer2.fit(dataset)
-    val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
-
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
-         intercept = FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 6.995908
-       as.numeric.data.V3. 5.275131
-     */
-    val weightsR = Vectors.dense(6.995908, 5.275131)
-
-    assert(model1.intercept ~== 0 absTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-3)
-    assert(model2.intercept ~== 0 absTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-3)
-
-    /*
-       Then again with the data with no intercept:
-       > weightsWithoutIntercept
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)           .
-       as.numeric.data3.V2. 4.70011
-       as.numeric.data3.V3. 7.19943
-     */
-    val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
-
-    assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
-    assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3)
-    assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
-    assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver)
+      // Without regularization the results should be the same
+      val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
+        .setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
+      val model2 = trainer2.fit(dataset)
+      val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
+           intercept = FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 6.995908
+         as.numeric.data.V3. 5.275131
+       */
+      val weightsR = Vectors.dense(6.995908, 5.275131)
+
+      assert(model1.intercept ~== 0 absTol 1E-3)
+      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model2.intercept ~== 0 absTol 1E-3)
+      assert(model2.weights ~= weightsR relTol 1E-3)
+
+      /*
+         Then again with the data with no intercept:
+         > weightsWithoutIntercept
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)           .
+         as.numeric.data3.V2. 4.70011
+         as.numeric.data3.V3. 7.19943
+       */
+      val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
+
+      assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
+      assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
+      assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3)
+    }
   }
 
   test("linear regression with intercept with L1 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-    val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         6.24300
-       as.numeric.data.V2. 4.024821
-       as.numeric.data.V3. 6.679841
-     */
-    val interceptR1 = 6.24300
-    val weightsR1 = Vectors.dense(4.024821, 6.679841)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-        standardize=FALSE))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         6.416948
-      as.numeric.data.V2. 3.893869
-      as.numeric.data.V3. 6.724286
-     */
-    val interceptR2 = 6.416948
-    val weightsR2 = Vectors.dense(3.893869, 6.724286)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setSolver(solver).setStandardization(false)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with only L1 regularization case.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                    s0
+           (Intercept)         6.24300
+           as.numeric.data.V2. 4.024821
+           as.numeric.data.V3. 6.679841
+         */
+        val interceptR1 = 6.24300
+        val weightsR1 = Vectors.dense(4.024821, 6.679841)
+        assert(model1.intercept ~== interceptR1 relTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                    s0
+           (Intercept)         6.416948
+           as.numeric.data.V2. 3.893869
+           as.numeric.data.V3. 6.724286
+         */
+        val interceptR2 = 6.416948
+        val weightsR2 = Vectors.dense(3.893869, 6.724286)
+
+        assert(model2.intercept ~== interceptR2 relTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression without intercept with L1 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-         intercept=FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)          .
-       as.numeric.data.V2. 6.299752
-       as.numeric.data.V3. 4.772913
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(6.299752, 4.772913)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-         intercept=FALSE, standardize=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 6.232193
-       as.numeric.data.V3. 4.764229
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(6.232193, 4.764229)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with only L1 regularization case.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             intercept=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)          .
+           as.numeric.data.V2. 6.299752
+           as.numeric.data.V3. 4.772913
+         */
+        val interceptR1 = 0.0
+        val weightsR1 = Vectors.dense(6.299752, 4.772913)
+
+        assert(model1.intercept ~== interceptR1 absTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             intercept=FALSE, standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         .
+           as.numeric.data.V2. 6.232193
+           as.numeric.data.V3. 4.764229
+         */
+        val interceptR2 = 0.0
+        val weightsR2 = Vectors.dense(6.232193, 4.764229)
+
+        assert(model2.intercept ~== interceptR2 absTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression with intercept with L2 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         5.269376
-      as.numeric.data.V2. 3.736216
-      as.numeric.data.V3. 5.712356)
-     */
-    val interceptR1 = 5.269376
-    val weightsR1 = Vectors.dense(3.736216, 5.712356)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-        standardize=FALSE))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         5.791109
-      as.numeric.data.V2. 3.435466
-      as.numeric.data.V3. 5.910406
-     */
-    val interceptR2 = 5.791109
-    val weightsR2 = Vectors.dense(3.435466, 5.910406)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         5.269376
+         as.numeric.data.V2. 3.736216
+         as.numeric.data.V3. 5.712356)
+       */
+      val interceptR1 = 5.269376
+      val weightsR1 = Vectors.dense(3.736216, 5.712356)
+
+      assert(model1.intercept ~== interceptR1 relTol 1E-3)
+      assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           standardize=FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         5.791109
+         as.numeric.data.V2. 3.435466
+         as.numeric.data.V3. 5.910406
+       */
+      val interceptR2 = 5.791109
+      val weightsR2 = Vectors.dense(3.435466, 5.910406)
+
+      assert(model2.intercept ~== interceptR2 relTol 1E-3)
+      assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression without intercept with L2 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-         intercept = FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.522875
-       as.numeric.data.V3. 4.214502
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(5.522875, 4.214502)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-         intercept = FALSE, standardize=FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.263704
-       as.numeric.data.V3. 4.187419
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(5.263704, 4.187419)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           intercept = FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 5.522875
+         as.numeric.data.V3. 4.214502
+       */
+      val interceptR1 = 0.0
+      val weightsR1 = Vectors.dense(5.522875, 4.214502)
+
+      assert(model1.intercept ~== interceptR1 absTol 1E-3)
+      assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           intercept = FALSE, standardize=FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 5.263704
+         as.numeric.data.V3. 4.187419
+       */
+      val interceptR2 = 0.0
+      val weightsR2 = Vectors.dense(5.263704, 4.187419)
+
+      assert(model2.intercept ~== interceptR2 absTol 1E-3)
+      assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression with intercept with ElasticNet regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         6.324108
-       as.numeric.data.V2. 3.168435
-       as.numeric.data.V3. 5.200403
-     */
-    val interceptR1 = 5.696056
-    val weightsR1 = Vectors.dense(3.670489, 6.001122)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
-       standardize=FALSE))
-      > weights
-      3 x 1 sparse Matrix of class "dgCMatrix"
-      s0
-      (Intercept)         6.114723
-      as.numeric.data.V2. 3.409937
-      as.numeric.data.V3. 6.146531
-     */
-    val interceptR2 = 6.114723
-    val weightsR2 = Vectors.dense(3.409937, 6.146531)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         6.324108
+           as.numeric.data.V2. 3.168435
+           as.numeric.data.V3. 5.200403
+         */
+        val interceptR1 = 5.696056
+        val weightsR1 = Vectors.dense(3.670489, 6.001122)
+
+        assert(model1.intercept ~== interceptR1 relTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
+             standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         6.114723
+           as.numeric.data.V2. 3.409937
+           as.numeric.data.V3. 6.146531
+         */
+        val interceptR2 = 6.114723
+        val weightsR2 = Vectors.dense(3.409937, 6.146531)
+
+        assert(model2.intercept ~== interceptR2 relTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression without intercept with ElasticNet regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-         intercept=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         .
-       as.numeric.dataM.V2. 5.673348
-       as.numeric.dataM.V3. 4.322251
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(5.673348, 4.322251)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-         intercept=FALSE, standardize=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.477988
-       as.numeric.data.V3. 4.297622
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(5.477988, 4.297622)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
+             intercept=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                      s0
+           (Intercept)         .
+           as.numeric.dataM.V2. 5.673348
+           as.numeric.dataM.V3. 4.322251
+         */
+        val interceptR1 = 0.0
+        val weightsR1 = Vectors.dense(5.673348, 4.322251)
+
+        assert(model1.intercept ~== interceptR1 absTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
+             intercept=FALSE, standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         .
+           as.numeric.data.V2. 5.477988
+           as.numeric.data.V3. 4.297622
+         */
+        val interceptR2 = 0.0
+        val weightsR2 = Vectors.dense(5.477988, 4.297622)
+
+        assert(model2.intercept ~== interceptR2 absTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression model training summary") {
-    val trainer = new LinearRegression
-    val model = trainer.fit(dataset)
-    val trainerNoPredictionCol = trainer.setPredictionCol("")
-    val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
-
-
-    // Training results for the model should be available
-    assert(model.hasSummary)
-    assert(modelNoPredictionCol.hasSummary)
-
-    // Schema should be a superset of the input dataset
-    assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
-      model.summary.predictions.schema.fieldNames.toSet))
-    // Validate that we re-insert a prediction column for evaluation
-    val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames
-    assert((dataset.schema.fieldNames.toSet).subsetOf(
-      modelNoPredictionColFieldNames.toSet))
-    assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
-
-    // Residuals in [[LinearRegressionResults]] should equal those manually computed
-    val expectedResiduals = dataset.select("features", "label")
-      .map { case Row(features: DenseVector, label: Double) =>
-      val prediction =
-        features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
-      label - prediction
-    }
-      .zip(model.summary.residuals.map(_.getDouble(0)))
-      .collect()
-      .foreach { case (manualResidual: Double, resultResidual: Double) =>
-      assert(manualResidual ~== resultResidual relTol 1E-5)
-    }
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver)
+      val model = trainer.fit(dataset)
+      val trainerNoPredictionCol = trainer.setPredictionCol("")
+      val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
+
+
+      // Training results for the model should be available
+      assert(model.hasSummary)
+      assert(modelNoPredictionCol.hasSummary)
+
+      // Schema should be a superset of the input dataset
+      assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+        model.summary.predictions.schema.fieldNames.toSet))
+      // Validate that we re-insert a prediction column for evaluation
+      val modelNoPredictionColFieldNames
+      = modelNoPredictionCol.summary.predictions.schema.fieldNames
+      assert((dataset.schema.fieldNames.toSet).subsetOf(
+        modelNoPredictionColFieldNames.toSet))
+      assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
+
+      // Residuals in [[LinearRegressionResults]] should equal those manually computed
+      val expectedResiduals = dataset.select("features", "label")
+        .map { case Row(features: DenseVector, label: Double) =>
+        val prediction =
+          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+        label - prediction
+      }
+        .zip(model.summary.residuals.map(_.getDouble(0)))
+        .collect()
+        .foreach { case (manualResidual: Double, resultResidual: Double) =>
+        assert(manualResidual ~== resultResidual relTol 1E-5)
+      }
 
-    /*
-       Use the following R code to generate model training results.
-
-       predictions <- predict(fit, newx=features)
-       residuals <- label - predictions
-       > mean(residuals^2) # MSE
-       [1] 0.009720325
-       > mean(abs(residuals)) # MAD
-       [1] 0.07863206
-       > cor(predictions, label)^2# r^2
-               [,1]
-       s0 0.9998749
-     */
-    assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
-    assert(model.summary.meanAbsoluteError ~== 0.07863206  relTol 1E-5)
-    assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
-
-    // Objective function should be monotonically decreasing for linear regression
-    assert(
-      model.summary
-        .objectiveHistory
-        .sliding(2)
-        .forall(x => x(0) >= x(1)))
+      /*
+         Use the following R code to generate model training results.
+
+         predictions <- predict(fit, newx=features)
+         residuals <- label - predictions
+         > mean(residuals^2) # MSE
+         [1] 0.009720325
+         > mean(abs(residuals)) # MAD
+         [1] 0.07863206
+         > cor(predictions, label)^2# r^2
+                 [,1]
+         s0 0.9998749
+       */
+      assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
+      assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
+      assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+
+      // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
+      // objective history because it does not run through iterations.
+      if (solver == "l-bfgs") {
+        // Objective function should be monotonically decreasing for linear regression
+        assert(
+          model.summary
+            .objectiveHistory
+            .sliding(2)
+            .forall(x => x(0) >= x(1)))
+      }
+    }
   }
 
   test("linear regression model testset evaluation summary") {
-    val trainer = new LinearRegression
-    val model = trainer.fit(dataset)
-
-    // Evaluating on training dataset should yield results summary equal to training summary
-    val testSummary = model.evaluate(dataset)
-    assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
-    assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
-    model.summary.residuals.select("residuals").collect()
-      .zip(testSummary.residuals.select("residuals").collect())
-      .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver)
+      val model = trainer.fit(dataset)
+
+      // Evaluating on training dataset should yield results summary equal to training summary
+      val testSummary = model.evaluate(dataset)
+      assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
+      assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
+      model.summary.residuals.select("residuals").collect()
+        .zip(testSummary.residuals.select("residuals").collect())
+        .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
+    }
   }
 
-  test("linear regression with weighted samples"){
-    val (data, weightedData) = {
-      val activeData = LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
-
-      val rnd = new Random(8392)
-      val signedData = activeData.map { case p: LabeledPoint =>
-        (rnd.nextGaussian() > 0.0, p)
-      }
-
-      val data1 = signedData.flatMap {
-        case (true, p) => Iterator(p, p)
-        case (false, p) => Iterator(p)
-      }
-
-      val weightedSignedData = signedData.flatMap {
-        case (true, LabeledPoint(label, features)) =>
-          Iterator(
-            Instance(label, weight = 1.2, features),
-            Instance(label, weight = 0.8, features)
-          )
-        case (false, LabeledPoint(label, features)) =>
-          Iterator(
-            Instance(label, weight = 0.3, features),
-            Instance(label, weight = 0.1, features),
-            Instance(label, weight = 0.6, features)
-          )
+  test("linear regression with weighted samples") {
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val (data, weightedData) = {
+        val activeData = LinearDataGenerator.generateLinearInput(
+          6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+
+        val rnd = new Random(8392)
+        val signedData = activeData.map { case p: LabeledPoint =>
+          (rnd.nextGaussian() > 0.0, p)
+        }
+
+        val data1 = signedData.flatMap {
+          case (true, p) => Iterator(p, p)
+          case (false, p) => Iterator(p)
+        }
+
+        val weightedSignedData = signedData.flatMap {
+          case (true, LabeledPoint(label, features)) =>
+            Iterator(
+              Instance(label, weight = 1.2, features),
+              Instance(label, weight = 0.8, features)
+            )
+          case (false, LabeledPoint(label, features)) =>
+            Iterator(
+              Instance(label, weight = 0.3, features),
+              Instance(label, weight = 0.1, features),
+              Instance(label, weight = 0.6, features)
+            )
+        }
+
+        val noiseData = LinearDataGenerator.generateLinearInput(
+          2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+        val weightedNoiseData = noiseData.map {
+          case LabeledPoint(label, features) => Instance(label, weight = 0, features)
+        }
+        val data2 = weightedSignedData ++ weightedNoiseData
+
+        (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+          sqlContext.createDataFrame(sc.parallelize(data2, 4)))
       }
 
-      val noiseData = LinearDataGenerator.generateLinearInput(
-        2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
-      val weightedNoiseData = noiseData.map {
-        case LabeledPoint(label, features) => Instance(label, weight = 0, features)
-      }
-      val data2 = weightedSignedData ++ weightedNoiseData
-
-      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
-        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+      val trainer1a = (new LinearRegression).setFitIntercept(true)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      val model1a0 = trainer1a.fit(data)
+      val model1a1 = trainer1a.fit(weightedData)
+      val model1b = trainer1b.fit(weightedData)
+
+      assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+      assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+      assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+      assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+      val trainer2a = (new LinearRegression).setFitIntercept(true)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val model2a0 = trainer2a.fit(data)
+      val model2a1 = trainer2a.fit(weightedData)
+      val model2b = trainer2b.fit(weightedData)
+      assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
+      assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
+      assert(model2a0.weights ~== model2b.weights absTol 1E-3)
+      assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
+
+      val trainer3a = (new LinearRegression).setFitIntercept(false)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val model3a0 = trainer3a.fit(data)
+      val model3a1 = trainer3a.fit(weightedData)
+      val model3b = trainer3b.fit(weightedData)
+      assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
+      assert(model3a0.weights ~== model3b.weights absTol 1E-3)
+
+      val trainer4a = (new LinearRegression).setFitIntercept(false)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val model4a0 = trainer4a.fit(data)
+      val model4a1 = trainer4a.fit(weightedData)
+      val model4b = trainer4b.fit(weightedData)
+      assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
+      assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
-
-    val trainer1a = (new LinearRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val model1a0 = trainer1a.fit(data)
-    val model1a1 = trainer1a.fit(weightedData)
-    val model1b = trainer1b.fit(weightedData)
-    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
-    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
-    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
-    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
-
-    val trainer2a = (new LinearRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val model2a0 = trainer2a.fit(data)
-    val model2a1 = trainer2a.fit(weightedData)
-    val model2b = trainer2b.fit(weightedData)
-    assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
-    assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
-    assert(model2a0.weights ~== model2b.weights absTol 1E-3)
-    assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
-
-    val trainer3a = (new LinearRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val model3a0 = trainer3a.fit(data)
-    val model3a1 = trainer3a.fit(weightedData)
-    val model3b = trainer3b.fit(weightedData)
-    assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
-    assert(model3a0.weights ~== model3b.weights absTol 1E-3)
-
-    val trainer4a = (new LinearRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val model4a0 = trainer4a.fit(data)
-    val model4a1 = trainer4a.fit(weightedData)
-    val model4b = trainer4b.fit(weightedData)
-    assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
-    assert(model4a0.weights ~== model4b.weights absTol 1E-3)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index fde02e0c84bc0..cbe09292a0337 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -69,7 +69,7 @@ class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
 
-    val trainer = new LinearRegression
+    val trainer = new LinearRegression().setSolver("l-bfgs")
     val lrParamMaps = new ParamGridBuilder()
       .addGrid(trainer.regParam, Array(1000.0, 0.001))
       .addGrid(trainer.maxIter, Array(0, 10))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index ef24e6fb6b80f..5fb80091d0b4b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -58,7 +58,7 @@ class TrainValidationSplitSuite extends SparkFunSuite with MLlibTestSparkContext
         sc.parallelize(LinearDataGenerator.generateLinearInput(
             6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
 
-    val trainer = new LinearRegression
+    val trainer = new LinearRegression().setSolver("l-bfgs")
     val lrParamMaps = new ParamGridBuilder()
       .addGrid(trainer.regParam, Array(1000.0, 0.001))
       .addGrid(trainer.maxIter, Array(0, 10))

From 7893cd95db5f2caba59ff5c859d7e4964ad7938d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 19 Oct 2015 11:02:26 -0700
Subject: [PATCH 599/802] [SPARK-11119] [SQL] cleanup for unsafe array and map

The purpose of this PR is to keep the unsafe format detail only inside the unsafe class itself, so when we use them(like use unsafe array in unsafe map, use unsafe array and map in columnar cache), we don't need to understand the format before use them.

change list:
* unsafe array's 4-bytes numElements header is now required(was optional), and become a part of unsafe array format.
* w.r.t the previous changing, the `sizeInBytes` of unsafe array now counts the 4-bytes header.
* unsafe map's format was `[numElements] [key array numBytes] [key array content(without numElements header)] [value array content(without numElements header)]` before, which is a little hacky as it makes unsafe array's header optional. I think saving 4 bytes is not a big deal, so the format is now: `[key array numBytes] [unsafe key array] [unsafe value array]`.
* w.r.t the previous changing, the `sizeInBytes` of unsafe map now counts both map's header and array's header.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9131 from cloud-fan/unsafe.
---
 .../catalyst/expressions/UnsafeArrayData.java | 43 +++++----
 .../catalyst/expressions/UnsafeMapData.java   | 88 +++++++++++++++----
 .../catalyst/expressions/UnsafeReaders.java   | 54 ------------
 .../sql/catalyst/expressions/UnsafeRow.java   |  8 +-
 .../codegen/UnsafeArrayWriter.java            | 24 ++---
 .../expressions/codegen/UnsafeRowWriter.java  | 15 ----
 .../codegen/GenerateUnsafeProjection.scala    | 60 +++++++------
 .../expressions/UnsafeRowConverterSuite.scala | 42 ++++-----
 .../spark/sql/columnar/ColumnType.scala       | 30 ++++---
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  2 +-
 10 files changed, 174 insertions(+), 192 deletions(-)
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 4c63abb071e3b..761f0447943e8 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -30,19 +30,18 @@
 /**
  * An Unsafe implementation of Array which is backed by raw memory instead of Java objects.
  *
- * Each tuple has two parts: [offsets] [values]
+ * Each tuple has three parts: [numElements] [offsets] [values]
  *
- * In the `offsets` region, we store 4 bytes per element, represents the start address of this
- * element in `values` region. We can get the length of this element by subtracting next offset.
+ * The `numElements` is 4 bytes storing the number of elements of this array.
+ *
+ * In the `offsets` region, we store 4 bytes per element, represents the relative offset (w.r.t. the
+ * base address of the array) of this element in `values` region. We can get the length of this
+ * element by subtracting next offset.
  * Note that offset can by negative which means this element is null.
  *
  * In the `values` region, we store the content of elements. As we can get length info, so elements
  * can be variable-length.
  *
- * Note that when we write out this array, we should write out the `numElements` at first 4 bytes,
- * then follows content. When we read in an array, we should read first 4 bytes as `numElements`
- * and take the rest as content.
- *
  * Instances of `UnsafeArrayData` act as pointers to row data stored in this format.
  */
 // todo: there is a lof of duplicated code between UnsafeRow and UnsafeArrayData.
@@ -54,11 +53,16 @@ public class UnsafeArrayData extends ArrayData {
   // The number of elements in this array
   private int numElements;
 
-  // The size of this array's backing data, in bytes
+  // The size of this array's backing data, in bytes.
+  // The 4-bytes header of `numElements` is also included.
   private int sizeInBytes;
 
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
+  public int getSizeInBytes() { return sizeInBytes; }
+
   private int getElementOffset(int ordinal) {
-    return Platform.getInt(baseObject, baseOffset + ordinal * 4L);
+    return Platform.getInt(baseObject, baseOffset + 4 + ordinal * 4L);
   }
 
   private int getElementSize(int offset, int ordinal) {
@@ -85,10 +89,6 @@ public Object[] array() {
    */
   public UnsafeArrayData() { }
 
-  public Object getBaseObject() { return baseObject; }
-  public long getBaseOffset() { return baseOffset; }
-  public int getSizeInBytes() { return sizeInBytes; }
-
   @Override
   public int numElements() { return numElements; }
 
@@ -97,10 +97,13 @@ public UnsafeArrayData() { }
    *
    * @param baseObject the base object
    * @param baseOffset the offset within the base object
-   * @param sizeInBytes the size of this row's backing data, in bytes
+   * @param sizeInBytes the size of this array's backing data, in bytes
    */
-  public void pointTo(Object baseObject, long baseOffset, int numElements, int sizeInBytes) {
+  public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
+    // Read the number of elements from the first 4 bytes.
+    final int numElements = Platform.getInt(baseObject, baseOffset);
     assert numElements >= 0 : "numElements (" + numElements + ") should >= 0";
+
     this.numElements = numElements;
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
@@ -277,7 +280,9 @@ public UnsafeArrayData getArray(int ordinal) {
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
     final int size = getElementSize(offset, ordinal);
-    return UnsafeReaders.readArray(baseObject, baseOffset + offset, size);
+    final UnsafeArrayData array = new UnsafeArrayData();
+    array.pointTo(baseObject, baseOffset + offset, size);
+    return array;
   }
 
   @Override
@@ -286,7 +291,9 @@ public UnsafeMapData getMap(int ordinal) {
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
     final int size = getElementSize(offset, ordinal);
-    return UnsafeReaders.readMap(baseObject, baseOffset + offset, size);
+    final UnsafeMapData map = new UnsafeMapData();
+    map.pointTo(baseObject, baseOffset + offset, size);
+    return map;
   }
 
   @Override
@@ -328,7 +335,7 @@ public UnsafeArrayData copy() {
     final byte[] arrayDataCopy = new byte[sizeInBytes];
     Platform.copyMemory(
       baseObject, baseOffset, arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
-    arrayCopy.pointTo(arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, numElements, sizeInBytes);
+    arrayCopy.pointTo(arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
     return arrayCopy;
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index e9dab9edb6bd1..5bebe2a96e391 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -17,41 +17,73 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
+import java.nio.ByteBuffer;
+
 import org.apache.spark.sql.types.MapData;
+import org.apache.spark.unsafe.Platform;
 
 /**
  * An Unsafe implementation of Map which is backed by raw memory instead of Java objects.
  *
- * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData.
- *
- * Note that when we write out this map, we should write out the `numElements` at first 4 bytes,
- * and numBytes of key array at second 4 bytes, then follows key array content and value array
- * content without `numElements` header.
- * When we read in a map, we should read first 4 bytes as `numElements` and second 4 bytes as
- * numBytes of key array, and construct unsafe key array and value array with these 2 information.
+ * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData, with extra 4 bytes at head
+ * to indicate the number of bytes of the unsafe key array.
+ * [unsafe key array numBytes] [unsafe key array] [unsafe value array]
  */
+// TODO: Use a more efficient format which doesn't depend on unsafe array.
 public class UnsafeMapData extends MapData {
 
-  private final UnsafeArrayData keys;
-  private final UnsafeArrayData values;
-  // The number of elements in this array
-  private int numElements;
-  // The size of this array's backing data, in bytes
+  private Object baseObject;
+  private long baseOffset;
+
+  // The size of this map's backing data, in bytes.
+  // The 4-bytes header of key array `numBytes` is also included, so it's actually equal to
+  // 4 + key array numBytes + value array numBytes.
   private int sizeInBytes;
 
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
   public int getSizeInBytes() { return sizeInBytes; }
 
-  public UnsafeMapData(UnsafeArrayData keys, UnsafeArrayData values) {
+  private final UnsafeArrayData keys;
+  private final UnsafeArrayData values;
+
+  /**
+   * Construct a new UnsafeMapData. The resulting UnsafeMapData won't be usable until
+   * `pointTo()` has been called, since the value returned by this constructor is equivalent
+   * to a null pointer.
+   */
+  public UnsafeMapData() {
+    keys = new UnsafeArrayData();
+    values = new UnsafeArrayData();
+  }
+
+  /**
+   * Update this UnsafeMapData to point to different backing data.
+   *
+   * @param baseObject the base object
+   * @param baseOffset the offset within the base object
+   * @param sizeInBytes the size of this map's backing data, in bytes
+   */
+  public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
+    // Read the numBytes of key array from the first 4 bytes.
+    final int keyArraySize = Platform.getInt(baseObject, baseOffset);
+    final int valueArraySize = sizeInBytes - keyArraySize - 4;
+    assert keyArraySize >= 0 : "keyArraySize (" + keyArraySize + ") should >= 0";
+    assert valueArraySize >= 0 : "valueArraySize (" + valueArraySize + ") should >= 0";
+
+    keys.pointTo(baseObject, baseOffset + 4, keyArraySize);
+    values.pointTo(baseObject, baseOffset + 4 + keyArraySize, valueArraySize);
+
     assert keys.numElements() == values.numElements();
-    this.sizeInBytes = keys.getSizeInBytes() + values.getSizeInBytes();
-    this.numElements = keys.numElements();
-    this.keys = keys;
-    this.values = values;
+
+    this.baseObject = baseObject;
+    this.baseOffset = baseOffset;
+    this.sizeInBytes = sizeInBytes;
   }
 
   @Override
   public int numElements() {
-    return numElements;
+    return keys.numElements();
   }
 
   @Override
@@ -64,8 +96,26 @@ public UnsafeArrayData valueArray() {
     return values;
   }
 
+  public void writeToMemory(Object target, long targetOffset) {
+    Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
+  }
+
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public UnsafeMapData copy() {
-    return new UnsafeMapData(keys.copy(), values.copy());
+    UnsafeMapData mapCopy = new UnsafeMapData();
+    final byte[] mapDataCopy = new byte[sizeInBytes];
+    Platform.copyMemory(
+      baseObject, baseOffset, mapDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
+    mapCopy.pointTo(mapDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
+    return mapCopy;
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
deleted file mode 100644
index 6c5fcbca63fd7..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions;
-
-import org.apache.spark.unsafe.Platform;
-
-public class UnsafeReaders {
-
-  /**
-   * Reads in unsafe array according to the format described in `UnsafeArrayData`.
-   */
-  public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int numBytes) {
-    // Read the number of elements from first 4 bytes.
-    final int numElements = Platform.getInt(baseObject, baseOffset);
-    final UnsafeArrayData array = new UnsafeArrayData();
-    // Skip the first 4 bytes.
-    array.pointTo(baseObject, baseOffset + 4, numElements, numBytes - 4);
-    return array;
-  }
-
-  /**
-   * Reads in unsafe map according to the format described in `UnsafeMapData`.
-   */
-  public static UnsafeMapData readMap(Object baseObject, long baseOffset, int numBytes) {
-    // Read the number of elements from first 4 bytes.
-    final int numElements = Platform.getInt(baseObject, baseOffset);
-    // Read the numBytes of key array in second 4 bytes.
-    final int keyArraySize = Platform.getInt(baseObject, baseOffset + 4);
-    final int valueArraySize = numBytes - 8 - keyArraySize;
-
-    final UnsafeArrayData keyArray = new UnsafeArrayData();
-    keyArray.pointTo(baseObject, baseOffset + 8, numElements, keyArraySize);
-
-    final UnsafeArrayData valueArray = new UnsafeArrayData();
-    valueArray.pointTo(baseObject, baseOffset + 8 + keyArraySize, numElements, valueArraySize);
-
-    return new UnsafeMapData(keyArray, valueArray);
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 36859fbab9744..366615f6fe69f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -461,7 +461,9 @@ public UnsafeArrayData getArray(int ordinal) {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
-      return UnsafeReaders.readArray(baseObject, baseOffset + offset, size);
+      final UnsafeArrayData array = new UnsafeArrayData();
+      array.pointTo(baseObject, baseOffset + offset, size);
+      return array;
     }
   }
 
@@ -473,7 +475,9 @@ public UnsafeMapData getMap(int ordinal) {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
-      return UnsafeReaders.readMap(baseObject, baseOffset + offset, size);
+      final UnsafeMapData map = new UnsafeMapData();
+      map.pointTo(baseObject, baseOffset + offset, size);
+      return map;
     }
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
index 138178ce99d85..7f2a1cb07af01 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
@@ -30,17 +30,19 @@
 public class UnsafeArrayWriter {
 
   private BufferHolder holder;
+
   // The offset of the global buffer where we start to write this array.
   private int startingOffset;
 
   public void initialize(BufferHolder holder, int numElements, int fixedElementSize) {
-    // We need 4 bytes each element to store offset.
-    final int fixedSize = 4 * numElements;
+    // We need 4 bytes to store numElements and 4 bytes each element to store offset.
+    final int fixedSize = 4 + 4 * numElements;
 
     this.holder = holder;
     this.startingOffset = holder.cursor;
 
     holder.grow(fixedSize);
+    Platform.putInt(holder.buffer, holder.cursor, numElements);
     holder.cursor += fixedSize;
 
     // Grows the global buffer ahead for fixed size data.
@@ -48,7 +50,7 @@ public void initialize(BufferHolder holder, int numElements, int fixedElementSiz
   }
 
   private long getElementOffset(int ordinal) {
-    return startingOffset + 4 * ordinal;
+    return startingOffset + 4 + 4 * ordinal;
   }
 
   public void setNullAt(int ordinal) {
@@ -132,20 +134,4 @@ public void write(int ordinal, CalendarInterval input) {
     // move the cursor forward.
     holder.cursor += 16;
   }
-
-
-
-  // If this array is already an UnsafeArray, we don't need to go through all elements, we can
-  // directly write it.
-  public static void directWrite(BufferHolder holder, UnsafeArrayData input) {
-    final int numBytes = input.getSizeInBytes();
-
-    // grow the global buffer before writing data.
-    holder.grow(numBytes);
-
-    // Writes the array content to the variable length portion.
-    input.writeToMemory(holder.buffer, holder.cursor);
-
-    holder.cursor += numBytes;
-  }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
index 8b7debd440031..e1f5a05d1d446 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -181,19 +181,4 @@ public void write(int ordinal, CalendarInterval input) {
     // move the cursor forward.
     holder.cursor += 16;
   }
-
-
-
-  // If this struct is already an UnsafeRow, we don't need to go through all fields, we can
-  // directly write it.
-  public static void directWrite(BufferHolder holder, UnsafeRow input) {
-    // No need to zero-out the bytes as UnsafeRow is word aligned for sure.
-    final int numBytes = input.getSizeInBytes();
-    // grow the global buffer before writing data.
-    holder.grow(numBytes);
-    // Write the bytes to the variable length portion.
-    input.writeToMemory(holder.buffer, holder.cursor);
-    // move the cursor forward.
-    holder.cursor += numBytes;
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 1b957a508d10e..dbe92d6a83502 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -62,7 +62,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     s"""
       if ($input instanceof UnsafeRow) {
-        $rowWriterClass.directWrite($bufferHolder, (UnsafeRow) $input);
+        ${writeUnsafeData(ctx, s"((UnsafeRow) $input)", bufferHolder)}
       } else {
         ${writeExpressionsToBuffer(ctx, input, fieldEvals, fieldTypes, bufferHolder)}
       }
@@ -164,8 +164,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       ctx: CodeGenContext,
       input: String,
       elementType: DataType,
-      bufferHolder: String,
-      needHeader: Boolean = true): String = {
+      bufferHolder: String): String = {
     val arrayWriter = ctx.freshName("arrayWriter")
     ctx.addMutableState(arrayWriterClass, arrayWriter,
       s"this.$arrayWriter = new $arrayWriterClass();")
@@ -227,21 +226,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       case _ => s"$arrayWriter.write($index, $element);"
     }
 
-    val writeHeader = if (needHeader) {
-      // If header is required, we need to write the number of elements into first 4 bytes.
-      s"""
-        $bufferHolder.grow(4);
-        Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $numElements);
-        $bufferHolder.cursor += 4;
-      """
-    } else ""
-
     s"""
-      final int $numElements = $input.numElements();
-      $writeHeader
       if ($input instanceof UnsafeArrayData) {
-        $arrayWriterClass.directWrite($bufferHolder, (UnsafeArrayData) $input);
+        ${writeUnsafeData(ctx, s"((UnsafeArrayData) $input)", bufferHolder)}
       } else {
+        final int $numElements = $input.numElements();
         $arrayWriter.initialize($bufferHolder, $numElements, $fixedElementSize);
 
         for (int $index = 0; $index < $numElements; $index++) {
@@ -270,23 +259,40 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     // Writes out unsafe map according to the format described in `UnsafeMapData`.
     s"""
-      final ArrayData $keys = $input.keyArray();
-      final ArrayData $values = $input.valueArray();
+      if ($input instanceof UnsafeMapData) {
+        ${writeUnsafeData(ctx, s"((UnsafeMapData) $input)", bufferHolder)}
+      } else {
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
 
-      $bufferHolder.grow(8);
+        // preserve 4 bytes to write the key array numBytes later.
+        $bufferHolder.grow(4);
+        $bufferHolder.cursor += 4;
 
-      // Write the numElements into first 4 bytes.
-      Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $keys.numElements());
+        // Remember the current cursor so that we can write numBytes of key array later.
+        final int $tmpCursor = $bufferHolder.cursor;
 
-      $bufferHolder.cursor += 8;
-      // Remember the current cursor so that we can write numBytes of key array later.
-      final int $tmpCursor = $bufferHolder.cursor;
+        ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder)}
+        // Write the numBytes of key array into the first 4 bytes.
+        Platform.putInt($bufferHolder.buffer, $tmpCursor - 4, $bufferHolder.cursor - $tmpCursor);
 
-      ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder, needHeader = false)}
-      // Write the numBytes of key array into second 4 bytes.
-      Platform.putInt($bufferHolder.buffer, $tmpCursor - 4, $bufferHolder.cursor - $tmpCursor);
+        ${writeArrayToBuffer(ctx, values, valueType, bufferHolder)}
+      }
+    """
+  }
 
-      ${writeArrayToBuffer(ctx, values, valueType, bufferHolder, needHeader = false)}
+  /**
+   * If the input is already in unsafe format, we don't need to go through all elements/fields,
+   * we can directly write it.
+   */
+  private def writeUnsafeData(ctx: CodeGenContext, input: String, bufferHolder: String) = {
+    val sizeInBytes = ctx.freshName("sizeInBytes")
+    s"""
+      final int $sizeInBytes = $input.getSizeInBytes();
+      // grow the global buffer before writing data.
+      $bufferHolder.grow($sizeInBytes);
+      $input.writeToMemory($bufferHolder.buffer, $bufferHolder.cursor);
+      $bufferHolder.cursor += $sizeInBytes;
     """
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index c991cd86d28c8..c6aad34e972b5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -296,13 +296,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     new ArrayBasedMapData(createArray(keys: _*), createArray(values: _*))
   }
 
-  private def arraySizeInRow(numBytes: Int): Int = roundedSize(4 + numBytes)
-
-  private def mapSizeInRow(numBytes: Int): Int = roundedSize(8 + numBytes)
-
   private def testArrayInt(array: UnsafeArrayData, values: Seq[Int]): Unit = {
     assert(array.numElements == values.length)
-    assert(array.getSizeInBytes == (4 + 4) * values.length)
+    assert(array.getSizeInBytes == 4 + (4 + 4) * values.length)
     values.zipWithIndex.foreach {
       case (value, index) => assert(array.getInt(index) == value)
     }
@@ -315,7 +311,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     testArrayInt(map.keyArray, keys)
     testArrayInt(map.valueArray, values)
 
-    assert(map.getSizeInBytes == map.keyArray.getSizeInBytes + map.valueArray.getSizeInBytes)
+    assert(map.getSizeInBytes == 4 + map.keyArray.getSizeInBytes + map.valueArray.getSizeInBytes)
   }
 
   test("basic conversion with array type") {
@@ -341,10 +337,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val nestedArray = unsafeArray2.getArray(0)
     testArrayInt(nestedArray, Seq(3, 4))
 
-    assert(unsafeArray2.getSizeInBytes == 4 + (4 + nestedArray.getSizeInBytes))
+    assert(unsafeArray2.getSizeInBytes == 4 + 4 + nestedArray.getSizeInBytes)
 
-    val array1Size = arraySizeInRow(unsafeArray1.getSizeInBytes)
-    val array2Size = arraySizeInRow(unsafeArray2.getSizeInBytes)
+    val array1Size = roundedSize(unsafeArray1.getSizeInBytes)
+    val array2Size = roundedSize(unsafeArray2.getSizeInBytes)
     assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + array1Size + array2Size)
   }
 
@@ -384,13 +380,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       val nestedMap = valueArray.getMap(0)
       testMapInt(nestedMap, Seq(5, 6), Seq(7, 8))
 
-      assert(valueArray.getSizeInBytes == 4 + (8 + nestedMap.getSizeInBytes))
+      assert(valueArray.getSizeInBytes == 4 + 4 + nestedMap.getSizeInBytes)
     }
 
-    assert(unsafeMap2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(unsafeMap2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
-    val map1Size = mapSizeInRow(unsafeMap1.getSizeInBytes)
-    val map2Size = mapSizeInRow(unsafeMap2.getSizeInBytes)
+    val map1Size = roundedSize(unsafeMap1.getSizeInBytes)
+    val map2Size = roundedSize(unsafeMap2.getSizeInBytes)
     assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + map1Size + map2Size)
   }
 
@@ -414,7 +410,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerArray = field1.getArray(0)
     testArrayInt(innerArray, Seq(1))
 
-    assert(field1.getSizeInBytes == 8 + 8 + arraySizeInRow(innerArray.getSizeInBytes))
+    assert(field1.getSizeInBytes == 8 + 8 + roundedSize(innerArray.getSizeInBytes))
 
     val field2 = unsafeRow.getArray(1)
     assert(field2.numElements == 1)
@@ -427,10 +423,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(innerStruct.getLong(0) == 2L)
     }
 
-    assert(field2.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + 4 + innerStruct.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + field1.getSizeInBytes + arraySizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + field1.getSizeInBytes + roundedSize(field2.getSizeInBytes))
   }
 
   test("basic conversion with struct and map") {
@@ -453,7 +449,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerMap = field1.getMap(0)
     testMapInt(innerMap, Seq(1), Seq(2))
 
-    assert(field1.getSizeInBytes == 8 + 8 + mapSizeInRow(innerMap.getSizeInBytes))
+    assert(field1.getSizeInBytes == 8 + 8 + roundedSize(innerMap.getSizeInBytes))
 
     val field2 = unsafeRow.getMap(1)
 
@@ -470,13 +466,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(innerStruct.getSizeInBytes == 8 + 8)
       assert(innerStruct.getLong(0) == 4L)
 
-      assert(valueArray.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+      assert(valueArray.getSizeInBytes == 4 + 4 + innerStruct.getSizeInBytes)
     }
 
-    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + field1.getSizeInBytes + mapSizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + field1.getSizeInBytes + roundedSize(field2.getSizeInBytes))
   }
 
   test("basic conversion with array and map") {
@@ -499,7 +495,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerMap = field1.getMap(0)
     testMapInt(innerMap, Seq(1), Seq(2))
 
-    assert(field1.getSizeInBytes == 4 + (8 + innerMap.getSizeInBytes))
+    assert(field1.getSizeInBytes == 4 + 4 + innerMap.getSizeInBytes)
 
     val field2 = unsafeRow.getMap(1)
     assert(field2.numElements == 1)
@@ -518,9 +514,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(valueArray.getSizeInBytes == 4 + (4 + innerArray.getSizeInBytes))
     }
 
-    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + arraySizeInRow(field1.getSizeInBytes) + mapSizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + roundedSize(field1.getSizeInBytes) + roundedSize(field2.getSizeInBytes))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 2bc2c96b61634..a41f04dd3b59a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -482,12 +482,14 @@ private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRo
   override def extract(buffer: ByteBuffer): UnsafeRow = {
     val sizeInBytes = buffer.getInt()
     assert(buffer.hasArray)
-    val base = buffer.array()
-    val offset = buffer.arrayOffset()
     val cursor = buffer.position()
     buffer.position(cursor + sizeInBytes)
     val unsafeRow = new UnsafeRow
-    unsafeRow.pointTo(base, Platform.BYTE_ARRAY_OFFSET + offset + cursor, numOfFields, sizeInBytes)
+    unsafeRow.pointTo(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numOfFields,
+      sizeInBytes)
     unsafeRow
   }
 
@@ -508,12 +510,11 @@ private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArra
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     val unsafeArray = getField(row, ordinal)
-    4 + 4 + unsafeArray.getSizeInBytes
+    4 + unsafeArray.getSizeInBytes
   }
 
   override def append(value: UnsafeArrayData, buffer: ByteBuffer): Unit = {
-    buffer.putInt(4 + value.getSizeInBytes)
-    buffer.putInt(value.numElements())
+    buffer.putInt(value.getSizeInBytes)
     value.writeTo(buffer)
   }
 
@@ -522,10 +523,12 @@ private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArra
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + numBytes)
-    UnsafeReaders.readArray(
+    val array = new UnsafeArrayData
+    array.pointTo(
       buffer.array(),
       Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
       numBytes)
+    array
   }
 
   override def clone(v: UnsafeArrayData): UnsafeArrayData = v.copy()
@@ -545,15 +548,12 @@ private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData]
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     val unsafeMap = getField(row, ordinal)
-    12 + unsafeMap.keyArray().getSizeInBytes + unsafeMap.valueArray().getSizeInBytes
+    4 + unsafeMap.getSizeInBytes
   }
 
   override def append(value: UnsafeMapData, buffer: ByteBuffer): Unit = {
-    buffer.putInt(8 + value.keyArray().getSizeInBytes + value.valueArray().getSizeInBytes)
-    buffer.putInt(value.numElements())
-    buffer.putInt(value.keyArray().getSizeInBytes)
-    value.keyArray().writeTo(buffer)
-    value.valueArray().writeTo(buffer)
+    buffer.putInt(value.getSizeInBytes)
+    value.writeTo(buffer)
   }
 
   override def extract(buffer: ByteBuffer): UnsafeMapData = {
@@ -561,10 +561,12 @@ private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData]
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + numBytes)
-    UnsafeReaders.readMap(
+    val map = new UnsafeMapData
+    map.pointTo(
       buffer.array(),
       Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
       numBytes)
+    map
   }
 
   override def clone(v: UnsafeMapData): UnsafeMapData = v.copy()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 0e6e1bcf72896..63bc39bfa0307 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -73,7 +73,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(COMPACT_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
     checkActualSize(LARGE_DECIMAL(20, 10), Decimal(0, 20, 10), 5)
     checkActualSize(ARRAY_TYPE, Array[Any](1), 16)
-    checkActualSize(MAP_TYPE, Map(1 -> "a"), 25)
+    checkActualSize(MAP_TYPE, Map(1 -> "a"), 29)
     checkActualSize(STRUCT_TYPE, Row("hello"), 28)
   }
 

From 5966817941b57251fbd1cf8b9b458ec389c071a0 Mon Sep 17 00:00:00 2001
From: Rishabh Bhardwaj <rbnext29@gmail.com>
Date: Mon, 19 Oct 2015 14:38:49 -0700
Subject: [PATCH 600/802] [SPARK-11180][SQL] Support BooleanType in
 DataFrame.na.fill

Added support for boolean types in fill and replace methods

Author: Rishabh Bhardwaj <rbnext29@gmail.com>

Closes #9166 from rishabhbhardwaj/master.
---
 .../spark/sql/DataFrameNaFunctions.scala      | 29 ++++++++++++-------
 .../spark/sql/DataFrameNaFunctionsSuite.scala | 14 +++++----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 77a42c0873a6b..f7be5f6b370ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -198,7 +198,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * Returns a new [[DataFrame]] that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
-   * The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`.
+   * The value must be of the following type:
+   * `Integer`, `Long`, `Float`, `Double`, `String`, `Boolean`.
    *
    * For example, the following replaces null values in column "A" with string "unknown", and
    * null values in column "B" with numeric value 1.0.
@@ -215,7 +216,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
-   * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`.
+   * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
    *
    * For example, the following replaces null values in column "A" with string "unknown", and
    * null values in column "B" with numeric value 1.0.
@@ -232,7 +233,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
    * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
    *
    * {{{
@@ -259,7 +261,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
    *
    * {{{
    *   import com.google.common.collect.ImmutableMap;
@@ -282,8 +285,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
-   * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
+   * If `col` is "*",
+   * then the replacement is applied on all string columns , numeric columns or boolean columns.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
@@ -311,7 +316,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles , strings or booleans.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
@@ -333,15 +339,17 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       return df
     }
 
-    // replacementMap is either Map[String, String] or Map[Double, Double]
+    // replacementMap is either Map[String, String] or Map[Double, Double] or Map[Boolean,Boolean]
     val replacementMap: Map[_, _] = replacement.head._2 match {
       case v: String => replacement
+      case v: Boolean => replacement
       case _ => replacement.map { case (k, v) => (convertToDouble(k), convertToDouble(v)) }
     }
 
-    // targetColumnType is either DoubleType or StringType
+    // targetColumnType is either DoubleType or StringType or BooleanType
     val targetColumnType = replacement.head._1 match {
       case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long => DoubleType
+      case _: jl.Boolean => BooleanType
       case _: String => StringType
     }
 
@@ -367,7 +375,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
       // Check data type
       replaceValue match {
-        case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long | _: String =>
+        case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long | _: jl.Boolean | _: String =>
           // This is good
         case _ => throw new IllegalArgumentException(
           s"Unsupported value type ${replaceValue.getClass.getName} ($replaceValue).")
@@ -382,6 +390,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
           case v: jl.Double => fillCol[Double](f, v)
           case v: jl.Long => fillCol[Double](f, v.toDouble)
           case v: jl.Integer => fillCol[Double](f, v.toDouble)
+          case v: jl.Boolean => fillCol[Boolean](f, v.booleanValue())
           case v: String => fillCol[String](f, v)
         }
       }.getOrElse(df.col(f.name))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 329ffb66083b1..e34875471f093 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -141,24 +141,26 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("fill with map") {
-    val df = Seq[(String, String, java.lang.Long, java.lang.Double)](
-      (null, null, null, null)).toDF("a", "b", "c", "d")
+    val df = Seq[(String, String, java.lang.Long, java.lang.Double, java.lang.Boolean)](
+      (null, null, null, null, null)).toDF("a", "b", "c", "d", "e")
     checkAnswer(
       df.na.fill(Map(
         "a" -> "test",
         "c" -> 1,
-        "d" -> 2.2
+        "d" -> 2.2,
+        "e" -> false
       )),
-      Row("test", null, 1, 2.2))
+      Row("test", null, 1, 2.2, false))
 
     // Test Java version
     checkAnswer(
       df.na.fill(Map(
         "a" -> "test",
         "c" -> 1,
-        "d" -> 2.2
+        "d" -> 2.2,
+        "e" -> false
       ).asJava),
-      Row("test", null, 1, 2.2))
+      Row("test", null, 1, 2.2, false))
   }
 
   test("replace") {

From 67582132bffbaaeaadc5cf8218f6239d03c39da0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 19 Oct 2015 15:35:14 -0700
Subject: [PATCH 601/802] [SPARK-11063] [STREAMING] Change preferredLocations
 of Receiver's RDD to hosts rather than hostports

The format of RDD's preferredLocations must be hostname but the format of Streaming Receiver's scheduling executors is hostport. So it doesn't work.

This PR converts `schedulerExecutors` to `hosts` before creating Receiver's RDD.

Author: zsxwing <zsxwing@gmail.com>

Closes #9075 from zsxwing/SPARK-11063.
---
 .../scheduler/ReceiverSchedulingPolicy.scala  |  3 ++-
 .../streaming/scheduler/ReceiverTracker.scala |  4 +++-
 .../scheduler/ReceiverTrackerSuite.scala      | 24 +++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
index 10b5a7f57a802..d2b0be7f4a9c5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
@@ -21,6 +21,7 @@ import scala.collection.Map
 import scala.collection.mutable
 
 import org.apache.spark.streaming.receiver.Receiver
+import org.apache.spark.util.Utils
 
 /**
  * A class that tries to schedule receivers with evenly distributed. There are two phases for
@@ -79,7 +80,7 @@ private[streaming] class ReceiverSchedulingPolicy {
       return receivers.map(_.streamId -> Seq.empty).toMap
     }
 
-    val hostToExecutors = executors.groupBy(_.split(":")(0))
+    val hostToExecutors = executors.groupBy(executor => Utils.parseHostPort(executor)._1)
     val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String])
     val numReceiversOnExecutor = mutable.HashMap[String, Int]()
     // Set the initial value to 0
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index d053e9e84910f..2ce80d618b0a3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -551,7 +551,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         if (scheduledExecutors.isEmpty) {
           ssc.sc.makeRDD(Seq(receiver), 1)
         } else {
-          ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors))
+          val preferredLocations =
+            scheduledExecutors.map(hostPort => Utils.parseHostPort(hostPort)._1).distinct
+          ssc.sc.makeRDD(Seq(receiver -> preferredLocations))
         }
       receiverRDD.setName(s"Receiver $receiverId")
       ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index 45138b748ecab..fda86aef457d4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart, TaskLocality}
+import org.apache.spark.scheduler.TaskLocality.TaskLocality
 import org.apache.spark.storage.{StorageLevel, StreamBlockId}
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
@@ -80,6 +82,28 @@ class ReceiverTrackerSuite extends TestSuiteBase {
       }
     }
   }
+
+  test("SPARK-11063: TaskSetManager should use Receiver RDD's preferredLocations") {
+    // Use ManualClock to prevent from starting batches so that we can make sure the only task is
+    // for starting the Receiver
+    val _conf = conf.clone.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
+    withStreamingContext(new StreamingContext(_conf, Milliseconds(100))) { ssc =>
+      @volatile var receiverTaskLocality: TaskLocality = null
+      ssc.sparkContext.addSparkListener(new SparkListener {
+        override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+          receiverTaskLocality = taskStart.taskInfo.taskLocality
+        }
+      })
+      val input = ssc.receiverStream(new TestReceiver)
+      val output = new TestOutputStream(input)
+      output.register()
+      ssc.start()
+      eventually(timeout(10 seconds), interval(10 millis)) {
+        // If preferredLocations is set correctly, receiverTaskLocality should be NODE_LOCAL
+        assert(receiverTaskLocality === TaskLocality.NODE_LOCAL)
+      }
+    }
+  }
 }
 
 /** An input DStream with for testing rate controlling */

From 7ab0ce6501c37f0fc3a49e3332573ae4e4def3e8 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 19 Oct 2015 16:14:50 -0700
Subject: [PATCH 602/802] [SPARK-11131][CORE] Fix race in worker registration
 protocol.

Because the registration RPC was not really an RPC, but a bunch of
disconnected messages, it was possible for other messages to be
sent before the reply to the registration arrived, and that would
confuse the Worker. Especially in local-cluster mode, the worker was
succeptible to receiving an executor request before it received a
message from the master saying registration succeeded.

On top of the above, the change also fixes a ClassCastException when
the registration fails, which also affects the executor registration
protocol. Because the `ask` is issued with a specific return type,
if the error message (of a different type) was returned instead, the
code would just die with an exception. This is fixed by having a common
base trait for these reply messages.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9138 from vanzin/SPARK-11131.
---
 .../apache/spark/deploy/DeployMessage.scala   |  7 +-
 .../apache/spark/deploy/master/Master.scala   | 50 ++++++-------
 .../apache/spark/deploy/worker/Worker.scala   | 73 ++++++++++++-------
 .../CoarseGrainedExecutorBackend.scala        |  4 +-
 .../cluster/CoarseGrainedClusterMessage.scala |  4 +
 .../apache/spark/HeartbeatReceiverSuite.scala |  4 +-
 6 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index d8084a57658ad..3feb7cea593e0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -69,9 +69,14 @@ private[deploy] object DeployMessages {
 
   // Master to Worker
 
+  sealed trait RegisterWorkerResponse
+
   case class RegisteredWorker(master: RpcEndpointRef, masterWebUiUrl: String) extends DeployMessage
+    with RegisterWorkerResponse
+
+  case class RegisterWorkerFailed(message: String) extends DeployMessage with RegisterWorkerResponse
 
-  case class RegisterWorkerFailed(message: String) extends DeployMessage
+  case object MasterInStandby extends DeployMessage with RegisterWorkerResponse
 
   case class ReconnectWorker(masterUrl: String) extends DeployMessage
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index d518e92133aad..6715d6c70f497 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -233,31 +233,6 @@ private[deploy] class Master(
       System.exit(0)
     }
 
-    case RegisterWorker(
-        id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => {
-      logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
-        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
-      if (state == RecoveryState.STANDBY) {
-        // ignore, don't send response
-      } else if (idToWorker.contains(id)) {
-        workerRef.send(RegisterWorkerFailed("Duplicate worker ID"))
-      } else {
-        val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
-          workerRef, workerUiPort, publicAddress)
-        if (registerWorker(worker)) {
-          persistenceEngine.addWorker(worker)
-          workerRef.send(RegisteredWorker(self, masterWebUiUrl))
-          schedule()
-        } else {
-          val workerAddress = worker.endpoint.address
-          logWarning("Worker registration failed. Attempted to re-register worker at same " +
-            "address: " + workerAddress)
-          workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: "
-            + workerAddress))
-        }
-      }
-    }
-
     case RegisterApplication(description, driver) => {
       // TODO Prevent repeated registrations from some driver
       if (state == RecoveryState.STANDBY) {
@@ -387,6 +362,31 @@ private[deploy] class Master(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case RegisterWorker(
+        id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => {
+      logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
+        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
+      if (state == RecoveryState.STANDBY) {
+        context.reply(MasterInStandby)
+      } else if (idToWorker.contains(id)) {
+        context.reply(RegisterWorkerFailed("Duplicate worker ID"))
+      } else {
+        val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
+          workerRef, workerUiPort, publicAddress)
+        if (registerWorker(worker)) {
+          persistenceEngine.addWorker(worker)
+          context.reply(RegisteredWorker(self, masterWebUiUrl))
+          schedule()
+        } else {
+          val workerAddress = worker.endpoint.address
+          logWarning("Worker registration failed. Attempted to re-register worker at same " +
+            "address: " + workerAddress)
+          context.reply(RegisterWorkerFailed("Attempted to re-register worker at same address: "
+            + workerAddress))
+        }
+      }
+    }
+
     case RequestSubmitDriver(description) => {
       if (state != RecoveryState.ALIVE) {
         val msg = s"${Utils.BACKUP_STANDALONE_MASTER_PREFIX}: $state. " +
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 93a1b3f310422..a45867e7680ec 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -26,7 +26,7 @@ import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFut
 
 import scala.collection.mutable.{HashMap, HashSet, LinkedHashMap}
 import scala.concurrent.ExecutionContext
-import scala.util.Random
+import scala.util.{Failure, Random, Success}
 import scala.util.control.NonFatal
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
@@ -213,8 +213,7 @@ private[deploy] class Worker(
             logInfo("Connecting to master " + masterAddress + "...")
             val masterEndpoint =
               rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
-            masterEndpoint.send(RegisterWorker(
-              workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+            registerWithMaster(masterEndpoint)
           } catch {
             case ie: InterruptedException => // Cancelled
             case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
@@ -271,8 +270,7 @@ private[deploy] class Worker(
                   logInfo("Connecting to master " + masterAddress + "...")
                   val masterEndpoint =
                     rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
-                  masterEndpoint.send(RegisterWorker(
-                    workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+                  registerWithMaster(masterEndpoint)
                 } catch {
                   case ie: InterruptedException => // Cancelled
                   case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
@@ -341,25 +339,54 @@ private[deploy] class Worker(
     }
   }
 
-  override def receive: PartialFunction[Any, Unit] = {
-    case RegisteredWorker(masterRef, masterWebUiUrl) =>
-      logInfo("Successfully registered with master " + masterRef.address.toSparkURL)
-      registered = true
-      changeMaster(masterRef, masterWebUiUrl)
-      forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
-        override def run(): Unit = Utils.tryLogNonFatalError {
-          self.send(SendHeartbeat)
-        }
-      }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS)
-      if (CLEANUP_ENABLED) {
-        logInfo(s"Worker cleanup enabled; old application directories will be deleted in: $workDir")
+  private def registerWithMaster(masterEndpoint: RpcEndpointRef): Unit = {
+    masterEndpoint.ask[RegisterWorkerResponse](RegisterWorker(
+      workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+      .onComplete {
+        // This is a very fast action so we can use "ThreadUtils.sameThread"
+        case Success(msg) =>
+          Utils.tryLogNonFatalError {
+            handleRegisterResponse(msg)
+          }
+        case Failure(e) =>
+          logError(s"Cannot register with master: ${masterEndpoint.address}", e)
+          System.exit(1)
+      }(ThreadUtils.sameThread)
+  }
+
+  private def handleRegisterResponse(msg: RegisterWorkerResponse): Unit = synchronized {
+    msg match {
+      case RegisteredWorker(masterRef, masterWebUiUrl) =>
+        logInfo("Successfully registered with master " + masterRef.address.toSparkURL)
+        registered = true
+        changeMaster(masterRef, masterWebUiUrl)
         forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
           override def run(): Unit = Utils.tryLogNonFatalError {
-            self.send(WorkDirCleanup)
+            self.send(SendHeartbeat)
           }
-        }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS)
-      }
+        }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS)
+        if (CLEANUP_ENABLED) {
+          logInfo(
+            s"Worker cleanup enabled; old application directories will be deleted in: $workDir")
+          forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
+            override def run(): Unit = Utils.tryLogNonFatalError {
+              self.send(WorkDirCleanup)
+            }
+          }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS)
+        }
 
+      case RegisterWorkerFailed(message) =>
+        if (!registered) {
+          logError("Worker registration failed: " + message)
+          System.exit(1)
+        }
+
+      case MasterInStandby =>
+        // Ignore. Master not yet ready.
+    }
+  }
+
+  override def receive: PartialFunction[Any, Unit] = synchronized {
     case SendHeartbeat =>
       if (connected) { sendToMaster(Heartbeat(workerId, self)) }
 
@@ -399,12 +426,6 @@ private[deploy] class Worker(
         map(e => new ExecutorDescription(e.appId, e.execId, e.cores, e.state))
       masterRef.send(WorkerSchedulerStateResponse(workerId, execs.toList, drivers.keys.toSeq))
 
-    case RegisterWorkerFailed(message) =>
-      if (!registered) {
-        logError("Worker registration failed: " + message)
-        System.exit(1)
-      }
-
     case ReconnectWorker(masterUrl) =>
       logInfo(s"Master with url $masterUrl requested this worker to reconnect.")
       registerWithMaster()
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 49059de50b42b..a9c6a05ecd434 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -59,12 +59,12 @@ private[spark] class CoarseGrainedExecutorBackend(
     rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       driver = Some(ref)
-      ref.ask[RegisteredExecutor.type](
+      ref.ask[RegisterExecutorResponse](
         RegisterExecutor(executorId, self, hostPort, cores, extractLogUrls))
     }(ThreadUtils.sameThread).onComplete {
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       case Success(msg) => Utils.tryLogNonFatalError {
-        Option(self).foreach(_.send(msg)) // msg must be RegisteredExecutor
+        Option(self).foreach(_.send(msg)) // msg must be RegisterExecutorResponse
       }
       case Failure(e) => {
         logError(s"Cannot register with driver: $driverUrl", e)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index e0d25dc50c988..4652df32efa74 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -36,9 +36,13 @@ private[spark] object CoarseGrainedClusterMessages {
   case class KillTask(taskId: Long, executor: String, interruptThread: Boolean)
     extends CoarseGrainedClusterMessage
 
+  sealed trait RegisterExecutorResponse
+
   case object RegisteredExecutor extends CoarseGrainedClusterMessage
+    with RegisterExecutorResponse
 
   case class RegisterExecutorFailed(message: String) extends CoarseGrainedClusterMessage
+    with RegisterExecutorResponse
 
   // Executors to driver
   case class RegisterExecutor(
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index 18f2229fea39b..3cd80c0f7d171 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -173,9 +173,9 @@ class HeartbeatReceiverSuite
     val dummyExecutorEndpoint2 = new FakeExecutorEndpoint(rpcEnv)
     val dummyExecutorEndpointRef1 = rpcEnv.setupEndpoint("fake-executor-1", dummyExecutorEndpoint1)
     val dummyExecutorEndpointRef2 = rpcEnv.setupEndpoint("fake-executor-2", dummyExecutorEndpoint2)
-    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisterExecutorResponse](
       RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "dummy:4040", 0, Map.empty))
-    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisterExecutorResponse](
       RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty))
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
     addExecutorAndVerify(executorId1)

From a1413b3662250dd5e980e8b1f7c3dc4585ab4766 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 19 Oct 2015 16:16:31 -0700
Subject: [PATCH 603/802] [SPARK-11051][CORE] Do not allow local checkpointing
 after the RDD is materialized and checkpointed

JIRA: https://issues.apache.org/jira/browse/SPARK-11051

When a `RDD` is materialized and checkpointed, its partitions and dependencies are cleared. If we allow local checkpointing on it and assign `LocalRDDCheckpointData` to its `checkpointData`. Next time when the RDD is materialized again, the error will be thrown.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9072 from viirya/no-localcheckpoint-after-checkpoint.
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 35 +++++++++++++++----
 .../org/apache/spark/CheckpointSuite.scala    |  4 +++
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index a56e542242d5f..a97bb174438a5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -294,7 +294,11 @@ abstract class RDD[T: ClassTag](
    */
   private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =
   {
-    if (isCheckpointed) firstParent[T].iterator(split, context) else compute(split, context)
+    if (isCheckpointedAndMaterialized) {
+      firstParent[T].iterator(split, context)
+    } else {
+      compute(split, context)
+    }
   }
 
   /**
@@ -1520,20 +1524,37 @@ abstract class RDD[T: ClassTag](
       persist(LocalRDDCheckpointData.transformStorageLevel(storageLevel), allowOverride = true)
     }
 
-    checkpointData match {
-      case Some(reliable: ReliableRDDCheckpointData[_]) => logWarning(
-        "RDD was already marked for reliable checkpointing: overriding with local checkpoint.")
-      case _ =>
+    // If this RDD is already checkpointed and materialized, its lineage is already truncated.
+    // We must not override our `checkpointData` in this case because it is needed to recover
+    // the checkpointed data. If it is overridden, next time materializing on this RDD will
+    // cause error.
+    if (isCheckpointedAndMaterialized) {
+      logWarning("Not marking RDD for local checkpoint because it was already " +
+        "checkpointed and materialized")
+    } else {
+      // Lineage is not truncated yet, so just override any existing checkpoint data with ours
+      checkpointData match {
+        case Some(_: ReliableRDDCheckpointData[_]) => logWarning(
+          "RDD was already marked for reliable checkpointing: overriding with local checkpoint.")
+        case _ =>
+      }
+      checkpointData = Some(new LocalRDDCheckpointData(this))
     }
-    checkpointData = Some(new LocalRDDCheckpointData(this))
     this
   }
 
   /**
-   * Return whether this RDD is marked for checkpointing, either reliably or locally.
+   * Return whether this RDD is checkpointed and materialized, either reliably or locally.
    */
   def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed)
 
+  /**
+   * Return whether this RDD is checkpointed and materialized, either reliably or locally.
+   * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the
+   * return value. Exposed for testing.
+   */
+  private[spark] def isCheckpointedAndMaterialized: Boolean = isCheckpointed
+
   /**
    * Return whether this RDD is marked for local checkpointing.
    * Exposed for testing.
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index 4d70bfed909b6..119e5fc28e412 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -241,9 +241,13 @@ class CheckpointSuite extends SparkFunSuite with LocalSparkContext with Logging
     val rdd = new BlockRDD[Int](sc, Array[BlockId]())
     assert(rdd.partitions.size === 0)
     assert(rdd.isCheckpointed === false)
+    assert(rdd.isCheckpointedAndMaterialized === false)
     checkpoint(rdd, reliableCheckpoint)
+    assert(rdd.isCheckpointed === false)
+    assert(rdd.isCheckpointedAndMaterialized === false)
     assert(rdd.count() === 0)
     assert(rdd.isCheckpointed === true)
+    assert(rdd.isCheckpointedAndMaterialized === true)
     assert(rdd.partitions.size === 0)
   }
 

From 232d7f8d42950431f1d9be2a6bb3591fb6ea20d6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 19 Oct 2015 16:18:20 -0700
Subject: [PATCH 604/802] [SPARK-11114][PYSPARK] add getOrCreate for
 SparkContext/SQLContext in Python

Also added SQLContext.newSession()

Author: Davies Liu <davies@databricks.com>

Closes #9122 from davies/py_create.
---
 python/pyspark/context.py     | 16 ++++++++++++++--
 python/pyspark/sql/context.py | 27 +++++++++++++++++++++++++++
 python/pyspark/sql/tests.py   | 14 ++++++++++++++
 python/pyspark/tests.py       |  4 ++++
 4 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 4969d85f52b23..afd74d937a413 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -21,7 +21,7 @@
 import shutil
 import signal
 import sys
-from threading import Lock
+from threading import RLock
 from tempfile import NamedTemporaryFile
 
 from pyspark import accumulators
@@ -65,7 +65,7 @@ class SparkContext(object):
     _jvm = None
     _next_accum_id = 0
     _active_spark_context = None
-    _lock = Lock()
+    _lock = RLock()
     _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
 
     PACKAGE_EXTENSIONS = ('.zip', '.egg', '.jar')
@@ -280,6 +280,18 @@ def __exit__(self, type, value, trace):
         """
         self.stop()
 
+    @classmethod
+    def getOrCreate(cls, conf=None):
+        """
+        Get or instantiate a SparkContext and register it as a singleton object.
+
+        :param conf: SparkConf (optional)
+        """
+        with SparkContext._lock:
+            if SparkContext._active_spark_context is None:
+                SparkContext(conf=conf or SparkConf())
+            return SparkContext._active_spark_context
+
     def setLogLevel(self, logLevel):
         """
         Control our logLevel. This overrides any user-defined log settings.
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 89c8c6e0d94f1..79453658a167a 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -75,6 +75,8 @@ class SQLContext(object):
         SQLContext in the JVM, instead we make all calls to this object.
     """
 
+    _instantiatedContext = None
+
     @ignore_unicode_prefix
     def __init__(self, sparkContext, sqlContext=None):
         """Creates a new SQLContext.
@@ -99,6 +101,8 @@ def __init__(self, sparkContext, sqlContext=None):
         self._scala_SQLContext = sqlContext
         _monkey_patch_RDD(self)
         install_exception_handler()
+        if SQLContext._instantiatedContext is None:
+            SQLContext._instantiatedContext = self
 
     @property
     def _ssql_ctx(self):
@@ -111,6 +115,29 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    @classmethod
+    @since(1.6)
+    def getOrCreate(cls, sc):
+        """
+        Get the existing SQLContext or create a new one with given SparkContext.
+
+        :param sc: SparkContext
+        """
+        if cls._instantiatedContext is None:
+            jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
+            cls(sc, jsqlContext)
+        return cls._instantiatedContext
+
+    @since(1.6)
+    def newSession(self):
+        """
+        Returns a new SQLContext as new session, that has separate SQLConf,
+        registered temporary tables and UDFs, but shared SparkContext and
+        table cache.
+        """
+        jsqlContext = self._ssql_ctx.newSession()
+        return self.__class__(self._sc, jsqlContext)
+
     @since(1.3)
     def setConf(self, key, value):
         """Sets the given Spark SQL configuration property.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 645133b2b2d84..f465e1fa20941 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -174,6 +174,20 @@ def test_datetype_equal_zero(self):
         self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1))
 
 
+class SQLContextTests(ReusedPySparkTestCase):
+    def test_get_or_create(self):
+        sqlCtx = SQLContext.getOrCreate(self.sc)
+        self.assertTrue(SQLContext.getOrCreate(self.sc) is sqlCtx)
+
+    def test_new_session(self):
+        sqlCtx = SQLContext.getOrCreate(self.sc)
+        sqlCtx.setConf("test_key", "a")
+        sqlCtx2 = sqlCtx.newSession()
+        sqlCtx2.setConf("test_key", "b")
+        self.assertEqual(sqlCtx.getConf("test_key", ""), "a")
+        self.assertEqual(sqlCtx2.getConf("test_key", ""), "b")
+
+
 class SQLTests(ReusedPySparkTestCase):
 
     @classmethod
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 63cc87e0c4b2c..3c51809444401 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1883,6 +1883,10 @@ def test_failed_sparkcontext_creation(self):
         # Regression test for SPARK-1550
         self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
 
+    def test_get_or_create(self):
+        with SparkContext.getOrCreate() as sc:
+            self.assertTrue(SparkContext.getOrCreate() is sc)
+
     def test_stop(self):
         sc = SparkContext()
         self.assertNotEqual(SparkContext._active_spark_context, None)

From fc26f32cf1bede8b9a1343dca0c0182107c9985e Mon Sep 17 00:00:00 2001
From: Chris Bannister <chris.bannister@swiftkey.com>
Date: Mon, 19 Oct 2015 16:24:40 -0700
Subject: [PATCH 605/802] [SPARK-9708][MESOS] Spark should create local
 temporary directories in Mesos sandbox when launched with Mesos

This is my own original work and I license this to the project under the project's open source license

Author: Chris Bannister <chris.bannister@swiftkey.com>
Author: Chris Bannister <chris.bannister@swiftkey.net>

Closes #8358 from Zariel/mesos-local-dir.
---
 .../scala/org/apache/spark/util/Utils.scala     | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 22c05a2479422..55950405f0488 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -649,6 +649,7 @@ private[spark] object Utils extends Logging {
    * logic of locating the local directories according to deployment mode.
    */
   def getConfiguredLocalDirs(conf: SparkConf): Array[String] = {
+    val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
     if (isRunningInYarnContainer(conf)) {
       // If we are in yarn mode, systems can have different disk layouts so we must set it
       // to what Yarn on this system said was available. Note this assumes that Yarn has
@@ -657,13 +658,23 @@ private[spark] object Utils extends Logging {
       getYarnLocalDirs(conf).split(",")
     } else if (conf.getenv("SPARK_EXECUTOR_DIRS") != null) {
       conf.getenv("SPARK_EXECUTOR_DIRS").split(File.pathSeparator)
+    } else if (conf.getenv("SPARK_LOCAL_DIRS") != null) {
+      conf.getenv("SPARK_LOCAL_DIRS").split(",")
+    } else if (conf.getenv("MESOS_DIRECTORY") != null && !shuffleServiceEnabled) {
+      // Mesos already creates a directory per Mesos task. Spark should use that directory
+      // instead so all temporary files are automatically cleaned up when the Mesos task ends.
+      // Note that we don't want this if the shuffle service is enabled because we want to
+      // continue to serve shuffle files after the executors that wrote them have already exited.
+      Array(conf.getenv("MESOS_DIRECTORY"))
     } else {
+      if (conf.getenv("MESOS_DIRECTORY") != null && shuffleServiceEnabled) {
+        logInfo("MESOS_DIRECTORY available but not using provided Mesos sandbox because " +
+          "spark.shuffle.service.enabled is enabled.")
+      }
       // In non-Yarn mode (or for the driver in yarn-client mode), we cannot trust the user
       // configuration to point to a secure directory. So create a subdirectory with restricted
       // permissions under each listed directory.
-      Option(conf.getenv("SPARK_LOCAL_DIRS"))
-        .getOrElse(conf.get("spark.local.dir", System.getProperty("java.io.tmpdir")))
-        .split(",")
+      conf.get("spark.local.dir", System.getProperty("java.io.tmpdir")).split(",")
     }
   }
 

From 16906ef23a7aa2854c8cdcaa3bb3808ab39e0eec Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Mon, 19 Oct 2015 16:34:15 -0700
Subject: [PATCH 606/802] [SPARK-11120] Allow sane default number of executor
 failures when dynamically allocating in YARN

I also added some information to container-failure error msgs about what host they failed on, which would have helped me identify the problem that lead me to this JIRA and PR sooner.

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #9147 from ryan-williams/dyn-exec-failures.
---
 .../scala/org/apache/spark/SparkConf.scala    |  4 +++-
 .../spark/deploy/yarn/ApplicationMaster.scala | 19 +++++++++++++++----
 .../spark/deploy/yarn/YarnAllocator.scala     | 19 +++++++++++--------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 1a0ac3d01759c..58d3b846fd80d 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -595,7 +595,9 @@ private[spark] object SparkConf extends Logging {
     "spark.rpc.lookupTimeout" -> Seq(
       AlternateConfig("spark.akka.lookupTimeout", "1.4")),
     "spark.streaming.fileStream.minRememberDuration" -> Seq(
-      AlternateConfig("spark.streaming.minRememberDuration", "1.5"))
+      AlternateConfig("spark.streaming.minRememberDuration", "1.5")),
+    "spark.yarn.max.executor.failures" -> Seq(
+      AlternateConfig("spark.yarn.max.worker.failures", "1.5"))
     )
 
   /**
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index d1d248bf79beb..4b4d9990ce9f9 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -62,10 +62,21 @@ private[spark] class ApplicationMaster(
     .asInstanceOf[YarnConfiguration]
   private val isClusterMode = args.userClass != null
 
-  // Default to numExecutors * 2, with minimum of 3
-  private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures",
-    sparkConf.getInt("spark.yarn.max.worker.failures",
-      math.max(sparkConf.getInt("spark.executor.instances", 0) *  2, 3)))
+  // Default to twice the number of executors (twice the maximum number of executors if dynamic
+  // allocation is enabled), with a minimum of 3.
+
+  private val maxNumExecutorFailures = {
+    val defaultKey =
+      if (Utils.isDynamicAllocationEnabled(sparkConf)) {
+        "spark.dynamicAllocation.maxExecutors"
+      } else {
+        "spark.executor.instances"
+      }
+    val effectiveNumExecutors = sparkConf.getInt(defaultKey, 0)
+    val defaultMaxNumExecutorFailures = math.max(3, 2 * effectiveNumExecutors)
+
+    sparkConf.getInt("spark.yarn.max.executor.failures", defaultMaxNumExecutorFailures)
+  }
 
   @volatile private var exitCode = 0
   @volatile private var unregistered = false
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 9e1ef1b3b4229..1deaa3743ddfa 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -430,17 +430,20 @@ private[yarn] class YarnAllocator(
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
       val alreadyReleased = releasedContainers.remove(containerId)
+      val hostOpt = allocatedContainerToHostMap.get(containerId)
+      val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
       val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
         numExecutorsRunning -= 1
-        logInfo("Completed container %s (state: %s, exit status: %s)".format(
+        logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
           containerId,
+          onHostStr,
           completedContainer.getState,
           completedContainer.getExitStatus))
         // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
         // there are some exit status' we shouldn't necessarily count against us, but for
-        // now I think its ok as none of the containers are expected to exit
+        // now I think its ok as none of the containers are expected to exit.
         val exitStatus = completedContainer.getExitStatus
         val (isNormalExit, containerExitReason) = exitStatus match {
           case ContainerExitStatus.SUCCESS =>
@@ -449,7 +452,7 @@ private[yarn] class YarnAllocator(
             // Preemption should count as a normal exit, since YARN preempts containers merely
             // to do resource sharing, and tasks that fail due to preempted executors could
             // just as easily finish on any other executor. See SPARK-8167.
-            (true, s"Container $containerId was preempted.")
+            (true, s"Container ${containerId}${onHostStr} was preempted.")
           // Should probably still count memory exceeded exit codes towards task failures
           case VMEM_EXCEEDED_EXIT_CODE =>
             (false, memLimitExceededLogMessage(
@@ -461,7 +464,7 @@ private[yarn] class YarnAllocator(
               PMEM_EXCEEDED_PATTERN))
           case unknown =>
             numExecutorsFailed += 1
-            (false, "Container marked as failed: " + containerId +
+            (false, "Container marked as failed: " + containerId + onHostStr +
               ". Exit status: " + completedContainer.getExitStatus +
               ". Diagnostics: " + completedContainer.getDiagnostics)
 
@@ -479,10 +482,10 @@ private[yarn] class YarnAllocator(
           s"Container $containerId exited from explicit termination request.")
       }
 
-      if (allocatedContainerToHostMap.contains(containerId)) {
-        val host = allocatedContainerToHostMap.get(containerId).get
-        val containerSet = allocatedHostToContainersMap.get(host).get
-
+      for {
+        host <- hostOpt
+        containerSet <- allocatedHostToContainersMap.get(host)
+      } {
         containerSet.remove(containerId)
         if (containerSet.isEmpty) {
           allocatedHostToContainersMap.remove(host)

From 8b877cc4ee46cad9d1f7cac451801c1410f6c1fe Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 19 Oct 2015 16:57:20 -0700
Subject: [PATCH 607/802] [SPARK-11088][SQL] Merges partition values using
 UnsafeProjection

`DataSourceStrategy.mergeWithPartitionValues` is essentially a projection implemented in a quite inefficient way. This PR optimizes this method with `UnsafeProjection` to avoid unnecessary boxing costs.

Author: Cheng Lian <lian@databricks.com>

Closes #9104 from liancheng/spark-11088.faster-partition-values-merging.
---
 .../datasources/DataSourceStrategy.scala      | 73 ++++++-------------
 1 file changed, 24 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 33181fa6c065f..ffb4645b89321 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -140,29 +140,30 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     val sharedHadoopConf = SparkHadoopUtil.get.conf
     val confBroadcast =
       relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf))
+    val partitionColumnNames = partitionColumns.fieldNames.toSet
 
     // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
     // will union all partitions and attach partition values if needed.
     val scanBuilder = {
-      (columns: Seq[Attribute], filters: Array[Filter]) => {
+      (requiredColumns: Seq[Attribute], filters: Array[Filter]) => {
+        val requiredDataColumns =
+          requiredColumns.filterNot(c => partitionColumnNames.contains(c.name))
+
         // Builds RDD[Row]s for each selected partition.
         val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-          val partitionColNames = partitionColumns.fieldNames
-
           // Don't scan any partition columns to save I/O.  Here we are being optimistic and
           // assuming partition columns data stored in data files are always consistent with those
           // partition values encoded in partition directory paths.
-          val needed = columns.filterNot(a => partitionColNames.contains(a.name))
-          val dataRows =
-            relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
+          val dataRows = relation.buildScan(
+            requiredDataColumns.map(_.name).toArray, filters, Array(dir), confBroadcast)
 
           // Merges data values with partition values.
           mergeWithPartitionValues(
-            relation.schema,
-            columns.map(_.name).toArray,
-            partitionColNames,
+            requiredColumns,
+            requiredDataColumns,
+            partitionColumns,
             partitionValues,
-            toCatalystRDD(logicalRelation, needed, dataRows))
+            toCatalystRDD(logicalRelation, requiredDataColumns, dataRows))
         }
 
         val unionedRows =
@@ -188,52 +189,27 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     sparkPlan
   }
 
-  // TODO: refactor this thing. It is very complicated because it does projection internally.
-  // We should just put a project on top of this.
   private def mergeWithPartitionValues(
-      schema: StructType,
-      requiredColumns: Array[String],
-      partitionColumns: Array[String],
+      requiredColumns: Seq[Attribute],
+      dataColumns: Seq[Attribute],
+      partitionColumnSchema: StructType,
       partitionValues: InternalRow,
       dataRows: RDD[InternalRow]): RDD[InternalRow] = {
-    val nonPartitionColumns = requiredColumns.filterNot(partitionColumns.contains)
-
     // If output columns contain any partition column(s), we need to merge scanned data
     // columns and requested partition columns to form the final result.
-    if (!requiredColumns.sameElements(nonPartitionColumns)) {
-      val mergers = requiredColumns.zipWithIndex.map { case (name, index) =>
-        // To see whether the `index`-th column is a partition column...
-        val i = partitionColumns.indexOf(name)
-        if (i != -1) {
-          val dt = schema(partitionColumns(i)).dataType
-          // If yes, gets column value from partition values.
-          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = partitionValues.get(i, dt)
-          }
-        } else {
-          // Otherwise, inherits the value from scanned data.
-          val i = nonPartitionColumns.indexOf(name)
-          val dt = schema(nonPartitionColumns(i)).dataType
-          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = dataRow.get(i, dt)
-          }
-        }
+    if (requiredColumns != dataColumns) {
+      // Builds `AttributeReference`s for all partition columns so that we can use them to project
+      // required partition columns.  Note that if a partition column appears in `requiredColumns`,
+      // we should use the `AttributeReference` in `requiredColumns`.
+      val requiredColumnMap = requiredColumns.map(a => a.name -> a).toMap
+      val partitionColumns = partitionColumnSchema.toAttributes.map { a =>
+        requiredColumnMap.getOrElse(a.name, a)
       }
 
-      // Since we know for sure that this closure is serializable, we can avoid the overhead
-      // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally
-      // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
       val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[InternalRow]) => {
-        val dataTypes = requiredColumns.map(schema(_).dataType)
-        val mutableRow = new SpecificMutableRow(dataTypes)
-        iterator.map { dataRow =>
-          var i = 0
-          while (i < mutableRow.numFields) {
-            mergers(i)(mutableRow, dataRow, i)
-            i += 1
-          }
-          mutableRow.asInstanceOf[InternalRow]
-        }
+        val projection = UnsafeProjection.create(requiredColumns, dataColumns ++ partitionColumns)
+        val mutableJoinedRow = new JoinedRow()
+        iterator.map(dataRow => projection(mutableJoinedRow(dataRow, partitionValues)))
       }
 
       // This is an internal RDD whose call site the user should not be concerned with
@@ -242,7 +218,6 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       Utils.withDummyCallSite(dataRows.sparkContext) {
         new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)
       }
-
     } else {
       dataRows
     }

From 8f74aa639759f400120794355511327fa74905da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Baptiste=20Onofr=C3=A9?= <jbonofre@apache.org>
Date: Tue, 20 Oct 2015 08:45:39 +0100
Subject: [PATCH 608/802] [SPARK-10876] Display total uptime for completed
 applications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Jean-Baptiste Onofré <jbonofre@apache.org>

Closes #9059 from jbonofre/SPARK-10876.
---
 .../org/apache/spark/ui/jobs/AllJobsPage.scala | 18 +++++++++++-------
 .../spark/ui/jobs/JobProgressListener.scala    |  7 ++++++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 041cd55ea483b..d467dd9e1f29d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -265,6 +265,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
     val listener = parent.jobProgresslistener
     listener.synchronized {
       val startTime = listener.startTime
+      val endTime = listener.endTime
       val activeJobs = listener.activeJobs.values.toSeq
       val completedJobs = listener.completedJobs.reverse.toSeq
       val failedJobs = listener.failedJobs.reverse.toSeq
@@ -289,13 +290,16 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       val summary: NodeSeq =
         <div>
           <ul class="unstyled">
-            {if (parent.sc.isDefined) {
-              // Total duration is not meaningful unless the UI is live
-              <li>
-                <strong>Total Uptime: </strong>
-                {UIUtils.formatDuration(System.currentTimeMillis() - startTime)}
-              </li>
-            }}
+            <li>
+              <strong>Total Uptime:</strong>
+              {
+                if (endTime < 0 && parent.sc.isDefined) {
+                  UIUtils.formatDuration(System.currentTimeMillis() - startTime)
+                } else if (endTime > 0) {
+                  UIUtils.formatDuration(endTime - startTime)
+                }
+              }
+            </li>
             <li>
               <strong>Scheduling Mode: </strong>
               {listener.schedulingMode.map(_.toString).getOrElse("Unknown")}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 0c854f04890b6..77d034fa5ba2c 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -53,8 +53,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   type PoolName = String
   type ExecutorId = String
 
-  // Applicatin:
+  // Application:
   @volatile var startTime = -1L
+  @volatile var endTime = -1L
 
   // Jobs:
   val activeJobs = new HashMap[JobId, JobUIData]
@@ -536,6 +537,10 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     startTime = appStarted.time
   }
 
+  override def onApplicationEnd(appEnded: SparkListenerApplicationEnd) {
+    endTime = appEnded.time
+  }
+
   /**
    * For testing only. Wait until at least `numExecutors` executors are up, or throw
    * `TimeoutException` if the waiting time elapsed before `numExecutors` executors up.

From 60851bc7bf1fe702cde1d1548efd8016411e1682 Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Tue, 20 Oct 2015 08:54:34 +0100
Subject: [PATCH 609/802] [SPARK-11110][BUILD] Remove transient annotation for
 parameters.

`transient` annotations on class parameters (not case class parameters or vals) causes compilation errors during compilation with Scala 2.11.
I understand that transient *parameters* make no sense, however I don't quite understand why the 2.10 compiler accepted them.

Note: in case it is preferred to keep the annotations in case someone would in the future want to redefine them as vals, it would also be possible to just add `val` after the annotation, e.g. `class Foo(transient x: Int)` becomes `class Foo(transient private val x: Int)`.

I chose to remove the annotation as it also reduces needles clutter, however please feel free to tell me if you prefer the second option and I'll update the PR

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9126 from jodersky/sbt-scala-2.11.
---
 .../main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveContext.scala  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index a2b28c524df9c..e01cf1a29e95b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -308,7 +308,7 @@ private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
   }
 }
 
-private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
+private[netty] class NettyRpcEndpointRef(@transient private val conf: SparkConf)
   extends RpcEndpointRef(conf) with Serializable with Logging {
 
   @transient @volatile private var nettyEnv: NettyRpcEnv = _
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 4d8a3f728e6b5..38c195bc7db0e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -89,9 +89,9 @@ private[hive] case class CurrentDatabase(ctx: HiveContext)
 class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
-    @transient listener: SQLListener,
-    @transient execHive: ClientWrapper,
-    @transient metaHive: ClientInterface,
+    listener: SQLListener,
+    @transient private val execHive: ClientWrapper,
+    @transient private val metaHive: ClientInterface,
     isRootContext: Boolean)
   extends SQLContext(sc, cacheManager, listener, isRootContext) with Logging {
   self =>

From 94139557c56cea318d4a4f82a4deaf72198f349a Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Tue, 20 Oct 2015 09:20:35 -0700
Subject: [PATCH 610/802] [SPARK-10463] [SQL] remove PromotePrecision during
 optimization

PromotePrecision is not necessary after HiveTypeCoercion done.

Jira:
https://issues.apache.org/jira/browse/SPARK-10463

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #8621 from adrian-wang/promoterm.
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 63602eaa8ccd8..6557c7005d1e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -57,7 +57,7 @@ object DefaultOptimizer extends Optimizer {
       ConstantFolding,
       LikeSimplification,
       BooleanSimplification,
-      RemovePositive,
+      RemoveDispensable,
       SimplifyFilters,
       SimplifyCasts,
       SimplifyCaseConversionExpressions) ::
@@ -784,11 +784,12 @@ object SimplifyCasts extends Rule[LogicalPlan] {
 }
 
 /**
- * Removes [[UnaryPositive]] identify function
+ * Removes nodes that are not necessary.
  */
-object RemovePositive extends Rule[LogicalPlan] {
+object RemoveDispensable extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
     case UnaryPositive(child) => child
+    case PromotePrecision(child) => child
   }
 }
 

From e18b571c3374ecbfc0b20a5064cb58d57a2a7d21 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 20 Oct 2015 10:52:49 -0700
Subject: [PATCH 611/802] [SPARK-10447][SPARK-3842][PYSPARK] upgrade pyspark to
 py4j0.9

Upgrade to Py4j0.9

Author: Holden Karau <holden@pigscanfly.ca>
Author: Holden Karau <holden@us.ibm.com>

Closes #8615 from holdenk/SPARK-10447-upgrade-pyspark-to-py4j0.9.
---
 LICENSE                                       |   2 +-
 bin/pyspark                                   |   2 +-
 bin/pyspark2.cmd                              |   2 +-
 core/pom.xml                                  |   2 +-
 .../apache/spark/api/python/PythonUtils.scala |   2 +-
 python/docs/Makefile                          |   2 +-
 python/lib/py4j-0.8.2.1-src.zip               | Bin 37562 -> 0 bytes
 python/lib/py4j-0.9-src.zip                   | Bin 0 -> 44846 bytes
 python/pyspark/streaming/context.py           |  54 +++---------------
 python/pyspark/streaming/flume.py             |   2 +-
 python/pyspark/streaming/kafka.py             |   2 +-
 python/pyspark/streaming/kinesis.py           |   2 +-
 python/pyspark/streaming/mqtt.py              |   2 +-
 python/pyspark/streaming/tests.py             |  18 ++++--
 sbin/spark-config.sh                          |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala |   4 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |   2 +-
 17 files changed, 34 insertions(+), 66 deletions(-)
 delete mode 100644 python/lib/py4j-0.8.2.1-src.zip
 create mode 100644 python/lib/py4j-0.9-src.zip

diff --git a/LICENSE b/LICENSE
index dca03ab16de29..790476ece15bd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -265,7 +265,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
diff --git a/bin/pyspark b/bin/pyspark
index 8f2a3b5a7717b..18012ee4a0b4f 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -65,7 +65,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
+export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 3c6169983e76b..a97d884f0bf39 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index fdcb6a7902bbd..319a50049a82d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -350,7 +350,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.8.2.1</version>
+      <version>0.9</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 31e534f160eeb..292ac4cfc35b9 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -32,7 +32,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.9-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 8a1324eecd325..4cec74f057fbe 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.8.2.1-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.9-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/lib/py4j-0.8.2.1-src.zip b/python/lib/py4j-0.8.2.1-src.zip
deleted file mode 100644
index 5203b84d9119ea5415114939624411ba7d3daa8a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 37562
zcmaI+V~{R9xHOE8ZQHi(HMZ`tZQHhO8*6Oaw!Ox-zh}R7-XG`GK3`Hv<xf|tQr)Sm
z(@~TG1%m<lpT-sbCHj9B|DOpQ2noo+lg)||paBQubF`}RKjRq~4M_3-F9hlTLiF`5
z?JQmN^%)#I|2H%m5Y&JD|93-#|BE)Zw{<Xd`CqWLsxxufOh~<tXr#|TiOQ<v=`gN{
zfYO&uZvZiF*gnkFI6>>3k#AtH2BfiH<CWR#Jf6{Kw!I(U-oAS%b!xB4ywdZJG1{2p
z(@3+ruymF90uZ1aR#}|4A|_P6TG|jA#rnPiD?zdkncau5f&)k15>X`GLH?61ie-V>
ziuG;V0CDeXPq33i?y^aBwK?GwAdx%52f3mbTgnWq9>RSpPK)CXMx1NOGw5R^s$@WO
z{a0h&$0gWUh5he2sn!_zskHjPvGDvm=vhs$ucLz6U`Pv5S{Yzd2-Ay(W2UWZSiG(i
z%PY&$C+w$lDDZ0Zk06_y@6kW83+W4inEq*lg{0Mdodn(V%>nR+oStwYgIdfv;_>hG
zT+1AH)?M@56v1D&$SmHn9q6#>j@ZT9+9;j<A)j-r^LXyhB>DM|?~_^w6lHZc^2?#l
z8k6M3Pi56CpvF_8cF>%fnM6IY3+GM>Hsj<yCEmq&<k4?+Rlln5jvWhLl<a$^bx+%`
zfJ5w!iA%fEoy*Ro*2%x$FYD_U8to#LZ+B)R{73Lvd-osqq(rwBbEl#Nz|T)%{M$ND
znT6{drnhVY1T79hTvi_1?e3T*$P*nl%3?kl|ETZet!+sC?^Q;yPMOLwNlV&(CPaAU
z_2CqDc}lY@3~|3f6=fiy$QVvdo`HdY^1*@r5B>gk0wMoj0-0Ib8QNHSnL7P19J*Gw
zwO$rN`pMI0UPj!V&{afE5gJgVhUyJiRUldGF>%^(Yr=Ko?8!UycX#9LAbVBT7DKZY
z(4~0%m}X_&=Fbqw5UFBez=ZbU<IfauPCDD)C-N$8)FN6B(i~?xMS*AHI!J>40cS4>
z!Un~WD=j&2R-Q>Mdj;pjVCM#-U|kfUI47Uf9-{IpOw^nflJXd0<IC+o3XdlcR~od^
zrpoPSUTPHzJb}?`1eYF-RiIw03fZ8mv#oXH0M%2I)~+BgM{%Iw0yi6!4VZ~VX&KR+
zj|eKIr>duIVfXpfD#e@`vwA+}lLu%jb<CJMsb^6|iIkS_et=Fn#tul%%rUndk%4$o
z4Uuwf6}mxe3%4Oz&58zV+d=_VuX+riY=HNiR4A2&m|EL(-y+8|mvwTiq-|r6$z>ur
zr|HEUjaTPmpfQ6T3aIJ9YG6=7JF^%qW(hm2j)11B91*Eve3k2zw-|IrTOU%u?XDt8
z_fpYGc9TVOe@sI|=c%~YGxyr_)CnN0I0UAKzE4_XtF>0i^-vD;bsL$II~@A3OW}p~
z_pTZ%j2-12TZq7vDYlFY(Z~?jZ;EH2b+A)CP%Q64K)T7n``#5oxpERv)lzf|Bdy>a
zC6<C$L`7J2BsCPbyl%1MAsk$CGCp#I2bL8(&U@c%X_3fD{{v^s?r*edbz_p=fx5Ff
zl*v@0h8M<7>b38*gRn)EO)^<SyT~gs%1R}Z@u~Ctm$l7a6-;gI%__>c`?PF^_GSBL
zhlS;avik>Ll3eyG;M~e7KG_Y)?{LE$<ZrASyc3R$Yvk6xt;oOTuz*CyRHwdiAHCI?
z?b><gPbba0=DR^Z+7O4P#040#*S-ibol2|Y^}95)UmlbWKrJtsYygR*r8J^~Mav<9
z4%I=cYlsBn8wQTGKXmt`WtK~<IqGczC$Oyh$`G%J1Y<|xf|_zon889-!8JdX{~J<i
zL(P>{vP}=twHv+_Rs~x6YA$T`UudE5`$=?pR)P<!(Y70`Iq!Cxk<GL6wG!8Pbz!TS
z)<V*ZOTHCR%P?;a94}mZZFcCy1&b?;?4udGTKoyygxZeWHNELB-gkxF2}AANgMXnm
zJ|Bf;D8ALyu75}=9dMeZR4rQyW(;5UCVzjJe%#k?Z-$@q`c8+h`-}zK4Yyt|Pwm!b
zZdMWPRtd_kNG9Z<4i^>paFciZ3APz*(0Lq(ss;Ni_a@7GbgVp^L5n>6IYC10%HHw+
zNB8jVZ_<^ZKtMN0|8Kg-`oFrjGITT4H@3I2F*SCvw6}Bq-{h}zX^+c|@O#rIlJ0lc
z?iyRtMTj8WB*byKTbeF`+yzV5IilKn;<0%jV^1LEBli1BS~({97<ELpMW+ZVW)oRL
z74;RFV)hZ!<MWdwcch7=d(-rK#L4&@NuO)iw#6N5?n$LVvIxK{9RTZ{I|7R+rcE^I
z@85l(Md%w+lx-r4TW34Aq)|>Ax!U2;Hi$mIs+a}FWeiH8FLijssH@Kow@EUmi+Puw
zUN@OBGn##5!gb-vEf&3l(-Pj*CTLKv`?y-U*|jTgnsde?-!SicAk&Ueq+7e<^rI#r
zC}9irL}ARloqMEH2)S)C=!i;=IZ*;q!SKT}AvrMIH1|mwHOzz7S9T#YFZ8X_RCtUD
zCYOJ=0>?OON_?|tD5^ZWBv`sL53h_rpGXk6q*6NP0UXt=3i=bs9-Ty)d*Lm^{m=`y
zNp1rj1(ownC8=YKnKSkILzi{@vz`f;aT0ik(|SGstkjSfT{KT8c}xeXVNbr6Xvayb
z0ic*OPpeOPBDj^KlU^7%t6)&-5%;z!JZ^1l0Be`TT%scet6IfBok8n#MlK;#M@=7v
z8syu&=l2f{GW9}W1~HH?K(X>XSd5}*7(1}b2MNSBdXR_q8~GZMNCo_O5tE>pkjW1C
z7GZVMb)rRDVSr|UN&%;XnnM>6J^C`DA9T;L<Q1HN)Mv3Y3&MjUpPwI<*H0K2+YQBp
zfD)#!$G<7pgRYJc>i)Tli8bbcHW)Qz!d!YrE2797Y9?~AVLjqoAcnyuJ)%KYXUZeL
zcB0u3^JHGWDpWeRmt#go5<;fdMk>xT6=5O|<$Pg2x4+^Mh8#c^G<>AoArM_;Z6>Hx
z9`*=(i2{yYro~dxkc^nd<`!WJ7nI$W1qX(XE??p^%)`<__z>UK<eMyyCME*YFs`ul
zMtYle#Fs&c5fR{E{Rt$*s_iyW$~n+6X;NjTr&FeDy!cQ&{DrdC6RlFexJGr}Lq3ga
z$&uw?>(EN8(QT;#*sjjff=O-(Y5aV-MF6WMg^fDZLU)TE%6JM)qxEskgPa{~{C6xi
zsFPn9NOX@Tw!&};RG2()f7aDgFsQXhD!2OrmLf6&p+->yBPhQ`=%3w3K?*>4(LHM4
zy5<+JHztKaZ;;GF7>#rZ8_AZ2?6Uc58(%lj5uFXCh+hEm+ns9|C{Y#Y^bnSJf(P7m
ztjMhua<Y*cqv}OP_qFS6KnUvwJ-b52QM_y=4A_W8E}+Sli@y=BD^45qR&RbJYLZ9w
zm8~d*wh#6Ddasu^ZN4wAx7zMNma9qT1S}r<KqY5WfI4+y;OVj6{!XHwj2Pj#d0Ok$
z?+<kGH?@e{AQgClG2vrmT!(eXNVl3u84J58sk6o)WWOC^tO4w7<AvAo^4n4Wx8eh=
zSBZjW0iP>Z=;*a`pvDC>vOn!JM~Gd78`dwVsKf@yN-v$#5DuTlNG?uX1USqFiLqps
zTfX)V`%RkkTteP$vap#T3Rabs<syv5f768BHts+{t7f&vg~WPTbf$w`CX>|!(BDnv
zS9Io}q9x>B#C6)(g{hWWO>Zsg+g-I?gK<pSoPe}PUv8pf+eo*=`qf&M@S6c;uk+&^
z<_1Ad)_@}X41i6QL0-s6HbLNCtlpg-`dx}gFg`*zJ_WQsFHkvb8%#HH4suUOTcQR8
zBRU?=UtbC`2VpgYkq23W!L7s}V(N;w45%8PU{4~V<%a!{W~>^QOnMhByQN|Y1x&mu
z^lGE5yK~H%-r7N4Nh!vW2u)@u9G8=lFHu>~Jn2#cq6jFfa2P(AwHO8J4sG%Qmhm83
zs@ejdph4{{nPk1O2}~Bg_<o#A^ySA*Eq{JC9^iT%rFVjv;n>{yyNy%68F`N^{O&;M
zJzzX2K_T9w;?|>^#x+PPpjIHVYFGWU)&Q^!vTufsd<Er6Tt)d}u*H2bfMTHwsWxng
zR5P!*jAZv#VhblnJ(dpw9=&k>W+f*v8EDJ764_thoWsC4y0x<ua!ai<ts-oM^@EX-
z1Hzw$_w)0E4f~&sgL5P^I^19IKtZox>yF1c>G5o`)%XdCe%x*XMYrFW&QxOkad_G9
zCw0CFzEUrMAQ8dfxK^-xIHvKRq2ebkGPBOVukgyigQ-{w-?c(WG$ef=V6Ay68>*EZ
z6FvE11FH632F7oRT2l>)krI?+5<`$|EMdJaWYb)w@_%1j=s_KSdN2x0RRe|Km4QBE
zK+h&YBGqb|H>hcu2a^LoPOz8HSH#=l>Ib2z^m<M=VZ9jiQdXewY-%T6MRp>iDcG<<
zmuZ<2vm^6XPy%4w*|C&$L_W65b0L#X@Ox6#@!)5Gc=pw<KJVb;v3>f~XQ@P#=f+5-
zDN)AJd_Aqec<b4fv~(v6``6#bSVjMW7C$CoxmU47r<kMZ5l`z9n+~GAlCkM4s$*pD
zrY(L99JoA@?Kg|m)TsRVU<>vuc20yhCL}k&SIjm`$kB3`w*kyOPS^%^gFzt>J5a1f
zB$bOel}buq7Mn@1{0O6|f1)EM72NSUS*ecqo6&j#0X=`4svb8a*eMn@sUQ~%@&;gz
zLpwrR)0!=&h23B31yWQf<Tuk2o1=Ho*;84id#T=|SF9(}h1v!FrFgq$Z8Br01*&dp
z85>^P+;kyL;l|G6N0{UzXRC)9LDh);2(AxvTIhvzuIsyTAQ0a^0cC!H;9ztdYLME6
zDu6Vmrn2XaxE$L3Q6Tib;t5bHb8(!!i*svPeI}2LI~iHu+EXz7=1>j->8_Yiabn3Z
z09p-da3ru4AXa($=u80XXiUqUtVYeLS%A4%_vr+pD{y?X!5E*yrm}N5U0P{ePL5i6
z$D5W}c0|#EKt`0p!oK+=oLbVo%#|PiIRq`Jz0}u6-FYqs>2(LUsl(Rm%g8xLUJ~s{
z88NV}v}A8k)0Vfj!k;^(-jTV^q7ag7E2RJC5S?4kCIS}Uj_Tm4&H}~)k{xKo<Y~|c
zQaD}<+wPI{+a9UYamLxPlevzCwqEkimfi{BNzYvoPNmDpO9J4_Z#<s>Z#h4*T5baw
z8P9R-Vi8&fxQOnPlEn4st!=?NV#ujYcl_(thIw+B7jS&4n(XG9&$56M&!g9o5<dtm
zd$0d&P&1s_s9}JvT2hZwRD0`6OF>lEdRT#B&Fhi}OHGk(-cpOI>%d{s^U*h;voDt$
ze`AYg!dv2E_&n*Cw1@i*19zrH$X39>A0dJitGV*E!7C#skR#jWn5V_Ds8jOW>0Myx
z{!6gc9yT%u;aWptU1_>f)Uwv}<{89p+k`K5K-gkyYwMj}{Fr>S(5^G7oXySk>uC3L
zwEZPKJM#;(X7As6!3?Lbg0*knq>3+?N`@^W0kcD=y=z50W1--POpEATP}asjCCnzD
z$CZr@?ARKh^C<u3?&9a>!qv+ObZB(A@A?gpiaN3k(pofATI(SzEpfCe<3Jv!Z33Us
zBHh7gn;!b4yFdR-hSc1jbR&1IM1K}dZ=FfHS(Kj4D(yKn!V|1qS@%QMAq1?Le6w0M
z?$&t9E3Uk{SASI{_<JjDnje+uFs@I<PNv2$vh5_V)1m2T=$#%Fu!{@8{h1@760JD|
z7@h|=anWpK(s}gP7q4_bfMxc$wEk<1Y~>fS6=bMK?AaplA?tkfd~TTKif@GYiEW*H
zHIGN_RL<;!PbhO#_9V7QR=)f?6rO%XvM~D>bSp<*xo*URIkXp!S^T*chbUN_W?qrs
zV3_g`y0@!iAG-^~<y{i;(Su&HE?O*;M|gQIeNrugS|)}vKh4+uN{2$zK#nA9UyCq#
zHMF{{S_1vFpru|#0Q^d>Wtb%<g)F#;i+)5E@@Dm1F+F7Ax+KYCY0I4RgV<&1QV&{S
zwwm-=K{Wlu{oFPjUhfwx@=CJzm==Yxb?=3&`8cs|7kCvnUd{;KqSniXpm7#&zc#Al
z1jAi)q|7rIz%WEK7vykfpT|(pwdIp66P<(u{Zy6UELv<|IuN3ay9UiBNDdLzYGfv7
zG99`0579{B8>YU&OkE$#y^WKGe?5+dBXb35+iYDChWSZ*J#Fw@?T93`;f7kDKgp4J
zLb8irq>E=N`_Er*EJ9rjiur#F^S;>!>14vl={0d1X$@Sxgy~*AI77h^i1U%RMbgS2
z0Bc{OYZnwp=xCt7n4$@NmZ(E8T<gFYi;`zG+PH<d0#GVpY_m;C^F!`GZ}IL!0<iB+
zEJri(?(G#pZq|M%C2K4sw(nbTQpk-v84toWwh*cIJvN*)J!MTJ84-<Ju^xN`dU$bj
zjIB7UU+Lhs#=nIBV}o?eXR90FfPj8Q{@-j6+W(ggGB<QFbvN|<-z<=?wfm8{)0z7>
zObx{;(v$Kwb{!{T?)<A2SscChQ-o;jk?g+!q>U7VL?HNpMJ9pIJ(YOBVqxW@hGd%{
z6Mm9;UELjBU0r~#?lTF`+aXaF+7ewKZ;YL|J;D$2-qrQ(Z9e+aijJr&vZ*Kiq>HkI
zf~GGadYfRRl<vS5Rcx;e;nJC|+G@26L2haCCmrTO4IPG3r+dfW+vx4z&Alz}^fXN`
zUQB@MUb$=Y_|46w1&czpw_RZ`=7Rg}Y*CfG%!WmlM70miq?K}t5o4Lz+!D9}K0fAq
z@%SZNRu+bm-m(%k#fTB{12xGWObJc9oX`;lEz}#=9Ko>94me)COya5$4)8K6bL7u<
zbg$3*L;lMP!V>yZ6<>@V=(;yEX0)qI(rzT*3=Lij|AkM|wAynOniPs@v?(xD%>$gc
zCfOt0n3LXf(NinUziKTq9iw<_`KWiiFibC%$|{xnN9n5x!pFlK1g}~@40LtvzvUyM
zfQrj!6h#_9p)`@?cXb9|L{>GRddpN(R<snii!P1iRxG_d{g198J#OW#p<;rv>H%`k
znV7mga2d@H*<!fz^b4Ai{`YeDg6^Q`6o_foUZkE&zM7?Oy8cViRx=7f*QiZ4S2L~r
z@k&tioWm`5b{O}0f(Buy0v3x8@Bw?dMJmgk6#kV$_8^vYb$v|?ue$JVCGg8JQ|_!Z
z>J;($;Qs2s7-UXkmZ)Lk3Rld1i5#1%@(!O*9<Z@B1uXA0hh7{A+S*kPv={k7{{?;f
zg539QCzvY_V8#VeCzhml{S+LKmb?j2&0l=>bGN^Jp2_AMsy_aB+6z;>tWN)ZdiACp
z;*{XDmzY?Xm{_^{z|nKp^L<DBPQUZ7<N3He8Cii6@O@7g{C%JMcM@N0&p4YrRUPgB
zy})pv%HZhp^m2V!d~}<9d9cE3@pks+;lV8}UT!`VjZ62F-ZMqm_rcKd^Yg^Q-Rtq`
z=E29)!$nCKM98D#=i(ZSUm=R$cUvR2axRw3{!?D|n?k$r4z#l~WLe0(KN0Kir9&2<
z>2Wc!As<kGOgC`BSL%-Lm|@D}82~^*HyvhEq#q&6HmyIiUt3sLB_yp5#sLG{mw*zS
zTneoKf?#3$a%h*GazU<qK@8Lj{Ru_NyJ?!>Z&0BaO3D6C9xk5y&-TLwDn*O^{@p8%
zdr^1gxb2&%>Wr??=3gkOAnejod+IyO`nM7>-<RZ<G{FZd4>R=l?uPP)k|uT<{R}k<
zIN8sg=L@*(_Uh{Fi;P>42UpyGcx~bwJ4wdL_i^*{;}#*Stk4FAo%c7-eV<TCZn0F-
z=^pHh1+#%YwKD<M9O@3hn=YYF`jPQ(1(lwl4fPwQHB9t{B<<I&Qk*)#`gFpWM<2R|
z@?zKzopm@5vK})Rvt$2v(*e&RNiDEeE;)aZ1vt!u5O)$U75gR}96g~4MtgOshYI{R
zWCej0`l=qpjA>w|@>Z54tq$n?l`uoGFjkXe%nqsi?c_v1P&4K8TT_W)k*BrYU~T!f
zTxS1wNn-`Uho(y4=jK<{q~nKqMp_5ygR}mE`<aQDX<oeTRCe93@0aq&U-Sk@>Pg{~
zsiX5?uleLcfF8W1yG1GZ%mk3ev;|3SQ(kY}!QN&7>rG{&WR&K6CH8q;i;9*(l;>lB
z1GZqpvO%Y8Fd|_942bd9?n)5zc^jb+Cqktd@a&wVe6=ziYj1{NOq5B{Fs<*~GyYrb
ze{hU8)rr&nYY^s?6$usrIbQ*3FI0dWmbRx1+P^=#=YVUBfnmts)K6uKknT)E5H5Q`
zHjIX%gaO%*6#~JcDHtVhPIT$0l+cZ)78d@e7LQz@p&JtxtwVLg0#I36?7{K+HBZND
zf%n_1<Nu8B7y6Ik;T?x3z@q#FhY|#~pL6F)C5nYPBAcjRt{4s5NADQ&jwHYABn6W`
zikPWKQCE5_1W{T5Miz#YHygU7*}`VrET)mg_RqZUKoD!;DO+`6Le8a&vI4cc%5TZR
zI@9K^Br`eQ;dq&8WHPYP!dRy)sfU6xZGea-)Gs+#!%|LW<);yQc&(a-vpA_RR*Y+?
zqo`ao&O3wjt8wUX`(BG<6Vw9|qz4}evEeo=eiB$+1)@vPJDxRh@(jD)Grw$P4jur-
zP>szV@RV>MM7u~Yyo`ad@~?V?-i8}IIlS;!Iwjsp1Z`;6Y?)BZ7YFnm1gou0(;=I&
zh|`jzjS6X*V4VjE?LMRze)|XmrFn!&dR{wYKHO9@K-E)rDah55a&Ze`9@mE0>r*j-
zP1zp)-ND>*1{Xf~p2dYzaU5VYA*&4B@)mzQE`IFM7r%rkYxygL*@(RgX;dLgbXQJG
zE~=PYV#dw57IO9j$!^2}eIC_l+e3(lC)5C{vC(FFPxKIM+KTpxkX1!BPCKk?N~w`{
zt1xnX!H_14a<02TeqK>k{-QyPEM~{Tl*5EnQc<bEujpJpQbka(D=yfW^Rv3-90Y3-
zxXC?LIBh&*l9)C9VV48TY?`N7o2I4VbI`CDwIBtx2owv(T(oK~>p+;7bhqj%S`^}{
zyBVR_9pow~;Rs6R!9n*#;+48zX4jfR3;a>&fkxM<0-e-fzOar)D4{HeK21h8+@MoK
zs<Xk0<$5^SjXbU9to5^i_4E<cfIaQNo9vyIl@o-`&x`_bIq_Y_Zw5J>{0Lp&OH#K)
zr=Am=Rd0>+)j{%8iUsA4Mp2aV?T!_piEb|+Zm;0OoTKl@N=EHdjAZ9<ODP1&#n{Sa
z$Mi*Zz6IQuO!arQ7z8-#Nz93Z&l?`5%qG0UktD*>vE$}5op2Ldo09R^VQ_J<!!)Ky
zU+=r67$a+m`o98j3QB4cLx^6jOZK$3YA9^s%yYWR)+~pdWKFrnmj0P!aEPwbTFG8h
zPAFuSM??e<HD#52<N-QDy4@@x5Gp0OJye@n%5Xvndw+F9w^xyd{H~Fu7Iu^Mc$Zho
zc{@Z%fGR+6CfdcyZD-GP0{FrW|AG_K{23|Afib1oB5MHLs!nmeFH++0X`{|4XxUmn
zX4gtdFmP@#VCC|sk?P-0&rasR^KCy|j=-fAJjT7X=kPBT#=U07;LS8pU%;@BzQ=aJ
zUW;8V^}bZrm?%}&Kb&@s#qFI}w_UjfHO{yMHsBDwBEH#I>S{Z(2qGzD-O>n{wrxsM
zto~ey_GM+Zs_Il4$0^B|)S<6O8rl)GWB<jcGU>BS7A?D4DJz+WxnpX$(Z5#kcFEP_
zYFlnjQiN2xInc{QMzC3b2y8=ISy7ePKxqFw(^H!@(4kpHn^;eUWht<-2?Dsv<~bGN
zD{{2xnHpb2r+|Udo-^c1!>sx{iy0HSmU*%l-_V<5RYK7j$PjE#Di{82>o}MLKZnHO
z7hzG`;D2{ZpVmY!qMT2F{xz;Sj;RaU`$Lc%rgigCrKeV@&@eD8@aZs%f^-lV+K9%s
zd0@fE8FWtBl>eA(;ONc)3OZR#aa4eJOq*6_)T{w$;s2Qx;v99+zj26)l<pc<h>sSa
zcJBYLz}NH#+kAuH9BaBzeYrBiK)tAi6{UwVz;EZDi<TaK)8{K>Q~hOpLzfKcRX6EA
zC5gIH@&@gJM4X7CETL}Z_sxN7jA}ReTSXv_S;mkxH45g-r#3xTSlFqfEnNw=VD57Q
zrs%w&*m6NAuenomqh3(Pf5My(VWl>I#gqmn4`b@sS@GojaWNnm^A{fKtbWiqUBJ8G
zLyz!dZErUwsXCePZyqyFKi<^VlRVkOHegZo3ioX$%&+}5$RMTSj^_c{hSPc7p)B|B
zHWBV=s!$bdf*GoBE4DgpLQ+y?f0JRI5+52JINjM<Ji@tMmB=bY|42dgdi=l>A}G?R
zxVlH4yO)48gPuPp9|3teWwV&~ok4PdFJm}zpxjEv_?6~>d2&Al%LX{xzo;B|vGVK0
z7KJeTY^Tdvp&HH24uptW5VE!0DR<Wix6UDR)a!}DFrMqFhm3bbXxZ*0MJRF0=t{&k
zeKwy(o6S-h)kY`!+l04KZC4&)*~oPIg&ERdUEF}b(f{4cm<l%5Vfv-YGrPG5#D_Ws
zAEUg<SCJ<@Lf53uA}{@11@XXV8Q4g$cjy>f35gi<11zVZ{(&GPs8@LW)PtqYM08K{
z6~ZaeMx}oq#>mILHbNB3i{<Vex#uXb>*+16=}J1OArIW-L<oyYWShpkrqOBfaWPD4
zpW^WJSS9d#TDvC((ZeJH%?{h{bn5Vq7N|;e(B;*lN3!@B4hMHjB1;N<)F`33so8Q+
zRV!UYj2fRz+H*mvu#yI<S79E8ty!1Jkk2TaYwts*s~gdn)YcnHEkXRYIjSb)t2am>
zj$X1=Gpsa^0byh@<>vh!%1>$aX4T3#vE*tzGQ!9%R<hXReTXy$`?Av*HE7KVkuOL*
z889rLI2N#p|5CY2xS2TMQM@TE5X}XHN38VBWuBUf{y7LY#&q{5m+=nR0z7a+ARwE^
zo49q6#2;i)R#E${2zHQ42o;}+Z7i$fD2RN~qt6md4;t8sj9V_6A<?q~&-o}#2Np1{
z(4oxTMpeN+M6`^HV+cl=74~d_gZh0xsA5ngf6SWDMlQDi^zqf>tiqec)h~ukW|fq3
zJM)Bib=|Ydm3a@`sEt8eE)R;&^wDz0=z%tqpLhL_#pj9WK0*kr{AL5qPLSh%x|U0A
zTX5ql{Ry2Y?sth!$u=qMxbL9lB3_9Z#Dyw!sy@Q85z0$7*b)4MZjU9du&S^cd(93c
zX4JBvag6XHeG3WLDWj1&Jz2}rNn|!1Sq=(59DiNj*=ynLZx?uTLc;}r!awF8g(FH`
znJT6UQ!yN`k#!XU_M5|tYS42;$?(Yg0%ys7odl<xQGI)h1lL6{(fWlSfi4Zj)<43v
z$8VcXAd2-m(8p(xq!;=w>Jv}U4kjrWywfRT;g|W#!D<omqCYCB2umj9tl#Jm&Njc}
zHZX7>%7nvohS46FN~S#2g=?Tm4SAZotOBG?mh=H^0yADt5qcGy%3)k;vhBoTY)n@5
zQ|o1wdBlbP3Iiobdzkn`Hm{ec_v3ncr-YJ)f6m>z%~Nmf=5cBwlOfC0ddwr2U?|BA
zE%z|uRnM16$96<8SKOSb38*eXZjXi`*nb<e)%b$r6%d8xb0w&F1v_hEfwp17m{S!o
zG1dYg1!T4=`Ihp{SqG-YapqE%rtvqhOYci47}E#Ch6QLRU4dm@gRiKCK6i2UEbpIj
zk9KV3>un1h1Uziwd?LOyU*E+70zkO1yz0@F#Ihdx@viHwH>-!h4i`rL^SR4c8)oK7
zfRqfEOGT*c^kQnsO;qZF#36)fS7?&4$|o_cQVY+NP<#BrdU7@n*!Y$=0J}SPB=hZ+
zacfQNwV!Iy0%2iqwp(PcS$$^G1!ZnQinnVM0B7vKa6vLesWt~U(482eGRaP^+9Ec@
zY{};4Y5JjVfj>z?fT;!~ZEG3t<5FM(WgB|R3HL0H{W0I%Go>baU*4)NWtEjKJYz`K
z6;yk>Tc|NJCvTMCKFEzsafD)b@u{<4#)kIc?>Q%sxreK(qv=1u=a>_4lTqo-mciEI
zRiMsl1)t4){d=t#PqW&)jwX#uyN;%AM6+_Z>0LWd=m@L{qZw=Gz2YBwM{^8g-ErN5
ziNjiocqCp0nQrX?8eoD#u8?i$b^uClQ4oR>7Aq+Lpe&~fDX+{VQBsKT0NWVQzr5cP
zP>+`N4H@^XBY6<`qQ&Z1K~3o~ErrQ)n9g6d&z%fWY0mjB!yjkJFElACgzN6<6Z8+L
zN!%wnKv{lSz_t*N)>;t;bpaq+5eq4~@1Y$|#vU5hk&f1WxBQ!5!Jl=n$O1zb@Z^>U
zcs2UqOLZizn5$!|V#U%You<o0&Uz#3QOtW#6V63~XNngK`^1biFXJYO-;*Uy$;BJr
zo!q-vKRT0h@)E#%US7L|?T%&b#39Ia)#bt^|GTHy<RgNJ=t&Q!aOq*n#U`MD*|x;y
zMdRJe?rH!ZIE9{PEZ7z*vgins1>9Tbqt(S;1B%TIt6<FSZ)|HycvKS`tU|czeQYAN
z?wIYiw!^^1`vMHw7j|t@=<Z-=n<xO)ZHeSQxh2o&)lr?+QgL(FyQ)Nx$SS8QYk?|W
z2h(i`!k-dMdf#YUt2FjH5L<*`Eq8Ms<r2Sg8o}Ir@zOgICUoMKN(8b*i%1Q}d%m@F
zx_&$}{vqydyv=#EW3-NlZlRywm+rJ>pdYeAN!=!1+!E*;<z#Ky%ux|LN#BeL39tOa
zj=lFOlUam@V8)PgV>0S03{myM-kWOmR6?7FaA%ld%r`91@pi!X{?CH#>tLLp>v~8a
zq`Ws5z|Z?O7B`01in3uQqC(Em6ok0|t=lL*_biwplG;5Ymjm2>sI`Z;;QUX^N{jvR
z7sgiSJ(|7Qaf)<`_~chW`prKw=xW!5%Z=B){on?qrNL5AbxLTbWL@%Z(|g%5vOX=_
z75d=&V4xb&&b19w;-;XZCt<f(*Phx(#%(HUWG*E)Un&!|MoYfX6URXj%+q)OobakB
z+O+4zU}AO~#$u8Cnn*wl^11oFvs&7_-zc7SzIPtAb;+_yp|+LF2d7udi7CP%LQs7p
zMF$q@(brzv%rx(QJvFv@gAhkqC|ofqNqppj!7JikwD4wYLrQubrrbDgonCPIpVDJe
zj0XE>#iDH;*JX6?!{6)9vAoe(+wJ&WA#cz75;1)i5C?Y8*{Rd)_RL;kNKL<L9dY^>
zv76>QDbHT<L}lHAl(^{uU#1^8TOJzkOd6ollbd6G*VcAcCA9Z;1gL;dVNhdUtlw)4
zW;<@sB?FC>dCVoywS`)Wjr%VGHT)*<HuURgJEl|hUL3V)z=`hHE$%l&Vr7_IDR>z5
zer`Q*UTPzDfBwq^bPdmkjh*{Z@Y+$f{Xz(^eI(Sr@eq;ZR>}Qm2us`$rIOkoBSZj<
z*WXQzvpf=p?C>IJA*1T(CrZn_as73_0y+Qoj0aU$l@?5qvI#WY;yU@t<>(h0D>Jt`
zM_AsKKS$Q<H8@s<z<fIF8Z&K^qgASNPdfq#@``+m-AAzT8ViJz+EB_|f11z}(>Wxw
zSa!}X&q|~1k>JOO6llX5tZ!Rer0)pV+Rqv2zTx%B|B1+z552134!!p}7LFxg>drK%
z7mFMRvmchh#}qd*D~m^rc#7dQx9Kjh=B_BK8bsQ0eBa_QkOFm_tZ#05u%)+-!iK9b
z+Q&>5QQR?c)%hksU%C|4$>ZjH&*PcxnG3Xg?zPzkHpUE(Dn4b1VluMKR@>r32f)3<
zfOfts+csn0Hhx3Y^mb--ce04(Q(DOHNu*O@c0f0Phkh(@65VUTv^tT4+7_;uOupf5
zrh&SO7W!Dn5a;%o>97YEY~|kPkn5~jos*bMc}+vQA(14ROCd*gbS}r>Eg=EP8X`tz
z{Rlrz>CX@eM7S#I-q&?w|CgH3?&_(31aQ)H4w*VASGO4!w8S<hPY^9@IHQ{{1+Fa`
zRwsX(EJmM2<%FR;WQI4?<NY%PTai<<ejhiz(H!2~gnt<(B-#Uxx*UY;L!VF#I-0V^
z-8Cp-mDqm4t2n#C#b(m^u#B*8P$&TwrkbVsS#@5|>GFIqHT+i>E0u$dMIWRV5xcpk
zU$6<AWs^(Hsq^}^q<*vsO``<>odYYa&5GcoZyJm{8|<fG>Z$mnKCD<i5@oj=aCfI_
z*1|!a^jgs^vFMI&YOcCx+$LxML|c5;fG~`DJY$+X2|qk~!eb8PVZ`d3&n(N1phd8A
z&j$`|{+Ozt4t#|N*Kb~ZAk#Ys4KYt*dF3~QnXm7;vWD}JF`ety%;1w1bkr1*UF*Zl
z)a)b>%(X~e-{pmm`H|1gN*tU#CASX2erHdtamOFcxC<Tw;2X`TdM%2Zdg0k|$e_MR
z>Ca9^U)vTai!s={2wUP7I_~w1?zcmb!8?LDTduz7KLPF1($z`sy-=eSZnR$YE{tL4
zAWiJ&dGVb)7s^Jnxv`f@d59~)AdfF<@Z*o03^R}-wCQLZlz~RIw5>X(58@9-#SQ;J
z(6B9{U`3XdN6xHgpt3P*F1+B97hmi$B=wjBrnHq)lcUvA5$_a$`?XTWBzJTJm-V)!
zCA4qt>-bB5luv+pBuPk_{4KjDQHB%-v%TK?C1nVIYU#IjYKh<F4hib?e0A!REey*z
zZggCl?|l3H7$Uh9(X4_7S?4#D7HSb%1CUzUKW@Wp=CVbVp5Aq0_aQZQw%azS$GT=r
zVzrO{vJn4VyJfl<Dz{<cI1(KS+2${*D^_Q*0R9P^0mHs{p1Hrj`{*Q2oe-(3nBQ=C
zx*m~Y`M7t&dWd|q|7hADKqjk`kSJa{I!1wbe9~vv7H+g{!3!Nh>H=$Uk0t)}P_IZB
z8MBZ7TL--mkYa8)i8r0u(D&!BM8z-7^q^gEcChJH?*R^-#Yxwet%3W4LI^Wl1g?%;
zSdiFt%MxNMwL;9*K#`lT@1cKS2j{yWXr14W8(eq6x=xOcsJc<wp<Da|*zPzuJmqHu
zYy5y*Pm^;|qZt{zDUG~a13IPu)ac=H$LeA9<K@E2%>CJ+Rk++sTN{p`Li#5l3;i}F
zc46h~<+TMv-|6hS%!N^q>w#ZvnB+XoH_rsfNoH~EwaV5?RQ0FyfJ0c9^KDN{^_L90
zN5XJWc<BKzUogHOK__B>^w!d98csG1OLQ_e+bAdhswL%6P+$NeViz~+*R55OlJv+;
z3c)L?z=o8&5t)EU!t0Xu{Cerc*Qa^f-}&d8o2S+m^}{x0u+i6;X)D@SjlDk$%Gf;V
zP(VoV=jSZja6Zk@8F-_GW7hO_YB$t|l7K$Dx>ZUXF<URjf}zwq?R4)8R(2Q);`r_l
zR`MJ*sEnifOSkp=d^fLmdSv;iFwTR-51V-3;-QqTi(^jMcM8k61)F&TDF&9?e}O=f
zW)&$F>fCHo;V|pD9w-@OZJjRuAEKzIqQ(r<mYsNt;Z}{F<mqVr`-G5=jmK%YD&PsH
z9Q0Vui?UWe^h4|6+8R(+67k?8`ISIexVo^@u|JDs%s-8P_g>@VeIPq8gMv*Ik)Wh7
z(#~lZT(}g;h230hpb*hEep;Lz_XPbE`zg_Od8Tf~GVc#+A;+jBd_)~__?6IQ;Z5uU
zhzg6AwfEt9(fy`_*+_32yH@z<H!78H&I*H}t;Zo(-}sFoJt3ApyYY5gmazt*gsdaM
zKRVj$1O_Y{^{9oYj(=9$>)4pu+m0AYQ(DPo6S{pV5dVw!?lY2{Np{^1>FM)Krkcr3
zBL92xtAA^UA?TKD(xJo~!o`tMEStA)^+P!}u{KiFyy9ueLGO*gDFBS)Y!lg(`*HpW
zjj~SG-WKESv)cEl;taE-TZ1m-NGp_;Lffs?h2EM)nx~9NhnxRX3J;Ak;(BDaL~;sd
zeYPvle?qV3RFJogKu)s7F*2_v379KM2EBJEbq5Me#H686a&kBfhJrZ_b?lNh=gJ{A
zRnWX=9x0G5*t5_z(uyglZ0Du$8nCD<NOH6x3oDY*+ON^_MSZ2Kl9>-fjM|Vz#ni-l
zkge!1M6<1|Fg+%E<Ix82fX&`G1#G|9Cbav2oY#DjC$i^3`{<~|d<k14Ftm{{*NTp?
zNWV?S*$C?$7!O5yQ8KaX1Od0?cGFiWT-;*#=d0-xz8+%gv2o>Mi;i(Y9Oa=a>Vp)>
zv`e1JJ4k`BoLm+`M971QuvX{<4W>(47QCZSKRd<X=T}^XQh;HG)$8o+sB9uGCCN`H
z9^=F53pGriirv?1N&nJITQ{z%7nG#Q>OphsGBWq8j&u8F?Ts`nq-&G&!9&;Umnhp6
za@006ELcU-Woc<iykc`VSSzqjK?faC9r4H$%3_O*T>Q&P%APjc2nh-KSF4{-h4}ta
zhg^5w7!p@<vdd-}0Gnj>SX2j6wiuxSHARQc=2zHAVbM*u;f%LTl~x%juHMmzvql%2
z>bf)tYj&ZMuA|IwBTIhsH4zlcYeN%BA(e`*mv`EvmDNxxl{b`Rv9jh~zF}b&A6a6b
zxminRAYiaXIRW&F1a>YiS7SxSE+sYyO>7K8MT6DMWWrgsAAsezm?6!6!}BANESJ$*
zyC1W5S%neZ8}r08iE#+0I7`ESn-zKIgIf=g6QY28U4zxSRi+KF*V#qb_If#?(~diY
z?i)^Zh9*(ke#XbdwT2C){LOXdIj;^Q{*4R!ID?RPiRZ;v-US-%S$6R9^JE2i|Nh#!
zvtsMsB$}}c3aY?0Y0XLaoSpc{dS^{FZ)+zfkyK|o|HOrh-})~_3N+Lk59IJ_#s-0C
zeMed;F90pIHg-9k*0Kd=U-XH^FQu9G1O*BLKZ4M8%P-8}@e@f?HB)ci?uIT~L7Zcx
z^zlZrowk)VK8<hyliE8%(4lEUf{RCe?c>(!O&qHZhva#fX#qxiLAB(4zoTYgGR+R3
zm#9L9wJVSK!&r3G(a$$ERiTjz^p?0tyOb-A_s2C3X}^6g2c(&+>Z%aq0G@}Xo7(5n
z0~tutphlHMWS)qcopvHSW$#+56e=wC-#8kQXA4c}hNILLF{HQrai`?Hor%2Xdqt4i
zx{_(Y_i~bD?Q!*+q8_~4S}|RPS*1Uz({yua8Eukw>1Nk{$u^58LFORZ#j=a6GPd2}
zJODmMG*TOQq^anNqs?Jf(@_`|Br7hrUF4(-99s0526F~>wVLk}n;+wyf=p3@ZANHz
zyzo{d8@|adMSV;Id!*#?blaFzOC@y~XsT?d|N68z?3yX65}xBvNh<0$P*(|PaI+ZD
zR?gunyDf3PKraqKY#4OvjvLG(7Zk<I>t{}nRwQDc_tPdH2E4udK{kjPBAhq8W|$)<
zH>jT~Y@QDKiB%h`nh!I$TYj;RR!gm?7ATJ?`C_cB9lIf-1Y02%)zK(U!%}#``$+Hp
z<-1N6tNz3dT_xI9s?5}57n>&?I*R1FGut~l>t(I>PHTz=&%md852IjA={XKZ;CYm4
zpoMJ9Zr{fpfdHg>(oP*I&R>6y$kE!XCtuxtYt&b9L}6<Sgs0Pb3fR*1bZwi0@mgrx
zOs)`55qv}0hphI$5NDz)0GYS5i;Hmuq4{Pxi7GtCV_kU`P}&WdmO}{bC5uBlL%AkU
z4R0f`#n5w}8M`0HyRDpXzbMu`e47b`=%u3$xu<`GEJC`ty0xuonV3d8qq>a}@~JLj
z)Ztzy>Al*=mySeLrO{`Fw)KV}Ei2*zZS}#Vnjal8TtGV{ZbJOgg&XYDzs{MjDrJ%u
z8}VAA9R3trLk)ImKFsnZYX=YwO>ZZyb+}L9hT3WN^iPpw_kBLIxJF(oT0c6;wWsxH
z*B`T^E0<fRMy1h4`(lj)OeW+hpsafaK^vm>xRB><iuGDGT-r<vTB!uusCAVR3ui-R
zU#k`sGjC}n-C#y6X(FRCGiGj>a`KNEUu_Y8d*+8McnPvoJb~d!Lf#S5bb0-xL@z@v
z%u(Ur#K{>;VqH8D4pAH3ulfgSrb^|#_}t}Z7a_6rW+woG=xXv}FjCf(O2d$$QkP`W
zCA^gNsWIZ@!Ei069mTVdMS4|#L@R|JLhWf?FiN$%vY$-By{Q|LCjK32t2z?iGX)`Q
zMu;gXum5<qj#uO=Q<zb~$o4m?aoCZDaY?BJQD$R^ai4#SA3XV0gl>Z3%}E1CrPx}$
zYC$igCetmr0*{>cVLq1Te2t-z<(WoRMmTv^K?Z}xBtGHvHbGWhIOT<rMCadO9=k%J
z0uAt$wxyVgSXG_<W^+o3mKTJwo5F*ptZqU=XIzO|1*b>v#f;%^pa-3rH5!K_ix2W8
z^gU_L(Pgipm1$Hw9%AeigEk&w+D8@rDFllB@@Nf9=l`1ecy*nWuV}WgKmWk%hR4G{
zH!QQw49}9^9M2Ql@W$&36`*x-t&_gt2|?>q{8Ez7;nPhDFKw9?u);s>kRkzi+t!0z
z;QKaq2CmG+7_Kt`vh7?oZjj<vnN$qVJi>0oXc(x2m>2%Ju$cBqG!lcxK!V|vELZ!?
z6azw+FvTpn+5Sd`tz-&6UEf`wAkd%-sH$r1w&|#IAQ~(=aBl7MPb&NwNirdXDwx+A
zng182RXd3-&?hcT3Q6HFE%f~5gF4_RH<ukggV5{GYuxu=y(AudjF-5QL$k)S3SOqp
zlkRt&P}H7+s#THi_GhqbFOIXDT>qZ58}xGUC{3fSeF)x8Hx2pcx7k;d7{TEvr@Y$7
zsV;S4qB}`=PQM5XFle)f`10!A{d}(9j*Iz!V)Ff{!zEcVA}iyU_5|6lG_6OS&XijH
zunclSDklpPW-$1vym%=h>a1Eh8q~(;_sa(FdRuY6W5ori;@RNk6x>8h(zRJwh6}Ob
ziZr9aY(+UEVydDk?}mO-P(YsBO>cZ4o=2o;)EElQ<c<2?fx5*D$Sa^2bG*OjZbtY3
z`UM7MSdvE!l1Yd|P-wy?D{aX_PnPQ$WYjE!`aSus5#;g_CuAZw5lLO*$V&H_`NHeP
zF-C%(j;bhCp?t|X+l{F9p^^$WcNNWzN`-nnU#s0RGqnGV*^%JF`?38uI^H!?d_?&a
zyTr$HGL+OU%f&pA+{Q(H;jx3o-PWbF0oj}@VN!#b<;L9?uhw;Oz8{Jhbu$kW44Pgs
zde}Xky&%f~1xCp-MBA}D_KdiRr2-XZF($`vQRLx;c1mf<Efr7ZnVBx1N*yNtU9pT#
z><eRx`K!zvF$Y_fT?2G1bLF{;s4UInU~|W2#oODPk!p17f4;#KUrrs|pJy)_UsGLb
zQu^p?s`~SxVv<|BpsJrqmR;3JWwoYu$Q+4DebX?~+{ME9UY#&fk5s{&S!c0NQ3;Ss
zbTq_1q(K`|OVetW!htsL#3oHM$I773ImX#Y^)_srAZG()J2QBPVg>~y508~8*FqnT
z2ib4Qk2i}B1SnH*<SWo!8IU|S_q2k@g3F~yPPL8c#naGb)vi`exQn;emX5GH@?6bJ
z*{4QUudqh@w%-j@YMj}B#6qN;R^%mr90AN@zW&N)#|7bQtO)`WuygWEBAy>Mr(oSx
zL8lWVs`E;%Jj$pPKs?xj(#mTQx}erMyqI*{WGVO9xh1r4(6JBWswK`ZC}&-{L)+aR
zBVmi*982J50Z%hFz*|i^?a5c8DxAE@k)d{5yC<sJnR-(I0{2gNZ~VuLW9L#P$Nqsj
zoL`WaTIYWKZn^qaP`VDBT^&TnYtGoNx>g7ZcdhOjA;fiJ%`x-Ia^WVd#$3ObWo+4e
z^Yah2lVx(jcxm%m6w6*M@#>RbXG+mD{PB5T78}woARsB1Kc(5?4rF<1TI{mXhYgCY
zPMnctyUTQ~SYm!bKwELrt+JD#H?<&<o9%@78~?>#R(n?o6mp{SdA1(%{k=NRmWzoy
z74UMO1KOl(@=HE<B^oj^Id4ej(DT}9b5ZTZVH8Qmf0{<`mvIAjMut&wf61|c^8)o*
z_n63llE0^>`s^Zm7G)v#o4v<uKx-i!+pNd(hasuih}&S-<+54?+lQoJSLHd2AQ;fr
zC~^;LbM!)lVc!|Gep5m~FNzn^e<RsZ_}3TC{OMa`CHExN7fqe`yS+HOrQ_n_!_$jR
zT)a5C+p#@!hb2Py)+5*i@VZLwp-pF_9>yl(Lan_kGrW|H9$+?`24Q&<aswP@(rKG~
zEevp3a5mzT>*N=z^R?1T<*X`v>;K?y>UgY4vTb3M5kzjp9k2Ee3Xp6kIB-7MNFrYe
zUK86cACb#88bYG1hFE`YfZ+n#@&fG%g~E|HHc0apV>pOl){Np~#~ud%G;j}%P^^o$
zH}zP3@i%D**iV%-r#he75L;CQvW%!)zWn#4Cd@UrPB&A@vu%?me^~Xm$CFb@0F829
z-`a<hKE0n{RWb=<deii=*tR2ea|+@_@3j|HfW<5=3lW`d3g?~Tn$Q2-Mw3BqI1&Cu
zY1`TZFwR|aCO)9hnVkn1I&7vR=V=SbE*-rv(kva(<Jj^F-^#AVxhcF2i(utAvcFNg
zJ|MxD+c4;p)3yVctPn{2dRfI+3y{Yq|K>j3$sM|RAQNnTk>vipmhoo^bza#h5N1{w
z8d)nbbtYO#S+jtuxsuE2tpqM=|Bn2l=)(e5zNJoMm?i>mzh{k#D8BS|0?0m#jL&|k
zqFlJ-z$70%GhH?vCvGk;GN<bUbmWNaHe5iDB^L*HIcb>6LvzG#NVsQ%cu_j3J$qt>
zl90SoMC#0utlr7nD%H%Mo`~$_tilrVtTcB%%q*;27Tbr-$G2g18lCEaTM}zH(fkuf
z@R{@dMiQx!>sI>Y$^YCrM(oH9OiW2z&9T^Wy2;eO<POTvel_prI;|Cn<{(=CYiMZ-
zi<c|j$HSxIJ+?)w<43GwF`1JL<44&XZYQ5&ISKxNg#%%yME>X?mMxq1bHW;3MUCv~
z#E!S4lS^qnbZK2cqr0h<-eI`9>u5<u+?o=^C*DkoNb*^W-h;_AS9h2uwi=Tk9{wwL
zu|hp@$cSY?OT{XX`y7V)*$><u8=d<;iXPHt_G$++lEaQjy?ozkYJ@>JVp3|S9>Oxs
zkr&6dwiK%~M|p!&3Kp%gQ{eAp(|uR#6qfPfQ?<lurCH28xz7qMss-9`NI2fJhtJvi
z3E2&5fl#@Nkf?~QUW&BH*w#-;!~mHfZ0$JThk_pgb%c&xOiDb7Wnup-={^o9LH6BV
zJUnn01*}f9o$#^ZfCb@=Vz@24IyGjMqFxq=544+;sB$io;jt0QaSu;qtRUwiB?wSc
z885ABc%30DUzj0wq<iX_0??*3x)%FnftTvXxn+%XHiYLz4bXCT)vC9omnZFQv*fHm
zrBK&1-k|Oh92s$rRxAA6@4?TZgn*f2<2}h&!Wy7KCTh?Gjfl}4BY0!ljC7<@?}1t-
z=}Ef%``nfap`X)}9=Ph9@0BQ!y(}?!UKH&a;luZesZGmFQ4(Dwi|4trntC)`?_GFY
zt<R-DXfX7vnjg7<chHk#ik+)yODDCLMQkbQGrcbJ@-OO-;2pK%6oeK4q_1PbPgU>e
z;kXkJSxLw<%jOfLI7%$6`<z<r-bk%&i)Fj(ZPS%X6;|cSfxFX!$jxbR>^ft<gi^c3
zgSDg3Atk9cuu_@SO&?Fd{ctLo><An#8h$hAO`x*)!PvlV3&{trTBfKx{NNm75~bc>
ze^R6xstgY@9A+AD2@^bnW$9P4&Zb|S<FsW8&k1y+{k|@`jz2z^WJ(Eg^m6p8U8Kdp
zH%E=tl1NdYVgAj08q6Rvp(uEO3k4P^7gys}8Ux)HYHu_BKb)OokY(MvrPH=;+s;bc
zwr$(CjY`|LZQHh;mFoI#ba(XaIOjxk@AZ4{SU=`iW6yVv@vPL}d&}Yoq~FN@#It`}
zlsU`L0syGD|EGBN|Ar&jJK8zh{cWlKi)eN;mX+Nm$3v$tDE)`P%7_(-O<m`m4Vpa;
zsmyA<Ps<{BCVz>^hH7D$z_>)W$j`@gJPN6#<0kD*U_r!~hl42x{dkLwRf~b~WD?DN
zQoQiEBRW%qaV3E%<@k^;0;t9vNF0l9Bx&j&XY3)X2C4N#nwJ%KzZXs9b9H`Zamdd?
z5~JJ(Jv5YkmLzK-<}H&-l|68pnK7z;<7vYYi?>PF(N>9xs&JDmV+n7p(7p;8w9aTA
z#vXEl3{h{m23SZ+J!Yia0?<QB3)*x!0S0K2ZR80J2{3TuP6$IN<t7%|6%DtI2akF!
z&ZCTyA-Mw<btX7N@Ooun8D_#@)FI46M%eMd;cv2nPehdj+eEV#WwS|;g!tlPA+UPT
za0SUkREeP83}rKfdbsiD=B<FHh`rVVU$SxqB(jVUam2I0B1~}Kat_RoBRlghupZjt
znsm`Bna_A2tM^yrV?dqYse#cZ8TAu4d3c{0bFg#?Y0<umrz#oU?CczM)wCObl-oq;
z34z3T5iduHejXNaT5KNE;iPc)9m}+9jz=CO;mu$(V{~xe24h_eV{>tR5p)Zr8>SNp
z2r0t?QeWQ5`URLJ{`Uo~PfxHvAV4-oHskg(`2#`#l4B0^$AuePH?zN7KS?$UKuP%1
zA!Y$X6|Jtd#nrdWq8(*l={H`{6!_X8;%(|gj^zL&;Wv3H$bHLOWsYb7Z_O+DhJSs6
z`-0A<nsncEpL?`+yX{Qdi_VNkEjI4X`1-wHuG`g}%6@d$Uq0(SlhwI6`}rAb@FB9#
z%&8?*9K)F<z3#Y!jP0BqG;ij5u%`i7nYAqA!-mz_Gh=xkj+<Y}n%>G_?6#CN?p&3?
zf7~ojb8CNeS`o*s+Syi1y;iufr+eJB3hv?C7`d!<x;Wqp0&j<8^krYS(lpcU%x;yl
zd-0)t_=5p;tlkLIgSN0@z}7xL_O;N776xo$UlfBfU~lWdXm>XVzkFqF?(oy=9AJ{M
zlS-D`^jmC2a9^RPZ?$+PN!)hp0vMFuNb}k9XJ=X5<7}+h*j3?|+rZv}fv&E#jC6nN
zbzcS!qM`gclZg@va<$uDubeN|jqsls>vXZB*KR8Py|$7yFH`>mVwu^Vv}3J&W&xYM
z1B_}g=oa!xoqtsirM?F~z(3WpHEv8&<`;;(^Zaq+94Y7J4<1aZeK56aV$pM2?gStW
z=OaB~w&;K045sd&>+)3j1H*e)7GZCkJboExYv*<6MT4#|Wn{QJN-A^20A|r0CHR^|
z5jdHv`GvPeK%T*!8@E(ryVY-EV-=!h58Een^QUZ?ir*h#QO&J!a(*-4Z9A*ou3xM|
zo>CC<wd^Li4S0u5{<W&XZ|L@r_%3jBzDV?$VxU{?Mqj(IL!e?or&1gxVeg{5-C=iy
zjqjl3qvkdiJ7oQUt*kN3AgZV{B==I~p(FHzp}VszqLFSW0}nt#iXaHP%}s{i0FcE#
zEQX-U`5^-O#5@=YEi6cSjmp>;jlYE|7wno4&WZigD%{nkpnj5;oFIjq4Nx=?(;ged
z!t@Rij}ccUSQ74IYRKnMw!{`0VU=IgDn?SNhyCgm%R>RyDJr2IkZ1vcJ&ZpPsqpW~
z)0pJ>7nT10f_zAq??eFBmzbb*7NMfZTdD{oyN;4hj+EP6aGS6&=t9=C=a(AzGw_jK
zy}cCzdaGRV5Lg$;{Shqcel;M8n@C$Cn8{S^M9{Db@7V}Z`%Y{MIx9Fua7IUW1dq<+
zah5;QPMaR_a7vjE1qS+(AIUkIBq@dPG>B3%kXDLxUyzww5}|hDxw0u#hLJKsnt}$=
zYOJ}|YWFzkN#eJ;zrq6mK@q2wGduP!BwrXbGcx5v<4lLn0+dCQy5d8*%!>|_wBbpU
z%WqP@2~^>LJK;fGWbuPqSqKqMJI*7VNq`<B*O=HE>CABbuQjANS=f5*-z@tT2Tur|
z?c$b#VUf28mJ0T)gcE`5iz1f-?@^wW5mQvsX9AFtChEvHbZEFlkA^QW*Zfm(^Y4ZN
z_3AFc@bCyh`BNk+M3C?+b6p{_YVn+0T$yz?;y!26fT0910lwivr@=9MKplL6JGp}F
zO#yh!Fd|W8D@JGj!E}#4WZYC|T1dm{f+LVm{9Sp4cO6=L$O+m-HD~L_NddUoMS1)l
zU@ng6;uZr)416JP)0oamj4$pyk43Oiz;10k=94|Wt(*4f`=fNVj_nf$_d9bAfJjpD
zYp%(jm#T^ftY_@#TgrB_`CT<nPN2BFAHsz;5Onp@u5<oqT*@7)4U|zuc?nGM<>td>
zT2uH?tXP^OU)pvuo-2(}PL7}>riKc{{ONVWP_iW6S6@yenw9;L{&MDAIzroZ=SFpu
zE>7E#v)Dva1Z0>WFv1l^=I^ny5q7wt)9XTu4=zMRnOWYfbU$QtwFkbvQz?&9$jS8l
zu_f6{ZSds}(g>^oZvBjaZLSrao~7|by|SslvLo{Uyds5~r@P8u4P8Of@t7^``m=*l
z-djw}p^g+hlhap9qqvb+-8L-K_YR^~UKT@T>j`l_Q4C*(ys7-EKw8<NoL@Q&RrSeC
zBxa2OLDx$FQ&LX<B*p_JB<EA5g_W4&Y15xVr0CO#6K9E5F`xlwt%mm;TF~S;h@_jN
zdv;KeN^-+<3<z9GYQ(miOp^OZ4X1_?R7Jd@yJtj7nmvU@cF+?VcPMcTU{o|u${Xae
z$$qh+u<a?4Q^umrMVR2s#!wV|$_2-OBStej&lZLmO)-sMoKnWn@;2rGktXP3`l&Y>
z;*7>Q!*yiykQXO@G4a-mG%8mH!5}bp$d84?D^v=<i02!RaL$qYwP`wp+M9=LRix58
z|6bBCm}!zz@tapa6E%CY$eO?|i)6pO?5io@j|7YxGc~T_IRs0NK3oDM1`@OxC4`*F
zAcUV?z1z@ISHgY^SKSCJu}Qw>e6*GP#iw&@<TI%g&I6foPh*gh*Z1#}r1WMU6;w3|
z@np|<eN&9OD)x-Z2LgpQfM9!NO@nNxM7l<7A)|2&=(%2}_-Mv{Zni^bhvuGPA=L4(
zbqwY{d{f|5=kSFx#YOznBA!@Y3V+6_PXoZZ1bc#3ACaCQB$UQQd{g?!_gac}7SAs_
zJQ3wu(RngC%9Fv-XY19ezU+pd-ZmD`ZeI=ibTtnsA8jk8Ob;L4cNT{7xWKr}`})Rh
z6P3`npmqi0d{mURbYl2Ww?=zr<37#XUADUAWGu_@$fXpAPlt&5umn9Y5qk9&oJzcf
zhY97Y&!*$_^Xv7>EXn}CRbqKA%<bupqsMXdPGGa>JP}%Cd6n3KA^y#_>Zp$g({iNb
zGf2G_?@>la4b1%H*`pfrjxDs9MYE?O*6B6oZ7<*O+8=`AJu69`4slDHd(Rtv->3)@
z;AkH-9)ShK+cpO7JV2)dj|1ig1EpV9GtuKza*hqe%!=7X<<Oa|hkAs>ob2)f7Gj1<
z3W;j6-Qg?`r92L#tObT9QR=*2n9EDeBng$`HREo}M3`cTX!~{4)Q#CSEkDh|SMdrs
zH~EU77E0bGRTZ=*7z&mSErvJ+u#<YkQoJZmu?&5yBZ%8vCJxQi;u7=7@)5-q0_q<g
z$ZuIpa`Cv9N?VICD7(3&JLj`!D^AaI>vas$nloDgjp2zgp}!k&#@grtX=HxdAp$qu
zpyJh7nAn(RkZb5-ANP;?&syraRJ7zHb20&;Z#jZ-+kf_-`qxli5nU@6N8rPs^TK`I
z=Y7g_RYwVcnauyF>t-<4?7TZ-@ehWRl69m|&!_?q;CG@UJ2)r4ea9ZIg5}-DALH}t
zXMcGAIev_y#|@ACEs8}2Wf>QU*JL!gchFP}Hnqs&b{R7DgYA*e7vMeL9mm)E`9R!o
zT7cfLdi;$)`3VeVYgqyBNfHWQ?<V_Hxi=9Y-1M;#VS@Gt&~Jse&SCi}I^TPgzXzE{
z1+_g1{DJ<fqlL}W+yH+gmH5w~P+aO6loJCbw3$_SOtjZHT@cmCWFweeIVg{^7CEwR
zj-;e?0vV}2V5%QqPyjmc2A4G2T!dQx!URsM{yHh|Iv$h}AZUe{5)(_MeVQM${Lmjq
zsUr*r)kcwnN*x+K<H%Ysraxmx0uK}7EL2T~0@J}I#blzs@u12@!OuNys9<pj9pgAz
z?d^3HB`oxckJ_guqcF3QUMgY8Uaa&ql!yn$4rdkpL2FH~t<p4my%zgW%C>v0*r6za
zG<gPh>^r_w?@k?)iQ2(VPx~I&OG~K0Sv7N0F69$%Lf%?Wh^o9Msj`qqvz!R9LRAb)
zTYbqOeDwwOMapY*KTvRC<78$QNkY=cH@Qv%a4=l-T&V-mC2rU^Y74}Y06w}9B3czp
ze7?e|XmkgQ9IJ{_9m&k-y6o&~n<?(hYCy`LWe*Kv)+#Gp&YdRr4M!+@zZ_##g*|x#
zsdz-`$D8~W@L=Q|#v1)q#YFvBql5&1Yd6uxv#G;qyky^rjY~X!vX!bIq%=;u-BU*f
z-<qLS8f2ICkdi<DE~s#=|50J<gXe)h+Y%~zzGoh`9!xq-Cvi|PIBboJCopR8-`h9(
znoJE$?cxbtS`cCl+In!BZw&(Xa@51IssT=Mk0C}uHE`)R{d1^@FzZKxIlC#-2PgRH
zUr%R14NJ6!j41GAtu%z1A~s1W!gLQFN-#SpzUNV$970Vox#*IWevtOZA^4nsWfak+
z5^^axco6`hZ{X)7HUbRcL@-3U=jwePX}6c%@6bv6wrLvO@6G7p+`S`I;r4M~?Rl`e
zp{6fTQiZfS9UZsy^_keUOK4o`1>qEx3fG)8?Onc2MNR10BsdNIWp~?aGPbi!5lV9?
z6sS%r4`c~BB$0CyOl<TI{XctEduF9rT{r*$#=ra3e-HA5{$EvMXA>u9r~d~6AliTb
z;{O*15bwYLhM|YEiJp_QqlK-R-aohVFSgAg%9FO63<zD9)aoDl?cpJ!WBCkwQdsOF
zh)tlDuZd_wqMGYW6pEy3vCk6kvES-xiN#|##DQ>=%U9-RZAa=;y0gND3)*%khTWcg
zUYE=QrZR*@Q&bf+l9UexEU6$o9EDVZ>oq}p>9x+X!Lc2z$Dm-e5*m8f1-MrNw*GgB
z7qOSbr?#6*02-P?WQ-L;?Lai+DD5IReV?;ikkwD+J(1q1czG14k`z+oU65RE_q+(B
zqXMA|Hq?^Sg>Zoc3VYdYHITiDCfur1=&l1qy`JE3rH`Qm!_pi@E|^@(>x}Dl4@^|U
z2fMVvi+DU}l{PmZy~ZGxqq@)=Kp(9lrVU<A-$p_q<?ek`3`C5YMuJ-f^poc*v!^z3
z*@85Kljjyvzu%+sZ?`0CLlLfEx$ndB@@1|^nd|3gj{YQ1%s0lJ1{>X<KU#8{#Z)^N
zs?hTD*aK2OqY}-6;4zeA%{RX(F0Pm(it&QubqxaR%-iY-PaEUlx*@_pHpGC(c1qD@
z`zxA0WJmF?>(2~;-CbHdXoIf^UR&KG*tqXfet=aB%<1G99Q5iRFps*=%q={afl?`#
z@s5Yw34ElTOWnCc#QlJ?vzfenu=BMQkiOnSzrB4|jIuN0KSt97!-n)ip+PeyHu!5x
zab|)W$rVkR4mNg}1MLXZ{?v?z5cg6So+F^&kGBu<He?fjntF8i@DB55`U3o>Dmjzq
zk%JIP<Gr|%E4vMM`@{Z^4)~U^rRjkH02V<0)653vzc0oA%@zE23vpR(+v@K^^n2AK
z><C+<Nnb+&3+R(<`32L_zrq_sX#Bp#l~OI8h~ClbOYXP%ykY$HB4l_e+uQlE9d@;D
zeNu(W0L+-c$!QeXm6c?VNP~0AOFdPrEQQ2qLMOoQdOao2r@%y<Ocp|kA}_gMO`Sq0
zlH-U8j9Lyj9GruUW<d!B+_Yeyt@#+py^UwYp9SRvekCce0B}&P1lh7Wip?kCM7F?}
zhxPtW%7y1=1ewiNz<Ivy2q-=8n&pZ^F8YK)+;6Wl;*o$Fh#ApgaYc(8xT;#*?r>l4
zeR2NgF1J%7?$4mjP(3a|?rhQoXln^$6|#V0RJgQEkm$Ls`%)jQnIIC@)Z*vnuTFlt
zQpl)@MXZh`m(jQD>JdnnJM18aAj)gkoCTdEuZ2z%Whm;JTCHM(@uk6fl8W0{w(qe+
z(RG(_=yg%{tA6(?D&`Ca`YpkTI*qx{HJUZj_tP6tqxn8p04-o6%0Ju0pO?~UZ~<p2
z9fEpf>}pqj?uOsOYnRJFMoxZ@EoitE%Bm#)G`+w6PT=Oj!7BDN#H$9X&ppeBySyfl
zE7O2V!+&O321j{77R|MxE)|xr5WN5D<vk*~#djdbw-F7-CJf$E6EFwjy)S%PAT0%P
zU*-4fXc_yI&+uEHs7jh<5EH&1s`QfV8+%H{o9(Xm)+P@@=!5dTgd>~!+1Yd5$0&`>
zUpSIx<tj0A%eEli><MWkd!O~jX07uU(f~G7yJ%@V<ufXhe7qCHPt*ccTfm^*nAu()
ztI{e7&<qr;Fj?>_*mYkSjpgp^rVcvJrV_-}M}5Yx@qUzX6OJ|e_t$Ge#^mC5H_sry
z<&iRfkfWO=f@{-&RlqF_<kGhW;MZp&j=89o>A1UT>9Imp|HBQF{s34fd+8|N?cWox
z5eT)s$IIjPeB>|1ci))!Ec&;5-hiVf?wf$fviRw4c$!{;Yy)O|ExI@I-ZKhsRA75n
zI9{_HS2?Cb#V?10z7q11s((9Zp?a{%cq&t^gEKWmhkk(n$gAPwg9P+1xDN#JKgkRC
zzsu{t@O=i3js_n8PG2!9o3<PL2tKcBRoUR!<IQa(9^t{tK(=(ngix|6Q2EPXU<c+t
z=QmgryKXD&(L&~5K-|BaPNuFjYE3jo<N=iGNF0QH83Z~`;m%vQZ3f>a6ra(es2!(+
zJ%Qo&W24uwh|t6b2c}ZOg#^#3RLTJ)B@oE!<w;@!MY=#HfI$Nt&JkXyjB=>*I0wd<
z2a_ua5buGCU<zC^zSG5`4Jj8_j~EI95u$x)i}_QN$8SEjeax^$SJxI(uO%0}qPLIo
zu@ucFfO(elCC6(Yy*&BeIP>Cwe^wZxNvpz@Ibfmu)YI+rkcPm5w>y|1etA{j1UlHx
zN>AHx4ShC<-<`(KjXORWnOZ{0o_oIM3MAg?_bVfaopiZ0@kk+QD$}rLC#0pJGiAny
zMacEY<k<4$z9Bz++47KmSBv50k_p*yk-F_R6yM7D2zxEU@cH1u@!BOLk{0RRgv5xf
z=KUG<v^&;k_cA-B20RN7Cn@vj&qy8`E#ElSZTDm~MuHmF(8|j})&kNqxU0&Bj$5@t
zC=>b-h-#xS%pSh7Ha4w{a7i$S!QOMtV0x&@wfl9j=IAk5Kq^*&#Hr$rG{CfBsj^vU
z((Z{v9;z46%&jvHh}3zl%OWnR_Hr3IbYuw!m5wpIS%x#555iIf5d^V#bvRVxEw!w8
z>PJ_mDpe7HK#`|VWiHRIeHtnrfh%)Bpy)}Ar>}+4tQu|6180hcgB9*He1DRHHFiVz
zK&L39UIIW1BIwbpG;*CI#3AgeTh=b&vgZN2W`pX3FQVD4JL?E_-FWi}axMaRIm7jc
zWs(y4r5?>#IZsw_KYa`d$#woiqK%h^(4T7RK%?B#doj&-dVca){vbQuaCr6pY9fVv
z%s!?-ov&v84QZZ!?t|R%=BUTSJbtSFMd-pg8AnM3PnGP0Z>$n{EeWz?U2Qt-ceLWd
z2{OIPvmhp_DZ$0MFfi(ZTDpghzu)+xy}1FQ>JBMJv!rI<D<>@}geIJv`Oc=CtAjh7
z^o;(Gv;-DW+W`Jb>k0fnNsHiLNXy8;+S<^-$m-vz>smug4x1Cj_ivTyE<PeNHc`Gs
zu`_`!IC)Y~fW$^afyL1@TR*+tj2Dlf;?M5unp4E7Ac1x9jisA1YlIJ7ZrrSkIa^jt
zK7l$CyhA8hK0mk3(xo$Z%!nOF>Z*2^P^bEs)j^Vez{_1WgLI)NGKN_o_?UxfeKe>8
zj&M*sbcCU(&JE&dX(Ggxm`!jg;;WHFc5p$0Jy<=x1x1@wL3o45h;M2Y%R;UKkBkKp
zod>KvZ;)>RI+LRWa3T`aRU)OU9rJ*^JQ?o89$PAZF^NJbxpr53y$)Wmx+x7%BPXM)
z-5sv$iGH<L*O+;%z73y-Wnm*|dUQV`9-3M92Uw*d3MvSPB_-c8zFll~VgG4>6I(h?
zlR~iBP(+Q<`MuYQISUVyP(pB&Kky#GlR;8AH&N9}Cs?4F1=;3skDZP!l}eUP)B6VO
zej$n8W0{#!kd%q&3&lGVgivmWy5bC5ttR!-t8d-+B7(N4W5PkYe~2pZnlG4t)Sf}<
zCiDuJ(^y}C*`5ex=4kvLgR|l`W+tt(gG0cJ?EL5=<R5#_9=^=HamqhmcW)RnaH3~d
zpUxauar67&UrtO|a{ityD^G`VgZt)9rJx~Yxgx=!lBdEims4|+;Wpi7q`e>kZN?Pd
zP)@gI0XJW`OPhSq&o9VjI{0FUe*hxlV0hvxFdE*TLfe=?t74DuhZdJjg+Dd0=h-?e
z3OB#SX2d1S@VJCp%XY;qJ=!y~wcby&A%*~#r%qq7L%vWJUN&{@N*`T2H>ceo`Mq92
zs&`~#3y`t_#WRQqq(${>CQW_b*lHbxuAFYFt-udIDVZX5^Rw{5EiuFr!8@VH`qphx
zwF)$mi`Po-w)+M5LIUI?Mds6$`tm4P4KDE4nGb;~2(5`GU>rmMHN#<hWYf(^mulr(
zRosjgkBfh`ktCZJ@%3ekBzRmmS`Jg(dgi-8i?T?2<1vFBQ<l$!s^EqRuDfI}6$vJB
zcFeUXK(H<{*bG>stEav#(`zvU(J09#2#$jQJu7VOA*h=Z9XuWVnK&GRg@w5oDeI^%
z?kKRJeyx=`3Aobk#vNHGsD@fb0-H0VV$q%II<A$v&?(2^ue3A`cgPqoACR9#v3A#5
zqwItlKqOKzy=F$!0M#;Uuei00$sjJns_`VGwraI*W*SP6bZS2~5mjOG^Ykk2nYcSR
zI%qa4p)0)|yJ%8q1(u_Jrt(RMZE)=A-R_{$b|R8YA`z65P^3Sx_yMhA&u(DlY_9H)
z78XCQP^F64ZE*ySbMdy^R8GQpy8fk?p_v69c|`p&5OT{(xG1dX-&mq&o9YS2x=%<^
zSrKt}(6>}HiQ!z&dK|ry)(A6wlqm}FZQ+1DT^gmDaQLP6QvLX)_3E_<b_s@Y0H@!i
zo4%DujIANlUPISuF<tR^zC|YN3&rfWGx7B9U1RC0=NG^@68h^<y;h^+Ys)&t!Un-v
z`X=1YgvP6ITkb=J>&gQ>p2Hg<MIn}S$T0qI>e8U_goOJ660+Ksvbf@xxk%6l4&r+F
z5_>5zHDX1F6KiVmwX$Bj%cL}j`c?mwk}OC_Kv@k^u!j5cp;T4#s6>&{2`_r<*>@%b
z2=Z<+Q@Y=6d6y^-nNC2W!F0=favMi1!~ru4W)7zwSXh$a1Qt2~?R-h<WF)^OJoc6}
zO*VqPq{}V-xDS^mMqP>!P%ffPaS++M{*zg>PbCiBwR9t1U}C&7kk)J}AmGw<HjOMA
zw%+=OX(xii$aw05S2I)t@+o@@5I~V(E(zJxdjlrs7Mv$^`&>xfG8HP!@5c#5)p#c-
z^fj&$C2}0A(nggN4q3yaCx_g3`d!ohxsSnyAUne9)0{uQ4yN>q0E-1F4gdwja^<dm
zsynzur9rz{4fhnp)a-by@hiJ}!$}~o+O=*syEREV`rXN(o~SQUvW??Lm2zPgpK?Gn
z*KMe#8_FwMMZ3$!(hV#<`!5{Iyw4<IBaa{hVi&O;!F1%WDgf)flQm0oun*~QN*TWB
z$~n$cw|~hg$8|$9ueCbk2Li#)qg0$A!-Hu})TtjORY#N)4e8{}#V={$>vpF$6sMSH
zChKowp^btNQM{IMKS7#0yzO4rc>e6XOho^F;7xVDUvg>yY#`%@Y2_ScAxcmQ-{q;=
zj9ZN*K_L@Pua}=%#*7z~@2h{(M=J^QFBixyxY^_8gLmlPn@m%dBcm96bFu4>?$?a_
z%d=H#st~63qwW^3RBx`?z&xPmR--oeqg?2@(dAWLXi#RNm?^!K^I7-$31ED`Rt?h~
zqYY}9U(9#b?{+_$O1{OKcoa#2$jwEAX4A=VZ;4tawQ(g83l^y-5Z5(={6S1MMr|%P
z>0e_ZMW7X4Mg%)i@Jk0~qhkNQkElWth@S!c)})mpL`lZ6TCD=6_oB3VVU7V)xs}F*
zThdei@M${d04$kuU_&-&XD(CGe}n$Vp|`p|e1-qI^8)UFa%lX2;n4pR+V*b^n&;AW
zb7cPK9R>Rwyf0Z0RK5ZCcVP|{6Y69$U9NcNIirt#>YidSED#NVr0GuC>^%C%-*j-_
zfn<1MN=fs9_>rM4H#0Z0*C5^cheI4k|I)Kv+}Owb`VO`Axq0RaCoE>ndNRl_k7*pk
zo7eE>-XEpXN41W~Qvr8jGM4DmDWfP>vGkDsz_k}^)LKE)%^@9{AO<q(T~M&8m#y@W
zr6|u}*@8Z;;VCo(+yG9K*v>rNz!y#L>#^F_Sbxo`t~H&|fWZQYVA;G{5h);CVIVH3
zY<e15f7zL<$I1ROu~4;Azs9h_tp0HkxdD>&c}t>lXsCOfB?c9a7<VBcyiRB`K%t8|
zNS7WG|H-*$qbwg6t_-4sbP)-^w9BB`P-L1^7F33!Z`w;l_RK0H(ZTo{FOo0w=Wz75
zR;Y)m=kn9<k%2C2Augl8u@lfw=I+85+O?;g8szBKxtA#Oi+g%>YeoZuxUGh@oc5`#
z$T(;)vas1hxPJTy$BXu<IDrsM1~Z~?=s>lk*{O(nrIt%E#%EkpaApx2c{dym0n0ah
z?%y-^_KybkMtXYN(PNDDa35d3W-J)7OK6o&UvWXLFPlm5<S)<TXRs{b34j~f*tx)~
zt4Z`q>`&)`#k&F`c)oM}Dmw5u&70Tvx)!1Lr%(8iEn&akABUcs>Bd-0n1LR`^p(u;
z))1j@)yv%CIBZYYLWOwce%gZCxOp2IPigk$bd}9dbnpT42}>Iv%Tq&WA<3&;$7_4P
zC3W=@CMyv+J3HP@7>Ej~A6g&XZpY51{k3<sf5U|eIr@(D7Hj}`G929vzLu5-^;^lB
zLt-Tzih`HupBy4R*7&o$0qb}}W5U?y(mVqB6-SYpH#Tef>Ui_AREtzc1{e0n$)^<I
zV*=mr8@E4chmF|or8$#}<WC;o=5=P{X_mo@r9lWp#A6VL64~QUo)xKu)Yt-I1xDNs
zsxP`sH|b)1+~t-(i*wm_aDHR{W$Qp-yRzsFFYfKECe>$-q141FMv=_IU`j`$NZI?R
zkESwW-tp<@!7#@}aAi*2o1-KkLO4>9fUEQF-3vBsudk2e3DAVfbHb_%e-h*fNwbZ8
zgiovDWq=aiBD4U?h7oi3PlP1{OKvfXK)Wu6G8~PxREtezJr}VN^oSsKn~ZL7!8F;D
zF#usYbHVO&A}FF{f$t|=rl+g^5d09x;qyPhdgzBIEQH3p#f?*F*Gt>)3qh2QwSvG_
z5cE3&gFKe@8P(@~D_ipWD#<^fF(UDPL;gv7?}LenCx8ZzyU~HD+%UY78||qWZVJ-B
zQ=bnRlHD(|Ql-3+n`@V^O{IA(w#tw~kd=F=?yombeq*`-bh@y5S_^D(g*2UYm&xWF
zRK7t5N9cc49~<M1fp{+VBK8{`yGq06&9)TVM;i_r%a>9;BLAcgG)IL0ZZF8`c?|W?
zWr!W!-wm1r=8YXiz*hTpc-As|lOO+(k+W!XZ-AOmgY!K*%UT*bBq&6NE3Y7$-AD)L
z?`=-uOh9Q8yG$iIi+rBIOu?xU8ev;%h>hT9B1yTRhFZjF#E18RdAAnkhoJL#<p@{9
zz$*M@d|?_FcY!w_+~8e9l?N=7769KXb66)3n+<pe4hsD;+GEV{qVwh#$TQ+g;x0~R
z#3#&;FG_wF|B!apd@-Bs!Bxq41?!aVrRL}?oeo<pa{??ckDV~OrpTV6%KAlOSi-3&
zlg}y-2KK9pzloJXhdDjY2}wt5xUpMJvlSFeZHFyf>j8O$c)=N);wtcwUptT!0`U2O
zCh%;4_>T^g0i2Eif|EfGAmG#>ClCk+6rru9HQG=}QHu_G-)z;OnM&qQ)z%S%muT!Q
zo;ma=G?6L}N;{(J%>1$e>d9w7`XR1S?IX$hYGj$`_|0Sh>v1A<O=P7J#V6IE2_IzI
zHY^A~HD{CO8?qguFeIA0nG_3(pd2A>cN8(?p^H>>5ZHzwM)rmgUq_^HEmb{L<PFXI
zlIqS|q3SMa1M@{7DM6$hPYXf7VbwsqOf!rlWy!Key$f9Ps?AubKK5mG`0%Y%ad}y`
z7FnncQ)bY@z8*~eOq?nqGBQxvba>xDST>6CEHaoAkf<?d%q6@3$q3&PN#-dn%p;We
zbLKe0aHOM@$-=8~Tg=eOEm{3@82mcJmkyY3@UWAJv#-3foH*)medGfJKF+r8z*_j`
zq$#f;N2O{<M_ZP}E{bc`pp|c8y8K%%{v+}=bq(R?Xj~%KYe&BA5Yd;X&z&nmsK6RA
zo{n_MQ<;g!V|ACz$&J#IUX)5AMGuT@KVDlsp=!ls1R`RiGE4^c!wS$sVca^p;%%hC
zjSh$#%xhv9A8Ma3e;X>QWPGpC%WMjE1v2uodJ40Vt|jE=G085;Ys6lyqlkb5V$7?s
z^_Ov=@<&{m4$ipL3O%TrWVFW9#)a(CTVvA}=<DxN%C-u$iQMwN3dff~Q!Ql^Ei^Ut
z_T1oB5zA{QLrNTKr=W)>eiC*Qra2C$y$H$$#x%eW=4j~>xW&M4G8m~0t}Yaw4X~JL
z=Ab~Z-6)yuL|!rxSl=G4yBM*-!tZLv=z#g_*~)d8*KiaTU~dF9kY)#tm(hClwgdd;
ziG89*H_VQ(p#y$~g{aCn)~LFl0L97qzx3iT{Ht9{oLv5!6Re7>58y|ik9oZY3K(u#
z#98t54;uiP`i$DFk`5X^M1#N}^$(TUk>4fr?I6PA`BVsrq)`1cB%`B_5`_rVe&De5
zZ<HV+*kK>dy!M60e2RKwgV5^&p@sl$FnH(8!`Hz!*(DS$7r!{?@%_ElfYj)`uBl78
z0eZz)#&s^Ko~xoEJ{3C7p}t%S+ag=4+!j9*xdjp#Ik#3FdA@H(oB}UWqK`_<N^}Qj
zlJ&g*(2h#&*n~16@+HkuJW*dS5v%t4$?sN**D~7&w(Hy|nN9K?HEEuhW@wkgqaw|9
zL=q;(G(?+mpGH+=!;=n>u5{V>8Pu1RQ=76ekqagc($?9>Z0;y`Tx|Uw$jcK((QYU0
zEy--@Mqi54f0j<2`@QwO*kO5p_sf-+UH4~yGECL;OO<uR!#QVY14zKq3jxL|Y!<4?
zE_-jh+YO+EB5DJ=!&=l!sPG+B=b~NR<Haux7E^#%r?j%jOK}cweH<fCoO@X#P^>Zx
zLHS@v_E%6|i0nKnKB?Q+fw->4TIr~3IVgQ`=kZSDVr(39(%Xmmp*{O$1vGdRfwrD}
z?oI>f;~}57y88fPZqp7|D6%^qUI8_Ftb_FLcVH_2g+FmB(06Z`WgTO>xQx{7FxW|D
z>h&aIRJoU|Bv^cElx*{3G31{T7Gzw1-}%%JQM>*^CQqBiQUYE^%pJ(b<|a(A&!|_+
zA@v)#Ew-gPucdig`9-_Bl&qLGDK)zKFX?0;6(?&%wExYK*sJkoOn01?iguoOxZOF0
zntW7zRmv6X4MDMhd^7%Aui9e=^!vJ9kIVW`TcV0a){s$yjM7pNPLl~<7mxe0pTk`R
zFPxsX0wOj8=as<so=7ikC2XEblZVz%0MS=b-m&#_qZf7U1W$fDQY5!x$VgOB8g~W9
ziEqhvS{vJT=vvM?I~<%I3MAz${E(B&->^z%Z``TP$fIqCHvX*TNt+3IKS^S*<Gru5
zYCT(hr&Nu%oy<+poz#%rUet#F6LWkbeK>C<#R>awy6-P~I;HvI;^|{7ehXD!4^}Lk
z0F~hSqvwXFWFrPCI;4mwb4vd%zY^Ic{H=H8RWe<$yf(+=W(|qW-k+4_hf1vy_0qt7
z@X$AY-gHReNV)J!9aIKYrEq^9XkpJTbwq6}VyHkZs<veiyx={mlhH3Zf5Py?Q`oze
z(IAu@KUuN+*~_;NgdIx=awDMjJhVn6Gp9DY^C@Jg@N-t7FJfE6J;8OU&*uuUszISQ
zlb*fl)a5hq58m^kRe__C8(g7`5Oz6=j=2%6e7IHX5fKaQx@y?9TCDnym^E!a*fsF!
zwX!v^t3lB#<=qHB9_3Zit`$!@1GY<dIR!_^W{rNY=LN^@6EI0f_<Y~ID_I7-QNRu!
zU8;q%q4D3W2guYW{CS%~F~nj6JveB2k#d$?DCF5CJS&fJuA+to!SIk~Dgzo+!-+&>
z1mnr5_DDT4SlQ~&;i}~Y!rgn$7vn@Qcy#er#S+fz^up<Kk{nD<7jp5-417h<CCNCc
zZDcx1A=2hEIjW!{nI`5tA49!MBsMBR6r5P?opYS41CRZ^?NYxFT7$KQIcb+gft2d~
zKsl|X97z{7XAQJ|Y4`QzvYlx(S2g9hBx6RON!WlFs*t#i-LPmHb%Y-A4vM)y$G-`L
z*TRgIiF5@=Q(-*C6-^cm4XY8%%2|Yn<4l5Cf4@ZpTVpnazccbBR?qzf(I{JPR84zh
z;zZ`D6q2SD<OUN|#O{zg7di{ZsU<5Mdh-=%EAG~5b?r~kuOC?aLNQp~`Y9icYd(Az
zapW`m)hiw8k{I^XW+~i^u43nF0<GACjWMQIC5rh<sB@8_`DP99SjP1zg^t*Z9t-64
zF_4hUzwKVsX3Xge4p`+9R`_ID5(O!DE{&PvAk&V3h1k+B(rg{`;21KBI%B_O8kS34
zL+a>y0E@Ox3ms16?Iv;ikOJ#HE_8`(r(IT&WaB+cb!VVllg$BvU)u5Gn<5h=3ES?j
zF0`1lyUuZ_*lKeh3${tS%^H?^vHvh>VUpXnB~i+J&cSzo0mRubESq;D>-FA4W~X0|
z`~8p{%3LgeG8L4<PFm*ljAr)N&!6DZcnN5B7wiMmj*kEEO#T+72p`n}pk&2)uA1Y#
zwXaTE!R_xcV3yk=@y$dqZg!0FY_0C=FlMaL=f!d0j)6H83r!}E9?=%1;Zs-h2@s-8
zG$)rpjNc;hx9ofF+vNwD^6kYn*)Jz3$xj)N`66y0%)zzT3?bFv?)tL7@3ur(+f;#-
z1dEO&>Bpo#fx=YTg16xkW~0v;pL7*`C)QS=yAmS<5NR%>WODPb6$7CS$t%ahdz4lg
z+bHD9X%}dM$&+A_qUrREaeBf5=O3$W^W^yYRQG4(V=V(7X%2(#`3%y?K7w~3It-Ty
zNDK+Pq%c(xRkfKgur%X6sP7*P!H?@}q}PT(WAGl+licj!WKCDqyJU4dVUs;1$T_RL
z!r`mexY^Y1swj;B3}m4Zd#+BS*L?;dJv{ha__lwz`O5nczCaf9F8U5svA)9|ZP=s0
z4Nsp>LC<}JjZ;*U%I~3=Ny4^hw73g?KWPqQbK{eGwRmvZ3X6iL>!fdZSm$L!f8Xb}
z=J)-8Q)n8-`b8atbI7DGz6DmjY?-4pR^G(lc!r^+I9tx|KAdFE?@_5Y9!0QZD)2N4
zPShUY0G1;B40&x?s&c@>mxVnP!p>-2<d0+0#(Gd&-;yCaxL!6C7)U|S#vwzxUKSa@
z$635@F-ut4&OFo^&yWy5oO3$$>rO2yrZ!_?rly7^0h^8H4<Y9yz}<oPuP`Pul%#Rj
ztp^n??it55Y;>znvf;ARN7K8T?UcV|q#T(Q^1}oCm-FRgduK<^hK7L$*vHgeYN>#3
z9g_&GWT(Rzi#)og!8y?C2)D6gggZb!1bhSAmizA9(E}eXC3a0uXPvI7`boQA7nJ>f
z2*W|O!NB%cs;)6~E!#`>@C>44Hm*O~P-tSChvPKcM~f|1AI<Za2*MlkrY(k3x6%9B
zWgl&Qh`Cq$Z^o2v+hl9wWdnz&p#~#(V9STX4yY+O(`og-sXxklVimT8W(`n{58B=w
zB|?<98Pake;)3XlUuA%*ue;N34e~UZ?xAjLDQevI;?i#5?mur_r4VkUr;0j`!i)}5
zDUY~^e&->X0#SV_dL~|G@&H`AaCeHdO$-hR3dm5S+5{s97KdrJpZ{1P$@-on3r}{7
zb!>m&KC;9{RhI8k+4PH>9ruNDn=K6*g6+ab+4C9i5P=cXv5fKGsa!c_@H*l9dNOdT
zeXP?OYMMw3AC8Uh+WI1?7R>L4imr6-2YR;OxL9(hJ;j92-19XjOW`UT6GzxT41y7T
zl@b`9?95e(t1s@<TChi<OCm9)On(_VG4_%O2hnHgSD`WZz$FEvapvsIr`%3ytR3hu
zT#qU(V~IS|<U2+S3HS?=a+AVBQLP}OdU^CeFjnzNL5<?SV^8Yg(70B_^)5#&Ixy}U
zg&1zS5BiR_pf5e!c!`B4Rp4mlm;S*2GrQuJa+}dZ0su%f_)oK||0@^qUkA6=7EaFp
zc5tPs<CMjL;`3T7{~O#s3N!lJb)|7<oyFm?K7&lD^Y@K0Pqg6?20AgCFw)OQ%1{_6
z#6gS3R@gI%QIXf|^89iF0#jr`lPnMjWCP$!AEl=M4mWIi=)G3uGkNqF6gaUxIB<g?
z-G;Rm6etaF7hr00v^WaEB+$&rC?H_MDNS7r%_<><zik<yJGvn|(O4v}CZi*A`_Mgb
z7ysX8_ABKlS&aqhFhj@VBkRz5-bxLvpnQ7hQ)^IjPaigZA`o_ePbmCcN2!ia@aEEG
z;>zz~l|XWv^gU2lX3Ncf(#LENKsXlNqcou41V|?t^(v~UyVt#i4xVpE>usFb8CHY=
z(uM&QVe8McW=&dbS^;tMOxh<9_8zGi3U~g14a{bMud9Z9`n=1ER+|-NE#T{$fl|GY
zH=My=HnMaPm={TJ2Wzi7W`Gev-7j6lHh6z}Qkv2oIe_$feL+iC_YR+7yxYE=*)jsw
zS>G-XE}*7n$oEbbpT~{4K+*Bt9oRBv%3h5pzDjfDWXY10@W#LootsoP&UlH8G<Lb`
z8)s}RwV@D}v%x=unL`xYLxgqq#)u(jhfdiP{4bpQkih!@W5I#WwhM>6WP@n5F+%q^
zh~7ZFnq-&?gGx#+7BtR5LMbAq7K6vk^k5S@i&Dfaa&)}?;(#{jT?Is{l}y#TWRHdB
zZcuz&u+E@(3Djf=OOsx6XD58;T4wXjI$4x2I7yu+Y!^0U$+2xkrOeK&F{h?7rpbAo
z5g$~9@X-Q`9*t>a5mV?O)-;DQux0}x6z+>62qC~qm8UIi8|({~jEn+mdq!f_grv=h
zNCwnVwFKp=PJz7Gpz!C73Ow^ce<CA8xy=5Y-&$V|h332wpjGkkEW;#0UH49OyFAGW
zqw}PMT{|=Be9@P1F|yX;u5e)&WEuPL1^aU-mP!ccR8#R5`zL7{N{2U^*q`c_p&W4<
zWGd?%kv1#=bn22LRf7bRi~(GAK!mr0C|e7JOFB<H|L_(f5tNLfd*q1G61G;Zqg$u;
zoqX@vwx844aS!4LIJLcj-OK@r5&{_?2m5~u^pSuvbdrNRM}{uL+U_Z(V!M7>+p^8j
zeOz&YF3li6zDES|AX(Gq%8EV+C53KE&vUG~iB>C>>El6YoaBEj;kEvdSy0EBy|@_a
zon9|P-9#bX+!pWmP#pM{W}aBh)f!S{W|FOzRzC`6;4oJ^l?a*4l+9#2AUN(#t5<NT
z_O@>Lx^~mdG_M{O3_-apD@!LtwWDJy85>QUJ9{Dh5;Or0j+ENNBvSfQ6|8Gt$xx-U
z7HVkfzF1yo=T-Ra4K8DI*&vfc4k!MIf}S&}1hwxWNvQG^ouD8=Z^n(%U86B5$HK;6
zSJtR>gePmp=4gUswCq)RIjmY{Xj($g2fVvtDq5E~<XH&=D>U!_km@@PErpKG?m~3D
zS+KRVgd`U>T}4F3$|_pLhoy>`l!-O9R!3*1Xk>I0J)fs(hNPr4<ZEpSy1A{5_VK2g
zdKkTp#8p!vt;XSKQx^yiQHNy=EjM{)5#SBf!_O}GneM6#N1VMkG^(D`v;toIY!zqX
z@PIKA{;ejPOXcL$xEW5l<XH7&?r$pL<Auxq9gIc(IoZFnV@R`;1MeDYx*{-*q{`uJ
zphD;m2!uzL*B!VIwbNcU?Mu+Mo{0M5%rJ<tC7`w2FIhNle{O$MV}<{#ZIer(uO{9L
zhifSyBeke`#18kP8rt`rcv|2%p~T!mpJ}|)!)mJ8Ss|A%EosyDWGfdv4H=ggai!L9
zW2K-zy;A5&>zq=0W1ND#7(*4<kXsiqH5FOk!ERthDx@Y`(!orTt*HDs)Buu@aVXq2
zPHW1Mm`O^>4R!k-LzuOxElY$@In(K^2hFV=NQdooZaW9<wFii<{`Ah!GTGN98>`pb
z4u+Uadz&z0v>y$0d$cDLcja-QcA#Q&^1fhLFC-&_Sk-IS%R{%57+T0xU!iPj1wO*6
zo^72<GEM-C8)SXD?CIoPmMQJ1Wuby3D0K?d-Jk6Za6Btz6Rw_DHq?Y;b~*L658r%W
z#pBg$fBo$IBWHNw`nHq<9M;zNxZ?~HwEzKDY|jufQK9g84>9yf_S?UA*v=;b^?+|G
zXs$j1?FZ(4>~Z8}G>+r#MH1#X1Vlg!m^aAA05Bih1SBy{(41d06c1KTsq(TzTh(L*
zNl554`W6=+Y}4uo0iFq!Pe=8f0g|m|y$jN}cBJ_CpM<T)*F+-|VZL2Qkv@kLcV+B(
z6v1Do?_w9}*xgrvFQZ{{Ov2r#aKY)N?tM8l6z%Asy{BsueQE~pfg?M#j;VbOJZ%QG
zx(mXsv{t~lvke=J6kfaG%Zh!y|5)m&P#o<$7y!T_<$tnN?0<0_WMg3eZyi;2*RGHQ
z!S}TGZa&OESqO65r<`u#1Zl&zwzZtid09zW<1guHNgto>+C}Ya*Up*%H1oPb^E@XO
zed_f6IKvJM9t_w@RldMH&^=bxB6tTrfe$N=<`S54ITf9pf+%D|QM6Ia3gF=~&<x`6
zZvs5fFnU4yPB5QI&L~HIN>K!Q;0Pf>+)<S(k{YsdVM?+@QQ<CKGUFsm^s7R!>W@-t
z!Wj|OblLNj6eSgQ(!u@q%w~dDAiI4c_`@1TH^5a@pk|_`lIu}@=u5)q^9_)mbg%UN
zG7Aqrk;Dqhq)j=}cf3|=${cGVlw;sI9|<xHn}vfM^eAsV0?%x|7o)+H#`;DIukcH4
zy}#OQ&-{FxO<<*sBz+m05}`U4a>~P2j|+SUx!~T05%M5nM|-=#Bj}$di23t_M-7Am
z`+)B^caMmHM*tlk-KoH1Bp)8uT!T6o0WNi{_v2@4G!Q2v@jMA7EbwgGvP>MVPAy0;
z)7kPKC@7Hegfb&Am2AX$Vf9<z#t5Ilk@#$gogVf;6&UJ|$}f;eMq<SRi}JVV(><{e
zd5TiX-qK!E;f0M730RfYfrR90KpsoS$v0CWXpSgdfJ>h6lLh@)iO%a66J7>qiCao1
ztQ3#CkLPtM&P0+^5;=@?OOiAwO)4w-)$KI(u;cUO9>u=yI<hWu%;Et|k6SNMtOB_W
z9Jf3p&e%$6nSp{L{&X_N$V5Xp7+c3i_SF)%=Jlx6&3hI<(z;aWa?R!oFG<%wN7ZND
z+dig$=BQOBx}1nw{VYS9SX6_|?V&Aw!HL1J`HZ+qcLCUxfl%`nlwMK#h-L7{B9i#i
zzH|n{9g+4syi&j9s#nXNcbUeQ#NwbWu3!|vT&6!$8=Ye?rhi*s6^s{YwzNJQMfWi}
z&vRp2>E>}KpXfJN&aX_2RCa3Fa6cN{3&MP!YO5^J$DEM|N^ZNjUSya#zhEazcshz-
zp<qaMP8W9;?kXD=wf`(#hkVt2MVv4XR11=H=D)0*%ck#i%Yx7_VN5;1VPT`3{DH?=
zqA81=&fbXF1Qmg8;d!&er~BSl|8lLP{d~K|i-k8id#l?#46^M_E1K~&YqOXk>%^CO
zn<Wx6pDAYvCWDbBn+K-9k<Jy*QP3wz;Cbb#Ee3223>UtdHxFpr7?>72!%ps5wph;Q
zb5?;9e#rXuR$2eebe<q6F7sB8`Rr>pN6!8Jf;7I|rbp2J(D{?CsvbZWLaYK<FM)0w
z=1E%g9@i(`$3dRagDMnh14An16n$Nx^ZJ7pBM9>Y^v`30eWp!m=5Mac2m1f@G2vwL
z@BetK>BJqdq4><!wi)7yldxshZThG~meUxUxlW|cCzh9JR?`5ZLpUBRxu(zb-ns}9
z1Vc1G&bd}R1p4^!&G7zmO|)&jj7D(`l8UGF-iP{g>owO4SA%;hmXj5ujK4>$W*P*P
zWQOs``gi^$n?|H2N!GVU#ojJBLLkdj4s&GD6)$B>#8%ce$Ghi5D`Fd;f+vfaaWpWL
zkqEuSCA+Hf=40XTqV#*VQ7I|{**Ug3T0p0GDreW`i+0zcr6eck%_-$6)|$Pz{KSIz
zIfAdT1UOM|ce}oGpI*q7FWS~C?;R>*=g5>YS8>!KwGGH794EB*kp#IZ+%jz9`4<O&
zFR`;<e~5jri*>myJeM6B<`?S&I^)YKW#6@vB_9p=APw?;V3vBx?^|6fj~o+gz4`%r
z(73obhMa-fMTlQAy*t5(!b)%FkH$drHW06RrG@rvL9@A$7AP}B4{&B7KUW4UP=gK$
z6hQ*btaPY4g_upxHz_l|^(U#!AtdoAOaomA+Ey4RqOr!3H)KRT+m`4)+}I8=UUwp4
z=6FnMh3X0S6$RpB&seAe4Zuioa_(dy)3j@s?lOA0NHX?ef0z8o{%Ba16JL+^&Vw^0
zg~~A}v$JU8#Dz!L?}j?>$*B+@4i2sDKbi!lH_bmn#1h4;0y-6Rzxl_}NR@)bkHtT9
z5sl(286oQrPU357798sXR(B^jx&f+G1+zAEf2(Y4%vha$UDHv<Cgo<SzqM-HnVMUs
zJgqETx}I(`YsojFL@VyK=7Ar5bk74wj8TQ1*;8WpO`az@xSV)^2Yl65NLBp#5J907
zt_&vIH#?xT?cx0kw99{cxk*n>mbnCtc-OB@LN@woprEEGjgZuMxUnc<yA<%tU;a%!
zw_pTR3;toy+n(g&CJE-CnM`7VPxb{(2*s?DQZ~{e3jOUt{r%1tm3y%&ctT647B2~m
zv!X~BAoxhqD97m@ISqD`drnFc()z81UxKZOqV~=B5MoU!eFUe)*5$LmJGKtZtp`C=
z=b*0{UE6zjaZ!#ZI1PPc(keDdd`-WN=e3(R%xpgSE-Ck<A#M)?5qnb|)-W!U_1?Si
zaLF6v*P=tNJ5O#nf&3A`fx#bEA3REF^!tGZPj{gzCG#8jAK`+P-(c$h6>j3+TFpO|
z8WaGCzZXZ?r!c_(@%g{9&NeRA&KAz*jwS}i|EAQktY#y(DUR@cqgOA(x0b3H)v6@N
zE7HNzCXa<BEInK<SUH>%Tx?a_+zB_`+pAIgn}agMv7qDc+e_VeX1jQpa~OMWy@)}l
z_(hEcO5_u_SA7~<8#@m@{2C{=#Gga8nN4i@`FozcL8g+YAd4F(l%Xg{dFYFoim2SF
znDYX$<)zPl8LVnXP5`)I<p`amFmx$1fmVAk+U=Xk9<Yk(=&*Canb<N+RTbtg1*&e}
z;dHqWjQHDrrA!8GQlxl{OGl|xr3;<j&}uPoiJ{%BH7s)qtS2D5uvF`G=gU=_rws9d
zlRp{1S%}Yuj@O@hU`~azjp2vX($N2p#?Cw*s`ZcK)7bYRMV2TUyDURp%F@q1WHgp6
zm+XvfY(+$tNQ^DlrCTFRmQ<umt|k4ruAfZSP$XW+l9!<vyZjE_bWY5@&AiU*b^e(1
zKF@rg?|Hu8=XuWOc^EJpxP8*hLm~8<)2RFm7VdD0^+27*RGkre{fQ@{DVUcaC)`AF
zI-eO`NXA<)JZcCVEH5V!3VWB(0imIWoDyTB;zwqY6Z-yV(pgsKB2FP&{Ege%B=SSl
zj?E7BG>a3FMZ%_D8&gdlyv^|yx#{fvpBN^CINk6LZMoRw4`EU2QqPgn1__FV(Viwe
zF7spEQ@+C=24ZpZxf#xmQXHu3);Xw!6ReA)h|W0ss%kG&Guu&Zwk6AP3yYI~_759N
zH?SPHtEXSPV;hXT!H1meC;0M(4hp4d>7ahb(S>GM8fiQ7cz0su>6_uPf^pvSdN#0(
z|DIjYuX@?^F5m8?zG;4gV=Bv~kH$KuurJn93X+U9<s9fQlrOItXMfH(C$c_!FTzQV
zVB@RwN|&{zT(RELSdMPUKx{(=@vH!IYwa~Pg+#UwGLKXsWg3w_zIv49cGTJ}=HfA(
zMU<0(8oieSpDVhtwYNBR{wK*Xvi)#>0{%Y1%No-@Y+v<IFgI6GR*PP4Od%r7f8pWb
z)^cY{7r05qEo_V~+19hlLQ~yABx`)y$y=o_E{!@l?X5~*8lv$0Rc`HCftVe0)rxr=
zTP(wpVDf^At5V`-gqE;Fe^6qqieN#os%_gWX%%kd{3^%lx7=x!2RY}fBb+wbOKS;*
zZ>-B$2@hWq0~>nIu5xwQwj>Tmy|rs6HnNwwV_&!7sd$ZQgTtb($34ayN==#c)_#_M
zoAeGj5>>lISHl#MNOHbMT9s<jT&CWb^{=U7%^cVS(%i0mlzZ*n-}`DWSNg4>H~1pN
z!aeO?CxzFWhW}={k>|tIljSw)&~vv3@wA|$bvtLb98O3C79@q$0lRL}ex}8sIXgZS
zg>%?>$=Wr<(FgDCx`$ltmsC@9qZXoRc1?=$lz0=O-&~3pZpjtRRJVXE_fAh36t59<
zJMnO3(f6}?7^Wl2?j3pU{R~_5)zb*pcm{rsnoQ9*I;pZ&=)WjB6P$6CsVo6Z-WaOv
zTwHombY*?<h>;LO<PzoPr1t&Z)EJ4<(`R{{W&L6+%4ArlNS(PY(&Ry>*5{^C$9Tu@
z>GF(rIbrog#fpv=b-U;LYY6FOV*YchK{T?Y5mB9tGN_9C;1xyKSn;kiOdojfR%i3L
zWJvOJ$r}&i2$yNKL7t5R1J3=(wZ03oaLqsO_J>?1<PH-QMY03scmp)VxlUT0!L`LC
z1XJYd9?1UQ_Up^QPE_{)ynNG=4W@`{c1iNeo6iTH+1ydB#vbZI;E>H4V$@odb2VA9
z@=4Xd+F&9%4-p;r-#r$|-<DOcykxK$Lo{3L_@m)=+<MMd<7*h<Flew32D=VuxoB6E
z_zzT+(jGNsf~oOsRA7HgJ@D51I8hDQX+75?-dHy?Pn19Y?92y8OwhT_D<nx>aug!D
zR-MSm!Z}WL#Yt2nh=wahGD*Iid2?xHPvM8UZEtX2{9`s|LQP15QW2`cc<xTfRpn`-
zP+!_iDn2BmOn&6Z%@@9AFGkw$jd{Fz+{Y{&Zn1Q8IYmie;8Lk99rd=#eLv^F_93o(
zIurXShPAVElDp*SvPk{_vM4`Ya?xcbZPg)3mM@cnJ?Kk$z$0lZKBDY?C=<a{jwWib
z_Xm|eI2qsRRU#BTuef4e^n~PJ$L&$6U-{9!6I-?F60{x1WZa`urhtf*449L)DFya0
zC2d#tNip#P3>Y{2`9O8cHOH>UXtt<M4n}p3{u(o@pZv2m@ppbvSRIl{=Tt4~(48&O
z7r9<!&x(CdM`A8rXKC%BJEi*4sMuWqm-9kk(Hz(AAf$H<UX>0X`#9wRv@MnUSB=G%
zQ?bQ!^8Ev{QtyS&3rdw$bW~pLW&G4>V<(fktUFMjVQ>5{0-v0C{wcECf3SDN`Dym;
zNT0ZX@sN%hrMc5rJa2_5Q7%Wq4Wm=cA_#Ncb{$pba0^1Ar?9*m3T;O9(K4BrvmwiD
zxtoAR&LSDGk?ud*i0iA3F1iK;diwe9;aQ0($I)`qZRGaE6ss}|%!!UC9m-wG<`Zzh
z^i0gS;butz;5PigfkE$sAE09e&S1Ne3I-U^K7ILU&1i3(Y_&mvUY8jbKH7`jI{xW$
z1n6>7l=9chA*$W)N#C*?arp@sDqx2k1{jPJLh}Dga`yAVJAP5@(&lhnS~e;M+FA+%
z2IGcs%7KS^`w#61FSjj88wK7dnID3>4-CL;qqgTNfYrCWU7dq~stNoTB6w{tUI_8=
zelQWd-7@X+fz{Vg<70r@+b$nub+GSAdwT&Js9ZDJ>!yE`zG}CB3~~?zAw{WRAdnyQ
z2(YO3FoYKh<?X5I{=S^wJPoWS6oPQ+v5<1^_{q0nGul+JkV+Urb+&|1_tGWO#)9Ae
z7KLD+S?|W~qzgN}0NP~m+?)u6e8F)ydDpw-w87vJN~nE>f_8(U6MeL)-~kOmNOAqb
zc2gl^owT9g&K4A!8Mzw@85yAs1!sFuPcV&vK=<mk(Z+&v6ETPZ#Nr{?tp|Va#E~`}
z9IeG6aPswkhkr?SXw$)QP5?sZyRn-Nj{mfo;Go6@VX7zo%>>6wFo*$Yy<xC}z!wha
LD)NAz!(jgh`%{su

diff --git a/python/lib/py4j-0.9-src.zip b/python/lib/py4j-0.9-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..dace2d0fe3b0bd01d24c07e7747aa68980896598
GIT binary patch
literal 44846
zcma&NbC7RMmnB@bZQZiXTej``m2KO$ZR3`0+tw}Hw!i1?nVyb!V!r6kh?8;t$h~q$
zWUP~CuUx4h4GIPW^shER_a^oqFaNs(2f_og_h7YTP*s5j0)-1{R1H+PP<3&K1p)>;
z1qA~7k51v=!j<6ljnV!I|JQ@~pW&`1j!qVKw)FNM{|D$_VE-48p5Ff+r>956OGLp)
z&qV?AFYtdsLH$Q=H}z?rs_;NSApc~E^`GFD2CfErMt0WLCPvQxOycz4%yOk;W4FPH
z@_nt>@6;=yy(-c1v4>=hF=o2e`IyKWz6Fl4B~)U=twtlwrzp2i^W3{Lbt_t|i$@{1
zL8|~NYU60inT^etO<eNZh{4yrpM0HFt`=c!W4;peXDC&!b>k}AWKoqwgZ_|N+T?fW
zO(QMj{-f3*IAh)FXXfAyDRL6Yj1s$KTe46Ci4o?7VJuxsV-e4XHESH1Hs#(eowgx|
z{F=LJ9qEBGlgR;FRwT9_nP~6g8Qw`C<{Zqu?n837_qq|o%kv9&4rouH;yvgy&1_V=
z#E{{%?}0kWQGGGh)V+0op?M3nT6;JN21BZ*Zk;O5V6tfPP{^1EiC1)-Noo}gGF4>k
zK5cVaDBCIHYj!mZ!ZYW=B-vWQXy|?~O@f@r?Bg0lnl~j3DcwLU2KlN|T$9evV?F;k
zhP3YMuY%B9J7Dh8eT>%=o%_PEeu#R!TG;-F?Am3<dY!~&zadx6Ltge{lwSYX0+tzZ
zdu?ZM^6ot=7Mzaz>DOOVE#f<~Lx!{h11Xe+AwU(%%f8XNv?kckCb{U?=}`_9Jv@W!
ze0GMI7kRpf_rDj7`Zh6t=ZHz)7F&*kbJE4scr<`?9`949>PJz^rO}X<5rM%=re95~
zg{<NWy(;V`+*6!hlS~FVw(J2z`|AUnNsY+vk`u*6$30@r>4<i&BJ@X06}9UV>bw!^
z7m!N2g3<tYG>OnuHK}r&a0OFWVaWyZ!9`hD1Pe_0=!0Ah+8&dT391c%sR?HG8%oxS
z)MSYgf)bSUO$4OJEw+Uh3Pc*O7=h2u7c+4xe(+|J)=Nx3-YNer?qzE=5gYIgM#d@e
zCSD`H*pI6ra~~1I#vNrmAkB>!_XLUJ&+2ai0@h(U_|3#8jr;3>8FQ5}_t%wW;hN|&
zw&ybzoRptgBz3XGdLq_S6-po#bV*%Y#jP0!7X(!7D{vNn`ZK3$g27|F7BVh2IrTfM
zk_NMsJt@#sG6->Qzn{#e9M~T5BzcnR49I-%`Ab#TgLPZx6eN(deoR0CJaBJ1CU-0z
zR*0ds<v~!sp_-FsCORkPa+OMQiKrCCMU5N!_si}B3MWWhJ+YAI(wr5s6NwyK(=Z^R
z)(0OSv$YZ@86>$tiuI3#@Z?)5xGqUVau-x-jNb2TU`l8mfv46e_OUm*((2F?D@Z$k
zk<mrP#6I1MW}xKh;=f4&i;?MpP$NzO3YCE(A78kjpb7yw6Wt;YZUrT{9TS3}*RHO?
z@`hVozLKW?TlAe(jW1YNM6qawVE__)PXKX*1c#8g-pCJXz$i2NI7D*<6=u|>))H$E
zYWaH~FqbgWGko8x@SvLdEoX{$`c(Y4c-2KJ&9%a>KzNS`R)Q?s%3H+&5QG=S{11<t
zTF_^YpITeO`!R3^kV6E%khadjv<t?^Q>OJ#Fa#l$28(zc>Z(6$U!T~;KdO<}Le5Y8
zrVcnr+NTsC=5wLok?oq3`@!^PsspuNRZ`&-&!X6(L5)%Kg!A@{0{~R>Ah(-Adg4c|
zXX2OyBJ7ndTCdiZ+^;ELV-jEml8L%X6CgKM5WzC>q5yPIjCHBR`mI(mXucfwLa-*c
z%BFb!+pdI6Ps>NkH#X*Sddx6=fvxp${Bm=J;TQ{Air+h9WyJXt6<Jx>{T@q)wzINc
zXjK+}dpE1$vp1RoTFV7pD{G8L*ZOOSJUEtYBJ5Rh*$i}SJmcUUSRlyDTUN+AU>jp)
zn;*pm_iZG0-K_8kR)FQBTG#gaH;`9?3HmZTn;`7K%tUH;Z74DBVBy13FZ`jwb>b__
zm~&xdf_SL;(7;njjRH)3en)}Rx>OxAX+qEd(4CU1Iv=s-leofgMk<O%yzimoFtU|H
zG!k_)k)2Eqn@**`8KZgkXeyEe1N21|I>CtKR-)x8C$*37Na**iDBJSOC&1N@nTt6z
zza!vgmFq@4sfbcmSKi;}PU-xI6KBX5D%xCFaQVL45{IeMhWBU?Dz^Y_b)P_2*3lgS
zouj^dBEWUvM4CpgLam^reZ{9QW8dDiQ~?59)hH4{CoX~|3)R-RL^1_i&{{%!0Cm;F
zwZ8oKQGrRO(vE5JVB>ryd8uL9VCCPeG?N_MCDNg4d8$BkDfeGRlaodKg$EpFX5uHk
zq9@{JpwFN3@>Xfrdj3#@t-ZFzt=P87ikkw#V8mFBA7+26EABq)Tmw-~&z4+#ZIcQ5
z=EOpxiY60E4lBBl+u4E%(|oA(q3CGmepdS`JiSpVGi-*l^M1scBPdLHz%B+6&p0hP
z4iM$#V<F>=e8FJQUDNZZdA~Ha4vF?Zc{<F5b3mhbQtJfK@cgw@)ek2pAS!Hzp@QDu
z)%6potwq3_DJO69*$6)M_kTSymHrXC+;afbKq78;SP17MTO;E5FF_caHUrrY<{*Wl
zsmD9Mn70OkZKRSn%Mwllaqm9ki{8S|QWjEHnG!=+VD9z*X%!no<8}E4>uh0Byr(|`
z=v#j>)QR|F89^ms`Te1OP=Uk~q@uOfpw;lCc^H;uYrfi2Nb}=`BJj0>szJ7jVvFZ9
z;aod!Gm%yifYwlDC8r}TLuQB5<|Y3HV@^~n5F!qh-Of_DUy-0=l^B`!cquW$AHM&d
z-_nMz^BJ2B!(fT_dUg*2*REm)r!=B&kwGmiCfu&lnt_@R-mV`J*f^|$y~^A^N>{EQ
zw^Z$KjejWdU}>=gCy+2g=r&JNfCyw5g?hcN)0xRe%RauF7Q8>Q;Km`qji1fWO~lK)
zYfrtTe5?Ebts|Xtt~>eG>hQuD{_FZ&63+{qC%^4zt@!#EV@o}RY8u1PlkLy$=nHXY
zgh4ViK%AxPO}9qYK)R5{MgHaeg+-MUp*bJxCPpoB0FfLks9E1SJ4`2L9L-=>rH)7)
zeIC^%DJZE~OAxp_bLuV;eROZpqUt&_|IfhHV)B`k{GUWK7Dfg|GY@a{vfso8#%*?f
zOadJ9FWi|Q6BP1@^j!d1nbnsCuakw1Cv07yb<zvi<IIVaBmJ!V4Nun^j-rvz-&xb7
zThUkR<N^}C2FeI_5xMp14hYe0l1@Gn?BI+*=^(9X-1T}u@=L3!Ao){C-Y2W{TGh60
z%aAL<F>Ha-B{zoM7c1}iS{O3&lYbD)uHN_KuRbcWnQh^$jCR<zq4MhdMMMuphjsZm
zHeekvWYuQc_woVIaQ1Tq4#6%axr8{fa1g&{L4_o%2ZV(mY;Syu20|MF0F<TTTAYGE
zw;a{dgtdG}lVqm9Uvi+RE0Wi&waI$+<Oam<YWg&`<Tm2(ELN;IOW}2%Cks-xJ3h7X
zj#o+O^Qgrq1Q8~i8-99R(o>(z)%O9Q-NgaV+D9B$xLF|_s+QVmov-VKhs(ye6S`MT
z(<KbPEi`_254R}kZG;7IBl3*_7UVmVdn1mo-{Q|@6oc~^B}@tHpdU9UPdl?0geP6!
zR<x1Rh`~P4gB(jJuj}-kqw;jMq2R^AwrTqOV+pQJWv4@bgDA?+0JAwrF=<+IWoiOy
zQ^dSJc*ehdoqWAn|AOPp_m?XO_cDl}@|zS-a%8N>afw!#cm1_Wg4$Od4>7QbzmC+d
zN(6jfKWBIIZY|F@!^>7I;pxdQ4QJ7wcWNlIx4Da#HPMeGQ5+P8d5s|I0~*-*9m*pA
zhL#aM5f6M#OumylxE*a66?JJPw=wxosLo!%Z6&SgvZittVD06@i}6V%_7PKx(l(k{
zoDDwER<B{xVM&ML<4#w`R;<I?vD9Xs7L+w*&5~{=J;}zoploGneibZRr(n>1cG}d~
z%@X?X(t!W)#nUu)J)17^y4CDuO#-H0-QCS7LhX`{lz!{mRs%b6z53X5ZB{6KDLHN3
zI=`{7OB_*;s=J<a>2n0yik}?=-O!yqbu`V1XCIGiFT-5SBcqm|gfqx>ZO}uWK3Qc?
zE7;9AWTkc#Sc#F|khV1~T7tCE#SBq{sged2a?(vNH8F9Hssm8OR?D>ZZQT28rw`s*
zR(nA|nC(_s`Dt`hc1m6ZTHKjTDI<*@wyopa?M6`+?w7jMtk~(&hOp=EJWUDt8Gg^z
zM71AedPtO1`v7aU7xYr^7ysDNpUiF9^jeaaNzeprm)^&QOf_?*D_9&r2JDp;3oWip
zS%+6~-Rb&qc<~vHIx#866?63R(w?kSD^@y2!zN=-9`sZ6fM1U(#Zf_k|BxzS@vgF}
z>D!1M%Q{^Q*R)ICgkr#Dr_D{{)z6Caik|jnUn^M!YOS@^*7CD@o^C-rifVsq1a^{8
zDhknlfd+=#AIwk+QR9l4dB0IsHeAb&4HZ8dCskgp8@%j|j5b4~t~EIqe9#XvNCcAC
zXyiER7&&;WD?u0p?FkO?oDQWWqf>oZHu1%~a@+XX65R@*^fji;J#3H*5mF$<Y5!yf
z{9#c^w0Ctzj`?oGdej)eJfO1kI~L0r>`L4GHkWJL>Px=fbTigs;6bb=NbS5?JGUv%
zfh^-_zvg1KW#d8gX2#JWh}qWJ_HzC>p=YnS_51fSO*q<HY8C_tXdDU%2;o1=G$T73
zdjseHrZ{U>osL~(MCv)A5?hznN`bOrDE~{K2E|E49N$Ka#x-tiC2>`CbxF=J0()y1
zef7y+(itU!P|xx(9cR|#UPF|`h3UpqE_-(_ixg^bt42U$>OLWDt3_#<fK49~Dnwll
z&cJybzaPMBk5X(-fDG<!J1q#)7K^J-5@xeGBv@y}Cb^7YD{Z|S^!_D{XYApu2tyVZ
zPbz9-86pYtsUc}%4Kx9T8W|Ir;t5K-+)xEiPJj%Cp9l*uFnyQbvb<TuLu2+T-9A#M
zAnF_l+=o=joJq1*9J8zK9fY@b1D;DV0!Di(FY-PjdsdWyh8Z)e)dE+p>WEf{#SMsw
z$Rz8E$SD=3Ccj2{o_mGY35b80x{#8j*!VChmB<jYL32AFAB9^=NkjUwy)I%KOE$b}
zewz1-E`?myTrf3|xRa2!V_?3Io?Di6i<Dr1?Pq&hW=t>;>86-#+U;<p-O#P`lh5v7
z)VM~I7st&{N>?)_#Uz(XJ61X(<pbZCn(<#m7KhTyfo+TJ(zC3Zhu(~}JX=|t1T9|V
z3+g6YCC8!sBteM#Y=$d&b<J2@s%CLj>Yj;r)bAe~5Ny)V<ukzVmakwBSZ{<2=2^8U
zYmi0AnBhcJ{}~d8^d|n5cYn(doynJ!CD#Rv=8JQgCW=GlRng|>Fz4{*+kf;ym39U0
zqo0W_Rob~(0dO;~A}yfC$_=SZxWa$tPCP@zL2cqq%1Ac3q0Q~YnP&f9$^S9@%<())
zHf}8eYfQ9&_vL%9KzS>Hm2tA+``wL^4Y;&&&`!NqM0{|BAcg^*WUhicgo`Wwc^gB6
zXC?{UCBF`ewvj}slt`2?(zMo>HMbn}yt*vSIsd%cOG4w6h_MtOgFD6_N~tQGgRRKm
zg~3fb5Vzi;5`FMFE-Va!0|BU)7g`LTDwV0I-^IUK%=GKac;6PB09X0^COzlqC;WF>
z=;d}<$0Gs(but10A^j&U>>cf#?fy|O{ySpeSpB0~+;{qd%YO`Sw+Sskbi(eq0*@*x
zyWww`@%ZFxk2T3`SUxd6N}SWZ@7RSFQz|7c+B#sO$%@QGu?%`hw%BM-^8N~A9Y^Jd
zmQ1#ej8A~yfMA-Gn0X1L91HStFTM+9sGlkeBic?NF~V)&(`r08q7fac_cx74eifIT
z;X9FELEA%1@|JhnG}*A(g`}MsXO|q09i}bWOA<~XNQ~A*o_rY(ePx9UP>rjK%HC{z
zln`JJc||((LRoMgmCzBfN;NHRH01;Dr;UG<FrQ$>?jYJwMN-Z$EIwm2J9!*>=vZ?d
z2b~Vj8`7*ZpbEzy5`t%)Y)4W<I-!lRkXWI8enFPyFQ9Jlx$k4Yx8|ZiX^`PJM$-1j
zG9g`0q<zhTc7Z^4lBlSa1zsS}-bsCL&li(SKg1vr&4*1qqPm@Y9bYr^lRv<}y^?Sl
zW;Nv94@1+4>b<eRnR6MV^`7z!KI#eb#43$G_RknHhN+^b_c}a#8*VpwvH&h#;A=0W
zl0a!Vm8hXg4}-`s2^zItBqhmhrMo9F$rTSgjW6b++V7KRpMnj-gNLp<<(3{GG0XL@
z10&bkPn$%cnG}R)-EzOsqZiB*$TzF9!K#r|afcFsCcAESPtFQEI1(R~F9g*R0CJ`U
z;Wrrgf?R{ydXiE!xJzfrEPZ@!8Wp_5_V?6}{+ovfLx><gC-+&ksb=m05<8#d=P2@u
zr~{MRnQGFF(?#Ik8q~UN-atA#z8E#B$KdDnJb6^>XvONmlo_3Rqd!Y9Z2uMbX&zM_
z;l-j6tJ?`NmBc-#A2^B1T_2FuiG^TV!{N=9esHem*S{}66yx;hjke@SG5N-0#}4k$
zb+%07*7hj23YE37V$Yq?H(<_vXZmn9mBX-;!PsL%mrAsV^t^>x)Y|s!miOTP^uE&J
zst0#HB8~W+3UwyFp^*b;3(yG@5RPIzNuN9jFj-mdEa|*Wy)pGBoD&du8Of<E>1!ET
z9FOrN*jQP1=5~cdZEk`vUF@u^uT*0n^X2Tok6vD2%QfIzpfqU#FLOA#Hr6mGZDC8F
zg}hk;f$FqRJ9j|bkH%PbGS_Ym%J3|1VQa@#*D>12gb(#Yu%$+TRCsl-D@TC;+wGFZ
zP_}AG212cTv(nw|*v^Z(;mM1y`tg!qTEas}Tb7dR+INPpu*dGLJ>GjXkvo11>DvcC
zWIdTPTbBK9#<{m;e@c(>>MY{-9zjb)dgz;Ac9cso3(QFNh&?}B3iNgcS9jiZf6DO9
za@SXM{Armlvu*F`#pA!jQXg)Z8}Ms1lUxcbsOhh0L6{_(Amw<~9+0&{B5lsBxFv(w
z^=n6c^Dtc-#4fR$@RB7yk%0g3WarV-<MdF_$5{_BVTG^>)bLhO59&UG`-aAL<GXL<
zhkg86mn7RjciOf4%G&xtto`ac#y=RIwc^)VBgf*K&N~QOnw5-@n>_?<&2|0oa}Pj4
zzs4h*^{92A96&Km9Nd=sHdGzy<@}9t#@xX$8{jFc{70^f(St3hGdlIl)F|}`jAvSi
zA0B`b3w?1|(fFsf4rifpx?;o>Gn0j-Fl|5&pM`A?h$i9rHdQeexzxuk%rZM`x84S9
zK&8;*7qluDik0O;gVc0Y`Zo>7Yy-m4hECBW@|ilangXp$w)MbhH?jU8y(n=jRQ^DP
zi}Hw&L+><DO3JP#x1RsSZzZ;EisI0qha60!cmh4tYq-@%I$2dZp1|lT?&$b^-Rz%S
z1Ld}yA8n!R$=Ewcv6XJNwUO~y_&gTE2<yg(S!UJi?W6#LfnO$BqO?T3RAP1RN3sq~
z&0Ov_J*B)11({*e7nJ7qPwkgDEL4n?;B6d?xu-?M5o@l^5%h*mp0Vc~z4rjN8yn?{
zUmS|T6zMQ(vHaBn=S=d`G9W@0UF9#wMHrZT*8+Z=OX%#`n!R-w1;ybxp7epKtN5p|
zK%Sd8W(8M@?`5oBo?LROqwsubG8g7DHx&FJ|C50txQF!0dEBWAun@S8qK&R-Q?9}6
z_G+XacyXkR8(K9iOWN&fB^)TUBvSsTm)(9I*8LA&t)xbZx6RlFV*#73*4JvN;!xMT
ziS86Zer1Ga$k%dUN0V*)3n9@TVNvOGb4i@b{0q2FpBW6#axR|WrLa^SeW(E>^Ft-`
zx4IyJ@P^bapUK?FRRI@*@V55-6P6Z7O@(7#gc=e~jR-SFyaULzw$^-t`xoeKganmR
z9!FLo!@!ENLVUgj0+|P)1l0gY-EZ)p9A<M+BWt<EO;n6O2$#kC`S{j?o@?QKc0O$A
zl~lCf!{gA2N3OIjMx8XO8%+^t)uOE>bETHjFz+2p(@~@$shMJ7NO<~5Xmf1p?@6Ai
z4U$&I`SAD>C<7T2nOA2%Kxhtaz+dK>cp5kacT9yIHOb?+KO`8x(%=bUUh^I1*RsTt
z0$(bLWlT^84cqko?jmPd016+FI|gDKfGm4jg84V~4Zn7cAx|H9Jj*;nuasIh*zEuA
z^%bIHS)Td_VR)3Pp<PvIdj{<}XE+Dw!{Wdj;*t0xn*7;F+vu3R2ghJ%G<K>rxP>V9
z$00L)v9DAT`{FIfmN{9BF1Tdj!z7wLmmzf{wZeRR9=CH<Z<y8)CIQ#URfrb(jokJ6
zlM5vTqFUu&j^qnxa++VF#;T7;1a`Ml12#w1ERRoZk`de^c%psE6hnw^#5fO7yUKd6
zP#VbmSpW#)EN;$A{E3h)<BFnA_$vc}ERhJIX(jWG92Xhycqvwjns1UkxI3yicX^a7
z*{{JiV%WFrQCq5#<C3=1&h~EOw6>yn*hwfA5IBVz#DEC0eqs~`NeWTl7}>e{ignk1
zH<){OSd>4+sG*yHt#q2uXCQ}YR^k<zReQ>dE=F@wX&5;!-i3su4m@%cKJ3v#f;?zY
zi;a)Yg8j2Xh#jkd3RN=obBG1(2pxjt8J2#Ti?#rj>ET%w9rrE(#V9bgFNKdlFjygS
zmB>HVZl62L$8$Jz)SKULUGfC7&^Fbx;D1!wG#Go7OKZxjpQ)bhCBhN!CXsl{#ek6#
z5Gn~*PqT_Z<pPEy&zKQG4aBT7!Hkr9&xiHRr!O7-)Sv61_2j@`gWZn09BH~4prnpT
zqf*6?eh~hfxN-o2{&LwZQ(m7PMF*fmMVxRh7}#zg0I<^t+)%?Ng~JpmQ5~ruuIT7u
zb1@&$2>R}%7{@}LWMOeuSHIByZ9MIh8N>AOX~3>OPM?62V)+gbGvk`$qw;6!`Z55n
z!#zWxEs~pXQwp?H`y9OVnDG!9|6BexC9`(*`%k%UT%k>_()8eL=6X#|z+2@0&v41|
z#j<qjV%KBEhh~ww<;j!#y@kO7o=+azL6H+B4*LM1ZoLu4U!jsXu6twm+E)%^&LF4<
zJKQyDDLOV0+4H{_VJtEAVF`L*q6}-!9cnDa#NvFaa|>Yz`6Wp8H0Of|F^!O{+p))&
z0R+7KLoX>v!*|`(jt3saqdOG2GE_{+#{TuQfnMPcpCxz@#!H%#T~lmgAePd^SL2lH
zsUD)vMq+Bntg7(!MrXf_!A>x!DX)_4q@)HWkFu11nM9zjHXu{0HHAGjUX+*CfJZ2a
z8p?L*oV+na)-m`46yZ{wQ!uxnM8af@%o&Xs1yc8TY&CZ!25-;FO|(-7z|D@l><Yqi
z3(+B7b9oBGYUpbm!HKCnL8uj_mp1<*e)R2J{1$YjgmjQ+iud-zaL437_|h7OG@Bh@
zJabkDaKhFmnxCFJ5*?I672<rJkl5SqPzh*?gxe1-uu2{cpY`zn<fA)j>mp6im4DMu
zVFf(FZaUZheD4R<kKg>dTp10#LB0qgegC%j5&mk(q~jRNKQ)Ro2}M^XAfPb^;-yt{
zIaQ<Q*9HxzF#zJ_woMv(i9+L#1M&&0`jiUt=K6U*+P_FE)5FIjYFa>gh>N|aG>^j9
z21~PzGsm}jGZLFBo%+G`C?NAk+WMjSzQ4SP8`F){9p6s=$YS>E?V<zWD_v{(ek!Ig
zT-)`Ujqse*;em>CjLT6m&Lg}vTDHYemuY@@-vW7?si#gpoZ*l6tKjI5q3;c%)02;$
zm7mq)?gFOi52mHs5R@^lNJHa7gVW8RTZq-^6Mj-bk$jRY?mIkF1X9;O3gg4FXNtcX
zFou`4!>`JgMfgiuZ12q4`Kly+J()kog6KOG9PRIyF~->};=M%L$SA~R`p!!Z&NvW|
zW(cx=@Jo$flpq+v?;z@iLDZ_M#QAreFh+F3a-?5kkVJlyLXXC@8jY^Lq?)D;J)ojV
z9-?sZa0aTZuq4HJBEg{RbF*;5z|(D+?LKS)A7uxl-hFwJzj5M>f*ej2L$m9ofDKVq
z#=P`Ncz=>JpM=!9Wi5yC6^tsPs*XX&p;k3nO?A9p$@DzaLfljQOX#k9gl;hz>aLgD
zd8;63?a}NB(-&bBfp+`%$;)1_xYMH@w==xd!wE#s*^HsKXX9~2+-u|g+s(b{w1J7{
zghex_2ke&}cN^Y2mfW}Ltiml3qB+)Q%sl=@J38HcG3%gHv~ndu`0B|+E#FHlZgt>i
zw;?rPabDnC19+Zs=v;Bpu*CN_ObmF*gx{n*^8snC1BS0hWX0N-5m|J6pjRbCoukP^
zD6YB!3!fBFwFeZq2YDs;XCYH%O@#b`k~F?Ubq9^_gW9oWZK|@G7*u@j76r>3?6%i;
zjVb036S>BjEqrxBl7<aO!!wS8Y%(b(nGgI7@KOHIiRZrgxU>tar$7Tf{Aj~ab`v$t
zQh0qwtEJ=;k%pQ-g%aL=bwLXHQD}BE3Gpa>nGqJ_Z$zJ!2X1WG*4@lA`XoByp9QrW
z=sOA;1SFZ{-*dshTC0jGNWIia_KSPYj4F2wZco2tHNNAE^M=OJrtR`*G2A`5RQdDT
zKr~HnE2(#H%rGYR=<VphDc+XrtdHz-3%hZu@+WCXHVOH0W?n4rkvKpQ{3zN&;r5N3
zM%lAd!wEu{_9LkwXvSTJPXz2kc<X25nw8cgd*u%!BkNo?nVu>*!cY7L%Beu<^dX6T
z<u;S+lE9HK?|$3+L&Y;7rKK(B)TGYHwW}syc@ClJxluc#KH}Xb%HftnLze*)`G@>W
z<D<e8O@-iOIIcew!TXjje8+5RIyowLPt!X|4G-R_B+&DF^uJ^^#}T;j20RsKvc{w<
zBJ6;%#htj9{lT2a<F8e8U=F80f(?~Z27kiT)?KgneNm7ciunUqYmSWDk;NDAJ>qK2
zP?XVNW`C*m1OD%(hkpzT|EK+*%3e>;!q&oBPw&5}?En9fVE<Dia5iyrcKR<528QDQ
z(}VF}Hi?e8x~E=rAfOpLARxkj2l)@PgqeY}iJO6k-oJ4Fo9wQ5&m1?#=6{~h8s`Y@
z(}W}x8oK}NtLKp@TXC%*Cp_CuI2H&vQudDsg611v;~sI{8=mEYruU1UPL1H27$peo
zY%MN2S!P{+wh5cgZ^G6-kAIBM&(L@`m|%^p7|3iT&q$&rt~!gochlE<b?3FVeD45C
zRu#MCOKr1tmGdC1LhHhd{5fX~Q)&#)Ifr0V6|)mnPk}+tUv;I7R^>Z}a>x^Ur78xA
zTR4wt?|rzXyD4Qz&$E32ir=nsX1crY_tRG2F4ErJnd9;SS-MWq`bTBk%(14dk^;OD
z9y56ouatXplB^#>%FxdChj@9h<N#(qp3f|nT%BkHxegF9L-NWqrLvR-{q|dLVh2xn
zh-{i{ic3L=OK=oMS2O4HnI4Hglgp3_;a(G|0O-YJ@&%>Ir)x7S3~*@<)U{2LThz`y
z^eYsGj2h+hk^0UNhDo+I8ob>`(wI(CFAKK_!Vq;p=*J5>U_N$0e=nrw=1a~9@Lk`T
zFMgi_^1a^ekV6xmuI*MW{e-doIiTmfo`#AeLj~n1s%*BW=r_X;fp(bfWC8YBa5K*c
zK6w%!jaW8Ykt545yp~@Wi%DLAJ5!EI$I5(SdkNx#BYcbB(Gxd9u=pA}a-LO7gSa;J
zNjZNBBq#R^N)Ax*@N~VQh$o&8;VUATTwBK49kb7{4?RTB?!QSD4FV4<4=rnYoybkx
z9WsIv_)FbjPtnZE%IbH=D`SWK%q%^*vBYI}zkVGXb_~Tis@KIQVW(YAD<mO8_CwG(
zt~(wX#?WN*6W6^^O`tdP%RW-H{>06$2_b*_Tq<!Ue%az43U2CC+9iL}Gy?YlgE9lF
zLLC<%eyGgZqQMV8`qp5!;l(q@C)N(D_G}=;b$m|)m&gLF)V_?*yQB2bIvUZiLyFSj
z<N+%a(_xTHa8WxE<|AfIEkTc59HxD|pz&f%GQCKaSa{pA8e1grZC(IBnx4tkRQk1K
z1D>D}3k#T-fN`Wu3nP|@R?{YF$M++qi|<#N8LFeR>)oWe5V_f*-RRAB!gv-kS1%_I
zUd-s>Yq*anA;_co)b8jFBBFTFs*Y)?ahQ*8AdM&tiy)9W$6z86$bumQ7(v;5>+|bs
zx=<p*xYFuT7A|f79xm-jF0;IG=?C3w%W7+o7Rxc^RCb`EVa#h-(9yz+DgE(xqECp-
zwd0%db9HNlX}o9S{cO6f352t$BG&!(R8^BZ-K}o~hD2Gh!Z@w&jhWh(sb)-@aSu!&
z+g?Pnw=NkKduiX7P=6E?S!?ptW;~SZD?--h1|kJESxW8G8+qNx1y~0gY|?BxJ+XGF
zBCUu&LL~j7QjhZRJ35vaji1~AWr{W3eQc?ikS%}#JfR+P-$t{aM;FPx-JX|F>jz|k
z1?pTV7Scfw9+4pN>^ySQ^NcM*jY*C1F$^w`XZTpD!FEbLx};*Pp_@t=!Hal%ywVrz
zVL_kr-LLTv=P6n~T|t!vj?YKQa?sCHfu^xXM-jMaEQJvvw<blqZpp(bJF7`6xrq{h
zVMkM`2u|8*A^a;4Xp*UsI;evG0Wj8tigd_`I1=ts?eRRL$K)69A%g0B#iqiA;gL5t
zJV6mfX$rH)GPMkcT}ULtfiNzY$1li+e_G$Y<S-Td9#vBZu>>r+uRC8*AN#8)2{u|F
z^%6BygsBQ)!1IjwY$FKp6(UC{@CeDxn2b6QrT~gKP8ur3ike*p6s&xx9SE)(Z{7|Z
z80xf(3^vyb4I6F;6M2m+{r8Syjv+1}#9R+9<D?91Bh&&cZSRv=r8=Do3PB!S8|o5H
zF4#0>&llv#uu-ewcj}DjR<M0(l|EhQ5jk_J2etFTDpozXl?`eY+ag;AK$$j{*a5h3
z47ink1)QWSp*Q6OvHOYfk2XvzDSrK*KVxyD>LBBM_&$L49*eGm9zsBA+Fh;UWe`3C
zLri4h)RMKfx29<EugsqZr{yUkUe;0)ARf<DmCTR2bQ~C%B^T957KYP>umFUFd=|Bh
zLG$1{5)VQR$@KwZffYSw73DNVgV4JwVqrtf5-~;l>HEe>#%?&L-ASZx(1VNUrqraQ
zB8(fi57XYs1#mRsO<3Xw;JE<tGGvTrO?)+~uwzX$>T^J7;H$}L|4*6353>xXPya4+
zG%{l_IpSl;B~G=P=TwE=LZ(D}KFj%5vK9h{0{oFZfr{lU8Tdqpdh<(lKm((hA+lvv
ze^u!4s6uJI6edq?v>Ys}Y2gRab`qf*GBUzn0R!9tP$ey#PB6HLAP+;wUPNwVDKq-m
z$b7~Dkg?Hz&L$GfhJ($&wBe9ndtWO4<_#~o%jSEHj;Jwb*jzoGy$oIU4wy2UJAZ1A
z(o1wuRLmUU5&VpCbWM{e1{Xx4&T1CIXV>B8d%%&;S|ibXr0ga&xlwC2#S>|3eNo%p
zh@9-qK!NT3L)8%db|Do>>hp=4M1+Z_;1<|<*36mST8Gwb@~1=k15nTB6-hmtk*)T`
zZ7p$-#70bqBAG<Dx0VrTZ^iL-lE?PjW?1v(Gu~N==B-;Ijkq}+$H?+ZCalJdDCueq
z*#+zBvx9~9d@AmbL4#7ame@0Nf?9X^**S(oXRNz|icDfd7_h0Z@MNgqJj~VlcJ0{%
zX}^O0hVTi8R90qcGNutrqO+u)cmKFdza+H%5GJjCKe&K0-s*`N1TOoCj`S+W{WD^?
zKI*=Y^A`kq0aMA9Nh2XqHh=~IkeVs6n;4tUP$wjihVgqRWn$~uyBvR6tvcVo8+by=
zOrJPUy@6P4@7!q^@%(d(=F&%u(={Bdl$a~_E~M9fAv)Bh9Yzi0IU+C<C(O}fD6K5$
zEbIV{x0;V-kA-~f8iKL}!t~W~Z|Fj#mfOIPp1JJxo69`WF7d<vLA|{&pL1u7b?;)P
zorrG0mR!S>Op(CTWwOMwTO(V>PL17*#_X)bNnXR9SI$Sh+4Ax937~qYKI_kh9`I6i
zWKZjWq=#P7e4{8mu7h2j3`yN(r4mXJV}!j@O-BA4-1iOjpt2KuH=>UkS{*H^>qub3
zGImTX-XD1LHj<pkUn&Z_&y7XzqSb?|t9gKi1^9Al)|k_Pf35(Z4`rzsG4aOME%mmI
z^<kyn2tNIooCA=iie)EgW;a)7bbW(L(fws}`axyCD6=$xoEv3crzx>aP$OJRYPFjn
zdlA4jBz0I+>s^JYn|%Sv`0D9kXF9UU4Or{`Q*MAfhed@~h=z|2<p>encyYo)zHI9u
zo5qy0N6^f}S#Ew^R_97A|Km2K(8q2<^}T(u{5~dq*wBa_-mUZ2Q)Ae}9h2Klj4!Bu
zzniMEr1I2IdWCQQqO}`YF_PQBk8s;LgtQqG+fp?mh5bye9SiBxO<G+@H}O#gx%|9)
zB=hVUI&z@CX%_H|9zEpfZmT;vUajZ-kt`_!+f-l9`rW2j8m@A-PG1ihwB^Y|Y6_5$
znEF@-J^vg;%p{!HyySBl1cZP!#qM*rvu8e7W7bHjwiSwSBG-fsGO!-~ii|ACqx>nz
z9bKMvkI|;oLEc>5&fbUkr5kM7SR?D=`$1W5_i)aqVarp0Zsoibc<yulrrY5gny}z!
zZ0wYcH`%30bjr;~+VZzgHvmutx+(pE(1*sRLt&pbZEm-D_&&5uZrd_Gzax*^(AlxI
zrXuMUk`L$+PGk{Ch(3>=MTy9bf?1MI5|O=3eg>S<%=v|32{mi+1|q?WiQ)dPhh{JB
zY7l*exDVsX{IOlACc}nWisdFl5xILGg5RRSnPdFA!4e%WQ)iLJJExS_#VfAh>Fg0!
z2V~kdSOF98VW?rk{%JHg$}~4xC8JP6XFs_#4qcI$ByH-e&8L{2FE{T0u08vzuH{zc
zo0^Fvl>_O;!!Q>ZC@au(uqNf#PTXuYZm-=q7IJF#&q!7C(PD1laDadkx3{*$03TJq
z1hF!ZXi&ZirKgeuQVFTs1nSaV7^Lh1Mu}@#`5vug+!ozicF_3S_ffYXo6$N{Jt`a2
z34s55ebck)!{%Q9@rM^7t5Ojeml6B2rf0)wE_E4Vj{5KA`fe=AS5>}6&wHgC8Nvr|
zVJE4maQd8Q{fE*9;^NKGiciGDicR>{Bjn2Ub`%Nm(HFg+-qIZu$I;<*O*tk1JqCSd
z<I;sW+fxO2(Q5GhY^ehty?xkMhlO?Z4OC(mcU<3BnYa<*LZw$k{*i*w3BvtlISBN#
zXu;;~H9MD!d}Vv17OH%$mM%!L?6?sA%N=;s4IPc$2@UgEq3xKi?WMWZ7wh`RxZ<$R
zBs&-E7hC^>Bs5R<KqWo10+Ezr-)cc(a!xt#+bVyC!mugLC-E_>6iHsLtk?%Bx)c09
zmUBxCht?Qq$!DK41ksFSB`Ttu8egL;A>u?Er&}t2<58pE7mMFSZ2t3`41TfV?GWT@
zSh&nPKK@9Je|?q`uL`|D@uU>iOi8_&fW0<&YgS(omSM?H{R=cBr^9C#@b-6GR)6EH
zChW%r_t$l1TD00>;QR6m%L_td1#I}bpy|V}nxNg}P3g}Q)wDG)uUq*aa4+kP7=m*b
zQTH72h%`?QG?QZoc9;2f+{!xcO<xvzsSSS{yFTCLa9;Bv(`SpmZJQCXL3aHcHbb-L
z)H?@t?X+91jrQPm>jV7ZF|s97(3>UcN$W~BG>^5Co&xJ6s?rHo6gTShP@WtM+~Ku&
zB@1M%7X`7%(mB*C5bHb1i;CCpCa%fBkPG0cio^);Yyh;G6{&Pf`tu|QN_wJ05^|X<
zZy|tsW1g8a)4Pc@sFP>tu~vq$TPX3Z*todo7$~C376hjgSQQ2&YbRP!`Iih}y2uq=
zl;egk<C%GAiD0P)LXlhH@H)UTH|C???<KV~YKOEn65#YFj<Xa62>tNVE0Lys#TQ|x
z0Rj}^yQzQXk6Fme<=3XcP;v-nlX}k+4at)1N?V2%MhL9;@Jv#eVl>AjC0+Qkn<RKp
ze-BhfH0KY@;ks^{20{JkC?S6WmK!Z8s|?^O`+&Uz0#$iY%aLFbNdc@|t8EQDk6#kY
zq><d64MZ!Ou0GjmAG~{@X#LWe^SW9Dx#gofuk#WM^qQQkb(Ph)EZ+e$C+;!b!ksEr
zrMBrP=+9gf?cuE1e$~brAhTUlqaqeyA&$*#@=&tlq&n()Zo63tXrSVO2XXD{e|BnN
zeB&%=+LG(s4sC-jzzr1lisGMWSUQ8@;KMUOoovm(l&fIg{^Tq8yoF_TZE-YlU86<Z
zR$ufqBTA~Xge1~Nj1n^cU8#vPjM#XPH|6_KzMw;J{lgf2f+#%FR@^dp<K)n_Fbr{9
z+(G>1s8NBMq)iP-hu(iN4riZDLzT7D)rh9e*0oy;hb<Xr>tqABsmcA2eLb*xfjB`X
zO3`h<L$fnRx`0;Ms8K+Q3jtEHTqD#rco@i`#}<1ae}?TqN^9qpb8OHK5%pP9(^uWm
zd@#nqE<>DRh)XZah}K}9i}lo&;JU*bi`C8@_9xhUpyFZl2fW^Vcvw=0TVR_wrtR^q
z$4@@Te3X#Q@AQ7z-*-PhN{Oc=t3lZYMo#dQMbcEte)QHv;ojb}sXoJ)$-~lQq2%u{
z8ZNnBxImYcAYE2|^nCVE8(--X^!5ri;pI!RX3a9b$NbWb0VO(3isC~!=EKl`&%>Dl
zrkfdUmwhhK``a5n7>Jz*#Nd&>GuUj9li9x%OFX2iTSXjpHUxVb0XdBYlJ%))5u?E(
zyhDIR*Ro=a4a0GuN<I2)mg>@Z%)!paBoH6^mJ{%GKU_&)W}JhU$4!W^-A(91Fz)RQ
z8CR#*Bt_?{tDOu|QjLBYI%ve+s;AVP92~X(yCaFc6XIpfj$pi%3rd@tpjcglP>#?2
zv1N?9xb?Md46&9O<ShqRgej2CTMdV+lgp9P0k=xc%0o<$#Uj9ziSb$zzycf>DPHgI
zh-gU_*%TATzl{tHu`WJ0%2B11YA7?lXsvCNj8W6ECIQS9>1<W9qh!RA^A4}J5Rgq?
z34$eAQmc4H)W?>!XFJDK%>_Kn8ZQH8tmrU(QDl#9-D7=udO5WV7!1W`B{kCq(zPBN
zHIOjko7fnVr77mL>Ta>52$cX{8OP~`&d_4^Vex#qo}$0EnPpjB;eHEqa(Cs<#mkmU
zekisp@BP$`5?qpnW92Zt?y!chhJV~^{jq?04Yc8xuj6sKb=)QHowL3slimSD6(Kgq
zfYMY(bGzz%Y2DqKkGI2KM`+WEZ_A_7^(j=7op7)Q62?%dH8il@)3;CO4yNKb>dT^&
ze+D^M1hEaRIA*KQI5eoa)epE*vC~k)SG%uLfI1Du90}n*ZA<$`0tTtahV1V!<U>>d
zm1ud7tMh3K!?^+%-v(pHoAb2o?jaY<kw!kBca6(%D?y$)D8a3}Mkxqy1`=J&-~i&|
zbE$Ny^hfEln9|I(s`IpvwVLg>NxCIg8M|NjX|x-`6W%{?720_Tpri?HZ_N)jPTnQN
zG&Pm9Ge5klJ3v=SsYwVmvu%8?m+n6F17G=F2R2aRyM<rA2a{A^w!M)20IcRe=Ppv^
zuGr{zSS_Dh^I;SwBtc=w*pTk9ErkZpHO0A0vGz}EwOSKo@aXH|XmZ|b0>-ykBJY3E
z4<m!wW+B;TKZZeZ6J?GL1n(T`jp9X$7_#&)F&IH1ekli<+uUrbLK)q(4Wlz&WvMLR
zc6S8UOaaa56)7|{b6}OPZK+p@iB5*qebZ9H@i)-C>0LsIJ*ic)@9WI9V+uFqiImaw
zHod*`J&(PM6TEkJ=jpjV*YEWN+(dMZ;4C~Qj(leh2?1r+ZlQ5iDwA_E3|<4>2`~1=
zQ|el!&<uUANGcZ93IMV0-8dHXE^yYkZ|eHVx>qD634qls5Hnv{$_H#Ad8}ks?_6B&
zY!KPp`nSiLRUYUcArdrU$l;!lh`LrXpU0Zaj_}a5BjG`Y(CnfeWX_lLGXpsKi12}x
zQAum>;t-d?pg#shj)<uEzV+gN%f-_c9pwZS`zr_D+<^lX??AxC0C1_TODmA5N*Bol
zaDU?EAYWi2d*MyM_2zt~SxR;m2GB{vLA?LwAA&R@0h>!?7EociMmSV<1c65fBjbU|
zKrFQ(c!cd%!{~ok;thPJU9ZX3FQ6r71-3)CGUeY<Xs3+ISxP-jg0fN`#Ytr7T*)nZ
zmHGz6_%h`*VfQOYjL~)Cu#Zeo9BQu`dS6^9-SS5lii=n~+pDI-sxrYn31!wfp!SUN
zgRq&!P2r$oFwhwMQV}_A5*XXy5H^^!WP>@}%cKFC#X%R);|l_D6$FI_yeF}|qLQsp
zgRL<vkmY2qGJ^_SUF<bM%bjo{U?W8AaRyw3#2VQzXaYye3r_oi{<{Hf%js{TC@>IE
z@IT(nU;nNP|Lc~Yo{@pIwV{ELm7a;afsMVj$$z5_rzpru4+<c4-_&9`N-+hqtHOl%
zH$(OZ!FZx-vQH1$W^TbZX03dBap*Neg6AA~*==`u2qM+d71*Z-&DT{xC#r&tdo!6`
zs<GPInZsugwT{B5PJ;9*#49SauA1mX7VQ%HevoR#|6xDe?KgS>MeA2nQ6+Z`!z})_
z&`4A;N31!rR)exKKt5ZNBo;<a<8WeX1>x&ROT;11mUt=|rHUL7ogcj$&xaFJREMkC
z2V5M2@GaI!><n)aZx4Qy(Kqo;t6h_LXxgJik5N??=D)>**{X77C2&Pse2ei&wp(_p
zfQ#q>4}KH<rjTv*Lw6Eiiez6Cyp#<Yw7R6}sj2>|t-WWxVwZ8}ugCMk&e)-tf$3@J
z9P1-y>0wjs$^@N*>q60-?+Sx0XBs3ZtBV!O?)Ty?`kr@YZ1NH_IbAXNzYY@qOSzZX
zM>o9qXLWF(fq-!TeTDu*xwkQ}|DUM#!)iNr|JZ1KXY~vQq^VUEvj(I^$>v4`BqY52
z%Pbo<k2}a^4G)~uApZW$tV?QlUa`wc?51&_eB5ohoj4tE#?rGwfpR6mcLsuc@ffe0
z|BfttDP*&%D6=MR(i8@N@<B`bmYU3T=&_Wo3~*9glCm89TdFaE=dOx1S2#e73Y)BI
z<?n{0=sd>?5f3(#(&49ETGEF4nQCu6{55lcndoyaBD`W(6N15}0QMMN%>t)mPxJRp
zl0*y^zNCMztXvgbhjKV2RayWf(hwYPwR-%hrC=6D0KSlN!eH38|5mHqn9)U-Rbd7G
z13$`rVni5&;gmyKyh(e3{u`Y?ey;%~{8HHJFvR*=Aj=MnV*M!RZ!K)V+((lE(v({Y
zkxHfRDbb?YpS4Se%W)!=Z6?l#ap`An*DEX~FLE=AO>z~_@ihGBb&jn8|JOAGHljx|
zCKr&^vU;B1Qe<={^RMZ2i;}xIG2$Bz!Cg(p?h&O&fGe21meTTtS&&sdoBH2TTq&oa
zQw;*-+1B3inwfEb-jNE+68}n}U`y(j-WHXR%R8E6oWaT!{MZ%z$R_BVKP%-o5;kN~
zR<bZi)ElC+O7aratygWlkGgVEfbHC4<5d-#;;OM8vNhdSpWk^q8TqVY+sG|Z`u>tF
z^En@?0f+CFwO`n=Hf!YfhcWP!yNZlNG}!q&!gH<8)?o95)Uxhd{3)u(V^WLc!PPVm
zA<yx)$6Q<_(BZ6<sXGgH+L<riDfLbiq)_fzfJyB2);&FAH0@oN>wSL@mE;5`I}?_Z
z4gY~9+3&%~VLf|%^nkN2c*NG3>|awLDw;vOA=T~h0%iiglmKFqR9a!?P<<ps`^U>n
z^xSm1PW73MHdEW|@0g9bBg8mQ)@zc6-gIoHP=U>BeO93+qwoWYo-RptTpVoh11;VO
zK~-D}8Z?gVn!Nx+`xce+uS`>r6L?<x$Fva2Sgw>RbI!QVi;lc?H7Trbpnw0ur3*a+
zq5g9*SsfYZANKt3%gg`k_{`M8*1+1r)5P(ALZD^!9jlFh2=t;yyos3wj+!-)kEAia
z5+=y8)={PyY@MfF){r7~FZ8l{gZ7hr(Ja|XUrmT2>DKXa&9gP`3Ru;Y(#KIEk#d|N
z_2B<C|2vy!22$<JidbG^s<;ONVL0cY(3f0PkaC(DAeo<hIIJZh`-fRx1Sa_ehxEWw
zOrr{{mUT{R&;e(<a_H>E9A0Xbouki|)VNv@kGoml$E^#dq09L94}3$4p3bk^N1n;u
zD)i|w{d9Z%MAwbv15PR}c_tLNRU=6ybaXSVeB0c23WR<$x(6jM&aD{;Zs6GQPH^aX
znSl(Y!zN5m5?#SUT-*`Z60Q-kiZXw*=?;%6XDds=<L;7@H?N^^U@n&Mk(GP2n4_~~
z*w3p}ic=Vk%x5r<oIf6O8Em80tc;vA`^gD01vc|U0{T%aH<I#9majehku08j0*Dyf
zX3tSRYz2Vb1m8WP)=6;vfKnJyeg-||TzKRYg&`t0!$M<l{q{(u#x5`})1h!Ckq0x_
z4t&b15qziv&p_DPE3N=V4Q6mqeH@+<@p`IJRUTWi?T{*<1Dh9{;4bJa^ULf8q<_rr
z_$xv}u@Y3anzg7;-18EnuE;m%2gDmeK7!=ITpIHNy#QF7o%vpS9|mV=_lV?FmZU5`
z?IMw-5XQ@{9v|{^84b?!hd*c%UuP|bUBIZD>VFQr-5>7s8%2?19W!JSn+8AxmR!BC
z8k?!}p>V4OuiC7;Lz!L7v}t-w8moWuSMFNKy|c#20)<KsImQHpG6Wt;r7m`9`@|1n
zN%PT?+(<)%Rbj%%Bo;VXV~U^41Bp&u0c092*+%?Wo!E(ZatU4Vkg)Zi_1hGoSIApj
z-FrJTBd0(Pl-(6_8!u439QL(7*?S+zzcEGIC?ab%0p1x016N>cBfJ>bnv7nftUJip
zDn0UCeiu}=cF`vRYbgUKW}DP~{UPr#HS^|aUaX$4BtDPKUWG|&XS=I^FKjZytW)d?
z1$%~D-1|&l{j~IL`Lw1mp-PMcMkwiSt%8(VO?#|DP0_J`{;yw&PBGxnKrkR6(0}%>
z|JO!kZ1F$h*_DR0-2ppN&xbnFOo6K<R&&y0y?{T2V9S;53ve+nnuP%4f`lPS1hu5*
zUGL3pi9~8naX{l$0XJCg{B2g;14)WVWS~OkQb(!))t)F>F}ch5FJ#c^c8GCDwqKWG
z5nU?+Im70xQd0|-KHfaNCNyHor3D5T{<jD&{9Mb?uY`FS#^6LG`|E5ouFogoyAPcp
zZ!Cw)n$^SYlz;G8F-=Bg$wDv^#)6_|FAm%=m0;HE=kuG+kqgDpQ)Q|sN-~zvt}`5j
zG*uOYV+DxmQb1LdA+aKq2D@k8WHqkYz-csL99cW;kxsaj&*{Q`s~sj&HU+yongC)@
zi*&tNa$SZ+<#nES=r>x&lpn&iuH2n@uXsLzxKV}UAP=jFcbc8eLz?aSbJ{MW#C`7C
z7;+zn?Yi9shG2~8HQ*dxzTA8sZa(%A<*R(dGt#t=sZRCU7x3ptzKN}~V4B*N64TBy
zlC8Jqzm?8&sSQ11$<m_&4BHb>N1;=;()~KxLnK;H*`EynBPy=3bm+#fl_Nbci5Ri^
z@zMnz4cF3U9t3Fc2c8;8X{mqfz;b#<OVu^C-Aw1j`E6Ca;h@(SIFinAqZF}sIy0@9
zo7uH$a_T)+LV+r(vTsp}wmdYa?Z6qnm^WHWxhOi_-+c4%m@^D12nn{?j?>CgNt$P5
zGMn-ZWK^!~x$&bbA*~+VA>mHYL<uyy-yb<Ng-6Q{vL8_B`@zU^c4{Kuj<hT&kN-c)
zzJW;)ZdtO8Y1_7KOxw0?+qP}n_Oxx=wr%XZ8ymav-F+K-_b*h`IaQgJm8BDXsxi7H
zFRuCKmt|-KL1eFuY`M=U)PxV(0bF3`kO(F=tRm-Q9Fmme9Q;PcInb&BM5PoNEbQln
zSH=^#fzNQJW!xMithjEJwwp;HX-HdM#{>VXV2{(YN>3v*u9E_h_$*VWb8lM&p6i^5
zxqFvOd2TJa*7K0T|Kn`&(q>|CQ)Z1N(1XEI6Bcy<bzqxqUzx}9!HNs9n{*)2T(qQY
zT&R$oRlzzz>aRvStpVg>JMF8LX-#|0G9NtlLCvp#UBI4nubkf<BV-}O^g(PGC5~wV
zkPh$Jg#m-h7KBLIP?s?2ip8pv9H`gOl6z~_7)1HRgV9JiSdfDo{7`UXyu?zFGfYy{
z>`j4r@HldDsWNG5$<4HBc;%-mzy=?|t@sm7;`B%0bv_tAeaKU$amXrG#`Diho&|5)
zCU1*36f-u#cgtaG!d_@7mK*X`H9IrX!-yho|9UFwgT;sI#OK!#-paMh>f2)dg^61&
zvheEEW`p|c{F?p*JpzW9X~$zrY=gx>Lnf|)=g!`WgV$HYXsorIsn)QKr)X_k*X_mV
z?Ac14ESE4z<-^yuUW-yH%>_RZC?bi*WuhNob5~IBFk~}!iEMlm;Leado52k>nru9C
z#SxTigImgy1Lu0$sL5)Hmoc~lzd;T$e*WVR47sx}koEFQKbjk%MLCn|-U!yo;Bkhr
z+5l2^I|<nVjK)TXVf`BJp1XPGB$$|CPw<85thx^cC)t2cjha*FPKYPX+#anq)^Bfk
zZs4Ks6^mH82aShM47YzrNMZ;>x08RRJpYA~F#p*<{0kxdXTZohR@!oa9y;Xa8%2N&
zEkt~TmIg$G4gHeY>_$ZUXwdk(*IA|i)VEfQb2U!@t*aK-8^6GVLMj<9j<EBdj9L}$
zc9of`ILExqo6>Ssv7E=(9v$V^uEejnk<vnrdA2o~Q>N%$L7C$I4l~=vq9j*6>!RLe
zfej!HtWi-*`)a;@H0vJ)4+i&-J`P%gtqiFVXU)jiEfa_DW{mOl1OA)w-j~N>^Xsv;
zi!VrGNM4aQmVQ}B{sRR_C6=5L=eD#tYT>0;K$fXiGlts#yF3Z?pYi$skURZvyi>Z`
zzUT=C0I){>fAnkr!aG*xj!yqczc!_zZM(sW?EO-!w}ke`8ALrsb22UTiReNDS-Z}~
z6B63LzSi(hLz#k-!Ous0l93{Ed7C-uikM#f3-0~xtxiP25xdL{4CthuzgWO_KQ@dm
ztlaKL&!b1rK~^<HcAllw+Ao9EOAlCZYKpZydaYyXQuq+^^`aogB(k|IYWyf7FyxUn
zfW3>TL5{$>+tVl5oGAWbDkH_SYp<)RG14I^OUe4$I~rJ9_BpQ3%1++fc30sgfMm4P
zw0#&muyi093VkD){a<c>xPI_^(UK53<AmMI`9ONAmMzIEDo{6Z_j9A{jRjWsr7;D=
z7MKrtrU(uUXKoSut&$r-|46zqfQfx{>DU5FIsr#jCNa3m+y_08HPHOhng-EM7IZD-
z)j*6OL?#K_KLnGw!LT*LJ)jn>br9K(J47Y_=r8}1T$MxK)rXNp6G_XY_fup8cGpmq
z^yoe(Ik(6Q+p6nYL#kI{adYiGSgJs^B|F%}z(N~KjW`W;y+Fqt%DYmsKMt_as?`jp
ztvugf1~a6YNG{I-wX~(NfT3u#;OSQlFNje#QhJWGsf|2NPE*R2gBhMB47(<Kdb&#>
zSlTq8!u%-z5QlRYj{Gn}V)p_8*L}q8uT`Ab?G67)WyqY%B=B_Sg*UO@yV8n<@uf3e
zu<9vHu`T{hSvrC{M%J%$gFZSPRlkm(Dl~``xANuyx!6(2?p*6i=qH3Eybf|;uj`~?
z5{*7&zEIpDht?Hwr;~EA12_0Fl7jxeih6{2W*?8Q-ZniuXD1@*KlUncael2)jOdO=
zxVVXqsM`MlOk6`!u>-XGrx85N5=YK1zZaM}5G>o|>u076keqycBXH4PE`e(i8wwV4
zch~J!z43x+?FIJ-`78WHcaCM!J_pdCl3ef;j_40id@;IB(x%ZfESp=$iip-u<rFDr
z8C?w(pJfin3)wH?sQXN8S%K%gqWBg*mK?lH`HNo_X)`Fv#tTF?bnm`8&&nfEAMH)G
z7cQP4v5WAyI4A=|EMcPFTvMiIMuue>?WAn`T@wO{S@hMGC5jX$8_5r{0<b)}n1wQn
zcsSO-+jY+WJX=+Srm2{gB9>_d6pn&WX7);*Sefk>D!K59=>qKleqf6_x(!zMw>j%$
zRgRC=RQ7gb%7`=qH#JmevNp;f%AF}8UoVsgh-FiTMd$p!tA4a-QJmv~(s(T&hJ`q{
ze@5M37+M1|Pz(z{FD$&6Qz6$*pNY-&r_)H5uc~U&;V|5B1siqm>oHlMCkf2tsz0`#
zvpTm3KUqLs4a1PmM!=FDW;GB|42_81U}nPPB07<szOpx=cNV_XFBf(V)lwOG?Jv5^
z*+LYpzwpwD=hn$X=>{dX(97qh8E}E65d7I^2X~L+FlQ~qo*q45<*I%XH~-pnEGLV-
zk$vr9@wPg?ZrGqBZ*pkzaN=u3@QzLt@?Xs*wD{t|$D}({6-p=!l<(omy@aMQ3j~9y
zayTWdU<{(j3!*=)vTNyIVPM}c4VH9gyvU~u%CqdeMOHK91g*=M8F0Q>2zAm-C9or3
z%n6loHee-pw!YIu^Rb+&^(k?;;aO0k!7)m7PgDK2lB+&%{9yird6g~7<jr%ZH!;}e
zw6<I!zjBUz;1a^vGNDc@30eh8+|o%|MVZRy?MR3!6-t*K)6HUzNoiiJzsZLV4Z3ed
zPQ%}blvrH35rIYmwErGxkmiA}HpTL2CI`J1V3T2a+PIg8-UVt?w{ly-JUzNC6RGRX
z+^q}W0ylbS-yqy4DSmz}dttn1s_)ENb+<p18>}uzwCu6t>7^6O1hwO2kgt5b0vA}R
zqj`-IeS)8z9e~YP(eB>Oe3-Ddtd=Y_8=@YlxEiNP?`qkSZQ!dVGvTIEX>Akmvl~n{
zb#?P$1h{eIiC+2K<BNvr)b_zLGBn!?5^UiXu}vq>VcChhwHLKBW;?dB&qLM%^cKlI
zUH}Rmu|sfEWMdHMC3?C-W}zrP3_u5Fb{a`TZQxZ3UFPK(%r-i~zBlD)&NIUls+`!|
zH#^6Fa6#VZ^rCw36+)Y1MBSskzF%i$wcg;a_?aVx#r0H*x}uAfvrM|+vC@KCI^qE~
z!iJf*Lpie!eVuE7#~U1Cj=AkhNzA>Z`0OBG-X;CQV=DV=J0v+W!_A#cqkvxOR;1Hz
zwAiKJF0haoIvc3TH(hkmq4(cb-`lvFQLz4t!24U&aQ;~!|5ecR9UT5M;cc{%WNhZ&
zP52L0wJbgXa)L2jLSc)JjtY(Pt|lJgcou0=$ascD*39D_*GVcqNE)``9m|luoRfOf
zXuZ2$rlB6c<2%N~J1btpOC03qtTUD$fb|@C1CIkbI|0yeeO!iEPRT#;BmJnD!Moxa
zrt9|tBp;W40;H%cVn`0J=8WoU$S48znC6MdYuUy$n4Hwc4;%N0+Pg4=;q%&i%VoYL
z>mwD5{G26cd%;vJ#JTn!QN<!4-yzJY<oGkfHH|!&Nc6+ch*5ht%^@4E2Jg~2qg_0K
zeR|pv*terF>v(@`<!~<y%4KThr`ckyKl4C4riq9U9DPx%1G>c)?n*YO<28LF^K=_3
zF=P1#X3Y!2j$@%jkF{0<4}QZg-Z8dhjn7uu%(84Qi5NdgAF3BvISG8A)YhWZ8G^~!
z%1mku0QRV7mnj}G^9_3+k{O5AF#>x3UVMI11)SFn(ki&8D%1mNy97AMyY@srqd|qD
zt7^18*a|wW2pQUVt07ubD93{X=;Y4~DoHF2hlqLfMaUT@BJYf7kP(@B^QiZ%LQ;J&
zcUNr83U&SJux=Du;$do+Xrd~3{{fqDa+f6H?9@z=yhp#F^%D!=6kg(d!p`XStYWR(
z3Ca7QL+AVs+?Yaf(CFTy<6)d<onI8Y;JGeI3#k1Lu{GstU8d%kSFXl)Mtcf*mJLHj
zBneXW4tZ{ORDN&#cgkyQquA8^Z^k+Oweb75jI(yOax!-^b1>F7`VXYuODWoaIWOdp
zn<pw}-8Fcr2SBXgUJ<rm#{Tst9T<Y-&-R5<EkVR=Q$0KJC}cA2#84AzCz0@yCp@Az
zL}$eE$njXfoBT4$!Z-X2-xs(zT2HkQ?h}@5s>d1AZT60Fay4Q>0sUnOj4F#Yti9+c
z#VW&d?TG2){^EPRcIKW*GgS%Az^H^`a_t<blfqB4ze=Y7)=KPQt+1+93xzYv6&zQF
zv2=L&OkMi$I6V2_B)csPuk9{bp`8i6bvlq{KINz)Acsacn<jR{Q#F7NG?=WFz))e-
zA@h&~#7O@%J4LK_)F%31V-v4L^M(SQK{WNLS@EOtw9~f0-pa}qaoLZVEcX;w7ur*i
zWj>bF1Q=x2$%Xu)tl%TVbt(BQj4spKoXVudod6T2l#k!8tqNqEvM|hZP?YZ%5~Hxn
zBIXLm1i51Z3rvI-@PB{@fkA}Or3KlAtA>@;5N<em69UW_oPpx#JYA0Yep}A@<2qgC
zGUk-5aQ9i5hGC;(f0(2=rvf31VfI2*F;}hdRw;98OIt1+j!xF>w6AzdoMW}g+P>Jc
zWe0`Ri@J%{6`NmPg0R*Hi-KoGknMBo+E#twm-%Ax7Uf1B{579cAh6~?Sf0H?`|>{R
z<r&e!Qgx9bih9#v((EamI{pb@Xy3fUn%%5PV{QYmvF*n;=Gx%MB>;w3H{INks!wly
z+f2nR+b&nYc}$ZPJR=8?6Ab-~V*~p&CgKG$j<2)Rni>nN>KP||>i&Tr^NKs%UHgqz
z@*BNyQg`ihZeqiW@Dom8Nb*P5j=KcGkOU3IP(IH3Jqu5`PxhabEip`e!uemw75f)*
z5&!eY{BMU+jQ@|Rl(bxj&0p}f`GgWg=6SbR?pF#e%w<H`fDGEl?qs0~7g&(^enTWo
zVE`Lscer)OEM`YvOQ*;+7kn!<uBxH$<Om-e8`Om|ioT#3f#CAzCh&w4o9w#TodrxG
z+s%*ZA_sdV{wmJSEvb%npCEDEMt=N!rr5}Qt$qap$4mJo^EH+cbHlZk8u%%TrNY*5
z7!cn3ULdF@@YmejDMBUTYi{9&NLgV5`Loeg{JU-xHCoEGBC5@G6(YGN@jjC&MhZN5
zgJcdb^tw}R6?^EMWhJ)r1H+qoS!0oX7lK)fXv>}HoMPL%k<-61a)nsrQcr(lsPvbW
z!~JI*4cwiKbse1?%xz5n6EGF5G!c_YkG%DO(mDoK0ckNyT_LvCBu(W!r(#i(DR0bS
z2PrgM@=SunTfQ@gcfkT8Fh7Ei`uxN(!aEYnL9IiJ&!S%bLglvFt=svWjNe)UL~tL1
z87aW0zBG(Dn$1)}fb9JnV){uWRp`|0R8d`jHgfUmi-32u5ZP1wm3V{yf@p>Ju^6(U
zB}B?%(a#~diJ0+_WFO?1%u%(WKNU;`9l$q+8%?&M0_B(bO2Cq$(=}|?q<mXJ{v`Jq
zIYfYx_{pxc<ZCi@XuWZ>T^-8Dw&oeVhI2p0@opUM`E369%#e-9^)O4-505i?^e%^0
z<UHVy5kT`nk^ir;Gn7$@=dfis7lpJwSjT~|MYCv%J4Bx9BDBIBI3AZ@^{}aPPYEHX
zD+`P66ops<<~#PtzUUdpi*Va%(QqY;D+)2R9NWL9_ZCjBOonhhFGMO)i7(T7Wq5$c
zv2TOS7t(F^Bqf9A;sy)Ba=iMHbno`mhPM8&lU*C|j%RlBh;P!QvnNWVuN9}b)sIm?
zarM^bHbTj2P-)H#DR!QCmwsj{!Gzej2a10Oh}$3@IEJLK{>Cg92wsoe^tXL%6q@Nv
zq~LzT+gy!W!qfFO6`{M{nAuGn`9RodeoHv{L^e}@3)Pzf{3So80L5_8>fA}vJ#PL3
z_RmL2zvW>=2Mz#WM*skT|IcLlR~z%cCdL0jUAa>GYq-h^|6hiy?Yc&bC<dk4{w{#W
z>^3^f%>0+#(AM~~h*$LSLeT{oX2L%oF~}r|3H!|I9aS4a1@)r`TLULCBV5iMThnPs
z)M9>WQ6i9{vL9w`46CP3K7HxzhMyek!&+JKaY7vHeUgZV87}R0c75tW+_v%IQAjg_
z%o6Bm<dNkwK>NW;@G%2TvA5jD`lMm1Vonjs@~FQu?2Z$ei|-n=e6*sR9xXGbnx*O`
zvH&-G<8sRZdB*Bxk5_xh$&h3W_u{E~8iB|_#3iA{zYjN`)RXk&Y4_N55DlNPefzf&
z(c1^zDGr?AxxwihHKiWs>|u?D0KO@In^0dCGQbK#8cV#A^~W-dAOH%?=j3Y=iqk;l
zU^h%*|2!Y|a-nzOxQ`UB>EIQ+@pnE_BLQ|q?XeIOpAr43QP^)}C+*IozINu!_M?`g
z><#AwiAjK0p^}oh`NLROu=)hp^uQlIUP_hk{89F?D!VHSGbSNrEkY%`C4Og^L`?8h
zFEq!a9cuX6<%dlI?-yVXV+Mv?E!yCYMSN3OSsJMCWU2JQ5Yz_*q&(0LiztIA8%VCt
zBXqOVBg!Ru`ZNdK#m%{vJvn;`@zwrwj*@XDJ$pAc=f;>_@RfS*%$h#U=fS9azrc2I
zXVg%LOk!5aZ3c_3jaa`lELscivXrIRS?;#bQwzz-GXY5)f=<GAW#gv1f%VPvvC&!k
zW$n}W^+q70_=woRfdLArEMVmWyYFF{;OB<|E+bPyto4X+RrtW$zUnb$Pyl))hz7yv
z6mfWK(WEDq6GO5W2KX&JN{grVfH{BMxOaOt+&I;jmq7w>^#;>m(Y_inxdnzVt|4R)
z4^)p=Ge)?z5D}(59I$b(cd1KMc#6RJ<B~V5PN~+E9jG|5M%vQfggC!?e7iA_b+E}t
z?q-w70S>L80D1mOo(p<lH#9(8P@g}ol9$wGM=6HQ9{fneeA?`8)$MSeEQ{z*dJZ>h
zURg!!TB;l!Rm0U_8E+=%mC%Q<CnN8GUgd!s+}{7dT&xxDUXgHG2r>>BxNcByNu$1(
zYFncLQ76UI60`^jbRWypN2}*H%+%$0bYBG_2psr`p1iG*-dJw^sw<D`cRk*jY1E(J
z6|(pXY&4Y=SqB@uvK5Y*U04%;k-3qN0jW5L6#8w_e*ns|1eX$`G>jRoGR=OJbCq&}
z#ZXn98kj!?v0>72++#kO$S`Z1S(SlKq8#&U-LpAiE^NR5j*`)Nlmd?f%bR)Cue7vK
zf@40BTJ<5bOL1Z<n36<+yqZLuMNJ#tRn24C%xfY!!ZI)3jzDl}E@M@=me8$Yx^%xJ
zeJ~FLa&3KaeJha=@4z)Glpl>znvxx)^4q?<p|KX^rB7`^D&m1>A)$RZZ`bC}wAqKV
zN<n&dAM>Z;Ij)m!che;b9q({1Telak*I9e7P0-#ZVAM8FqGr7@QRUdI@=8s2<GMln
zeoz>jI8RQbAWh=H6KEBU*0Z9f3DS|8)N8x%d~;N)63*hXb9M5<MS3XQ1I}q2rry+Z
zIn4*Dv6yOMyTLhkk}X$M5hcGiqmMuBj965jf7SxJ#;GLbG<={*dSOAcqc$@c@3ca|
zD$vyM$acrv_oM{8BGmY=Qj>bAKN$>YS>uj$fI<Di?Kdoc7OjH-)!OlvG3vE94D6pt
zdraA#&imHrXey^m*#Q*vZKGi)?4wt4tgw$m-vVRrB5igOtk5xAG$>njIILk1NBsyW
z+Nze5L1_mGway4tUt)16<4Tn)%10vPseVqp4d$A;Y8;>7DX>zRp%Eenv~uCx2^CNg
zoNk|Izj9Zx0=Jr&JB8_6bJ8AU<KNp4L*#JV7PwJ}9a4RWT46N!%R1hu0P}L^>QD)}
zs53(cr<b1c%oo#3%J<qY_<3DB2{E5h(?V3}Yc4p?LB@V8xPBAMt%4e&3f9NJHNbTc
z)KW+LAhKFFr&3XffIR^d;&eV|rq+Mvx1Y)v^`=W01H<#3h#5{kR5&v;uq3@!c41_M
zw1%$O6i?SHNk?j`F6p@7$gmp&K8+Cc*{68L-W$K(l<7{QS2wCO_BZ8@`;?Jj!;od`
zWh5+a`#O8Qubey*-kJnu#+#N-A%>v@AYUV`nG$;-$$?aAFSc-Z{;oq^shpv)bnxqQ
z+epPO<WQms_zXy>>7X<IT;c^|DYNRZhkn9im0216Hk#nY&1wnC;w>fW=xFzzQSf2H
z!|<MPX*3Qb)K?<YXjLLQnTAm`_u8P>ff=WM;BlBfhD&$EwEvBgH9VoY^`U|;$ur&J
zZoP9=nCCvn=G7*M2y2~z-^b(GsHW?9pi(&aD(ilR@=cUmaijSAmcnwe282%EeAKS4
zn0D27m%$2Vm^IW}Tj<mIYX^O{6+)i!_xH%I@l2v_V@D2}KBlKmtDK5(6?QPFFVH^=
zamMei#<9QX?i}{NC&<l=t^OlL(EkE=4^*&cLs{_9G=cs2b@J=ceY|;HXc6A~<jFJy
zW8#~-Jw$IrGKo%qR@aqOTbv%xmkJ%JXQIdb>)p>J0;Bv#n;Bup1&AqyNrMFkin9T7
z)k!2nt76ch!5n`ptKy~JCDgJ{v<<M-Y6RabzBFHF@IBmSV&xD~tZy5IF0bK09(QPD
zn*l6V9Sk#U-fBr2X<3#;OU$mzS0|PA?+uGHSe~%(aL~Dhgk1XgEGO<`!$OD@%IXWl
zEF4sd6v7`!DT9BgKqgU~-twY>rZ%vZ9TX#iN^@NnuDLNElM*c-+#Rg{W>H~mm><A!
zuArgYvd_Mb-xm2;rtXz2MSoMvw_gBPG_alWuv9>Hi^UZ{rBikH(p3+2-*ifBS3G79
zuJY{h!y*L4o@bn)!k2|g4p5AH5CxGG2e%9Y)z!c2!^>bOb`e8>!gL06x8b6(p?TZV
z;Og7h<~OR>PNG%5c&F#~%C)xgS72h%G}vE<R)cDFqY!LQdwvu-gl~}L>a|CLH#zm$
ze{Vsx>};k{77x2aCES$4M^TZ~k-jOrwPS1Hr~eeG&-*1t{KQx2iKO!vH<HCBX*zqe
zaMcO^7O=!wteiQz4@#?~EfI~?*+Is&F+i27Ys?oex$&aW!rfOUA3>Cm;(fsK&^F*Y
zYR`V59%mZ}7Jfx!C04SxjAEQok(A?dD`O7fw}p5a(j;~c@vGze2zB}KhfDSd`T#Be
zQ)sK!M>|tVN;Zq03Hxw&>qXg=>X>uuvwg+a#i?1g#ap0Sx^eJ~)-QUxEMXhPTBEJi
z<Cdz`8+#2qEqMkq#)|gX49Xa}$yaoW$4{9vpRE;(!y9L!L;Q=4)Bn-=k;)5NCY&c@
z>O1f!rL8p!ZkzWe$9D8YPX8x~j)IrR`F9Ez3+4;en_B>Q@XLFylM3I!;!keZi5;}>
zzv+DzFlEWD|2~f=sQ+Gu;As3GjgPieb|N$R{>pu7(Jvzf;ufg9*&&*yl2ZiE_!mkG
zsT0}7k)2yREayL8a9Bt&t<7TRFS2E?*?YQg97CM7SJKS|BpF75c=JO}mp}?CZ^!L*
z49(dTc6v1zT?8?XJllp?P}0yxjz*JHI3VgNt;@wB#2iSPgB!$B7)j$vQkYFlfZFI8
zsc6E72kd2XmJ{>^vn8%{OZX!+ku9&yLO)m!L|eLSrI6S=H}i+H+5*l%RL7*~g3F|=
zac2O1qKvr%eDrD<5zCU~G-$`{3c*x~Crg^@H&MJM!;)^;rx^teccsmbq);{IU8dE$
zTE6l)r)J}P716@XK|?N8k1)nnzzwz2WBmipYBg^^yB#G5n<o5SAIo!4%Qz7kapVr`
zlN6{kw@V$h6K3kxb!hUt|9WJ(bXYOt&E$cUOGVhspy=z+HP_>;gQ3hvM8VGDZyh)L
z@S7)6zxtUx1Y5NVVCrFeUiagy_Qw>HrUTy$K-S%~HV&&#NeCVH9q7I5+}oaG9OX70
zSHxD{^(>gygi|Jf^I5s+)n_h?q>@-RJ{k0jFgtyu4_ApgZ!1^0VJMGUoyzXsV!okm
za$pb(YbON%#w~y%$Vjr0Y`T%>!u|S2EBvCvZS}3$fmDwQ>hu%P;8T;t01i$0(*Rlf
zp>4os*Lp&t+v{wH(|ZE*gIpFUa|nDkpEN-eP0sth*F${eKbwl7=*G%OG;9j+M;ret
z<(mu2$mrx}4v6D5geYfEZJLa8F);2H89rxIbmTrOli|V9|I=yY&UKG-EW&mVRQKr>
zmNz?qN+?F0AZTN9|L&KfI>zI_(H|3QLwt6;002mkfPZgh{7=Q&zcAMSbZPj1&>dIa
zmaZEj38$`Kzo;E&{uG6ZTuL2pYiG;57>COwsl_HA8aeXt;tR2o2od)J<QZAB{(LPL
z?rh;x42=;tPK>D$&uV{F)YMeejF62zn9d}Mv(SuRY#m-5U7Y+x$+~!SZQHYGPYKVf
zh|9MWOp{qV=r*y5J3j@cHNkL8-ydFl(#m$zoVz5?Bp!G<C?*l6FD(&{xlZG*XI4D!
zlyGtF96QJBP6;PAvAJhbM_gpGrcXW`hiIDF3f(`oj)*TDH(N2&B`R~EI5U-7)Gw5p
z$lilKa&nZ7bIG@WGP6`CG*~2RNL368Bq?5ck;Rqlv*u%|s`2+uxZ6`5!vUWj@YCF@
zNFn)okduz(hK`xi>3mL1Puu)Gjj_}*Mg`zWbZl@Jhkd+KFc}GEY8?ueYuu>ZQ6Zc{
zGGw6653XVi*ht;r?(fK98zSzrr1YRn#YkH}z(Tg<1-k%MXs)zW{%TL+av1CG><v?>
z^1Cvmk_NA2Nk~~uz}G^#00Bbwr-HP#;D~eT+(KOyY3?*-x}umobZl*IO-$7-j;4u?
zo+ym8;+BL5o%F;OpoG$g?tVjK(nYD3_lF?3T)Urg0_ZCngx6U|CsN-bk<EQ$ITt3e
zvJ*jC6Zn;Ynp{P0_s5$=#Xzqw6UCU9?ptxuMwIqG-K|<DecwaoyrNpqA+dAs236>5
z`s`qB;6yj}yuBjy@kyxZMcJDwH_Rk~PFN+vQ*910#DQiVx}KJ<bPZ9?P>lZ8oCW}C
z{E{}eHcxBLD1d))zugP~qrqZ{R#gq_wk-sFphz7c$chr)@G^hk1ME)gnQ7uMka}Zn
zjo-*7A#us;8^y_h_Pl#AS?L@1QX#j*ASFzA?irBDL~xz@NvV5a2C?_@G6{gJV=rPX
z2iT>q5MbbTyI0k1dEomA%$zs7!hjW|Mj(#!yySOp%5;tX%+}wv_P*)q%GO%l*y+Z|
z!XLH(johBs)SpI2c8{)~=PBbBUpF`2_BOwfuGHMGok!5m8)MlX9xk4qAl2NjqZ!}t
zM)%jRBh^x}wY9f1UoYF#SFKd`Ro{o>rSCqrwV~5bO*6NjT|Fn4bZuOmSeYGzyPnG|
z@1M7owIMc9)3v^{Ot!+niNXb>bKytf*a_X7o*_Wyf~%ejp0E-&DG#_;S;?P8syR=%
zfvrB{^av7a@cC??bUFY{J#o$E(E+0+fO>L3)B7_WIE$H~?6jM5NFMAikB=icy28&z
zzEmMnUa4mt;3Wiy>_`j*38*~vA7RI6!}qhH8NVu(R6j2n|Cs3$K3YIa)FmA5KtVo&
zBUw7%>#wQ5(NcaQv?VgckmCTvc_+Ur314xzu_j8$<pmbwZg;xK;b@%PGFp`0lg%~p
z3e(0Zb6-TV5)p{qUEM_hn7z4g4<b2N<k7JZaNQpH!n;OY(<ZnMbucvLgFF)G3wIEj
z!Q@0qeLF-A^Hf-Zw0$w5ARVgm!}#MNHeB+KZVF)m5AT<Y{|OSh0RYV%bT>pRSK$?3
z_U!7P^y>?XU~>mK5z7EDZU1odva|t@LXcsK5#(A!w`k=o4lJW)-5{qM(1ESDMw94n
zj>WzLMfu(8ud5qVe^IiyFl#!R-8=AcnNMl@M}&mrJoHf5YH6>ImnZW(?DNIl-S?Qj
zOM@@B#J7=-iRg$JNke37Z6_mGyj0*2^7-?(%JXilE8}A|$x<*hw2d9etTL79$4TlV
zCtapzLE5$5JQzBb^Wx)??y$&5QJ=^5-hyeEAlxoJNBJ$tfUh+>EPHsf-Lp(1-aTea
z+jEJn-D3l7TxMmNpO&s(+qJC_%_|x1kQ_qmb7z5fHs?h65!UHr$4gYh>~K}u0sW~P
zKm`Dq(3T?B3g<gmiBATdzkm9(^3`Z%h4`K8yP$H?`$$unz2)OmLm67H+N60IMql7C
z7+=%EqvAK5=QB3LlcJXU)_LuUm1E*cK*rW;!P$|K@=D$GjO8wf|CEEJ%_(J_Z!d1R
z`W+zW>=q}#fAO9%ct1Zpj4%jid2a+9A6N#jPiq=1)YWZtbci2*l-GlovW(5*_4Pr^
zx~G3q!7!y0A>z?r$*XaN@y2DBCtg<MYdR_+FmwQJvlk~J8%QM3C*$P`#jUYt6N`cK
zRY32oeV<Xrz?%{fct-ilbUoDQcL*3@kkcngtl?w;@+=f;39%$C1#^NTC?$afjh~=Z
zf)^vg1Z&@!Z;U`>!Z>jcw-BSGgJ6c2E0ILNQUo0SZvg3l^Ew(V;><c@d(V5bdPF#~
za+61F;+8jL<9dScAi?MgCie>InL#DMd#<F%k%JFRqa5C4?5GjofWa<eb~%7H-HT>!
z{IZwVlsN}llae?V4iC9WLI9+#V+(<^JCHb3BUSd^KlA1szPlQeu0+E7Yewd>GN8Dq
z+-T+w%#yejD9p3F>SZ%X$Y**m(UTRXP{G9(&fdVG4pe&5buJWcDWt?|`$TC+o;nzD
zM5%n|rhb^Cv~8tK`){;J^4tM{AhsZ=i3rBTr2IDvAY7U%T1{rm$C$i;7Rlt0kw?*L
zdE^!2ga<_74f{}TrdOB&w5k|K1YVirXVjntRyx0TMMi*bW!SuP*#<+7@K=GDLZyV%
zlV|q$Ii@Zs@W4k$`}zf)%USZi^YSWt59_o|nf#NeF|Zn#MR0)XIqALneu19nEL4RE
z-1jhvNi({oy^~3gWUM`#L{Lo8PFv}$w4vi0dz)Qw`=1a`0h1$F-_rG+aZRNsx0kE<
z3HM%PuoR}};YK+J*AsvX8XS*QARRdAy-XE2bg=gwNXBst3o_x0%b^_YMT0cozS27P
ztYfJm{$Z~m?zb}A$~bK44R<jaA}Z;Kz$=3mXR5%Z=GTOwFrNnx$gMrT>YaqTku)1C
z=CrGk8p8RNY3}fA_uSK{+g)^Edig0lGY!a91#JTL=!W}EK5<+NNfMjOK`lsHh!4VW
z!$99nc^fpEv^_fz?EEegJbLl<hTTl~79F%53@67k$Rf-<%8V~H6D1C-T*RV@uhcSw
zr{aib&|>w-)kzL84phMf4pf{Yz%7iBJ_~|g8456}X3N|claF;T9A}&a{k;;E9>_#o
zO*33Oa9fbh&kh(gYm%xcXA|%>w)g@CJ~uRI`myql8gyc%a%C~&=P2k1Cc9ukH?RT(
z;W6+zlwE$fr`!qu7B6+MGjdYpQ1Yy4vpzX|Oo2m0PYPlW;h9L-9(b>*XD3hV8{}hN
znx+v?x=KK(Xq1)(@QWHZq{lAQutxpz4Gw(uKszzJ83o`h9J|ld6_r@KqAj%aaUUa+
zF>}Tyo#5n)(g|k0(X<L+bDGGQ2$uStHk>)-xp~fo0Bh=7PMcmy`|Z*a1F>8-v!;VK
zZCPR0i^Z^2dll7q3k%ZECkL`8?8uR__EyCGTZ39e&VsZuGKiExIC#vw0cDNKiuaiF
zK0!ek4+D)w(X$p+xWY_x5x<3CuX>3x-gOB-wFqa+MIv3A*Qvn{JWbl{N6tJTU|vCf
zVtZDYPaT5#57gs$_Lj49SQQN67XA=x>-u2(8UH*eulTjUkJ8bP4GC6g2R9uas#|Pc
z{6=OpPX@Fl>fP%)edK+Rqv;&oEK4@^IN<kzSb3P1r10^x-REN=`IKvt&f#Qm0PI||
z6|41gT#98DyHJZEzZRQ$SQ-;cM?9@|TJH5h)Lr>fRf<ORL->dvYZNR7|M~z`DK9KZ
zvMDhxW|0hon>#-mS6|<}(!14ckwrvv<=9neFQv}IoSnqvkAKzHO~$3Z!IQw4tLL6Q
zo0$3NS6@b#Kaag?OC=J56C;j3@=gtrDK}C(Xb^+cwtJvg4+^&?>CEeJ#Gr!`6JRQ*
zX!C@24udDX)Tw;PI}FihPGk%1F1PJrId$BtV7NIW>(EGer!C^-yhNBG&10VurMP$m
zP2*Bl+LBRL?Rtb~BilfsqQpwXxpHS|_?2;84(Vdk$Vn9;qePvO7dwmo3iH=V#H)1n
zB)8l?fcxk4*kvWDE?)FD_ApA~=lKM}841@?f(f)am}TgNAr-l_Y;IYs6XvP}M>%US
zP=1JqLk3Q4jBfO_vL&_TElj$S>fhyLx1Nwdaw?84CtEqnW?&tWGNh?B6_dw-m~_lR
zS~B8oa7tzHmigM6^t>98P@HU~70{^?X;!H+L6chTFqZ*V;7cb~bc55hJf~|G)JA`v
zp{XEb?YyjB8m(|~SkL%z0AD4oRH$NaQBn?Ds{MK5AKo#^F+o*3y~#?ii2eiHHnjW;
z7aUABG}A8*<UM)xr4$P=*P1Tpvwl^|%_~tbuGf3EZs9JRBA*7cIo_g)iVQfH91%)d
zA7O<(XdTrveEgoIfKNckQIUi@`B7mp%DO>;UeX6M(BBu_@s6kT<twBD`rIqM*@>yw
z_J5uTqFy!(6HWnxwtKnv*f<&u2H6N>5DJb~QEWI!VCZ_Y$=VPco66)r)Sjc)C^>e1
z*Z3KyUy}+?u#F~&W;?WQY9wYYDa#hHLl4(jNZW`x8dYjT`w1Z%NyLCkJc0>R*TGU+
z4M(zw6~lu*hhyrgOuaYARpB7IRb!nwNMziAk)bxr)*1XpPl;eYU$&S~_rvj2l`75|
z@pDe)b1_i0Bulo@<k#%bQQK2`&adQVX^%XC!{v4FR}mG*o4X`<vrnE;Kauz_BA4ET
ztxKTtm3UQ1L^XHcz0jqOUwRy5^>&D|s~hlCNiCdjfDwIhq)B7W{8F<4jSOwXMc2b&
zS&O}w!Yy|`t~Jdgj#OS)kKCK|pzq^RMUF>L{I+Wq6W;<dlG2zhPKA@aLzRY$I2MC_
za9VZ+KiutD$K(^gr54?5jr}QTF6jI59ch&~TZxkgL6gS<NTxz5e?LFRt%thN<jr$n
zZQGJpc1=`NM5_fK0rBQmX`1aTY4NFZ5<wIi|4NqRpCTbzs-U>G2yp^TIy?#R<B#<e
z3<7x6SD_f&$w)gk0S!3tUNdzRkQV8u=uaRI+e!3@k@#)m!u{GY77PfeSoB^$MeUxa
zU<-XVq-smo_%LuY?Ys6#ptdO7VR)S(Bb;^zRCj~-<;o%P<qkEXMsbsxiPc2HKQE+5
z__y%PQJ=g4w-%%x#N`s!7$_PMWE`qG%~4D_+odMU=dWI&Wxn<@6^+mL4;vR<Xj`^w
zI6-F8hhnEg)jf}%Bg2Vl!)iq$f`b2Kr8qW7Qs$l9r*_w7qCBW9-tP{jWH7tKSck*C
z8E~Z1Jo-8Ccxo$m837K^SFPUxa}!(2T-;D;K`3n{4hopka2snCuE}ZlPK0)5Z|?G(
zILDNUZ1G(fb=s+NL*KHXpb_gw?5#=#8u*Q()12MKA~B=5FF&w`iG*o)itIz7CCr9Z
zQrZo|(iPCb7+3NUVC~NwV&4LLa}Br!vS9&euZuM$P$c5G#!4;8pkbH3+}AHTS6R>_
z|6x~7MCK@N5vC(X8UoeN8{)1-Ip7q5H>9C8FJHh5wh98g1KFKtc_;pLEtfRpkjrXH
zC)g7C0&S<;#h*92Qn0RC$Yz?IV#Exqc=@=rQW)4AfaQ}1UcTY;+eAMkmybRff@@W6
zc2`x-CAtFQcPyJOQOIDf-mVJoauaUnT2I@mg-ZUMZ(Dj2VN{tX@cPZR<z*Vr96)b@
zMgBSFZ-44)3v?{7_LY)__n7D(hYN_)bWLSt$5>Q8b8Np&b;=}-?j<6~CdX&<^P1s|
z!X*ynDb*J%<&lcd4>uQ8*q*^R07P+2vyULDEF#7^Bq2FFJ`99PUA-kps&&<^)^rc%
zz7?3NIq!4E1XsZFhVFpebSL$ACvrW*C><l!b66nP=%lH3rmJ;*(ixZFTfV|Zej2{N
zD&i((`*xh>@r;=AH-^6*_xeJ|{XI4hSd0uUiZ(CjAk1Sc3k29bSi57AQLR4d{GKRL
za~2Fvoy%tK!Q35%(<q7l#RP85c90TXnL+tafGZy)D7;{!cE=yX&#IF+l65m#GE@L`
zcMo|@$t{ynK5#MeQk;wnR2f(`8zR{7BS<Bhf_V$d<6f*iks*;Vne}4KcK0Q%D8^^L
zOghFc0t`Fx%P}E*Ip)s}no7?WNutc(M<}jY@Qxvz4kmn`?&T5|y1$GX;7%)=;TM45
z!QHsU(;XLOrTvl^ItpL{LKKivW^UqNdJx5K61o)@A0|k)AEnDKj4bGQsZWNA%t$;%
z8H;%!RpE-b252cF6M=b7BtUS$qY_)u?*xqloz9XES^@;dk;Xf;n!CoS2b|xhCWvYi
zu*}gbMEP_f*7A}IrWmg{hE$8(B4OoF2M#eV^TEEkJGw?}TOj)_AieE6n1yya*YKD0
zzRpWnlO&tp(&`oMpzP>oSHik4S54gqwFJ^t+B{dxDFspJIOzHYpsi&a@S69oIY+xe
zhgsv+rse{wIFNx`MOy~DL<MrBt67)}<0iyJhC1&4$TXWb!1;T0z<}&{t${OV+5itV
z<g)^tD~mf<YZ&Eb$H!7^w_cIFHvcu7ZYcr?Z!wj3N*W%!hYKP>8!h>B&1FjKJZB}a
z1EBvvi!6sI<7_>Muxgli^vBk%)xNReC{Q8U_@Tg@?MAisC3drJ9vS>+)$HoAFMCrq
z;U+124n#D3qf$8)4kpuvb-f8sqXmLsY#p}=Pp)1=D;33ak9yvYB72+9Hhbr|(-pmc
zdi-X)R#=9I>dae*90}!<{ZR|;o`#M~5>TNntm>)WZ&F`$%U?*j3)A-q(zS$#?mf90
z6+G&rW*DdD^KOy99}4WG=&GrY?rk4Xx=&npsMJb%I>NGZdpxn;`HDd&aHY=y8fgsv
z#yzG%u%T|f5yvPr+*y1C69F8zt5{rLxX`PYc6?l2B=0hAZ>#IzsGz;7Thci85_@NH
z6mQf{3{Q|TgiMuJ0dpy6uiAJb3w_yUBoyae%*xLKu#t5rpJ9aLWslixbP{u8exYrI
z9R&#m!{3cxN9znU;5eWJ!P;LvxeL`8$bqq}Vq+<RP6W#V#%*?-DMWSl3&`TR2DgeM
zzABBiA2TjSa_v>p4HNOvCmz^Q!ewl=#MmXfDkY43abD0tF|PJ^B`#L+Qe%y|KQ7(9
zrea9D0gR&zEKnoHZ36kmplJd<^{oWG<${<ihJ!6jq<Qp?sqB*S4KNN8u52~OOCDa-
zs*S9h@g_$iC<&VFdwzgH+=XJG8W^-Zk*T?P7Ozb4U8e{OD{K9o;C_;mXEp!dqZi$A
z6hMsA5c5a7nW(9%HCcvrwhRxyz&2{`=`kI545Y)>miqe17p;>!O0_78UBBKp3>R8R
z$hEkcX|j0d5$+n)!VFIx=HP4A{fd?204;m}Dw-D5S|x-di(*+;%lyncH;T~t3yApT
z^LT_KI}IbQ<eIxG%1bE#RF-}d2BWjRP)o(4BtSHu;1_7lRzV;${eC2H23YV)mbF>y
z|77E-+${0UJ{X7F+BH8;8mr(;j)JKa79|u-aTYu#8Xe)RPuSU?m1yB;eJr|M9~t1`
z`o5(>K@(5fN+*45YLIcPHd>*YPpC_t79tF203e$^-PlEtTWoUkqQpQ+zzA>H8}_gq
z)}yOp!tFs);YjTOJVe18G+MgvO0XB0*S5=wtC@Ans=c=pauYo14dmz;#!bMJ!q$pb
zku{#%9Gu3_Jj8W3&1WN4I$1?Jx1ln3slCd^hQjnt)4+K(Gm?+R*gn6(5x#tCj5Mu(
zT8{w2twD_h|0Q%d-^m9Ly}wze@D@}xQq&{{1xvQFJu&@jKGTP1ks8TO_fPp<RHt84
zRfKEld=8qA7U+`J+M%DAFyF<DaAS}=aSDJj;V!O26w-m6>iH(C*?t*oasVUSb5Gdx
zOs3axR2U4DyjMY?2~X~?dl|;rQJNX{Q(m@^-jFe45nI5fs@_OTOD4oDP!9}U&+m&>
zxN>UvMWQhi_)={94HB_a<e4s#FGof|@jn{G9acEUVR{mg6@?M?>40zd$McaRjLwSq
zSxXp|e9T@QH>q=t?8iqPKn`h}hpzqfTTVAT6LY&#oCp<60v=1uTMH)D^Kv;1^tf5I
z4Kxns&11a75t{&(Eh^a!-=`&lX6O*{`8GRjE~HGUKePsiH+-*mC%&qhbDO{Fo45y(
z&Fu=STQ4!Z(<ZCwAsUuPej8qDvXJ{FK7+Ju;59ne*WkLJpb8hP8WwBrsO;X8^!%}N
zFz<YffN9@$4!03ayLhQG+&~Q1L!c?Pw#zr;EM(rj-0!MN`PzNHh7el=x3ETvi$a*g
zt@J0Y4gInfMnYbGI36{p!Vk@#&9gH#lQTZ8{i5%l4iqblY1}O7(?WA46x=_JH8o3a
z9Y(E+X~v&7U#U2$oqq3F^2Cs*#xLJ<wzLlo%61c5&>F;-k5<&~IpgEnPKdXa=O7`X
zP*=)467vyQJG}3k@mEK#Zt$!+g?eb|AP#J>P89@8ejm4Q&^?5`fvPQr3wTt{MPup<
z692_W*$!)eK7QCn6Qx&~5)G79h7YX9StA@~g%pcx)hxfD*9}fEznaoD4HZkAkaS^~
zAgj5ovux;JMp3D;0Ph$eBP?Q6H_>7*mnrH&QRq4-C`7AzdaS>;vV(gO2A9GFQ7;i8
z+2L>-oz08b-D&V0XD|eCoR@qW4;?t5k7cw+1wLg37H{n|obh4V|DgPw<{rVYfgE#X
z&rFCOD|aK^5me2@K0KDdH@qV+FFBxMjte~8uP+w;3$BdfnLu^2**w`c*TkK{Rm4O%
zf0ihL)Placp!YUvJy11PJY|<(ZQjj@eR<qBYf7b&Jjk@I>xOgc1luSk@rxzM^UH;3
z<u)($Ro<T4-c<N<JinE?%YHHRX=4_P<z_tk0LJ$1(#O2m!tHd{!us-VomRrK^5J2@
zG0>6|o&b$WX;2HOJxM_0d6&bWh_7X>UKSZDJKU$vw<5_tK1d8EHN~-BN<3eyChi8U
z5o1WiW{&-99UfvaY=Hv}*FYul1G{t*A*)uk&e7k7qJm_G(<T{RTvXV^ZBj%VPJD%#
zqC~GWA9kLe@qVxQR4%k}*@QMNQy+GyOKwU|QiMPAgDym=kEt)r;L&BX?42F0KK?2=
z7bc6!r+^Z%(cYUSJYne(Cg{vjSDpK)4iPvwSR>y+pjTagT<Y^)Oz-g#j!HgLT)q?C
zD)7@n=6!M`e9n^J+lxSh!~9LbL4Ry+b}-S%*xMB`F;Sk$Srx^t0TIh0g9!U=DaeRp
zl!rs(P1Bv@s%JVzMkH16>3JWSNJlp(_Bod4BDp#1sFvJ%r*J6&nkcs0D?XAa)prU_
zb%Fa&ZD@D&UJeT$ZMIv~QrT2TLLE9n=&kz!(RMNcsaR&QX@u5feG3iF0b)#{8$25R
z(YW~Pmrtk~_bN3F`!8f*-An|HYMOgZfBD7tuJ>PRv1nWSl<hNaT0v2lUAm-)skur>
zoLQv_@_zc`827b_@gdKjWg7IN)O62U+<F|tb`+dqt?y1|8hO~a3W3$>k)w=~NZNvi
z3Z59i9V9vWJ9T4eT^a7WhM$LMr`t_E$wqjy%;!sq#GXXc8@)M%YnuC=Fc#HDu{ntH
zQ`vZRt1rItoLo!bji?ffOb_k&F>Ri;wu6{mrxCNjUTMd)BY=;#JltQ`#2*en4v$;i
z7Xoqp>A}mHUCLaZ-!nY1E>?qGPp_pCRjO9r3D^qHs8I?pvL$jvz_f@{y=j)5j}`Au
z?pSV0q!m+cQ9XdXnzeneo@O*kO_6+-`futSo~JK`Rj(c7h+0-0cHpfW2$1rv_1AnV
zAF^W|maR$8XO2;&hA!)0eH~jTbUSd%yPw@$=GWv#@LB$?Yu$t}P6Ly6HfwqisICk~
zxYFzprJcZc38*r+Lk-@ydSiWB;pxoGDp@VY@xdVTTdkZ~a4~HF;qU|Rj~@9#E9HPg
zvV4I@M)3QyeY41l4s7{3o)Qq{db>W{A;pkwFshD}%6Q2d$VpvaC0y6ue;ICu#=i%A
zsLZ6y0P2XhbS40uM^CzSiwKqT{lKn`o9@=|8V9`8w=D0)JRpprC0cL1E}}x8Em;|l
z{BE&#h6_}XKfra$>3PHp4oS}ztfTcCF@B5o&*A2!xaQvgoNZ($E}i27+Uc6<RpCBr
zNtTD*MQ<>{3?E@<#}Fq34O%4N1lsB)GN9OL&doJR&Ai3a0joGFyJwOc^tEy9C@laE
zAlp%$+X*P%+tSO+{7l84x;^DdnSI?u$~(L%2=B%toZg*s8iZUeG=>V=zm%JiiRCHN
zLqPo!{w2p?80WzNnh+nd7!04%;ND>e&p*0UQ~%fN!WF+?E`yWuN53WU1tp7LaT|w;
zGnJer{-SgqNV#x@)cKbW{fE>ma<j^9k_%2JH1fh=2)ZjR`2qsuC|Fs&5SOM?fjk5s
zYgJ@XVy6;o-KtNKwC`c3rLwc4Gx~*;n-2(5d5y_Z;5DU1LTjx(Xz083Eo+TSY%@GS
ztZKB%gMCe{1kBW`^+pIZhq$zBuDe@lOT4fUb*v2GyGE|nICKJC0-B9);g;RC8^oLT
zONHGkYX1S=Np`E~BbQeZy8EK+BRhSY;)^!?NM9Z*s7VuB^7){rWy_ZzN}7p{b#KHf
z;2w%SXSX&rB$L^cn@G#~J5_=7Sur8BjS~`V*MW^$42R{uoz4^Ph>?i}ifdtHCJ@}#
zF%nMOw$tj2p)opHiRA?b8I=<KLsHea$cRmoRuL5H6|v&P1o{%NOOJZx^G=9w`01rH
z3r3k;M2XjI)-)J?hgPzL2O%%JbT<fO-+T}`lz3bXC(1VEb>cx;pH)l)>chfbn*j!Y
zEKCU78|tR=Gx;;6qZc!)y@)n1cDlxL*I{XHmn!MG36Ucg>?a~RK82>r#1i+-VxFsm
z!s{OtIQh3(xHO9bJ+~kuLum|nA_3pYjI{4E?;qx^PooC3ivf5!*BMKPz#5$pjC85G
zYdt;PqY!^#J%_3?Li%D#nTBQplD%8E#Tu7`!rPEtEXWLaX&ow9;8W>$JpXK+W%Tl%
zm!;FGOhw7wucbeir$C>GtXQd#g)r<1C8m?&h{W{2BG}H~FejNf6b}Q2b6g>CAbBeg
zPD!<whqytFpmJTw&jGTgbpgB=T6!Uh%!+@-FJ4H`Z>x?uDAV2f^xrH@_-p(W#eM>r
zyJz^~^a`j-LVF7lphr3=EFo(L#O3Wm)Vh4rgrpE}UZ{ld_VwkBG^t)T@;Bt_JxZ%s
zj1Gidsu>3ZrY{!48jbJqqXJ=wVmfGkV3q_~Q{C1HH${;vJnLUYR4Px<T!UN;Q>vsk
zfCI_x{N^RR7ws{7-I~>>*2+m1Y-SQY0Pb}`j)Hw7WjBh8arP^u2&3n`1_AQ-tETo2
z3F(fGyrXEVJqJUBG-*l?W2}uFx0fN&L&8rjLpgL)Ua%0Z8S6bzO!_sVsupr2MHCOd
zNZC1H-xYcm3CgE}xBPKC{TOycFM&|y=s!lJ&#xaKIT9YYK$giZ6&Z#Gw;~_PO{a#W
zfn+%A&&voYgGG@gBIg$OsoJ(JlLQav74_;H)%+EFVWHXau_;?O_vww=h~scnGEn%?
zY>@b#HjfPGYuNeQl|m3@vt6aVnEJ`SxccgURCbn8ab$}Y1_Cq$hv1gr8eD_ByAvch
zjXMMg?(W*SLlfLxf&~o(mteu&3GkTA+<U{jv*zX1T3xI9&;E|}sjfbK_7+~8&lC^Q
zvF<(wqs<EPhOD6Bw?<=u6P3NE>)*t5Rcb>Jdkc2K9<cHlzmgqT8-H9=p9imA&^8S3
z+)X67pOtu~V=9wKPOowX#3blfY1sGL;#+%pGz}f$%voRF6|*kY5Iom4+FU(Z^WYQ5
zZMCwu>`{5Gy|Kx(xZqz%D9It?K40QW6x2ja-Us_3K2h;zF^w_)3jW-s{lXdEjdDs5
z*=DW6KGN_YN{i&>$SfxO)gUX)jKoz2JT$Dy{%(hf3NN}CX9i9hEIH$XeK#A2Je3P$
zZQwUVWNv#5dp2hWja3LtIJwq>2u5M&+PbEfOcP+WFyp)>%AKn5O$wC*n>Qc39Od;w
zW^=0Y$JX9LxAECz3e$`a*(m|$cq3bY)R%aUt{FvJ-zJ3J`p&{Xaw-GL>PoYQ<TP?<
zm_iOEAjk;)M^G#$t!!rP4Y$1GI11EcX%TD%>t9#EmBN!dwQ8BeBs<@$tdi)FHM6g-
zJubQ%QjYnJM6=6?yaxg4M`lLKz1`3!+;nvdTN-qu`-1HqE({yDwGs{J56Nk$1KjR8
zBQiZPn^eWfKYp+!wGa{jkl$G-x3~5!q&30d9H%}ss_d5?k5BR{x*fQXNjJiwfrj)x
zY3jW_&?6$W9My2?6XCFZ@TOynDMB`xTaL$10RoPVT3wnCGqX&fb#Ha#iE4OvUJ_^U
zW)5qW{y1S;_c*&5-dZ`?Ix|8D#6rN}r)}i#h#$sIvKU)GCfH{XByblyrqDVm9>5-u
zPKiYHdNvQlSq!TO0CK~^1T<fTwbg`HX{73W@M3?SpT&0F^5a@Tan2avDxJ~!nfNZq
z#0gb+T0(0Vksx?+a?#ZE687->%?WcVGsy}SNvMPc&S!P7vg%361B206I8J@B1Jir}
z7~QI5^eg^?@dN3?Nz=^T)%M-l>i4t4B!2hp(QSiN8_wF!{N*w)?6W@R&4K&2%{LZI
zRIrC$doFcV%aC&4&yHBRu&W7-;f!FIvW$=kh5&DG(poOS{UMft14pbMt0JfsE~0=4
z@w~%0TW9O5CqMQ(H@4r8(9`7p0E!7lt$oh0#(aTlnX)D`KDs=Dvt>oP^HJbtMz0oA
zj4Cs%=p*A@ueR(A<&cMc7;<j<X~|df$)o!CeQ=&3O(}6Y#xmBbuABKnr%T{{FSZ^T
z=4QPQWpRx(UKWm^LCfcl16>^LQ?)>i2w-*zkjpc`9A$J{2Riy(t=Anxg1vuD%?aJc
zRl09Sys69KP_~br98%c*Jj)vi9Lb-%Ti@K0TspG=m(~P&YY&&`!QK1t#mSA&bU56M
zk9TbVwuuoeQ=M0t9Um3nspwRt5p6@qyLQ+VcwT1Xlv+hfuhghUU3;Tcb*>>B<T76!
zL^DK95a$`s^#&D_0xwR3x(5yV?8=lGw-qxtshJ}AGxLjYCLi$!%ScpC;k(Vp7yU9T
zP6>&a+`w4fJ5{9%*CuDYvAyjyH6MWp%k5iSTAW-MI`28J=f-;!k<BNBD;Y8^dnX3I
zIn*X?^ju{>YgPd^buQN`?Mu$F&gi;?l_lTk-V=-0!WMm6nclWHf(QFYe?#yNHWCkd
zLuh<GB<2~OH+#5x#s!n-5o(E~^p>Ne2#Z&jhn4x7Q|T6~K+I?QOa5^-g}QcafaOQh
zbqO4Df^2$Kp?PsN{1)EpP3SvWj%_5B`qdIhfZYd{eV^JF_fE-^=mPkF<vlO1(A6+m
zn2n>a2*o^nX~ngzYdpm|=iYgjSE}Q9**Yc9EMYo!z4!&4R<1V=Ze<OoO~!G_H|X<>
zxaieNa0Z-3qL4nYiE*6l=mxadbwRsktEJ-%%W<kZNJYbdc!c9$RU@x_(}OxUrhy6F
z8cmviMySumv}yhocf!)yG)&RqYo<i%NR+G8s>r?FC0Pliq#wvC>bR|-ZV!v}QlyRk
z)Ur$EOCPwzLWqf*7(@^*0SV(iF93g%${o!o`8sc(tao5)ucXaqSDoftX`&{8JG~qb
z%6i0|Be0_fVHOBN*TUL}C_-?CK3bW<jXI89p=vCoiANRc^A&Uy<!bJt(x;Ja1el90
zWlYn4HkC#m4n^6^3bS9mEW4gRRwG1FPlzWGMvJwfyolvnn`yrEi!ePlw8595PVdeg
zcbT6RbyT`$%-%`OZgoFyqs2ZT7Qxc>def=6E~C9jttIM5vX}@f*e?HO+<Gs<EXuB-
z;zs!VeB;1fz~70$sP9}BV9@IEP=f%9>UEr>i4e~M0&=l7b}~LqHeSB@YVF)HP=73Y
zWp`aZVgX0bTFiG}zuLT>;GeWC6AFerCAYE(=hFZL7k2OolditshG3Vf{&a;cyrAlk
zSvgBH8p(S5(hpf1=2g&CzOO>d%R*L&oj7bY6F0;k+MNtC$?qAccmgHofR!GN8|;^K
z@B-Q?av)BD#*s82rU5@Myk-!1tV5BP1J=K2ysEOp1B!<w&v>R>2R=;&lLi|~kjKzl
zSG|c_Pz!mnVV*VpnZUP!Re$RjWNELjld_G9jm6oSzA`zdmJmtT^IT-a&=8(fy+d@m
z?fI&4nf#z;3}*fEO{Tce6yJhvxTeX!;Kq;X8v+45OnOebhi~>)EEZ>}8W0_Z(h4Ns
zdkSR#(l5$EV(in0SSr8-5}g{HL`=3cpV~Tu(k@i9-wu93{<ZLj_Tv+1!X$v8`vNyP
zSivNe(MSeYr7Pd+aSdaorZG`YjJ)}<b_HAGkP%;KQT9Tb(jjOLi^Aut#z24^Ko*K_
zLXY&Yx(lbOY#W>&ZW2Q!4az-wv(CC>JQsR4zN6kEU#V$=QpGJBY243jzS;XK#`awV
zo^pcIipD^}`S?p^{2ZW~d9%)gUQ%$7v>_m55`{3a1ZYT$#~h_N)5$F^XHl)X=Qr4C
zaw~3@{q9xcHTO~AQUsza!_xIcW+)ryM&&wX0%(0-ck*i}2j&q<x&ndNr?A8P%uw=Z
z9S;AI&yJmrXr1s93S@mn`_oRgq*)d$Qo+s86C+IN#@5iLiA`@KRCt;8FZbvq3V3$x
z^a>b`<h?m~r$_oF$s0Xcsy>n8Rh<M_O@&iG1dHVuwN|(43xbeE-W@A`XK*7U%iOWU
zX7Bxc(KJ3GOJR~Oyk5~-1kfJ-s4YHf$B4?7F*8}3^s129?8?X~<%%{>hdI^}06AG{
zu;yM-ie=Tu30n7_1PyPK@SUF`Rnt$Dt;bkVy$33rZBCA&f<^>rwzPc?^Rfe@Y^nIK
zMlDGkAag*H0WZFku;-jZ*SFm|pmv>}-t_vt0%(6iLC-F;A*r4aCBvz3n<s`ItDEDg
zlnAc0T3UvffCuBs7=?q**DMEThIB2_t8fKB%`X)XT4j{t$PS`^Z+|@|Lee2MNKdOm
zh=nDHNHPSo&5X<s_@MOo517wB1up%g-(*>I(vuvvdier;$zhlZ9Y1M!fiVRY{U+3q
zWPRhTgCs76zLqb0RVo*Mu@R(oi;^2z(lr4TQZ7f1Fo=KG#-PSdT-&bb<m!%+nXF`B
zUwq%tOW!DSLn4KdR%CK@LXe-H8#nala^le5mTm4sb{7;h!(k;!TKR61DMQN}@%&V2
z>oD`YnelHcXoC`cF`r{q_e0fiD16P)o*#<6BtpWBz-v0;4dUwO_Jw3&Q-rJ}^<inv
z+EYwDl?WM+Ts?4TjY3=9BOA0-Ef%HLgH!IBMyoy!#V8DqGXhoHlUm#oI*d<SUNiO|
zaDhF9_aZ}c!OZ8I5po35-?H)ouy^)CHUdvxlhmnDX0Ek%_?6EmF;&`<InLJUYb7SZ
z&-n}*l#}fvn^t7hfxn9qdJPpmm#_#(LHeHTwT(rz`1+wKvc8gciaUNr`@-F3v9q+K
z&!vjr_oe2N8hn(1R2o7D<ceE~Dv~#oegw~!EiLVBvBZe;VjQwmMWU#^ZFe*xg6i9L
zTUH5y_u++2qP&;ync3)CcHOQa`tcBJ-zy{um7$3X6lB0Cpi9Zie|1Ajzbf#iO2a(D
zFuCB$id0Ap3-gdF5NWU|^8LD#8)&L4qZMcJiP^RXWyeq2S&E`rie-~wU_9#exd>mQ
zBHq3zn~tKb<Ep6Hd`UTryD&SNS(e!w_*m;JDS_jh62L3Bo{)K2xul8kI4h%+Eviq{
zvdB|KQlRsG%^FqBAHn6b1XVb~1hQcT^inu{Q2JkxI3&vrN4zunsL!Q|I8u($H1o9@
zebi+DXsM0F$ygBeLBmJ9!Uwh>$E)4ES2~iP7ib-wnA)!K2%~QNPRH*zsFc;WEi7P?
zHXKVH64k#MrIpBH_@*5_<`rj;JICjrs4U=f(lsGF?q~<HshI*^i@wyel0+_g0ls5&
zV}~gy5b5>!BKpECPQkOwlNFyBAJuSz6Q62LA0wrz+40<<Z~G#BB!U{8S1=+_Y~Fas
zv+OXHxDCkR{Y2aKe6v`4v5m?uVe(KHnctm1v4Fu__sf+3+oe7gG=|Y{PL8Fsx$}jR
zx1~(`iLb7<bQQ6$4EAckn>leo0+cT-rjAUmdjn}z^L8gs1FP{E7tJs}|5!rz{0c#j
z6xlv}IXc?vh<_YGHFV7Gb%xBbMFOH!kyMH*v0iO7psyPeq52-virQ3;fqW|O-d+G>
zlElA)1>Ga(Mq1AZwk^;^Dpqptg(w=gjg9^~=M*_2Yid*VT|U9Fkp~d5kv2(x(#sj!
z^<{tSbtoK7CvGlvFOCZQ^M0UD937{$6qiD(8EiaX%@2W1^HabkC8xRY7duy{qu_f3
zXUb|FhU09}IF3z$*?!CPySz7s<Q1jy5-cTtNDivp%4;FygrN(OXKBdYc>Vd6JVSP~
z=i@~;{0%t$wAnL9D3Shnjlgj*wx;x<N2{aWBJ{iGGR5%5qVQq7ws7Sk`z<EKWjak}
zuJL*$I_&UefdgRFp@sn=Qz1>TUuAf@w_ONXO<8~p*Dj4*7EXZ%tju8?KkEJqhFsRv
zy6}i79g(`wbDqy$+hCIUpeAkD#K1&P9W!ga(p~azF>?uKNlp1zP9o!>y*xF>a4xEm
zr`jpOQOTBW?83%|ZSp`mb6IQJ%>b(o;kV$p5NU3c!7>TwGwK1|)JpN#K$>}K>bDL(
zPPEh0s7}D?I2E-D7f6Jamr=~mF*kIaO1Z00!%8-3y$uu<-mtZsm(oCf*cObO+<IYY
zQ;Y2P8lkW(#6**n**1P`AO2tz2uYG}+IGP9qMNRiX;~hd$vX$)_-1*dV}<?{OVri;
z8T2VMt2Y@p@<M?SM1bZR$~6)?&2#e_vcu1%z$u`qPD=*0FkMVF)bJs<^PSd7S7@T)
zk8lijS?j%mbaTac!2s>hyrhJ+1z81@ySMifb-dLEOG+1}*@)kp@+*iKhrHY`no8cs
zOO4x(^`RWu1}~Sjf3$aYMuh9Z4AAgHPj`_$4@PILi^Jbi{}3^2qv;Y*MOhF)Q$9p6
zhf)6M2(l{WF+Wve+hWvKR0q4$<o;s~TKD4Nq6~W^R@2Ks7ze;QR`Al)LVGz+-dBLk
zdfiT}x*KJZSu(E+$7U?JMoZZ2#oGxV|8G}}$6b@=wFqxTjtbPj>Q^*%s(<b&04f)u
z49T63=s44$uqb4$1xs;+Mav+!3tjhKt;Jf2zSwE_9FH0g<=g90(^WCTscW|w-rBkk
z0&~>nJ%12On+lx^H!{fhI5-f2#qw6rd8xl<mf;AsI6ETk(eSRnS@%ImNQHjh<Z#hF
zsWBCcq*O{D3c;$X$Wpu-HnJjtwdky1C>~-{eWKJHxu%)!UfF2JGknNZ*zcImFF{w6
zf`eRHt13)2<HtnrQdWGgWD)7{M;|W_VC%iXp2;FxP?{_<19WgF%htDp8PGY7WBp@i
z_MqALBJRb?j#F0>#Tat?RYIyg0OFY>C`Tgv5g|y}^$P41QF7DrlaF-CIb7qF5;+Wf
z-rK2Wz*+BH3am!&i9|NJBxqcmJ~{%jx`&GNSGb-;A}uun1ecCNq%2_r9vDhbxQHsh
zv}E{BzKob*{Iny(gg(i@kHf$7(bN&IexSk4OzAV9t_RL(0{(_KHH_YC1rAUFuk7t4
z`@XVEI<Wev;#}U7!l_Ki=)yH~e`h60f}%I5ZlpE<80H+ZHz8SZ(}w<ql|hr7HDzqK
zk=gpHX=P^YgG(XF5Vsl#uh1`=Vt1dI@#`d4z&%hM9TcS16X&*vN(Xu?qKfy8oj{nQ
zk=~Zc7x%Pi_%&2H%x&Qbe%w@$R6`;C9>nw3FG(-Dn%Q#k;y-k<2g{dGfI^$m<r*e6
z&SQ}t(DQi#umw<nn=dju29o3G_MC7qpk6jbimL>Q<!>zt6B>I+6fDN!HVB8QfiB9q
zRak5ml+eMu;+d%4*L|tcM{i%Lu+c$t%avjEFt4(;nusm<$qA-R3<!zNpb9R;`1vur
zoG5S}d5NZrT?oLxaT{1o;h>kD_&z{1XsqS#4_kO3V3O1Ox~ll*@t-h&1m1-W&U|>R
zMAgQaZGzU+uz+Jj6(*Y*P1ucb0$kdt0pWQ%4Rr2C(t7OcxJX_3OKUf{Sw8Nyj+42~
zmUzfDZtLLvq75{YV}U?Oyg9Ou?vl}m#~)Ch)%EFhbGG4HQ+XJ%7v@yEN6M@B@RPA_
zQ65B@n4&I`+irtEAbE6|zDwZ4O4|DBV=Tnk&6a>$j9VRE8!4`oNQQmFmUuNZ9gUGI
zaa>Ij5>SHF@J#eglePdsYTa?HPP-~@Yebvm=1l&xnWUX_8R^M#uK5s8P2=Jc{k0W&
z^K9~v9#YTn8P4ACJH%^Td2|V?$9LV`r{lf7436iWAE-Z6roIXDUB&H}t8XBSEs%UJ
zy(hl4TP<j}iW~M(HYNy1-xwpDT>rtd0oF;o9p-sgU&_1tPfM>YS}_*5NOzqh5r(K{
zt-2p#T2P5L2y!n%suq%#m?ails1v_;QqWrWnFTLe;hteR8B4b7tOf6mc#F2CU*%_#
z+0b+_VP+BJu5(}(>_I@*XF)*Yz&ta@3>d30!-nsO%*GLe7Hm!j()80vXcBE%5Ttx>
zqd12jfa!1<1-;ZG_^~2LYejMW-0jhdthoQXr9(oKU)mbnMXUosh*!QMYl@QV?s`;e
zSrxg?I`gNQtoLxh(inO)iJeleh{8a!dQx;m#6|CMF*Rea=)g;smL`w+Knr<2rTZ^Q
z?H78c;*_~V+3(%$-XF<2+qv=0@n3x8wX)s9@>%wta4igiI;9X+KwjVTIJ+?L|K7pq
z);J(AZgo&XRe0AH@maW<5_VDDyJ-33;1CPxQ~v(`kcCWJQ<>9By&7PC!lbp+3G_^U
z2?pr6C;SX8ce5iFeFzC-`ujeoR|7f?6(Dtl&GHQ(4B7G$m{OMbuu~pN*G!3Fbiw5U
zi!TGI%Q=RE#7{7wPBRO=Ia~J<Gqo|yS8+4%z52^o=L2|FDfF%BKBLq;#?>;!cM9S0
zP6p&5)pjlTL`}wmFPG3tn%L#yJn`d4k7vs-PO4zu6w>94pF&~kypSPRK<1GJR^ZoW
z4_wr@FnoXGEZWsIv@7@A1iQ+n;k-DxzcE|YuQ&n}deF8Ju>W8@2G7Ie$B?X#o9;dJ
z@VJ4gm6fW**B&rnyD0aT1uS&#;Kg|4ko>C2#0V20uAy<;`leol)bTZs1(xCX$SHM`
zI0^=0NcVE^y22_=Q53fF4@zs1EcZac^s9%^6tx^#XQoGI4#$jrVuJ9I*VRM0YvbGV
z(l-8Aiq01n?%~A9euQZ|{z9Kh6rIT6a=C_H5TV4$D5ASax`w$^EH{1Jus24gL!#4J
zuZK!NzSH%D;aLcCwF)!AVc!$r-;+};a09)%)W?aW=)!j4_O&L-I$pAwYk41OqHXYQ
zhIFV$@Y73<sHX1RwZ3?S1(+Z5E(?z5*LJEM3qTeke0JH}n{r#LI!z5_r(DUgO?XGD
zSD=O1=L_HSunolp+^u=48ov7XS0HJ3vxgU0J#1x`PMz5G0jp5Tju)C;&ZaP=aHpN1
zNxdQ~duDdWk3J6CbgGH=XY3sA$`cs#Il-;fZn`Wwhon2KW1{E8u*AI1#}f&~*Uoh^
z&gE&?T?r#jNf87GNe$}{dj!9UTIR;Dlb*55^P{#0`EVt@6@H#g?3o|5Z9Y$j_xW=P
zMiVt1TOtknxW>3RY7Fm}^z%9PX<c%>>JsI<DJMjpVeKJg-&(EECSv3i`DDS8=1p1K
zLbCEmnVa1KtMVU7FrVbwy^Kd06zZ?I<NLxmQfe`$Ls{&gob|tGITq)+(|1{9r6S!#
z%+KXVrxl(;z^h9D!dZm+p)e@&X2^6!A(wBhFRzC#A%k~z0Fm=IMsR6X??&Aklo$2F
zO$uwPagr_ylb9}u_+xvM*n+Cs{i3_xb;%Khg_}k`K9yp1$t%~`X&Q(fjIoJ$uHx!b
z0V2~)LG#aMY>Kby?l@QV^DX?|TJBQ|R<={q1njMj%?;XZ_RWp1YeJ&lygpNjukrqK
zxw^HVNAX3EzJT6AX8YWOJ#?;{pB-sEk@gG49P&bgrD@hZ!l*0HGIt%1T!Z8p_wQD2
z+Og!dg4Betiq|QYL$DtVdTnXZdKT<8&T|sWv30Dl<Y*%-NmpAcHJf$eUa5;fLZ>I+
zxs(wb^;=pFIXSfAoUR^oYcp}NCkap%1~hx<(jc0wpZlR}G%&P=jEu^UR-cY<%%8^X
zxjNcGHkcA411Xa>XHH`~uZ+<7W8u%uW)qFVCE61%Fbceh&)={c;h^C7(fM=o#oytb
zWioZhDVQX_M*+=lKRcvvCKhJV%RsDhezrmxFaCNvBPIuO=Jm_)#Z|xLvUqU=bQ1fc
z?EzY@(51CmCB2tkSOr>+g}p;H6xc!IqvCH`lfQzXH&Nb`SxZDPs{NoGLK^|`v@L}-
zbWV^c?H8@;qY6FldcLzq;K{zV$HT2L>AQO=6~kDf8^e^EO`7See3Xr<<|^k|IromB
zO<t8Kxo&W3(;5|yd0S`ZI&}cTIZ?jsRs&`&kV(<aQ=1D|K!EL7#>`ZS$=0r<@TmY?
z$+}e_NA0Cib$?FxVIU7i))4Tx$vaRqu(_%)0^}(m^7w13Rdtil5Qe!j4}CwLp3npw
zJB1LGh!Ky>Cs*H+<!2rO%y{|&nVWq)u?F)qWcly3LK!Mi?L|dMKH<ABNW|Q6NuA?&
z-QNg)nAXV}cfE|w{NPNSv?~1giv&XTqMiRuJp6*_MInx!T263}aB9E`?9zqLST7TG
zg0i&w#L^H!Da4Els~=#Y2&1DJs5F+~jr=u>0I96pDnGv>mZAX`lUvcOBY8+Ov<QFl
z$DKKY8*W>#r1-iIENz@AB7dw>gQmN87N&rcV@tO7vA*nG6Vq&VaFJrsPLC=;<pYV?
zuo$o*5(9^%Ehr3MRRm75l)Fq0igYRmrRNJJ1dSOHtafE~HwEJ5FczazlPEN&0862|
zo3hLnA7L=2PR8L83aVMDD#HBZ8nsadTVk$XNH34xTiH{88?=zzQ?Py+gf=MuC=>=M
zz5U_G?%rxzE(^ID0Vv~bKbel1H}zn14uyl6fz#(Wm+{oYcS;B2xiUUkN=e^s@-1Tr
z9L*WQt!A!}D~$L2Pwh#RZ*u*=Jm#%Tph7^P{cHF7zmrz}udVCqi(&I(s9w{0OmB&S
z*xGsIV<engIzxgl&0n?hC9G;olUg{G>Qpv|uKE8s1l>iG&z>wcP6r9)P`a~bwqFk;
z-oPd<)4x)ONlHV!Z)w%MUnd*r?G-%{!vqUdU{)*W50K;O+jf&<{2&GP!?3`zriw{+
zPv>dT$Ubl^XiTo7CeukEq~!5S&f-!GFyN*1LdE-Y$aF^`Tz%`kWQZj*1cT#8R#MHi
zvpe1c+N+SHK1IkEPnHBul~`Jp+SNkoa!6^Lkr$%dl5@hCbkTW(BhbOUs$&wq`CzIF
z+FBNmTdj@DW8-S~>(|4oiwcd$Y>lO|v1Hj3+BSCpTDz!D8SwHu>+hOSX9|Jpwj9~c
zfCa&NG~lpE510+wTh+ZqKWy*Oc1R;fECnM2^z0c+hJlomTdmJv;(TZx7ly&`$*o(e
zD{(!hD56h8lxiJR9i~)QdU$e;$&2*!N`Zt)PPha&Q$Dc~r={lww=)z@c1QjjGr+gW
zKB+b|2<XdkQ+S>?&;W*FQ5-@K3)qa}V0TNpG8egScb2jP6~l235&j<?J7Ozc*P$VA
z13KDPEJg=*eb#}3&kK~mR01^8?<yd%c8G*!2N&>O7$dq=eep4o7H06^^j#i?mnwyK
zMeX4d_cU_bAS@2luxN}eUTeTc;`V~i<a1A{Z$?Z94f`#Pw)L@1%)HPQ@OxfHrREN(
zUKoDHfYp|cnSe2ihkMJe)Dg~!YDPZr8f!PB>|N><*^tMw+c&%=j<TSVU{7uVM&7eY
z;}<^&yBEh<2q{_~2;nGXcD%PO?4#l?Q15%g+Tj~s&pqomFMCXqhCM`OE@XoZ_b5)k
z1#g~KSwoBpjna==^_~tQ1c6a~O^j6Z1e;@ocrlV{6o)@Et#N2QM?o9hw_VINfT9zW
zigR}peuFKqFEZd_MP>=x85KQyJG7!I`6XYwMcWnNXReQTIn{mWAHQVp7p9mX6}pHc
zqD5}qi;-Pvy>}1|d{#&*Znqs-i^Li#y;~kU4~Nd*uxeH=Og#K!ul&ef;dX0SWVD4M
z;z0oh{@zenoN8KUFnt2C&yxLuoxd&=K4#qVATed_Q_WWqdw_pv&}pZG+kH$rPxL^R
zo?g`Y!<Izk?QS^AsNvaeVdCH!Zy`FOY;uS3Sft;qS#7rJJ2~P(hI^V1<AsO2p6+{X
zy@XsRE*G9@J@;pr?(4pUdLO-98Z(@0i5fL1w-|f<3oAPEGu;M_zholRK(toN6?<X6
zyGQ9?gRcgxx%F)GD=#{*n{#)`=*MIP^4e~gr#?Kovp_--J$rPbeLTA8uf={|{B?qX
z!1?P672;0tkIO&&JsDJ0U?83$22`s$xgtP7LT^3JCO=RATS^H@-{_CvfInU+e@Q*s
zT|(6O@c&E7+1TE}+{XInwDV7*|H&UXtW0{ZiUa`x^;lAzC!*>968+Qf(8kiz*wFD&
z+~NPBE*$P5HTvjIGXMtxf&4^W_|eYkXYCDbtZd&o{y#MJGFmL7Q6M0i86hB0pJ-}5
zW?uc&w6(W!v@x`?{6AIyzMuZ3`q%)Ejm!Gz0Qpn(Nxs8BweH`mBK%d>KT3Tk_~Bow
ze`a(2^BDL~cQuVwoVQ~_K#bTxKoCEveD_nqUkz0LV|n^drT!1(X-eXsWA@L1HqY@_
zTl`t-Kbqz5GZTLg^fc`6SCA*y6VT7vJc<7MJ;2j6v|j<OJWl}szGMG?+v;fy)UN~z
zfhUBYRd^B#^?Ox#8p!f1;7;TT;Adz28qe}i`hPz}AGc`zG3I|BH9mxoQ^L=I|7U0a
z_VEADKzbT(@@v_xo@)PWoTpJI|5W<_5T3ft|4QJKeNy_Lg#YC||9gn1_Ts-HD&(Ia
z{?2Ut_YhAVpnpZ!Dm_8`?Fa5pKj_~hJ@rWbmDHg8g!H$L$-l>VYMuKl14iu$<7c1z
zw~_AeQJ(q&{z{3|`ESZom%!g6JhfK)l>pZNZ^9F!wci6gb-wx)U}N|f;Aa#4>Vx%r
n%1@t4epRN=`AhlVGu}VmBw=9xm<;~-Vle!83zshV<K6!NXx=E-

literal 0
HcmV?d00001

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a8c9ffc235b9e..975c75473214a 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -32,48 +32,6 @@
 __all__ = ["StreamingContext"]
 
 
-def _daemonize_callback_server():
-    """
-    Hack Py4J to daemonize callback server
-
-    The thread of callback server has daemon=False, it will block the driver
-    from exiting if it's not shutdown. The following code replace `start()`
-    of CallbackServer with a new version, which set daemon=True for this
-    thread.
-
-    Also, it will update the port number (0) with real port
-    """
-    # TODO: create a patch for Py4J
-    import socket
-    import py4j.java_gateway
-    logger = py4j.java_gateway.logger
-    from py4j.java_gateway import Py4JNetworkError
-    from threading import Thread
-
-    def start(self):
-        """Starts the CallbackServer. This method should be called by the
-        client instead of run()."""
-        self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR,
-                                      1)
-        try:
-            self.server_socket.bind((self.address, self.port))
-            if not self.port:
-                # update port with real port
-                self.port = self.server_socket.getsockname()[1]
-        except Exception as e:
-            msg = 'An error occurred while trying to start the callback server: %s' % e
-            logger.exception(msg)
-            raise Py4JNetworkError(msg)
-
-        # Maybe thread needs to be cleanup up?
-        self.thread = Thread(target=self.run)
-        self.thread.daemon = True
-        self.thread.start()
-
-    py4j.java_gateway.CallbackServer.start = start
-
-
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext
@@ -123,10 +81,14 @@ def _ensure_initialized(cls):
 
         # start callback server
         # getattr will fallback to JVM, so we cannot test by hasattr()
-        if "_callback_server" not in gw.__dict__:
-            _daemonize_callback_server()
-            # use random port
-            gw._start_callback_server(0)
+        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
+            gw.callback_server_parameters.eager_load = True
+            gw.callback_server_parameters.daemonize = True
+            gw.callback_server_parameters.daemonize_connections = True
+            gw.callback_server_parameters.port = 0
+            gw.start_callback_server(gw.callback_server_parameters)
+            cbport = gw._callback_server.server_socket.getsockname()[1]
+            gw._callback_server.port = cbport
             # gateway with real port
             gw._python_proxy_port = gw._callback_server.port
             # get the GatewayServer object in JVM by ID
diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index c0cdc50d8d423..b3d1905365925 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -20,7 +20,7 @@
     from io import BytesIO
 else:
     from StringIO import StringIO
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import PairDeserializer, NoOpSerializer, UTF8Deserializer, read_int
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 8a814c64c0423..b35bbaf404cc5 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.rdd import RDD
 from pyspark.storagelevel import StorageLevel
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index 34be5880e1708..af72c3d6903f9 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.serializers import PairDeserializer, NoOpSerializer
 from pyspark.storagelevel import StorageLevel
diff --git a/python/pyspark/streaming/mqtt.py b/python/pyspark/streaming/mqtt.py
index fa83006c36db6..1ce4093196e63 100644
--- a/python/pyspark/streaming/mqtt.py
+++ b/python/pyspark/streaming/mqtt.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import UTF8Deserializer
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index e4e56fff3b3fc..49634252fd465 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -61,9 +61,12 @@ def setUpClass(cls):
     def tearDownClass(cls):
         cls.sc.stop()
         # Clean up in the JVM just in case there has been some issues in Python API
-        jSparkContextOption = SparkContext._jvm.SparkContext.get()
-        if jSparkContextOption.nonEmpty():
-            jSparkContextOption.get().stop()
+        try:
+            jSparkContextOption = SparkContext._jvm.SparkContext.get()
+            if jSparkContextOption.nonEmpty():
+                jSparkContextOption.get().stop()
+        except:
+            pass
 
     def setUp(self):
         self.ssc = StreamingContext(self.sc, self.duration)
@@ -72,9 +75,12 @@ def tearDown(self):
         if self.ssc is not None:
             self.ssc.stop(False)
         # Clean up in the JVM just in case there has been some issues in Python API
-        jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
-        if jStreamingContextOption.nonEmpty():
-            jStreamingContextOption.get().stop(False)
+        try:
+            jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
+            if jStreamingContextOption.nonEmpty():
+                jStreamingContextOption.get().stop(False)
+        except:
+            pass
 
     def wait_for(self, result, n):
         start_time = time.time()
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index b0361d72d3f2c..e6bf544c14799 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -36,4 +36,4 @@ export SPARK_HOME="${SPARK_PREFIX}"
 export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"$SPARK_HOME/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 export PYTHONPATH="$SPARK_HOME/python:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
+export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 08aecfa7f6fe0..754215db2a900 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1008,9 +1008,9 @@ private[spark] class Client(
         val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
         require(pyArchivesFile.exists(),
           "pyspark.zip not found; cannot run pyspark application in YARN mode.")
-        val py4jFile = new File(pyLibPath, "py4j-0.8.2.1-src.zip")
+        val py4jFile = new File(pyLibPath, "py4j-0.9-src.zip")
         require(py4jFile.exists(),
-          "py4j-0.8.2.1-src.zip not found; cannot run pyspark application in YARN mode.")
+          "py4j-0.9-src.zip not found; cannot run pyspark application in YARN mode.")
         Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
       }
   }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d1cd0c89b5d38..6db012a77a936 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -153,7 +153,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     // needed locations.
     val sparkHome = sys.props("spark.test.home");
     val pythonPath = Seq(
-        s"$sparkHome/python/lib/py4j-0.8.2.1-src.zip",
+        s"$sparkHome/python/lib/py4j-0.9-src.zip",
         s"$sparkHome/python")
     val extraEnv = Map(
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),

From 2f6dd634c169a34811738a3c230cce4462612748 Mon Sep 17 00:00:00 2001
From: vundela <vsr@cloudera.com>
Date: Tue, 20 Oct 2015 11:12:28 -0700
Subject: [PATCH 612/802] [SPARK-11105] [YARN] Distribute log4j.properties to
 executors

Currently log4j.properties file is not uploaded to executor's which is leading them to use the default values. This fix will make sure that file is always uploaded to distributed cache so that executor will use the latest settings.

If user specifies log configurations through --files then executors will be picking configs from --files instead of $SPARK_CONF_DIR/log4j.properties

Author: vundela <vsr@cloudera.com>
Author: Srinivasa Reddy Vundela <vsr@cloudera.com>

Closes #9118 from vundela/master.
---
 docs/running-on-yarn.md                             |  5 ++++-
 .../scala/org/apache/spark/deploy/yarn/Client.scala | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 677c0000440ac..db6bfa69ee0fe 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -81,7 +81,7 @@ all environment variables used for launching each container. This process is use
 classpath problems in particular. (Note that enabling this requires admin privileges on cluster
 settings and a restart of all node managers. Thus, this is not applicable to hosted clusters).
 
-To use a custom log4j configuration for the application master or executors, there are two options:
+To use a custom log4j configuration for the application master or executors, here are the options:
 
 - upload a custom `log4j.properties` using `spark-submit`, by adding it to the `--files` list of files
   to be uploaded with the application.
@@ -89,6 +89,9 @@ To use a custom log4j configuration for the application master or executors, the
   (for the driver) or `spark.executor.extraJavaOptions` (for executors). Note that if using a file,
   the `file:` protocol should be explicitly provided, and the file needs to exist locally on all
   the nodes.
+- update the `$SPARK_CONF_DIR/log4j.properties` file and it will be automatically uploaded along
+  with the other configurations. Note that other 2 options has higher priority than this option if
+  multiple options are specified.
 
 Note that for the first option, both executors and the application master will share the same
 log4j configuration, which may cause issues when they run on the same node (e.g. trying to write
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 754215db2a900..4954b6180902e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -497,6 +497,19 @@ private[spark] class Client(
    */
   private def createConfArchive(): File = {
     val hadoopConfFiles = new HashMap[String, File]()
+
+    // Uploading $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
+    // the executors will use the latest configurations instead of the default values. This is
+    // required when user changes log4j.properties directly to set the log configurations. If
+    // configuration file is provided through --files then executors will be taking configurations
+    // from --files instead of $SPARK_CONF_DIR/log4j.properties.
+    val log4jFileName = "log4j.properties"
+    Option(Utils.getContextOrSparkClassLoader.getResource(log4jFileName)).foreach { url =>
+      if (url.getProtocol == "file") {
+        hadoopConfFiles(log4jFileName) = new File(url.getPath)
+      }
+    }
+
     Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
       sys.env.get(envKey).foreach { path =>
         val dir = new File(path)

From 478c7ce8628c05ebce2972e631d76317accebe9c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 20 Oct 2015 13:36:03 -0700
Subject: [PATCH 613/802] [SPARK-6740] [SQL] correctly parse NOT operator with
 comparison operations

We can't parse `NOT` operator with comparison operations like `SELECT NOT TRUE > TRUE`, this PR fixed it.

Takes over https://github.com/apache/spark/pull/6326.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8617 from cloud-fan/not.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  6 ++++--
 .../spark/sql/catalyst/SqlParserSuite.scala   | 21 ++++++++++++++-----
 .../spark/sql/catalyst/plans/PlanTest.scala   |  2 +-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 2595e1f90c837..08ca325b21777 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -218,7 +218,10 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     andExpression * (OR ^^^ { (e1: Expression, e2: Expression) => Or(e1, e2) })
 
   protected lazy val andExpression: Parser[Expression] =
-    comparisonExpression * (AND ^^^ { (e1: Expression, e2: Expression) => And(e1, e2) })
+    notExpression * (AND ^^^ { (e1: Expression, e2: Expression) => And(e1, e2) })
+
+  protected lazy val notExpression: Parser[Expression] =
+    NOT.? ~ comparisonExpression ^^ { case maybeNot ~ e => maybeNot.map(_ => Not(e)).getOrElse(e) }
 
   protected lazy val comparisonExpression: Parser[Expression] =
     ( termExpression ~ ("="  ~> termExpression) ^^ { case e1 ~ e2 => EqualTo(e1, e2) }
@@ -246,7 +249,6 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       }
     | termExpression <~ IS ~ NULL ^^ { case e => IsNull(e) }
     | termExpression <~ IS ~ NOT ~ NULL ^^ { case e => IsNotNull(e) }
-    | NOT ~> termExpression ^^ {e => Not(e)}
     | termExpression
     )
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
index b93a3abc6ebd2..79b4846cb9544 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Command
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias
+import org.apache.spark.sql.catalyst.expressions.{Literal, GreaterThan, Not, Attribute}
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project, LogicalPlan, Command}
 
 private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command {
   override def output: Seq[Attribute] = Seq.empty
@@ -49,7 +49,7 @@ private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser {
     }
 }
 
-class SqlParserSuite extends SparkFunSuite {
+class SqlParserSuite extends PlanTest {
 
   test("test long keyword") {
     val parser = new SuperLongKeywordTestParser
@@ -63,4 +63,15 @@ class SqlParserSuite extends SparkFunSuite {
     assert(TestCommand("NotRealCommand") === parser.parse("execute NotRealCommand"))
     assert(TestCommand("NotRealCommand") === parser.parse("exEcute NotRealCommand"))
   }
+
+  test("test NOT operator with comparison operations") {
+    val parsed = SqlParser.parse("SELECT NOT TRUE > TRUE")
+    val expected = Project(
+      UnresolvedAlias(
+        Not(
+          GreaterThan(Literal(true), Literal(true)))
+      ) :: Nil,
+      OneRowRelation)
+    comparePlans(parsed, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index f76a903dcc9cf..2efee1fc54706 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util._
 /**
  * Provides helper methods for comparing plans.
  */
-class PlanTest extends SparkFunSuite {
+abstract class PlanTest extends SparkFunSuite {
   /**
    * Since attribute references are given globally unique ids during analysis,
    * we must normalize them to check if two different queries are identical.

From 67d468f8d9172569ec9846edc6432240547696dd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 20 Oct 2015 13:40:24 -0700
Subject: [PATCH 614/802] [SPARK-11111] [SQL] fast null-safe join

Currently, we use CartesianProduct for join with null-safe-equal condition.
```
scala> sqlContext.sql("select * from t a join t b on (a.i <=> b.i)").explain
== Physical Plan ==
TungstenProject [i#2,j#3,i#7,j#8]
 Filter (i#2 <=> i#7)
  CartesianProduct
   LocalTableScan [i#2,j#3], [[1,1]]
   LocalTableScan [i#7,j#8], [[1,1]]
```
Actually, we can have an equal-join condition as  `coalesce(i, default) = coalesce(b.i, default)`, then an partitioned join algorithm could be used.

After this PR, the plan will become:
```
>>> sqlContext.sql("select * from a join b ON a.id <=> b.id").explain()
TungstenProject [id#0L,id#1L]
 Filter (id#0L <=> id#1L)
  SortMergeJoin [coalesce(id#0L,0)], [coalesce(id#1L,0)]
   TungstenSort [coalesce(id#0L,0) ASC], false, 0
    TungstenExchange hashpartitioning(coalesce(id#0L,0),200)
     ConvertToUnsafe
      Scan PhysicalRDD[id#0L]
   TungstenSort [coalesce(id#1L,0) ASC], false, 0
    TungstenExchange hashpartitioning(coalesce(id#1L,0),200)
     ConvertToUnsafe
      Scan PhysicalRDD[id#1L]
```

Author: Davies Liu <davies@databricks.com>

Closes #9120 from davies/null_safe.
---
 .../sql/catalyst/expressions/literals.scala   | 29 +++++++++++++--
 .../sql/catalyst/planning/patterns.scala      | 35 ++++++++++++-------
 .../expressions/LiteralExpressionSuite.scala  | 28 ++++++++++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 14 ++++++++
 .../sql/execution/joins/InnerJoinSuite.scala  | 14 ++++++++
 5 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 51be819e9d6fb..455fa2427c26d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -52,6 +51,32 @@ object Literal {
   def create(v: Any, dataType: DataType): Literal = {
     Literal(CatalystTypeConverters.convertToCatalyst(v), dataType)
   }
+
+  /**
+   * Create a literal with default value for given DataType
+   */
+  def default(dataType: DataType): Literal = dataType match {
+    case NullType => create(null, NullType)
+    case BooleanType => Literal(false)
+    case ByteType => Literal(0.toByte)
+    case ShortType => Literal(0.toShort)
+    case IntegerType => Literal(0)
+    case LongType => Literal(0L)
+    case FloatType => Literal(0.0f)
+    case DoubleType => Literal(0.0)
+    case dt: DecimalType => Literal(Decimal(0, dt.precision, dt.scale))
+    case DateType => create(0, DateType)
+    case TimestampType => create(0L, TimestampType)
+    case StringType => Literal("")
+    case BinaryType => Literal("".getBytes)
+    case CalendarIntervalType => Literal(new CalendarInterval(0, 0))
+    case arr: ArrayType => create(Array(), arr)
+    case map: MapType => create(Map(), map)
+    case struct: StructType =>
+      create(InternalRow.fromSeq(struct.fields.map(f => default(f.dataType).value)), struct)
+    case other =>
+      throw new RuntimeException(s"no default for type $dataType")
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 53537799517ce..3b975b904a332 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.catalyst.planning
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 
 /**
  * A pattern that matches any number of project or filter operations on top of another relational
@@ -160,6 +160,9 @@ object PartialAggregation {
 
 /**
  * A pattern that finds joins with equality conditions that can be evaluated using equi-join.
+ *
+ * Null-safe equality will be transformed into equality as joining key (replace null with default
+ * value).
  */
 object ExtractEquiJoinKeys extends Logging with PredicateHelper {
   /** (joinType, leftKeys, rightKeys, condition, leftChild, rightChild) */
@@ -171,17 +174,25 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       logDebug(s"Considering join on: $condition")
       // Find equi-join predicates that can be evaluated before the join, and thus can be used
       // as join keys.
-      val (joinPredicates, otherPredicates) =
-        condition.map(splitConjunctivePredicates).getOrElse(Nil).partition {
-          case EqualTo(l, r) =>
-            (canEvaluate(l, left) && canEvaluate(r, right)) ||
-            (canEvaluate(l, right) && canEvaluate(r, left))
-          case _ => false
-        }
-
-      val joinKeys = joinPredicates.map {
-        case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => (l, r)
-        case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => (r, l)
+      val predicates = condition.map(splitConjunctivePredicates).getOrElse(Nil)
+      val joinKeys = predicates.flatMap {
+        case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => Some((l, r))
+        case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => Some((r, l))
+        // Replace null with default value for joining key, then those rows with null in it could
+        // be joined together
+        case EqualNullSafe(l, r) if canEvaluate(l, left) && canEvaluate(r, right) =>
+          Some((Coalesce(Seq(l, Literal.default(l.dataType))),
+            Coalesce(Seq(r, Literal.default(r.dataType)))))
+        case EqualNullSafe(l, r) if canEvaluate(l, right) && canEvaluate(r, left) =>
+          Some((Coalesce(Seq(r, Literal.default(r.dataType))),
+            Coalesce(Seq(l, Literal.default(l.dataType)))))
+        case other => None
+      }
+      val otherPredicates = predicates.filterNot {
+        case EqualTo(l, r) =>
+          canEvaluate(l, left) && canEvaluate(r, right) ||
+            canEvaluate(l, right) && canEvaluate(r, left)
+        case other => false
       }
 
       if (joinKeys.nonEmpty) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index 015eb1897fb8c..7b85286c4dc8c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -18,7 +18,10 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
 
 
 class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -30,15 +33,38 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, IntegerType), null)
     checkEvaluation(Literal.create(null, LongType), null)
     checkEvaluation(Literal.create(null, FloatType), null)
-    checkEvaluation(Literal.create(null, LongType), null)
+    checkEvaluation(Literal.create(null, DoubleType), null)
     checkEvaluation(Literal.create(null, StringType), null)
     checkEvaluation(Literal.create(null, BinaryType), null)
     checkEvaluation(Literal.create(null, DecimalType.USER_DEFAULT), null)
+    checkEvaluation(Literal.create(null, DateType), null)
+    checkEvaluation(Literal.create(null, TimestampType), null)
+    checkEvaluation(Literal.create(null, CalendarIntervalType), null)
     checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
     checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
     checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
   }
 
+  test("default") {
+    checkEvaluation(Literal.default(BooleanType), false)
+    checkEvaluation(Literal.default(ByteType), 0.toByte)
+    checkEvaluation(Literal.default(ShortType), 0.toShort)
+    checkEvaluation(Literal.default(IntegerType), 0)
+    checkEvaluation(Literal.default(LongType), 0L)
+    checkEvaluation(Literal.default(FloatType), 0.0f)
+    checkEvaluation(Literal.default(DoubleType), 0.0)
+    checkEvaluation(Literal.default(StringType), "")
+    checkEvaluation(Literal.default(BinaryType), "".getBytes)
+    checkEvaluation(Literal.default(DecimalType.USER_DEFAULT), Decimal(0))
+    checkEvaluation(Literal.default(DecimalType.SYSTEM_DEFAULT), Decimal(0))
+    checkEvaluation(Literal.default(DateType), DateTimeUtils.toJavaDate(0))
+    checkEvaluation(Literal.default(TimestampType), DateTimeUtils.toJavaTimestamp(0L))
+    checkEvaluation(Literal.default(CalendarIntervalType), new CalendarInterval(0, 0L))
+    checkEvaluation(Literal.default(ArrayType(StringType)), Array())
+    checkEvaluation(Literal.default(MapType(IntegerType, StringType)), Map())
+    checkEvaluation(Literal.default(StructType(StructField("a", StringType) :: Nil)), Row(""))
+  }
+
   test("boolean literals") {
     checkEvaluation(Literal(true), true)
     checkEvaluation(Literal(false), false)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 636591630e136..a35a7f41dd667 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
+import org.apache.spark.sql.execution.joins.{SortMergeJoin, CartesianProduct}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
@@ -850,6 +851,19 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(null, null, 6, "F") :: Nil)
   }
 
+  test("SPARK-11111 null-safe join should not use cartesian product") {
+    val df = sql("select count(*) from testData a join testData b on (a.key <=> b.key)")
+    val cp = df.queryExecution.executedPlan.collect {
+      case cp: CartesianProduct => cp
+    }
+    assert(cp.isEmpty, "should not use CartesianProduct for null-safe join")
+    val smj = df.queryExecution.executedPlan.collect {
+      case smj: SortMergeJoin => smj
+    }
+    assert(smj.size > 0, "should use SortMergeJoin")
+    checkAnswer(df, Row(100) :: Nil)
+  }
+
   test("SPARK-3349 partitioning after limit") {
     sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n DESC")
       .limit(2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
index 4174ee055021d..da58e96f3e6f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -212,4 +212,18 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
     )
   }
 
+  {
+    lazy val left = Seq((1, Some(0)), (2, None)).toDF("a", "b")
+    lazy val right = Seq((1, Some(0)), (2, None)).toDF("a", "b")
+    testInnerJoin(
+      "inner join, null safe",
+      left,
+      right,
+      () => (left.col("b") <=> right.col("b")).expr,
+      Seq(
+        (1, 0, 1, 0),
+        (2, null, 2, null)
+      )
+    )
+  }
 }

From 06e6b765d0c747b773d7f3be28ddb0543c955a1f Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 20 Oct 2015 14:01:53 -0700
Subject: [PATCH 615/802] [SPARK-11149] [SQL] Improve cache performance for
 primitive types

This PR improve the performance by:

1) Generate an Iterator that take Iterator[CachedBatch] as input, and call accessors (unroll the loop for columns), avoid the expensive Iterator.flatMap.

2) Use Unsafe.getInt/getLong/getFloat/getDouble instead of ByteBuffer.getInt/getLong/getFloat/getDouble, the later one actually read byte by byte.

3) Remove the unnecessary copy() in Coalesce(), which is not related to memory cache, found during benchmark.

The following benchmark showed that we can speedup the columnar cache of int by 2x.

```
path = '/opt/tpcds/store_sales/'
int_cols = ['ss_sold_date_sk', 'ss_sold_time_sk', 'ss_item_sk','ss_customer_sk']
df = sqlContext.read.parquet(path).select(int_cols).cache()
df.count()

t = time.time()
print df.select("*")._jdf.queryExecution().toRdd().count()
print time.time() - t
```

Author: Davies Liu <davies@databricks.com>

Closes #9145 from davies/byte_buffer.
---
 .../expressions/codegen/CodeFormatter.scala   |   3 +
 .../expressions/codegen/CodeGenerator.scala   |  10 +-
 .../codegen/CodeFormatterSuite.scala          |  66 ++++----
 .../spark/sql/columnar/ColumnType.scala       |  52 ++++--
 .../sql/columnar/GenerateColumnAccessor.scala | 149 ++++++++++++++++++
 .../columnar/InMemoryColumnarTableScan.scala  |  83 ++++------
 .../sql/columnar/NullableColumnAccessor.scala |   6 +-
 .../compression/compressionSchemes.scala      |  16 +-
 .../spark/sql/execution/basicOperators.scala  |   2 +-
 9 files changed, 265 insertions(+), 122 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index c98182c96b165..9b8b6382d753d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -32,6 +32,7 @@ private class CodeFormatter {
   private var indentLevel = 0
   private val indentSize = 2
   private var indentString = ""
+  private var currentLine = 1
 
   private def addLine(line: String): Unit = {
     val indentChange =
@@ -44,11 +45,13 @@ private class CodeFormatter {
     } else {
       indentString
     }
+    code.append(f"/* ${currentLine}%03d */ ")
     code.append(thisLineIndent)
     code.append(line)
     code.append("\n")
     indentLevel = newIndentLevel
     indentString = " " * (indentSize * newIndentLevel)
+    currentLine += 1
   }
 
   private def addLines(code: String): CodeFormatter = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 7544d27e3dc15..a4ec5085fa153 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -391,26 +391,24 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[ArrayData].getName,
       classOf[UnsafeArrayData].getName,
       classOf[MapData].getName,
-      classOf[UnsafeMapData].getName
+      classOf[UnsafeMapData].getName,
+      classOf[MutableRow].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
 
     def formatted = CodeFormatter.format(code)
-    def withLineNums = formatted.split("\n").zipWithIndex.map {
-      case (l, n) => f"${n + 1}%03d $l"
-    }.mkString("\n")
 
     logDebug({
       // Only add extra debugging info to byte code when we are going to print the source code.
       evaluator.setDebuggingInformation(true, true, false)
-      withLineNums
+      formatted
     })
 
     try {
       evaluator.cook("generated.java", code)
     } catch {
       case e: Exception =>
-        val msg = s"failed to compile: $e\n$withLineNums"
+        val msg = s"failed to compile: $e\n$formatted"
         logError(msg, e)
         throw new Exception(msg, e)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index 46daa3eb8bf80..9da1068e9ca1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -29,78 +29,68 @@ class CodeFormatterSuite extends SparkFunSuite {
   }
 
   testCase("basic example") {
-    """
-      |class A {
+    """class A {
       |blahblah;
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  blahblah;
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   blahblah;
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("nested example") {
-    """
-      |class A {
+    """class A {
       | if (c) {
       |duh;
       |}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   } {
     """
-      |class A {
-      |  if (c) {
-      |    duh;
-      |  }
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {
+      |/* 003 */     duh;
+      |/* 004 */   }
+      |/* 005 */ }
     """.stripMargin
   }
 
   testCase("single line") {
-    """
-      |class A {
+    """class A {
       | if (c) {duh;}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  if (c) {duh;}
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {duh;}
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("if else on the same line") {
-    """
-      |class A {
+    """class A {
       | if (c) {duh;} else {boo;}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  if (c) {duh;} else {boo;}
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {duh;} else {boo;}
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("function calls") {
-    """
-      |foo(
+    """foo(
       |a,
       |b,
-      |c)
-    """.stripMargin
+      |c)""".stripMargin
   }{
     """
-      |foo(
-      |  a,
-      |  b,
-      |  c)
+      |/* 001 */ foo(
+      |/* 002 */   a,
+      |/* 003 */   b,
+      |/* 004 */   c)
     """.stripMargin
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index a41f04dd3b59a..72fa299aa937b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -28,6 +28,38 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.UTF8String
 
+
+/**
+ * A help class for fast reading Int/Long/Float/Double from ByteBuffer in native order.
+ *
+ * WARNNING: This only works with HeapByteBuffer
+ */
+object ByteBufferHelper {
+  def getInt(buffer: ByteBuffer): Int = {
+    val pos = buffer.position()
+    buffer.position(pos + 4)
+    Platform.getInt(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getLong(buffer: ByteBuffer): Long = {
+    val pos = buffer.position()
+    buffer.position(pos + 8)
+    Platform.getLong(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getFloat(buffer: ByteBuffer): Float = {
+    val pos = buffer.position()
+    buffer.position(pos + 4)
+    Platform.getFloat(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getDouble(buffer: ByteBuffer): Double = {
+    val pos = buffer.position()
+    buffer.position(pos + 8)
+    Platform.getDouble(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+}
+
 /**
  * An abstract class that represents type of a column. Used to append/extract Java objects into/from
  * the underlying [[ByteBuffer]] of a column.
@@ -134,11 +166,11 @@ private[sql] object INT extends NativeColumnType(IntegerType, 4) {
   }
 
   override def extract(buffer: ByteBuffer): Int = {
-    buffer.getInt()
+    ByteBufferHelper.getInt(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setInt(ordinal, buffer.getInt())
+    row.setInt(ordinal, ByteBufferHelper.getInt(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Int): Unit = {
@@ -163,11 +195,11 @@ private[sql] object LONG extends NativeColumnType(LongType, 8) {
   }
 
   override def extract(buffer: ByteBuffer): Long = {
-    buffer.getLong()
+    ByteBufferHelper.getLong(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setLong(ordinal, buffer.getLong())
+    row.setLong(ordinal, ByteBufferHelper.getLong(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Long): Unit = {
@@ -191,11 +223,11 @@ private[sql] object FLOAT extends NativeColumnType(FloatType, 4) {
   }
 
   override def extract(buffer: ByteBuffer): Float = {
-    buffer.getFloat()
+    ByteBufferHelper.getFloat(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setFloat(ordinal, buffer.getFloat())
+    row.setFloat(ordinal, ByteBufferHelper.getFloat(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Float): Unit = {
@@ -219,11 +251,11 @@ private[sql] object DOUBLE extends NativeColumnType(DoubleType, 8) {
   }
 
   override def extract(buffer: ByteBuffer): Double = {
-    buffer.getDouble()
+    ByteBufferHelper.getDouble(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setDouble(ordinal, buffer.getDouble())
+    row.setDouble(ordinal, ByteBufferHelper.getDouble(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Double): Unit = {
@@ -358,7 +390,7 @@ private[sql] case class COMPACT_DECIMAL(precision: Int, scale: Int)
   extends NativeColumnType(DecimalType(precision, scale), 8) {
 
   override def extract(buffer: ByteBuffer): Decimal = {
-    Decimal(buffer.getLong(), precision, scale)
+    Decimal(ByteBufferHelper.getLong(buffer), precision, scale)
   }
 
   override def append(v: Decimal, buffer: ByteBuffer): Unit = {
@@ -480,7 +512,7 @@ private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRo
   }
 
   override def extract(buffer: ByteBuffer): UnsafeRow = {
-    val sizeInBytes = buffer.getInt()
+    val sizeInBytes = ByteBufferHelper.getInt(buffer)
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + sizeInBytes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
new file mode 100644
index 0000000000000..e04bcda5800c7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.columnar
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
+import org.apache.spark.sql.types._
+
+/**
+ * An Iterator to walk throught the InternalRows from a CachedBatch
+ */
+abstract class ColumnarIterator extends Iterator[InternalRow] {
+  def initialize(input: Iterator[CachedBatch], mutableRow: MutableRow, columnTypes: Array[DataType],
+    columnIndexes: Array[Int]): Unit
+}
+
+/**
+ * Generates bytecode for an [[ColumnarIterator]] for columnar cache.
+ */
+object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarIterator] with Logging {
+
+  protected def canonicalize(in: Seq[DataType]): Seq[DataType] = in
+  protected def bind(in: Seq[DataType], inputSchema: Seq[Attribute]): Seq[DataType] = in
+
+  protected def create(columnTypes: Seq[DataType]): ColumnarIterator = {
+    val ctx = newCodeGenContext()
+    val (initializeAccessors, extractors) = columnTypes.zipWithIndex.map { case (dt, index) =>
+      val accessorName = ctx.freshName("accessor")
+      val accessorCls = dt match {
+        case NullType => classOf[NullColumnAccessor].getName
+        case BooleanType => classOf[BooleanColumnAccessor].getName
+        case ByteType => classOf[ByteColumnAccessor].getName
+        case ShortType => classOf[ShortColumnAccessor].getName
+        case IntegerType | DateType => classOf[IntColumnAccessor].getName
+        case LongType | TimestampType => classOf[LongColumnAccessor].getName
+        case FloatType => classOf[FloatColumnAccessor].getName
+        case DoubleType => classOf[DoubleColumnAccessor].getName
+        case StringType => classOf[StringColumnAccessor].getName
+        case BinaryType => classOf[BinaryColumnAccessor].getName
+        case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS =>
+          classOf[CompactDecimalColumnAccessor].getName
+        case dt: DecimalType => classOf[DecimalColumnAccessor].getName
+        case struct: StructType => classOf[StructColumnAccessor].getName
+        case array: ArrayType => classOf[ArrayColumnAccessor].getName
+        case t: MapType => classOf[MapColumnAccessor].getName
+      }
+      ctx.addMutableState(accessorCls, accessorName, s"$accessorName = null;")
+
+      val createCode = dt match {
+        case t if ctx.isPrimitiveType(dt) =>
+          s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));"
+        case NullType | StringType | BinaryType =>
+          s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));"
+        case other =>
+          s"""$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder),
+             (${dt.getClass.getName}) columnTypes[$index]);"""
+      }
+
+      val extract = s"$accessorName.extractTo(mutableRow, $index);"
+
+      (createCode, extract)
+    }.unzip
+
+    val code = s"""
+      import java.nio.ByteBuffer;
+      import java.nio.ByteOrder;
+
+      public SpecificColumnarIterator generate($exprType[] expr) {
+        return new SpecificColumnarIterator();
+      }
+
+      class SpecificColumnarIterator extends ${classOf[ColumnarIterator].getName} {
+
+        private ByteOrder nativeOrder = null;
+        private byte[][] buffers = null;
+
+        private int currentRow = 0;
+        private int numRowsInBatch = 0;
+
+        private scala.collection.Iterator input = null;
+        private MutableRow mutableRow = null;
+        private ${classOf[DataType].getName}[] columnTypes = null;
+        private int[] columnIndexes = null;
+
+        ${declareMutableStates(ctx)}
+
+        public SpecificColumnarIterator() {
+          this.nativeOrder = ByteOrder.nativeOrder();
+          this.buffers = new byte[${columnTypes.length}][];
+
+          ${initMutableStates(ctx)}
+        }
+
+        public void initialize(scala.collection.Iterator input, MutableRow mutableRow,
+                               ${classOf[DataType].getName}[] columnTypes, int[] columnIndexes) {
+          this.input = input;
+          this.mutableRow = mutableRow;
+          this.columnTypes = columnTypes;
+          this.columnIndexes = columnIndexes;
+        }
+
+        public boolean hasNext() {
+          if (currentRow < numRowsInBatch) {
+            return true;
+          }
+          if (!input.hasNext()) {
+            return false;
+          }
+
+          ${classOf[CachedBatch].getName} batch = (${classOf[CachedBatch].getName}) input.next();
+          currentRow = 0;
+          numRowsInBatch = batch.numRows();
+          for (int i = 0; i < columnIndexes.length; i ++) {
+            buffers[i] = batch.buffers()[columnIndexes[i]];
+          }
+          ${initializeAccessors.mkString("\n")}
+
+          return hasNext();
+        }
+
+        public InternalRow next() {
+          ${extractors.mkString("\n")}
+          currentRow += 1;
+          return mutableRow;
+        }
+      }"""
+
+    logDebug(s"Generated ColumnarIterator: ${CodeFormatter.format(code)}")
+
+    compile(code).generate(ctx.references.toArray).asInstanceOf[ColumnarIterator]
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index d967814f627cb..9f76a61a1574b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.ByteBuffer
-
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.rdd.RDD
@@ -28,6 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.{ConvertToUnsafe, LeafNode, SparkPlan}
+import org.apache.spark.sql.types.UserDefinedType
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{Accumulable, Accumulator, Accumulators}
 
@@ -43,7 +42,14 @@ private[sql] object InMemoryRelation {
       tableName)()
 }
 
-private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: InternalRow)
+/**
+ * CachedBatch is a cached batch of rows.
+ *
+ * @param numRows The total number of rows in this batch
+ * @param buffers The buffers for serialized columns
+ * @param stats The stat of columns
+ */
+private[sql] case class CachedBatch(numRows: Int, buffers: Array[Array[Byte]], stats: InternalRow)
 
 private[sql] case class InMemoryRelation(
     output: Seq[Attribute],
@@ -151,7 +157,7 @@ private[sql] case class InMemoryRelation(
                         .flatMap(_.values))
 
           batchStats += stats
-          CachedBatch(columnBuilders.map(_.build().array()), stats)
+          CachedBatch(rowCount, columnBuilders.map(_.build().array()), stats)
         }
 
         def hasNext: Boolean = rowIterator.hasNext
@@ -278,59 +284,15 @@ private[sql] case class InMemoryColumnarTableScan(
     val buffers = relation.cachedColumnBuffers
 
     buffers.mapPartitions { cachedBatchIterator =>
-    val partitionFilter = newPredicate(
-      partitionFilters.reduceOption(And).getOrElse(Literal(true)),
-      schema)
-
-      // Find the ordinals and data types of the requested columns.  If none are requested, use the
-      // narrowest (the field with minimum default element size).
-      val (requestedColumnIndices, requestedColumnDataTypes) = if (attributes.isEmpty) {
-        val (narrowestOrdinal, narrowestDataType) =
-          relOutput.zipWithIndex.map { case (a, ordinal) =>
-            ordinal -> a.dataType
-          } minBy { case (_, dataType) =>
-            ColumnType(dataType).defaultSize
-          }
-        Seq(narrowestOrdinal) -> Seq(narrowestDataType)
-      } else {
+      val partitionFilter = newPredicate(
+        partitionFilters.reduceOption(And).getOrElse(Literal(true)),
+        schema)
+
+      // Find the ordinals and data types of the requested columns.
+      val (requestedColumnIndices, requestedColumnDataTypes) =
         attributes.map { a =>
           relOutput.indexWhere(_.exprId == a.exprId) -> a.dataType
         }.unzip
-      }
-
-      val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
-
-      def cachedBatchesToRows(cacheBatches: Iterator[CachedBatch]): Iterator[InternalRow] = {
-        val rows = cacheBatches.flatMap { cachedBatch =>
-          // Build column accessors
-          val columnAccessors = requestedColumnIndices.map { batchColumnIndex =>
-            ColumnAccessor(
-              relOutput(batchColumnIndex).dataType,
-              ByteBuffer.wrap(cachedBatch.buffers(batchColumnIndex)))
-          }
-
-          // Extract rows via column accessors
-          new Iterator[InternalRow] {
-            private[this] val rowLen = nextRow.numFields
-            override def next(): InternalRow = {
-              var i = 0
-              while (i < rowLen) {
-                columnAccessors(i).extractTo(nextRow, i)
-                i += 1
-              }
-              if (attributes.isEmpty) InternalRow.empty else nextRow
-            }
-
-            override def hasNext: Boolean = columnAccessors(0).hasNext
-          }
-        }
-
-        if (rows.hasNext && enableAccumulators) {
-          readPartitions += 1
-        }
-
-        rows
-      }
 
       // Do partition batch pruning if enabled
       val cachedBatchesToScan =
@@ -355,7 +317,18 @@ private[sql] case class InMemoryColumnarTableScan(
           cachedBatchIterator
         }
 
-      cachedBatchesToRows(cachedBatchesToScan)
+      val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
+      val columnTypes = requestedColumnDataTypes.map {
+        case udt: UserDefinedType[_] => udt.sqlType
+        case other => other
+      }.toArray
+      val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
+      columnarIterator.initialize(cachedBatchesToScan, nextRow, columnTypes,
+        requestedColumnIndices.toArray)
+      if (enableAccumulators && columnarIterator.hasNext) {
+        readPartitions += 1
+      }
+      columnarIterator
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
index 4d35650d4b1eb..7eaecfe047c3f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
@@ -31,8 +31,8 @@ private[sql] trait NullableColumnAccessor extends ColumnAccessor {
 
   abstract override protected def initialize(): Unit = {
     nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder())
-    nullCount = nullsBuffer.getInt()
-    nextNullIndex = if (nullCount > 0) nullsBuffer.getInt() else -1
+    nullCount = ByteBufferHelper.getInt(nullsBuffer)
+    nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
     pos = 0
 
     underlyingBuffer.position(underlyingBuffer.position + 4 + nullCount * 4)
@@ -44,7 +44,7 @@ private[sql] trait NullableColumnAccessor extends ColumnAccessor {
       seenNulls += 1
 
       if (seenNulls < nullCount) {
-        nextNullIndex = nullsBuffer.getInt()
+        nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
       }
 
       row.setNullAt(ordinal)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index ca910a99db082..41c9a284e3e4a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -20,13 +20,11 @@ package org.apache.spark.sql.columnar.compression
 import java.nio.ByteBuffer
 
 import scala.collection.mutable
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.runtimeMirror
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 
 private[sql] case object PassThrough extends CompressionScheme {
@@ -161,7 +159,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
     override def next(row: MutableRow, ordinal: Int): Unit = {
       if (valueCount == run) {
         currentValue = columnType.extract(buffer)
-        run = buffer.getInt()
+        run = ByteBufferHelper.getInt(buffer)
         valueCount = 1
       } else {
         valueCount += 1
@@ -271,7 +269,7 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
     extends compression.Decoder[T] {
 
     private val dictionary: Array[Any] = {
-      val elementNum = buffer.getInt()
+      val elementNum = ByteBufferHelper.getInt(buffer)
       Array.fill[Any](elementNum)(columnType.extract(buffer).asInstanceOf[Any])
     }
 
@@ -352,7 +350,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
   }
 
   class Decoder(buffer: ByteBuffer) extends compression.Decoder[BooleanType.type] {
-    private val count = buffer.getInt()
+    private val count = ByteBufferHelper.getInt(buffer)
 
     private var currentWord = 0: Long
 
@@ -363,7 +361,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
 
       visited += 1
       if (bit == 0) {
-        currentWord = buffer.getLong()
+        currentWord = ByteBufferHelper.getLong(buffer)
       }
 
       row.setBoolean(ordinal, ((currentWord >> bit) & 1) != 0)
@@ -447,7 +445,7 @@ private[sql] case object IntDelta extends CompressionScheme {
 
     override def next(row: MutableRow, ordinal: Int): Unit = {
       val delta = buffer.get()
-      prev = if (delta > Byte.MinValue) prev + delta else buffer.getInt()
+      prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getInt(buffer)
       row.setInt(ordinal, prev)
     }
   }
@@ -527,7 +525,7 @@ private[sql] case object LongDelta extends CompressionScheme {
 
     override def next(row: MutableRow, ordinal: Int): Unit = {
       val delta = buffer.get()
-      prev = if (delta > Byte.MinValue) prev + delta else buffer.getLong()
+      prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getLong(buffer)
       row.setLong(ordinal, prev)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 4db9f4ee67bb0..dc38fe59feed5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -271,7 +271,7 @@ case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().map(_.copy()).coalesce(numPartitions, shuffle = false)
+    child.execute().coalesce(numPartitions, shuffle = false)
   }
 
   override def canProcessUnsafeRows: Boolean = true

From 82e9d9c81b7a8af94e312035328da872684f6e67 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 20 Oct 2015 15:05:02 -0700
Subject: [PATCH 616/802] [SPARK-10272][PYSPARK][MLLIB] Added @since tags to
 pyspark.mllib.evaluation

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to public methods + "versionadded::" to classes (derived from the git file history in pyspark).

Note - I added also the tags to MultilabelMetrics even though it isn't declared as public in the __all__ statement... if that's incorrect - I'll remove.

Author: noelsmith <mail@noelsmith.com>

Closes #8628 from noel-smith/SPARK-10272-since-mllib-evalutation.
---
 python/pyspark/mllib/evaluation.py | 41 ++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index a90e5c50e54b9..8c87ee9df2132 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
 from pyspark.sql import SQLContext
 from pyspark.sql.types import StructField, StructType, DoubleType, IntegerType, ArrayType
@@ -37,6 +38,8 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     >>> metrics.areaUnderPR
     0.83...
     >>> metrics.unpersist()
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, scoreAndLabels):
@@ -50,6 +53,7 @@ def __init__(self, scoreAndLabels):
         super(BinaryClassificationMetrics, self).__init__(java_model)
 
     @property
+    @since('1.4.0')
     def areaUnderROC(self):
         """
         Computes the area under the receiver operating characteristic
@@ -58,12 +62,14 @@ def areaUnderROC(self):
         return self.call("areaUnderROC")
 
     @property
+    @since('1.4.0')
     def areaUnderPR(self):
         """
         Computes the area under the precision-recall curve.
         """
         return self.call("areaUnderPR")
 
+    @since('1.4.0')
     def unpersist(self):
         """
         Unpersists intermediate RDDs used in the computation.
@@ -91,6 +97,8 @@ class RegressionMetrics(JavaModelWrapper):
     0.61...
     >>> metrics.r2
     0.94...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndObservations):
@@ -104,6 +112,7 @@ def __init__(self, predictionAndObservations):
         super(RegressionMetrics, self).__init__(java_model)
 
     @property
+    @since('1.4.0')
     def explainedVariance(self):
         """
         Returns the explained variance regression score.
@@ -112,6 +121,7 @@ def explainedVariance(self):
         return self.call("explainedVariance")
 
     @property
+    @since('1.4.0')
     def meanAbsoluteError(self):
         """
         Returns the mean absolute error, which is a risk function corresponding to the
@@ -120,6 +130,7 @@ def meanAbsoluteError(self):
         return self.call("meanAbsoluteError")
 
     @property
+    @since('1.4.0')
     def meanSquaredError(self):
         """
         Returns the mean squared error, which is a risk function corresponding to the
@@ -128,6 +139,7 @@ def meanSquaredError(self):
         return self.call("meanSquaredError")
 
     @property
+    @since('1.4.0')
     def rootMeanSquaredError(self):
         """
         Returns the root mean squared error, which is defined as the square root of
@@ -136,6 +148,7 @@ def rootMeanSquaredError(self):
         return self.call("rootMeanSquaredError")
 
     @property
+    @since('1.4.0')
     def r2(self):
         """
         Returns R^2^, the coefficient of determination.
@@ -178,6 +191,8 @@ class MulticlassMetrics(JavaModelWrapper):
     0.66...
     >>> metrics.weightedFMeasure(2.0)
     0.65...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -190,6 +205,7 @@ def __init__(self, predictionAndLabels):
         java_model = java_class(df._jdf)
         super(MulticlassMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def confusionMatrix(self):
         """
         Returns confusion matrix: predicted classes are in columns,
@@ -197,18 +213,21 @@ def confusionMatrix(self):
         """
         return self.call("confusionMatrix")
 
+    @since('1.4.0')
     def truePositiveRate(self, label):
         """
         Returns true positive rate for a given label (category).
         """
         return self.call("truePositiveRate", label)
 
+    @since('1.4.0')
     def falsePositiveRate(self, label):
         """
         Returns false positive rate for a given label (category).
         """
         return self.call("falsePositiveRate", label)
 
+    @since('1.4.0')
     def precision(self, label=None):
         """
         Returns precision or precision for a given label (category) if specified.
@@ -218,6 +237,7 @@ def precision(self, label=None):
         else:
             return self.call("precision", float(label))
 
+    @since('1.4.0')
     def recall(self, label=None):
         """
         Returns recall or recall for a given label (category) if specified.
@@ -227,6 +247,7 @@ def recall(self, label=None):
         else:
             return self.call("recall", float(label))
 
+    @since('1.4.0')
     def fMeasure(self, label=None, beta=None):
         """
         Returns f-measure or f-measure for a given label (category) if specified.
@@ -243,6 +264,7 @@ def fMeasure(self, label=None, beta=None):
                 return self.call("fMeasure", label, beta)
 
     @property
+    @since('1.4.0')
     def weightedTruePositiveRate(self):
         """
         Returns weighted true positive rate.
@@ -251,6 +273,7 @@ def weightedTruePositiveRate(self):
         return self.call("weightedTruePositiveRate")
 
     @property
+    @since('1.4.0')
     def weightedFalsePositiveRate(self):
         """
         Returns weighted false positive rate.
@@ -258,6 +281,7 @@ def weightedFalsePositiveRate(self):
         return self.call("weightedFalsePositiveRate")
 
     @property
+    @since('1.4.0')
     def weightedRecall(self):
         """
         Returns weighted averaged recall.
@@ -266,12 +290,14 @@ def weightedRecall(self):
         return self.call("weightedRecall")
 
     @property
+    @since('1.4.0')
     def weightedPrecision(self):
         """
         Returns weighted averaged precision.
         """
         return self.call("weightedPrecision")
 
+    @since('1.4.0')
     def weightedFMeasure(self, beta=None):
         """
         Returns weighted averaged f-measure.
@@ -307,6 +333,7 @@ class RankingMetrics(JavaModelWrapper):
     >>> metrics.ndcgAt(10)
     0.48...
 
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -317,6 +344,7 @@ def __init__(self, predictionAndLabels):
         java_model = callMLlibFunc("newRankingMetrics", df._jdf)
         super(RankingMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def precisionAt(self, k):
         """
         Compute the average precision of all the queries, truncated at ranking position k.
@@ -331,6 +359,7 @@ def precisionAt(self, k):
         return self.call("precisionAt", int(k))
 
     @property
+    @since('1.4.0')
     def meanAveragePrecision(self):
         """
         Returns the mean average precision (MAP) of all the queries.
@@ -339,6 +368,7 @@ def meanAveragePrecision(self):
         """
         return self.call("meanAveragePrecision")
 
+    @since('1.4.0')
     def ndcgAt(self, k):
         """
         Compute the average NDCG value of all the queries, truncated at ranking position k.
@@ -388,6 +418,8 @@ class MultilabelMetrics(JavaModelWrapper):
     0.28...
     >>> metrics.accuracy
     0.54...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -399,6 +431,7 @@ def __init__(self, predictionAndLabels):
         java_model = java_class(df._jdf)
         super(MultilabelMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def precision(self, label=None):
         """
         Returns precision or precision for a given label (category) if specified.
@@ -408,6 +441,7 @@ def precision(self, label=None):
         else:
             return self.call("precision", float(label))
 
+    @since('1.4.0')
     def recall(self, label=None):
         """
         Returns recall or recall for a given label (category) if specified.
@@ -417,6 +451,7 @@ def recall(self, label=None):
         else:
             return self.call("recall", float(label))
 
+    @since('1.4.0')
     def f1Measure(self, label=None):
         """
         Returns f1Measure or f1Measure for a given label (category) if specified.
@@ -427,6 +462,7 @@ def f1Measure(self, label=None):
             return self.call("f1Measure", float(label))
 
     @property
+    @since('1.4.0')
     def microPrecision(self):
         """
         Returns micro-averaged label-based precision.
@@ -435,6 +471,7 @@ def microPrecision(self):
         return self.call("microPrecision")
 
     @property
+    @since('1.4.0')
     def microRecall(self):
         """
         Returns micro-averaged label-based recall.
@@ -443,6 +480,7 @@ def microRecall(self):
         return self.call("microRecall")
 
     @property
+    @since('1.4.0')
     def microF1Measure(self):
         """
         Returns micro-averaged label-based f1-measure.
@@ -451,6 +489,7 @@ def microF1Measure(self):
         return self.call("microF1Measure")
 
     @property
+    @since('1.4.0')
     def hammingLoss(self):
         """
         Returns Hamming-loss.
@@ -458,6 +497,7 @@ def hammingLoss(self):
         return self.call("hammingLoss")
 
     @property
+    @since('1.4.0')
     def subsetAccuracy(self):
         """
         Returns subset accuracy.
@@ -466,6 +506,7 @@ def subsetAccuracy(self):
         return self.call("subsetAccuracy")
 
     @property
+    @since('1.4.0')
     def accuracy(self):
         """
         Returns accuracy.

From 9f49895fefc294ef40b2e974f1f8b311087c54df Mon Sep 17 00:00:00 2001
From: Tijo Thomas <tijoparacka@gmail.com>
Date: Tue, 20 Oct 2015 16:13:34 -0700
Subject: [PATCH 617/802] [SPARK-10261][DOCUMENTATION, ML] Fixed @Since
 annotation to ml.evaluation

Author: Tijo Thomas <tijoparacka@gmail.com>
Author: tijo <tijo@ezzoft.com>

Closes #8554 from tijoparacka/SPARK-10261-2.
---
 .../BinaryClassificationEvaluator.scala           | 15 +++++++++++++--
 .../apache/spark/ml/evaluation/Evaluator.scala    |  7 ++++++-
 .../MulticlassClassificationEvaluator.scala       | 14 ++++++++++++--
 .../spark/ml/evaluation/RegressionEvaluator.scala | 14 ++++++++++++--
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 08df2919a8a87..1fe3abaca81c3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for binary classification, which expects two input columns: rawPrediction and label.
  */
+@Since("1.2.0")
 @Experimental
-class BinaryClassificationEvaluator(override val uid: String)
+class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Evaluator with HasRawPredictionCol with HasLabelCol {
 
+  @Since("1.2.0")
   def this() = this(Identifiable.randomUID("binEval"))
 
   /**
@@ -41,6 +43,7 @@ class BinaryClassificationEvaluator(override val uid: String)
    * Default: areaUnderROC
    * @group param
    */
+  @Since("1.2.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("areaUnderROC", "areaUnderPR"))
     new Param(
@@ -48,12 +51,15 @@ class BinaryClassificationEvaluator(override val uid: String)
   }
 
   /** @group getParam */
+  @Since("1.2.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value)
 
   /**
@@ -61,13 +67,16 @@ class BinaryClassificationEvaluator(override val uid: String)
    * @deprecated use [[setRawPredictionCol()]] instead
    */
   @deprecated("use setRawPredictionCol instead", "1.5.0")
+  @Since("1.2.0")
   def setScoreCol(value: String): this.type = set(rawPredictionCol, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "areaUnderROC")
 
+  @Since("1.2.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
@@ -87,10 +96,12 @@ class BinaryClassificationEvaluator(override val uid: String)
     metric
   }
 
+  @Since("1.5.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "areaUnderROC" => true
     case "areaUnderPR" => true
   }
 
+  @Since("1.4.1")
   override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index 13bd3307f8a2f..0f22cca3a78d1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.ml.param.{ParamMap, Params}
 import org.apache.spark.sql.DataFrame
 
@@ -25,6 +25,7 @@ import org.apache.spark.sql.DataFrame
  * :: DeveloperApi ::
  * Abstract class for evaluators that compute metrics from predictions.
  */
+@Since("1.5.0")
 @DeveloperApi
 abstract class Evaluator extends Params {
 
@@ -35,6 +36,7 @@ abstract class Evaluator extends Params {
    * @param paramMap parameter map that specifies the input columns and output metrics
    * @return metric
    */
+  @Since("1.5.0")
   def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
     this.copy(paramMap).evaluate(dataset)
   }
@@ -44,6 +46,7 @@ abstract class Evaluator extends Params {
    * @param dataset a dataset that contains labels/observations and predictions.
    * @return metric
    */
+  @Since("1.5.0")
   def evaluate(dataset: DataFrame): Double
 
   /**
@@ -51,7 +54,9 @@ abstract class Evaluator extends Params {
    * or minimized (false).
    * A given evaluator may support multiple metrics which may be maximized or minimized.
    */
+  @Since("1.5.0")
   def isLargerBetter: Boolean = true
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): Evaluator
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
index f73d2345078e6..df5f04ca5a8d9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
@@ -29,10 +29,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for multiclass classification, which expects two input columns: score and label.
  */
+@Since("1.5.0")
 @Experimental
-class MulticlassClassificationEvaluator (override val uid: String)
+class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
+  @Since("1.5.0")
   def this() = this(Identifiable.randomUID("mcEval"))
 
   /**
@@ -40,6 +42,7 @@ class MulticlassClassificationEvaluator (override val uid: String)
    * `"weightedPrecision"`, `"weightedRecall"`)
    * @group param
    */
+  @Since("1.5.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("f1", "precision",
       "recall", "weightedPrecision", "weightedRecall"))
@@ -48,19 +51,24 @@ class MulticlassClassificationEvaluator (override val uid: String)
   }
 
   /** @group getParam */
+  @Since("1.5.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "f1")
 
+  @Since("1.5.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
@@ -81,6 +89,7 @@ class MulticlassClassificationEvaluator (override val uid: String)
     metric
   }
 
+  @Since("1.5.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "f1" => true
     case "precision" => true
@@ -89,5 +98,6 @@ class MulticlassClassificationEvaluator (override val uid: String)
     case "weightedRecall" => true
   }
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index d21c88ab9b109..3fd34d8571017 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -29,10 +29,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for regression, which expects two input columns: prediction and label.
  */
+@Since("1.4.0")
 @Experimental
-final class RegressionEvaluator(override val uid: String)
+final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
+  @Since("1.4.0")
   def this() = this(Identifiable.randomUID("regEval"))
 
   /**
@@ -43,25 +45,31 @@ final class RegressionEvaluator(override val uid: String)
    * we take and output the negative of this metric.
    * @group param
    */
+  @Since("1.4.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
     new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams)
   }
 
   /** @group getParam */
+  @Since("1.4.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "rmse")
 
+  @Since("1.4.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
@@ -81,6 +89,7 @@ final class RegressionEvaluator(override val uid: String)
     metric
   }
 
+  @Since("1.4.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "rmse" => false
     case "mse" => false
@@ -88,5 +97,6 @@ final class RegressionEvaluator(override val uid: String)
     case "mae" => false
   }
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
 }

From 04521ea067d6ed3c5398067f07904d27c77017ff Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 20 Oct 2015 16:14:20 -0700
Subject: [PATCH 618/802] [SPARK-10269][PYSPARK][MLLIB] Add @since annotation
 to pyspark.mllib.classification

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to methods + "versionadded::" to classes derived from the file history.

Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated.

Author: noelsmith <mail@noelsmith.com>

Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification.
---
 python/pyspark/mllib/classification.py | 70 ++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index b77754500bded..aab4015ba80f8 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -20,7 +20,7 @@
 import numpy
 from numpy import array
 
-from pyspark import RDD
+from pyspark import RDD, since
 from pyspark.streaming import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
 from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
@@ -44,6 +44,7 @@ def __init__(self, weights, intercept):
         super(LinearClassificationModel, self).__init__(weights, intercept)
         self._threshold = None
 
+    @since('1.4.0')
     def setThreshold(self, value):
         """
         .. note:: Experimental
@@ -57,6 +58,7 @@ def setThreshold(self, value):
         self._threshold = value
 
     @property
+    @since('1.4.0')
     def threshold(self):
         """
         .. note:: Experimental
@@ -67,6 +69,7 @@ def threshold(self):
         """
         return self._threshold
 
+    @since('1.4.0')
     def clearThreshold(self):
         """
         .. note:: Experimental
@@ -76,6 +79,7 @@ def clearThreshold(self):
         """
         self._threshold = None
 
+    @since('1.4.0')
     def predict(self, test):
         """
         Predict values for a single data point or an RDD of points
@@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel):
     1
     >>> mcm.predict([0.0, 0.0, 0.3])
     2
+
+    .. versionadded:: 0.9.0
     """
     def __init__(self, weights, intercept, numFeatures, numClasses):
         super(LogisticRegressionModel, self).__init__(weights, intercept)
@@ -172,13 +178,23 @@ def __init__(self, weights, intercept, numFeatures, numClasses):
                                                                 self._dataWithBiasSize)
 
     @property
+    @since('1.4.0')
     def numFeatures(self):
+        """
+        Dimension of the features.
+        """
         return self._numFeatures
 
     @property
+    @since('1.4.0')
     def numClasses(self):
+        """
+        Number of possible outcomes for k classes classification problem in Multinomial
+        Logistic Regression.
+        """
         return self._numClasses
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points
@@ -217,13 +233,21 @@ def predict(self, x):
                         best_class = i + 1
             return best_class
 
+    @since('1.4.0')
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
             _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -237,8 +261,11 @@ def load(cls, sc, path):
 
 
 class LogisticRegressionWithSGD(object):
-
+    """
+    .. versionadded:: 0.9.0
+    """
     @classmethod
+    @since('0.9.0')
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.01, regType="l2", intercept=False,
               validateData=True, convergenceTol=0.001):
@@ -286,8 +313,11 @@ def train(rdd, i):
 
 
 class LogisticRegressionWithLBFGS(object):
-
+    """
+    .. versionadded:: 1.2.0
+    """
     @classmethod
+    @since('1.2.0')
     def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2",
               intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2):
         """
@@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel):
     ...    rmtree(path)
     ... except:
     ...    pass
+
+    .. versionadded:: 0.9.0
     """
     def __init__(self, weights, intercept):
         super(SVMModel, self).__init__(weights, intercept)
         self._threshold = 0.0
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points
@@ -419,13 +452,21 @@ def predict(self, x):
         else:
             return 1 if margin > self._threshold else 0
 
+    @since('1.4.0')
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
             _py2java(sc, self._coeff), self.intercept)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -437,8 +478,12 @@ def load(cls, sc, path):
 
 
 class SVMWithSGD(object):
+    """
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since('0.9.0')
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, regType="l2",
               intercept=False, validateData=True, convergenceTol=0.001):
@@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
-    """
 
+    .. versionadded:: 0.9.0
+    """
     def __init__(self, labels, pi, theta):
         self.labels = labels
         self.pi = pi
         self.theta = theta
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Return the most likely class for a data vector
@@ -548,6 +595,9 @@ def predict(self, x):
         return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
 
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_labels = _py2java(sc, self.labels.tolist())
         java_pi = _py2java(sc, self.pi.tolist())
         java_theta = _py2java(sc, self.theta.tolist())
@@ -556,7 +606,11 @@ def save(self, sc, path):
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
             sc._jsc.sc(), path)
         # Can not unpickle array.array from Pyrolite in Python3 with "bytes"
@@ -567,8 +621,12 @@ def load(cls, sc, path):
 
 
 class NaiveBayes(object):
+    """
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since('0.9.0')
     def train(cls, data, lambda_=1.0):
         """
         Train a Naive Bayes model given an RDD of (label, features)
@@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
                               iteration.
     :param regParam: L2 Regularization parameter.
     :param convergenceTol: A condition which decides iteration termination.
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
                  convergenceTol=0.001):
@@ -617,6 +677,7 @@ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regPar
         super(StreamingLogisticRegressionWithSGD, self).__init__(
             model=self._model)
 
+    @since('1.5.0')
     def setInitialWeights(self, initialWeights):
         """
         Set the initial value of weights.
@@ -630,6 +691,7 @@ def setInitialWeights(self, initialWeights):
             initialWeights, 0, initialWeights.size, 2)
         return self
 
+    @since('1.5.0')
     def trainOn(self, dstream):
         """Train the model on the incoming dstream."""
         self._validate(dstream)

From da46b77afd13df8eb696e4612224ae29cc198c0b Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 20 Oct 2015 16:35:34 -0700
Subject: [PATCH 619/802] [SPARK-10082][MLLIB] Validate i, j in apply
 DenseMatrices and SparseMatrices

Given row_ind should be less than the number of rows
Given col_ind should be less than the number of cols.

The current code in master gives unpredictable behavior for such cases.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8271 from MechCoder/hash_code_matrices.
---
 .../org/apache/spark/mllib/linalg/Matrices.scala      |  4 ++++
 .../org/apache/spark/mllib/linalg/MatricesSuite.scala | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index c02ba426fcc3a..cfed9ad0730bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -296,6 +296,8 @@ class DenseMatrix @Since("1.3.0") (
   override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
+    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
+    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
     if (!isTransposed) i + numRows * j else j + numCols * i
   }
 
@@ -570,6 +572,8 @@ class SparseMatrix @Since("1.3.0") (
   }
 
   private[mllib] def index(i: Int, j: Int): Int = {
+    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
+    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
     if (!isTransposed) {
       Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i)
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index bfd6d5495f5e0..b0071c9a028a2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,17 @@ class MatricesSuite extends SparkFunSuite {
     }
   }
 
+  test("index in matrices incorrect input") {
+    val sm = Matrices.sparse(3, 2, Array(0, 2, 3), Array(1, 2, 1), Array(0.0, 1.0, 2.0))
+    val dm = Matrices.dense(3, 2, Array(0.0, 2.3, 1.4, 3.2, 1.0, 9.1))
+    Array(sm, dm).foreach { mat =>
+        intercept[IllegalArgumentException] { mat.index(4, 1) }
+        intercept[IllegalArgumentException] { mat.index(1, 4) }
+        intercept[IllegalArgumentException] { mat.index(-1, 2) }
+        intercept[IllegalArgumentException] { mat.index(1, -2) }
+    }
+  }
+
   test("equals") {
     val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
     assert(dm1 === dm1)

From aea7142c9802d1e855443c01621ebc8d57be8c5e Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 20 Oct 2015 16:51:32 -0700
Subject: [PATCH 620/802] [SPARK-10767][PYSPARK] Make pyspark shared params
 codegen more consistent

Namely "." shows up in some places in the template when using the param docstring and not in others

Author: Holden Karau <holden@pigscanfly.ca>

Closes #9017 from holdenk/SPARK-10767-Make-pyspark-shared-params-codegen-more-consistent.
---
 .../ml/param/_shared_params_code_gen.py       | 28 +++---
 python/pyspark/ml/param/shared.py             | 94 +++++++++----------
 python/pyspark/ml/tests.py                    |  8 +-
 3 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 45a94e9c32962..7143d56330bd6 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -47,7 +47,7 @@ def _gen_param_header(name, doc, defaultValueStr):
     """
     template = '''class Has$Name(Params):
     """
-    Mixin for param $name: $doc.
+    Mixin for param $name: $doc
     """
 
     # a placeholder to make it appear in the generated doc
@@ -105,22 +105,22 @@ def get$Name(self):
     print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
     print("from pyspark.ml.param import Param, Params\n\n")
     shared = [
-        ("maxIter", "max number of iterations (>= 0)", None),
-        ("regParam", "regularization parameter (>= 0)", None),
-        ("featuresCol", "features column name", "'features'"),
-        ("labelCol", "label column name", "'label'"),
-        ("predictionCol", "prediction column name", "'prediction'"),
+        ("maxIter", "max number of iterations (>= 0).", None),
+        ("regParam", "regularization parameter (>= 0).", None),
+        ("featuresCol", "features column name.", "'features'"),
+        ("labelCol", "label column name.", "'label'"),
+        ("predictionCol", "prediction column name.", "'prediction'"),
         ("probabilityCol", "Column name for predicted class conditional probabilities. " +
          "Note: Not all models output well-calibrated probability estimates! These probabilities " +
          "should be treated as confidences, not precise probabilities.", "'probability'"),
-        ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name", "'rawPrediction'"),
-        ("inputCol", "input column name", None),
-        ("inputCols", "input column names", None),
-        ("outputCol", "output column name", "self.uid + '__output'"),
-        ("numFeatures", "number of features", None),
-        ("checkpointInterval", "checkpoint interval (>= 1)", None),
-        ("seed", "random seed", "hash(type(self).__name__)"),
-        ("tol", "the convergence tolerance for iterative algorithms", None),
+        ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'"),
+        ("inputCol", "input column name.", None),
+        ("inputCols", "input column names.", None),
+        ("outputCol", "output column name.", "self.uid + '__output'"),
+        ("numFeatures", "number of features.", None),
+        ("checkpointInterval", "checkpoint interval (>= 1).", None),
+        ("seed", "random seed.", "hash(type(self).__name__)"),
+        ("tol", "the convergence tolerance for iterative algorithms.", None),
         ("stepSize", "Step size to be used for each iteration of optimization.", None),
         ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
          "out rows with bad values), or error (which will throw an errror). More options may be " +
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 8c438bc74f51f..3a58ac87d6b65 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -26,12 +26,12 @@ class HasMaxIter(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0)")
+    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).")
 
     def __init__(self):
         super(HasMaxIter, self).__init__()
-        #: param for max number of iterations (>= 0)
-        self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0)")
+        #: param for max number of iterations (>= 0).
+        self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0).")
 
     def setMaxIter(self, value):
         """
@@ -53,12 +53,12 @@ class HasRegParam(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0)")
+    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).")
 
     def __init__(self):
         super(HasRegParam, self).__init__()
-        #: param for regularization parameter (>= 0)
-        self.regParam = Param(self, "regParam", "regularization parameter (>= 0)")
+        #: param for regularization parameter (>= 0).
+        self.regParam = Param(self, "regParam", "regularization parameter (>= 0).")
 
     def setRegParam(self, value):
         """
@@ -80,12 +80,12 @@ class HasFeaturesCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    featuresCol = Param(Params._dummy(), "featuresCol", "features column name")
+    featuresCol = Param(Params._dummy(), "featuresCol", "features column name.")
 
     def __init__(self):
         super(HasFeaturesCol, self).__init__()
-        #: param for features column name
-        self.featuresCol = Param(self, "featuresCol", "features column name")
+        #: param for features column name.
+        self.featuresCol = Param(self, "featuresCol", "features column name.")
         self._setDefault(featuresCol='features')
 
     def setFeaturesCol(self, value):
@@ -108,12 +108,12 @@ class HasLabelCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    labelCol = Param(Params._dummy(), "labelCol", "label column name")
+    labelCol = Param(Params._dummy(), "labelCol", "label column name.")
 
     def __init__(self):
         super(HasLabelCol, self).__init__()
-        #: param for label column name
-        self.labelCol = Param(self, "labelCol", "label column name")
+        #: param for label column name.
+        self.labelCol = Param(self, "labelCol", "label column name.")
         self._setDefault(labelCol='label')
 
     def setLabelCol(self, value):
@@ -136,12 +136,12 @@ class HasPredictionCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name")
+    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.")
 
     def __init__(self):
         super(HasPredictionCol, self).__init__()
-        #: param for prediction column name
-        self.predictionCol = Param(self, "predictionCol", "prediction column name")
+        #: param for prediction column name.
+        self.predictionCol = Param(self, "predictionCol", "prediction column name.")
         self._setDefault(predictionCol='prediction')
 
     def setPredictionCol(self, value):
@@ -160,7 +160,7 @@ def getPredictionCol(self):
 
 class HasProbabilityCol(Params):
     """
-    Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities..
+    Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -192,12 +192,12 @@ class HasRawPredictionCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
+    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.")
 
     def __init__(self):
         super(HasRawPredictionCol, self).__init__()
-        #: param for raw prediction (a.k.a. confidence) column name
-        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
+        #: param for raw prediction (a.k.a. confidence) column name.
+        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.")
         self._setDefault(rawPredictionCol='rawPrediction')
 
     def setRawPredictionCol(self, value):
@@ -220,12 +220,12 @@ class HasInputCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    inputCol = Param(Params._dummy(), "inputCol", "input column name")
+    inputCol = Param(Params._dummy(), "inputCol", "input column name.")
 
     def __init__(self):
         super(HasInputCol, self).__init__()
-        #: param for input column name
-        self.inputCol = Param(self, "inputCol", "input column name")
+        #: param for input column name.
+        self.inputCol = Param(self, "inputCol", "input column name.")
 
     def setInputCol(self, value):
         """
@@ -247,12 +247,12 @@ class HasInputCols(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    inputCols = Param(Params._dummy(), "inputCols", "input column names")
+    inputCols = Param(Params._dummy(), "inputCols", "input column names.")
 
     def __init__(self):
         super(HasInputCols, self).__init__()
-        #: param for input column names
-        self.inputCols = Param(self, "inputCols", "input column names")
+        #: param for input column names.
+        self.inputCols = Param(self, "inputCols", "input column names.")
 
     def setInputCols(self, value):
         """
@@ -274,12 +274,12 @@ class HasOutputCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    outputCol = Param(Params._dummy(), "outputCol", "output column name")
+    outputCol = Param(Params._dummy(), "outputCol", "output column name.")
 
     def __init__(self):
         super(HasOutputCol, self).__init__()
-        #: param for output column name
-        self.outputCol = Param(self, "outputCol", "output column name")
+        #: param for output column name.
+        self.outputCol = Param(self, "outputCol", "output column name.")
         self._setDefault(outputCol=self.uid + '__output')
 
     def setOutputCol(self, value):
@@ -302,12 +302,12 @@ class HasNumFeatures(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    numFeatures = Param(Params._dummy(), "numFeatures", "number of features")
+    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.")
 
     def __init__(self):
         super(HasNumFeatures, self).__init__()
-        #: param for number of features
-        self.numFeatures = Param(self, "numFeatures", "number of features")
+        #: param for number of features.
+        self.numFeatures = Param(self, "numFeatures", "number of features.")
 
     def setNumFeatures(self, value):
         """
@@ -329,12 +329,12 @@ class HasCheckpointInterval(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1)")
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).")
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
-        #: param for checkpoint interval (>= 1)
-        self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1)")
+        #: param for checkpoint interval (>= 1).
+        self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1).")
 
     def setCheckpointInterval(self, value):
         """
@@ -356,12 +356,12 @@ class HasSeed(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    seed = Param(Params._dummy(), "seed", "random seed")
+    seed = Param(Params._dummy(), "seed", "random seed.")
 
     def __init__(self):
         super(HasSeed, self).__init__()
-        #: param for random seed
-        self.seed = Param(self, "seed", "random seed")
+        #: param for random seed.
+        self.seed = Param(self, "seed", "random seed.")
         self._setDefault(seed=hash(type(self).__name__))
 
     def setSeed(self, value):
@@ -384,12 +384,12 @@ class HasTol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms")
+    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.")
 
     def __init__(self):
         super(HasTol, self).__init__()
-        #: param for the convergence tolerance for iterative algorithms
-        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
+        #: param for the convergence tolerance for iterative algorithms.
+        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms.")
 
     def setTol(self, value):
         """
@@ -407,7 +407,7 @@ def getTol(self):
 
 class HasStepSize(Params):
     """
-    Mixin for param stepSize: Step size to be used for each iteration of optimization..
+    Mixin for param stepSize: Step size to be used for each iteration of optimization.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -434,7 +434,7 @@ def getStepSize(self):
 
 class HasHandleInvalid(Params):
     """
-    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later..
+    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -461,7 +461,7 @@ def getHandleInvalid(self):
 
 class HasElasticNetParam(Params):
     """
-    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -489,7 +489,7 @@ def getElasticNetParam(self):
 
 class HasFitIntercept(Params):
     """
-    Mixin for param fitIntercept: whether to fit an intercept term..
+    Mixin for param fitIntercept: whether to fit an intercept term.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -517,7 +517,7 @@ def getFitIntercept(self):
 
 class HasStandardization(Params):
     """
-    Mixin for param standardization: whether to standardize the training features before fitting the model..
+    Mixin for param standardization: whether to standardize the training features before fitting the model.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -545,7 +545,7 @@ def getStandardization(self):
 
 class HasThresholds(Params):
     """
-    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
+    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -572,7 +572,7 @@ def getThresholds(self):
 
 class HasWeightCol(Params):
     """
-    Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0..
+    Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0.
     """
 
     # a placeholder to make it appear in the generated doc
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 648fa8858fba3..6a2577d66f287 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -163,7 +163,7 @@ def test_param(self):
         testParams = TestParams()
         maxIter = testParams.maxIter
         self.assertEqual(maxIter.name, "maxIter")
-        self.assertEqual(maxIter.doc, "max number of iterations (>= 0)")
+        self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
         self.assertTrue(maxIter.parent == testParams.uid)
 
     def test_params(self):
@@ -197,9 +197,9 @@ def test_params(self):
 
         self.assertEqual(
             testParams.explainParams(),
-            "\n".join(["inputCol: input column name (undefined)",
-                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
-                       "seed: random seed (default: 41, current: 43)"]))
+            "\n".join(["inputCol: input column name. (undefined)",
+                       "maxIter: max number of iterations (>= 0). (default: 10, current: 100)",
+                       "seed: random seed. (default: 41, current: 43)"]))
 
     def test_hasseed(self):
         noSeedSpecd = TestParams()

From 89e6db6150704deab46232352d1986bc1449883b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 21 Oct 2015 09:02:20 +0800
Subject: [PATCH 621/802] [SPARK-11153][SQL] Disables Parquet filter push-down
 for string and binary columns

Due to PARQUET-251, `BINARY` columns in existing Parquet files may be written with corrupted statistics information. This information is used by filter push-down optimization. Since Spark 1.5 turns on Parquet filter push-down by default, we may end up with wrong query results. PARQUET-251 has been fixed in parquet-mr 1.8.1, but Spark 1.5 is still using 1.7.0.

This affects all Spark SQL data types that can be mapped to Parquet {{BINARY}}, namely:

- `StringType`

- `BinaryType`

- `DecimalType`

  (But Spark SQL doesn't support pushing down filters involving `DecimalType` columns for now.)

To avoid wrong query results, we should disable filter push-down for columns of `StringType` and `BinaryType` until we upgrade to parquet-mr 1.8.

Author: Cheng Lian <lian@databricks.com>

Closes #9152 from liancheng/spark-11153.workaround-parquet-251.

(cherry picked from commit 0887e5e87891e8e22f534ca6d0406daf86ec2dad)
Signed-off-by: Cheng Lian <lian@databricks.com>
---
 .../datasources/parquet/ParquetFilters.scala  | 27 +++++++++++++++++++
 .../parquet/ParquetFilterSuite.scala          |  6 +++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 78040d99fb0a5..07714329370a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -53,6 +53,8 @@ private[sql] object ParquetFilters {
     case DoubleType =>
       (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     // Binary.fromString and Binary.fromByteArray don't accept null values
     case StringType =>
       (n: String, v: Any) => FilterApi.eq(
@@ -62,6 +64,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -75,6 +78,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
@@ -83,6 +89,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -94,6 +101,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n),
@@ -101,6 +111,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeLtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -112,6 +123,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n),
@@ -119,6 +133,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -130,6 +145,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n),
@@ -137,6 +155,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -148,6 +167,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n),
@@ -155,6 +177,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeInSet: PartialFunction[DataType, (String, Set[Any]) => FilterPredicate] = {
@@ -170,6 +193,9 @@ private[sql] object ParquetFilters {
     case DoubleType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(doubleColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Double]]))
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
@@ -178,6 +204,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
           SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[Array[Byte]]))))
+     */
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 7a23f57f40392..13fdd555a4c71 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -219,7 +219,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - string") {
+  // See https://issues.apache.org/jira/browse/SPARK-11153
+  ignore("filter pushdown - string") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate(
@@ -247,7 +248,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - binary") {
+  // See https://issues.apache.org/jira/browse/SPARK-11153
+  ignore("filter pushdown - binary") {
     implicit class IntToBinary(int: Int) {
       def b: Array[Byte] = int.toString.getBytes("UTF-8")
     }

From 45861693bef2619196f0fbdf5c166ad3f9b1e8d1 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 20 Oct 2015 18:37:29 -0700
Subject: [PATCH 622/802] [SPARK-10082][MLLIB] minor style updates for matrix
 indexing after #8271

* `>=0` => `>= 0`
* print `i`, `j` in the log message

MechCoder

Author: Xiangrui Meng <meng@databricks.com>

Closes #9189 from mengxr/SPARK-10082.
---
 .../scala/org/apache/spark/mllib/linalg/Matrices.scala    | 8 ++++----
 .../org/apache/spark/mllib/linalg/MatricesSuite.scala     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index cfed9ad0730bb..8ba6e4e78d969 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -296,8 +296,8 @@ class DenseMatrix @Since("1.3.0") (
   override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
-    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
-    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
+    require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+    require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
     if (!isTransposed) i + numRows * j else j + numCols * i
   }
 
@@ -572,8 +572,8 @@ class SparseMatrix @Since("1.3.0") (
   }
 
   private[mllib] def index(i: Int, j: Int): Int = {
-    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
-    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
+    require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+    require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
     if (!isTransposed) {
       Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i)
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index b0071c9a028a2..1833cf3833671 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -78,10 +78,10 @@ class MatricesSuite extends SparkFunSuite {
     val sm = Matrices.sparse(3, 2, Array(0, 2, 3), Array(1, 2, 1), Array(0.0, 1.0, 2.0))
     val dm = Matrices.dense(3, 2, Array(0.0, 2.3, 1.4, 3.2, 1.0, 9.1))
     Array(sm, dm).foreach { mat =>
-        intercept[IllegalArgumentException] { mat.index(4, 1) }
-        intercept[IllegalArgumentException] { mat.index(1, 4) }
-        intercept[IllegalArgumentException] { mat.index(-1, 2) }
-        intercept[IllegalArgumentException] { mat.index(1, -2) }
+      intercept[IllegalArgumentException] { mat.index(4, 1) }
+      intercept[IllegalArgumentException] { mat.index(1, 4) }
+      intercept[IllegalArgumentException] { mat.index(-1, 2) }
+      intercept[IllegalArgumentException] { mat.index(1, -2) }
     }
   }
 

From 135ade905089fe90d3cc9e84f8fedc7637e901d8 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 20 Oct 2015 18:38:06 -0700
Subject: [PATCH 623/802] [MINOR][ML] fix doc warnings

Without an empty line, sphinx will treat doctest as docstring. holdenk

~~~
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "label|raw |vectors | +-----+---------------+-------------------------+ |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])".
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])".
~~~

Author: Xiangrui Meng <meng@databricks.com>

Closes #9188 from mengxr/py-count-vec-doc-fix.
---
 python/pyspark/ml/feature.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a4e60f916b5c8..55bde6d0ea4fb 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -178,6 +178,7 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     .. note:: Experimental
 
     Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
+
     >>> df = sqlContext.createDataFrame(
     ...    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
     ...    ["label", "raw"])

From 1107bd958aefc793d6e8e369ad6268b24a0f8510 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 20 Oct 2015 21:40:22 -0700
Subject: [PATCH 624/802] [SPARK-11221][SPARKR] fix R doc for lit and add
 examples

Currently the documentation for `lit` is inconsistent with doc format, references "Scala symbol" and has no example. Fixing that.
shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9187 from felixcheung/rlit.
---
 R/pkg/R/functions.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a220ad8b9f58b..a72fb7bb42fef 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -18,16 +18,21 @@
 #' @include generics.R column.R
 NULL
 
-#' Creates a \code{Column} of literal value.
+#' lit
 #'
-#' The passed in object is returned directly if it is already a \linkS4class{Column}.
-#' If the object is a Scala Symbol, it is converted into a \linkS4class{Column} also.
-#' Otherwise, a new \linkS4class{Column} is created to represent the literal value.
+#' A new \linkS4class{Column} is created to represent the literal value.
+#' If the parameter is a \linkS4class{Column}, it is returned unchanged.
 #'
 #' @family normal_funcs
 #' @rdname lit
 #' @name lit
 #' @export
+#' @examples
+#' \dontrun{
+#' lit(df$name)
+#' select(df, lit("x"))
+#' select(df, lit("2015-01-01"))
+#'}
 setMethod("lit", signature("ANY"),
           function(x) {
             jc <- callJStatic("org.apache.spark.sql.functions",

From 5cdea7d1e54e751f928876ead33fc01e7e2f3437 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Tue, 20 Oct 2015 23:58:27 -0700
Subject: [PATCH 625/802] =?UTF-8?q?[SPARK-11205][PYSPARK]=20Delegate=20to?=
 =?UTF-8?q?=20scala=20DataFrame=20API=20rather=20than=20p=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rint in python

No test needed. Verify it manually in pyspark shell

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9177 from zjffdu/SPARK-11205.
---
 python/pyspark/sql/dataframe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 033b31983ffac..36fc6e0611dc1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -212,6 +212,7 @@ def explain(self, extended=False):
         :param extended: boolean, default ``False``. If ``False``, prints only the physical plan.
 
         >>> df.explain()
+        == Physical Plan ==
         Scan PhysicalRDD[age#0,name#1]
 
         >>> df.explain(True)
@@ -227,7 +228,7 @@ def explain(self, extended=False):
         if extended:
             print(self._jdf.queryExecution().toString())
         else:
-            print(self._jdf.queryExecution().executedPlan().toString())
+            print(self._jdf.queryExecution().simpleString())
 
     @since(1.3)
     def isLocal(self):

From 8e82e59834aefb74d49e45c9c9c926bb53143b4c Mon Sep 17 00:00:00 2001
From: Pravin Gadakh <prgadakh@in.ibm.com>
Date: Wed, 21 Oct 2015 10:35:09 -0700
Subject: [PATCH 626/802] [SPARK-11037][SQL] using Option instead of Some in
 JdbcDialects

Using Option instead of Some in getCatalystType method.

Author: Pravin Gadakh <prgadakh@in.ibm.com>

Closes #9195 from pravingadakh/master.
---
 .../org/apache/spark/sql/jdbc/JdbcDialects.scala | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index a2ff4cc1c91f9..88ae83957a708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -198,15 +198,15 @@ case object PostgresDialect extends JdbcDialect {
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) {
-      Some(BinaryType)
+      Option(BinaryType)
     } else if (sqlType == Types.OTHER && typeName.equals("cidr")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("inet")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("json")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("jsonb")) {
-      Some(StringType)
+      Option(StringType)
     } else None
   }
 
@@ -236,9 +236,9 @@ case object MySQLDialect extends JdbcDialect {
       // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
       // byte arrays instead of longs.
       md.putLong("binarylong", 1)
-      Some(LongType)
+      Option(LongType)
     } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
-      Some(BooleanType)
+      Option(BooleanType)
     } else None
   }
 
@@ -279,7 +279,7 @@ case object MsSqlServerDialect extends JdbcDialect {
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (typeName.contains("datetimeoffset")) {
       // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients
-      Some(StringType)
+      Option(StringType)
     } else None
   }
 

From f62e3260889d67256d335fd0dd38f114ae4e3eca Mon Sep 17 00:00:00 2001
From: nitin goyal <nitin.goyal@guavus.com>
Date: Wed, 21 Oct 2015 10:45:21 -0700
Subject: [PATCH 627/802] [SPARK-11179] [SQL] Push filters through aggregate

Push conjunctive predicates though Aggregate operators when their references are a subset of the groupingExpressions.

Query plan before optimisation :-
Filter ((c#138L = 2) && (a#0 = 3))
 Aggregate [a#0], [a#0,count(b#1) AS c#138L]
  Project [a#0,b#1]
   LocalRelation [a#0,b#1,c#2]

Query plan after optimisation :-
Filter (c#138L = 2)
 Aggregate [a#0], [a#0,count(b#1) AS c#138L]
  Filter (a#0 = 3)
   Project [a#0,b#1]
    LocalRelation [a#0,b#1,c#2]

Author: nitin goyal <nitin.goyal@guavus.com>
Author: nitin.goyal <nitin.goyal@guavus.com>

Closes #9167 from nitin2goyal/master.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 24 ++++++++++
 .../optimizer/FilterPushdownSuite.scala       | 45 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6557c7005d1e1..0139b9e87ce84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -46,6 +46,7 @@ object DefaultOptimizer extends Optimizer {
       PushPredicateThroughJoin,
       PushPredicateThroughProject,
       PushPredicateThroughGenerate,
+      PushPredicateThroughAggregate,
       ColumnPruning,
       // Operator combine
       ProjectCollapsing,
@@ -674,6 +675,29 @@ object PushPredicateThroughGenerate extends Rule[LogicalPlan] with PredicateHelp
   }
 }
 
+/**
+ * Push [[Filter]] operators through [[Aggregate]] operators. Parts of the predicate that reference
+ * attributes which are subset of group by attribute set of [[Aggregate]] will be pushed beneath,
+ * and the rest should remain above.
+ */
+object PushPredicateThroughAggregate extends Rule[LogicalPlan] with PredicateHelper {
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case filter @ Filter(condition,
+        aggregate @ Aggregate(groupingExpressions, aggregateExpressions, grandChild)) =>
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition {
+        conjunct => conjunct.references subsetOf AttributeSet(groupingExpressions)
+      }
+      if (pushDown.nonEmpty) {
+        val pushDownPredicate = pushDown.reduce(And)
+        val withPushdown = aggregate.copy(child = Filter(pushDownPredicate, grandChild))
+        stayUp.reduceOption(And).map(Filter(_, withPushdown)).getOrElse(withPushdown)
+      } else {
+        filter
+      }
+  }
+}
+
 /**
  * Pushes down [[Filter]] operators where the `condition` can be
  * evaluated using only the attributes of the left or right side of a join.  Other
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 0f1fde2fb0f67..ed810a12808f0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -40,6 +40,7 @@ class FilterPushdownSuite extends PlanTest {
         BooleanSimplification,
         PushPredicateThroughJoin,
         PushPredicateThroughGenerate,
+        PushPredicateThroughAggregate,
         ColumnPruning,
         ProjectCollapsing) :: Nil
   }
@@ -652,4 +653,48 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer.analyze)
   }
+
+  test("aggregate: push down filter when filter on group by expression") {
+    val originalQuery = testRelation
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .select('a, 'c)
+                        .where('a === 2)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = testRelation
+                        .where('a === 2)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("aggregate: don't push down filter when filter not on group by expression") {
+    val originalQuery = testRelation
+                        .select('a, 'b)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    comparePlans(optimized, originalQuery.analyze)
+  }
+
+  test("aggregate: push down filters partially which are subset of group by expressions") {
+    val originalQuery = testRelation
+                        .select('a, 'b)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L && 'a === 3)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = testRelation
+                        .select('a, 'b)
+                        .where('a === 3)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L)
+                        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }

From ccf536f903ef1f81fb3e1b6ce781d5e40d0ae3e0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 21 Oct 2015 11:06:34 -0700
Subject: [PATCH 628/802] [SPARK-11216] [SQL] add encoder/decoder for external
 row

Implement encode/decode for external row based on `ClassEncoder`.

TODO:
* code cleanup
* ~~fix corner cases~~
* refactor the encoder interface
* improve test for product codegen, to cover more corner cases.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9184 from cloud-fan/encoder.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |   6 +-
 .../sql/catalyst/encoders/ClassEncoder.scala  |  75 ++++++
 .../spark/sql/catalyst/encoders/Encoder.scala |   2 +-
 .../catalyst/encoders/ProductEncoder.scala    |  46 +---
 .../sql/catalyst/encoders/RowEncoder.scala    | 234 ++++++++++++++++++
 .../sql/catalyst/expressions/objects.scala    |  46 +++-
 .../spark/sql/types/ArrayBasedMapData.scala   |   4 +
 .../spark/sql/RandomDataGenerator.scala       |   4 +-
 .../catalyst/encoders/RowEncoderSuite.scala   |  96 +++++++
 9 files changed, 459 insertions(+), 54 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 8edd6498e5163..27c96f41221ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedExtractValue, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 /**
  * A default version of ScalaReflection that uses the runtime universe.
@@ -142,7 +142,7 @@ trait ScalaReflection {
   }
 
   /**
-   * Returns an expression that can be used to construct an object of type `T` given a an input
+   * Returns an expression that can be used to construct an object of type `T` given an input
    * row with a compatible schema.  Fields of the row will be extracted using UnresolvedAttributes
    * of the same name as the constructor arguments.  Nested classes will have their fields accessed
    * using UnresolvedExtractValue.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
new file mode 100644
index 0000000000000..f3a1063871775
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
+import org.apache.spark.sql.types.{ObjectType, StructType}
+
+/**
+ * A generic encoder for JVM objects.
+ *
+ * @param schema The schema after converting `T` to a Spark SQL row.
+ * @param extractExpressions A set of expressions, one for each top-level field that can be used to
+ *                           extract the values from a raw object.
+ * @param clsTag A classtag for `T`.
+ */
+case class ClassEncoder[T](
+    schema: StructType,
+    extractExpressions: Seq[Expression],
+    constructExpression: Expression,
+    clsTag: ClassTag[T])
+  extends Encoder[T] {
+
+  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  private val inputRow = new GenericMutableRow(1)
+
+  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
+  private val dataType = ObjectType(clsTag.runtimeClass)
+
+  override def toRow(t: T): InternalRow = {
+    if (t == null) {
+      null
+    } else {
+      inputRow(0) = t
+      extractProjection(inputRow)
+    }
+  }
+
+  override def fromRow(row: InternalRow): T = {
+    if (row eq null) {
+      null.asInstanceOf[T]
+    } else {
+      constructProjection(row).get(0, dataType).asInstanceOf[T]
+    }
+  }
+
+  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
+    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    val resolvedExpression = analyzedPlan.expressions.head.children.head
+    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
+
+    copy(constructExpression = boundExpression)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index 3618247d5d51a..bdb1c0959da87 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -46,7 +46,7 @@ trait Encoder[T] {
 
   /**
    * Returns an object of type `T`, extracting the required values from the provided row.  Note that
-   * you must bind` and encoder to a specific schema before you can call this function.
+   * you must bind the encoder to a specific schema before you can call this function.
    */
   def fromRow(row: InternalRow): T
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
index b0381880c3bdb..4f7ce455ada99 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
-
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag}
 
-import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.types.{ObjectType, StructType}
 
 /**
@@ -44,44 +40,6 @@ object ProductEncoder {
     val constructExpression = ScalaReflection.constructorFor[T]
     new ClassEncoder[T](schema, extractExpressions, constructExpression, ClassTag[T](cls))
   }
-}
-
-/**
- * A generic encoder for JVM objects.
- *
- * @param schema The schema after converting `T` to a Spark SQL row.
- * @param extractExpressions A set of expressions, one for each top-level field that can be used to
- *                           extract the values from a raw object.
- * @param clsTag A classtag for `T`.
- */
-case class ClassEncoder[T](
-    schema: StructType,
-    extractExpressions: Seq[Expression],
-    constructExpression: Expression,
-    clsTag: ClassTag[T])
-  extends Encoder[T] {
 
-  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
-  private val inputRow = new GenericMutableRow(1)
 
-  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
-  private val dataType = ObjectType(clsTag.runtimeClass)
-
-  override def toRow(t: T): InternalRow = {
-    inputRow(0) = t
-    extractProjection(inputRow)
-  }
-
-  override def fromRow(row: InternalRow): T = {
-    constructProjection(row).get(0, dataType).asInstanceOf[T]
-  }
-
-  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
-    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
-    val analyzedPlan = SimpleAnalyzer.execute(plan)
-    val resolvedExpression = analyzedPlan.expressions.head.children.head
-    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
-
-    copy(constructExpression = boundExpression)
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
new file mode 100644
index 0000000000000..3e74aabd078df
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.collection.Map
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+object RowEncoder {
+
+  def apply(schema: StructType): ClassEncoder[Row] = {
+    val cls = classOf[Row]
+    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
+    val extractExpressions = extractorsFor(inputObject, schema)
+    val constructExpression = constructorFor(schema)
+    new ClassEncoder[Row](
+      schema,
+      extractExpressions.asInstanceOf[CreateStruct].children,
+      constructExpression,
+      ClassTag(cls))
+  }
+
+  private def extractorsFor(
+      inputObject: Expression,
+      inputType: DataType): Expression = inputType match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => inputObject
+
+    case TimestampType =>
+      StaticInvoke(
+        DateTimeUtils,
+        TimestampType,
+        "fromJavaTimestamp",
+        inputObject :: Nil)
+
+    case DateType =>
+      StaticInvoke(
+        DateTimeUtils,
+        DateType,
+        "fromJavaDate",
+        inputObject :: Nil)
+
+    case _: DecimalType =>
+      StaticInvoke(
+        Decimal,
+        DecimalType.SYSTEM_DEFAULT,
+        "apply",
+        inputObject :: Nil)
+
+    case StringType =>
+      StaticInvoke(
+        classOf[UTF8String],
+        StringType,
+        "fromString",
+        inputObject :: Nil)
+
+    case t @ ArrayType(et, _) => et match {
+      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
+        NewInstance(
+          classOf[GenericArrayData],
+          inputObject :: Nil,
+          dataType = t)
+      case _ => MapObjects(extractorsFor(_, et), inputObject, externalDataTypeFor(et))
+    }
+
+    case t @ MapType(kt, vt, valueNullable) =>
+      val keys =
+        Invoke(
+          Invoke(inputObject, "keysIterator", ObjectType(classOf[scala.collection.Iterator[_]])),
+          "toSeq",
+          ObjectType(classOf[scala.collection.Seq[_]]))
+      val convertedKeys = extractorsFor(keys, ArrayType(kt, false))
+
+      val values =
+        Invoke(
+          Invoke(inputObject, "valuesIterator", ObjectType(classOf[scala.collection.Iterator[_]])),
+          "toSeq",
+          ObjectType(classOf[scala.collection.Seq[_]]))
+      val convertedValues = extractorsFor(values, ArrayType(vt, valueNullable))
+
+      NewInstance(
+        classOf[ArrayBasedMapData],
+        convertedKeys :: convertedValues :: Nil,
+        dataType = t)
+
+    case StructType(fields) =>
+      val convertedFields = fields.zipWithIndex.map { case (f, i) =>
+        If(
+          Invoke(inputObject, "isNullAt", BooleanType, Literal(i) :: Nil),
+          Literal.create(null, f.dataType),
+          extractorsFor(
+            Invoke(inputObject, "get", externalDataTypeFor(f.dataType), Literal(i) :: Nil),
+            f.dataType))
+      }
+      CreateStruct(convertedFields)
+  }
+
+  private def externalDataTypeFor(dt: DataType): DataType = dt match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => dt
+    case TimestampType => ObjectType(classOf[java.sql.Timestamp])
+    case DateType => ObjectType(classOf[java.sql.Date])
+    case _: DecimalType => ObjectType(classOf[java.math.BigDecimal])
+    case StringType => ObjectType(classOf[java.lang.String])
+    case _: ArrayType => ObjectType(classOf[scala.collection.Seq[_]])
+    case _: MapType => ObjectType(classOf[scala.collection.Map[_, _]])
+    case _: StructType => ObjectType(classOf[Row])
+  }
+
+  private def constructorFor(schema: StructType): Expression = {
+    val fields = schema.zipWithIndex.map { case (f, i) =>
+      val field = BoundReference(i, f.dataType, f.nullable)
+      If(
+        IsNull(field),
+        Literal.create(null, externalDataTypeFor(f.dataType)),
+        constructorFor(BoundReference(i, f.dataType, f.nullable), f.dataType)
+      )
+    }
+    CreateRow(fields)
+  }
+
+  private def constructorFor(input: Expression, dataType: DataType): Expression = dataType match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => input
+
+    case TimestampType =>
+      StaticInvoke(
+        DateTimeUtils,
+        ObjectType(classOf[java.sql.Timestamp]),
+        "toJavaTimestamp",
+        input :: Nil)
+
+    case DateType =>
+      StaticInvoke(
+        DateTimeUtils,
+        ObjectType(classOf[java.sql.Date]),
+        "toJavaDate",
+        input :: Nil)
+
+    case _: DecimalType =>
+      Invoke(input, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]))
+
+    case StringType =>
+      Invoke(input, "toString", ObjectType(classOf[String]))
+
+    case ArrayType(et, nullable) =>
+      val arrayData =
+        Invoke(
+          MapObjects(constructorFor(_, et), input, et),
+          "array",
+          ObjectType(classOf[Array[_]]))
+      StaticInvoke(
+        scala.collection.mutable.WrappedArray,
+        ObjectType(classOf[Seq[_]]),
+        "make",
+        arrayData :: Nil)
+
+    case MapType(kt, vt, valueNullable) =>
+      val keyArrayType = ArrayType(kt, false)
+      val keyData = constructorFor(Invoke(input, "keyArray", keyArrayType), keyArrayType)
+
+      val valueArrayType = ArrayType(vt, valueNullable)
+      val valueData = constructorFor(Invoke(input, "valueArray", valueArrayType), valueArrayType)
+
+      StaticInvoke(
+        ArrayBasedMapData,
+        ObjectType(classOf[Map[_, _]]),
+        "toScalaMap",
+        keyData :: valueData :: Nil)
+
+    case StructType(fields) =>
+      val convertedFields = fields.zipWithIndex.map { case (f, i) =>
+        If(
+          Invoke(input, "isNullAt", BooleanType, Literal(i) :: Nil),
+          Literal.create(null, externalDataTypeFor(f.dataType)),
+          constructorFor(getField(input, i, f.dataType), f.dataType))
+      }
+      CreateRow(convertedFields)
+  }
+
+  private def getField(
+     row: Expression,
+     ordinal: Int,
+     dataType: DataType): Expression = dataType match {
+    case BooleanType =>
+      Invoke(row, "getBoolean", dataType, Literal(ordinal) :: Nil)
+    case ByteType =>
+      Invoke(row, "getByte", dataType, Literal(ordinal) :: Nil)
+    case ShortType =>
+      Invoke(row, "getShort", dataType, Literal(ordinal) :: Nil)
+    case IntegerType | DateType =>
+      Invoke(row, "getInt", dataType, Literal(ordinal) :: Nil)
+    case LongType | TimestampType =>
+      Invoke(row, "getLong", dataType, Literal(ordinal) :: Nil)
+    case FloatType =>
+      Invoke(row, "getFloat", dataType, Literal(ordinal) :: Nil)
+    case DoubleType =>
+      Invoke(row, "getDouble", dataType, Literal(ordinal) :: Nil)
+    case t: DecimalType =>
+      Invoke(row, "getDecimal", dataType, Seq(ordinal, t.precision, t.scale).map(Literal(_)))
+    case StringType =>
+      Invoke(row, "getUTF8String", dataType, Literal(ordinal) :: Nil)
+    case BinaryType =>
+      Invoke(row, "getBinary", dataType, Literal(ordinal) :: Nil)
+    case CalendarIntervalType =>
+      Invoke(row, "getInterval", dataType, Literal(ordinal) :: Nil)
+    case t: StructType =>
+      Invoke(row, "getStruct", dataType, Literal(ordinal) :: Literal(t.size) :: Nil)
+    case _: ArrayType =>
+      Invoke(row, "getArray", dataType, Literal(ordinal) :: Nil)
+    case _: MapType =>
+      Invoke(row, "getMap", dataType, Literal(ordinal) :: Nil)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index e8c1c93cf5620..8fc00ad1bcb04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
 
 import scala.language.existentials
 
-import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
@@ -364,6 +365,10 @@ case class MapObjects(
       (".numElements()", (i: String) => s".getShort($i)", true)
     case ArrayType(BooleanType, _) =>
       (".numElements()", (i: String) => s".getBoolean($i)", true)
+    case ArrayType(StringType, _) =>
+      (".numElements()", (i: String) => s".getUTF8String($i)", false)
+    case ArrayType(_: MapType, _) =>
+      (".numElements()", (i: String) => s".getMap($i)", false)
   }
 
   override def nullable: Boolean = true
@@ -398,7 +403,7 @@ case class MapObjects(
     val convertedArray = ctx.freshName("convertedArray")
     val loopIndex = ctx.freshName("loopIndex")
 
-    val convertedType = ctx.javaType(boundFunction.dataType)
+    val convertedType = ctx.boxedType(boundFunction.dataType)
 
     // Because of the way Java defines nested arrays, we have to handle the syntax specially.
     // Specifically, we have to insert the [$dataLength] in between the type and any extra nested
@@ -434,9 +439,13 @@ case class MapObjects(
             ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
           $loopNullCheck
 
-          ${genFunction.code}
+          if ($loopIsNull) {
+            $convertedArray[$loopIndex] = null;
+          } else {
+            ${genFunction.code}
+            $convertedArray[$loopIndex] = ${genFunction.value};
+          }
 
-          $convertedArray[$loopIndex] = ($convertedType)${genFunction.value};
           $loopIndex += 1;
         }
 
@@ -446,3 +455,32 @@ case class MapObjects(
     """
   }
 }
+
+case class CreateRow(children: Seq[Expression]) extends Expression {
+  override def dataType: DataType = ObjectType(classOf[Row])
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rowClass = classOf[GenericRow].getName
+    val values = ctx.freshName("values")
+    s"""
+      boolean ${ev.isNull} = false;
+      final Object[] $values = new Object[${children.size}];
+    """ +
+      children.zipWithIndex.map { case (e, i) =>
+        val eval = e.gen(ctx)
+        eval.code + s"""
+          if (${eval.isNull}) {
+            $values[$i] = null;
+          } else {
+            $values[$i] = ${eval.value};
+          }
+         """
+      }.mkString("\n") +
+      s"final ${classOf[Row].getName} ${ev.value} = new $rowClass($values);"
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
index 5f22e59d5f1d8..e5ffe32217351 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -66,4 +66,8 @@ object ArrayBasedMapData {
   def toScalaMap(keys: Array[Any], values: Array[Any]): Map[Any, Any] = {
     keys.zip(values).toMap
   }
+
+  def toScalaMap(keys: Seq[Any], values: Seq[Any]): Map[Any, Any] = {
+    keys.zip(values).toMap
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index e48395028e399..7614f055e9c04 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -148,7 +148,7 @@ object RandomDataGenerator {
         () => BigDecimal.apply(
           rand.nextLong() % math.pow(10, precision).toLong,
           scale,
-          new MathContext(precision)))
+          new MathContext(precision)).bigDecimal)
       case DoubleType => randomNumeric[Double](
         rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue,
           Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0))
@@ -166,7 +166,7 @@ object RandomDataGenerator {
       case NullType => Some(() => null)
       case ArrayType(elementType, containsNull) => {
         forType(elementType, nullable = containsNull, seed = Some(rand.nextLong())).map {
-          elementGenerator => () => Array.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator())
+          elementGenerator => () => Seq.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator())
         }
       }
       case MapType(keyType, valueType, valueContainsNull) => {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
new file mode 100644
index 0000000000000..6041b62b74bdd
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{RandomDataGenerator, Row}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class RowEncoderSuite extends SparkFunSuite {
+
+  private val structOfString = new StructType().add("str", StringType)
+  private val arrayOfString = ArrayType(StringType)
+  private val mapOfString = MapType(StringType, StringType)
+
+  encodeDecodeTest(
+    new StructType()
+      .add("boolean", BooleanType)
+      .add("byte", ByteType)
+      .add("short", ShortType)
+      .add("int", IntegerType)
+      .add("long", LongType)
+      .add("float", FloatType)
+      .add("double", DoubleType)
+      .add("decimal", DecimalType.SYSTEM_DEFAULT)
+      .add("string", StringType)
+      .add("binary", BinaryType)
+      .add("date", DateType)
+      .add("timestamp", TimestampType))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("arrayOfString", arrayOfString)
+      .add("arrayOfArrayOfString", ArrayType(arrayOfString))
+      .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType)))
+      .add("arrayOfMap", ArrayType(mapOfString))
+      .add("arrayOfStruct", ArrayType(structOfString)))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("mapOfIntAndString", MapType(IntegerType, StringType))
+      .add("mapOfStringAndArray", MapType(StringType, arrayOfString))
+      .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType))
+      .add("mapOfArray", MapType(arrayOfString, arrayOfString))
+      .add("mapOfStringAndStruct", MapType(StringType, structOfString))
+      .add("mapOfStructAndString", MapType(structOfString, StringType))
+      .add("mapOfStruct", MapType(structOfString, structOfString)))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("structOfString", structOfString)
+      .add("structOfStructOfString", new StructType().add("struct", structOfString))
+      .add("structOfArray", new StructType().add("array", arrayOfString))
+      .add("structOfMap", new StructType().add("map", mapOfString))
+      .add("structOfArrayAndMap",
+        new StructType().add("array", arrayOfString).add("map", mapOfString)))
+
+  private def encodeDecodeTest(schema: StructType): Unit = {
+    test(s"encode/decode: ${schema.simpleString}") {
+      val encoder = RowEncoder(schema)
+      val inputGenerator = RandomDataGenerator.forType(schema).get
+
+      var input: Row = null
+      try {
+        for (_ <- 1 to 5) {
+          input = inputGenerator.apply().asInstanceOf[Row]
+          val row = encoder.toRow(input)
+          val convertedBack = encoder.fromRow(row)
+          assert(input == convertedBack)
+        }
+      } catch {
+        case e: Exception =>
+          fail(
+            s"""
+               |schema: ${schema.simpleString}
+               |input: ${input}
+             """.stripMargin, e)
+      }
+    }
+  }
+}

From 49ea0e9d7ce805d312d94a5b2936eec2053bc052 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 21 Oct 2015 11:10:32 -0700
Subject: [PATCH 629/802] [SPARK-10534] [SQL] ORDER BY clause allows only
 columns that are present in the select projection list

Find out the missing attributes by recursively looking
at the sort order expression and rest of the code
takes care of projecting them out.

Added description from cloud-fan

I wanna explain a bit more about this bug.

When we resolve sort ordering, we will use a special method, which only resolves UnresolvedAttributes and UnresolvedExtractValue. However, for something like Floor('a), even the 'a is resolved, the floor expression may still being unresolved as data type mismatch(for example, 'a is string type and Floor need double type), thus can't pass this filter, and we can't push down this missing attribute 'a

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #9123 from dilipbiswal/SPARK-10534.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  |  2 +-
 .../spark/sql/catalyst/analysis/AnalysisSuite.scala    | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e6046055bf0f6..9237f2f3ddb69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -482,7 +482,7 @@ class Analyzer(
       val newOrdering = resolveSortOrders(ordering, grandchild, throws = true)
       // Construct a set that contains all of the attributes that we need to evaluate the
       // ordering.
-      val requiredAttributes = AttributeSet(newOrdering.filter(_.resolved))
+      val requiredAttributes = AttributeSet(newOrdering).filter(_.resolved)
       // Figure out which ones are missing from the projection, so that we can add them and
       // remove them after the sort.
       val missingInProject = requiredAttributes -- child.output
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index ec05cfa63c5bf..24af8483a7ecc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -143,4 +143,14 @@ class AnalysisSuite extends AnalysisTest {
     plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
     checkAnalysis(plan, plan)
   }
+
+  test("SPARK-10534: resolve attribute references in order by clause") {
+    val a = testRelation2.output(0)
+    val c = testRelation2.output(2)
+
+    val plan = testRelation2.select('c).orderBy(Floor('a).asc)
+    val expected = testRelation2.select(c, a).orderBy(Floor(a.cast(DoubleType)).asc).select(c)
+
+    checkAnalysis(plan, expected)
+  }
 }

From 7c74ebca05f40a2d8fe8f10f24a10486ce4f76c0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Wed, 21 Oct 2015 13:22:35 -0700
Subject: [PATCH 630/802] [SPARK-10743][SQL] keep the name of expression if
 possible when do cast

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8859 from cloud-fan/cast.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 33 +++++++++----------
 .../scala/org/apache/spark/sql/Column.scala   |  4 +--
 .../spark/sql/ColumnExpressionSuite.scala     |  6 ----
 .../org/apache/spark/sql/DataFrameSuite.scala |  5 +++
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 9237f2f3ddb69..016dc293f4f47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -141,32 +141,31 @@ class Analyzer(
    */
   object ResolveAliases extends Rule[LogicalPlan] {
     private def assignAliases(exprs: Seq[NamedExpression]) = {
-      // The `UnresolvedAlias`s will appear only at root of a expression tree, we don't need
-      // to traverse the whole tree.
       exprs.zipWithIndex.map {
-        case (u @ UnresolvedAlias(child), i) =>
-          child match {
-            case _: UnresolvedAttribute => u
-            case ne: NamedExpression => ne
-            case g: Generator if g.resolved && g.elementTypes.size > 1 => MultiAlias(g, Nil)
-            case e if !e.resolved => u
-            case other => Alias(other, s"_c$i")()
+        case (expr, i) =>
+          expr transform {
+            case u @ UnresolvedAlias(child) => child match {
+              case ne: NamedExpression => ne
+              case e if !e.resolved => u
+              case g: Generator if g.elementTypes.size > 1 => MultiAlias(g, Nil)
+              case c @ Cast(ne: NamedExpression, _) => Alias(c, ne.name)()
+              case other => Alias(other, s"_c$i")()
+            }
           }
-        case (other, _) => other
-      }
+      }.asInstanceOf[Seq[NamedExpression]]
     }
 
+    private def hasUnresolvedAlias(exprs: Seq[NamedExpression]) =
+      exprs.exists(_.find(_.isInstanceOf[UnresolvedAlias]).isDefined)
+
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-      case Aggregate(groups, aggs, child)
-        if child.resolved && aggs.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case Aggregate(groups, aggs, child) if child.resolved && hasUnresolvedAlias(aggs) =>
         Aggregate(groups, assignAliases(aggs), child)
 
-      case g: GroupingAnalytics
-        if g.child.resolved && g.aggregations.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case g: GroupingAnalytics if g.child.resolved && hasUnresolvedAlias(g.aggregations) =>
         g.withNewAggs(assignAliases(g.aggregations))
 
-      case Project(projectList, child)
-        if child.resolved && projectList.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case Project(projectList, child) if child.resolved && hasUnresolvedAlias(projectList) =>
         Project(assignAliases(projectList), child)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 1f826887ac774..37d559c8e4301 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -835,8 +835,8 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    * @since 1.3.0
    */
   def cast(to: DataType): Column = expr match {
-    // Lift alias out of cast so we can support col.as("name").cast(IntegerType)
-    case Alias(childExpr, name) => Alias(Cast(childExpr, to), name)()
+    // keeps the name of expression if possible when do cast.
+    case ne: NamedExpression => UnresolvedAlias(Cast(expr, to))
     case _ => Cast(expr, to)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 4e988f074b113..fa559c9c64005 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -588,12 +588,6 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("lift alias out of cast") {
-    compareExpressions(
-      col("1234").as("name").cast("int").expr,
-      col("1234").cast("int").as("name").expr)
-  }
-
   test("columns can be compared") {
     assert('key.desc == 'key.desc)
     assert('key.desc != 'key.asc)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 832ea02cb6e77..6424f1f1d9cf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -975,4 +975,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       expected(except)
     )
   }
+
+  test("SPARK-10743: keep the name of expression if possible when do cast") {
+    val df = (1 to 10).map(Tuple1.apply).toDF("i").as("src")
+    assert(df.select($"src.i".cast(StringType)).columns.head === "i")
+  }
 }

From f8c6bec65784de89b47e96a367d3f9790c1b3115 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 21 Oct 2015 13:38:30 -0700
Subject: [PATCH 631/802] [SPARK-11197][SQL] run SQL on files directly

This PR introduce a new feature to run SQL directly on files without create a table, for example:

```
select id from json.`path/to/json/files` as j
```

Author: Davies Liu <davies@databricks.com>

Closes #9173 from davies/source.
---
 R/pkg/inst/tests/test_sparkSQL.R              |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      | 10 +++++--
 .../sql/catalyst/analysis/CheckAnalysis.scala |  3 ++
 .../sql/catalyst/analysis/AnalysisSuite.scala |  2 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  8 ++++++
 .../org/apache/spark/sql/SQLContext.scala     |  2 +-
 .../sql/execution/datasources/rules.scala     | 28 +++++++++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 28 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   |  4 +--
 .../sql/hive/execution/SQLQuerySuite.scala    | 13 +++++++++
 10 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e1b42b0804933..67d8b23cd7b8d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1428,7 +1428,7 @@ test_that("sampleBy() on a DataFrame", {
 
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
-  expect_equal(grepl("Table Not Found: blah", retError), TRUE)
+  expect_equal(grepl("Table not found: blah", retError), TRUE)
 })
 
 test_that("Method as.data.frame as a synonym for collect()", {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 016dc293f4f47..beabacfc88e32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -256,7 +256,7 @@ class Analyzer(
         catalog.lookupRelation(u.tableIdentifier, u.alias)
       } catch {
         case _: NoSuchTableException =>
-          u.failAnalysis(s"Table Not Found: ${u.tableName}")
+          u.failAnalysis(s"Table not found: ${u.tableName}")
       }
     }
 
@@ -264,7 +264,13 @@ class Analyzer(
       case i @ InsertIntoTable(u: UnresolvedRelation, _, _, _, _) =>
         i.copy(table = EliminateSubQueries(getTable(u)))
       case u: UnresolvedRelation =>
-        getTable(u)
+        try {
+          getTable(u)
+        } catch {
+          case _: AnalysisException if u.tableIdentifier.database.isDefined =>
+            // delay the exception into CheckAnalysis, then it could be resolved as data source.
+            u
+        }
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 7701fd0451041..ab215407f7da5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -49,6 +49,9 @@ trait CheckAnalysis {
     plan.foreachUp {
       case p if p.analyzed => // Skip already analyzed sub-plans
 
+      case u: UnresolvedRelation =>
+        u.failAnalysis(s"Table not found: ${u.tableIdentifier}")
+
       case operator: LogicalPlan =>
         operator transformExpressionsUp {
           case a: Attribute if !a.resolved =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 24af8483a7ecc..0a1fa74bede76 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -78,7 +78,7 @@ class AnalysisSuite extends AnalysisTest {
 
   test("resolve relations") {
     assertAnalysisError(
-      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table Not Found: tAbLe"))
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table not found: tAbLe"))
 
     checkAnalysis(UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index b08cc8e830737..6f2892085a8f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -432,6 +432,12 @@ private[spark] object SQLConf {
   val USE_SQL_AGGREGATE2 = booleanConf("spark.sql.useAggregate2",
     defaultValue = Some(true), doc = "<TODO>")
 
+  val RUN_SQL_ON_FILES = booleanConf("spark.sql.runSQLOnFiles",
+    defaultValue = Some(true),
+    isPublic = false,
+    doc = "When true, we could use `datasource`.`path` as table in SQL query"
+  )
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
     val EXTERNAL_SORT = "spark.sql.planner.externalSort"
@@ -540,6 +546,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def dataFrameRetainGroupColumns: Boolean = getConf(DATAFRAME_RETAIN_GROUP_COLUMNS)
 
+  private[spark] def runSQLOnFile: Boolean = getConf(RUN_SQL_ON_FILES)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index e83657a60558d..a107639947aa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -193,7 +193,7 @@ class SQLContext private[sql](
       override val extendedResolutionRules =
         ExtractPythonUDFs ::
         PreInsertCastAndRename ::
-        Nil
+        (if (conf.runSQLOnFile) new ResolveDataSource(self) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(
         datasources.PreWriteCheck(catalog)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index b00e5680fef9e..abc016bf020d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -17,13 +17,37 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.{AnalysisException, SaveMode}
-import org.apache.spark.sql.catalyst.analysis.{Catalog, EliminateSubQueries}
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.sources.{BaseRelation, HadoopFsRelation, InsertableRelation}
+import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
+
+/**
+ * Try to replaces [[UnresolvedRelation]]s with [[ResolvedDataSource]].
+ */
+private[sql] class ResolveDataSource(sqlContext: SQLContext) extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+    case u: UnresolvedRelation if u.tableIdentifier.database.isDefined =>
+      try {
+        val resolved = ResolvedDataSource(
+          sqlContext,
+          userSpecifiedSchema = None,
+          partitionColumns = Array(),
+          provider = u.tableIdentifier.database.get,
+          options = Map("path" -> u.tableIdentifier.table))
+        val plan = LogicalRelation(resolved.relation)
+        u.alias.map(a => Subquery(u.alias.get, plan)).getOrElse(plan)
+      } catch {
+        case e: ClassNotFoundException => u
+        case e: Exception =>
+          // the provider is valid, but failed to create a logical plan
+          u.failAnalysis(e.getMessage)
+      }
+  }
+}
 
 /**
  * A rule to do pre-insert data type casting and field renaming. Before we insert into
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a35a7f41dd667..298c32290697a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1796,6 +1796,34 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("run sql directly on files") {
+    val df = sqlContext.range(100)
+    withTempPath(f => {
+      df.write.json(f.getCanonicalPath)
+      checkAnswer(sql(s"select id from json.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select id from `org.apache.spark.sql.json`.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select a.id from json.`${f.getCanonicalPath}` as a"),
+        df)
+    })
+
+    val e1 = intercept[AnalysisException] {
+      sql("select * from in_valid_table")
+    }
+    assert(e1.message.contains("Table not found"))
+
+    val e2 = intercept[AnalysisException] {
+      sql("select * from no_db.no_table")
+    }
+    assert(e2.message.contains("Table not found"))
+
+    val e3 = intercept[AnalysisException] {
+      sql("select * from json.invalid_file")
+    }
+    assert(e3.message.contains("No input paths specified"))
+  }
+
   test("SortMergeJoin returns wrong results when using UnsafeRows") {
     // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737.
     // This bug will be triggered when Tungsten is enabled and there are multiple
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 38c195bc7db0e..61f611638fe9f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -45,7 +45,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
-import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.datasources.{ResolveDataSource, DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
 import org.apache.spark.sql.execution.ui.SQLListener
 import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
@@ -473,7 +473,7 @@ class HiveContext private[hive](
         ExtractPythonUDFs ::
         ResolveHiveWindowFunction ::
         PreInsertCastAndRename ::
-        Nil
+        (if (conf.runSQLOnFile) new ResolveDataSource(self) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(
         PreWriteCheck(catalog)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c929ba50680bc..396150be76e83 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1281,6 +1281,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("run sql directly on files") {
+    val df = sqlContext.range(100)
+    withTempPath(f => {
+      df.write.parquet(f.getCanonicalPath)
+      checkAnswer(sql(s"select id from parquet.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select id from `org.apache.spark.sql.parquet`.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select a.id from parquet.`${f.getCanonicalPath}` as a"),
+        df)
+    })
+  }
+
   test("correctly parse CREATE VIEW statement") {
     withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {

From 3afe448d39dc4877b2f2c62b3059aeb3ced0bd96 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 21 Oct 2015 13:43:17 -0700
Subject: [PATCH 632/802] [SPARK-9740][SPARK-9592][SPARK-9210][SQL] Change the
 default behavior of First/Last to RESPECT NULLS.

I am changing the default behavior of `First`/`Last` to respect null values (the SQL standard default behavior).

https://issues.apache.org/jira/browse/SPARK-9740

Author: Yin Huai <yhuai@databricks.com>

Closes #8113 from yhuai/firstLast.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |   3 +-
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +
 .../expressions/aggregate/functions.scala     | 105 +++++++++++++++---
 .../expressions/aggregate/utils.scala         |   8 +-
 .../sql/catalyst/expressions/aggregates.scala |  95 ++++++++++++----
 .../spark/sql/expressions/WindowSpec.scala    |  13 ++-
 .../execution/AggregationQuerySuite.scala     |  38 +++++++
 7 files changed, 219 insertions(+), 45 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index ab215407f7da5..98d6637c0601b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -113,7 +113,8 @@ trait CheckAnalysis {
                 failAnalysis(
                   s"expression '${e.prettyString}' is neither present in the group by, " +
                     s"nor is it an aggregate function. " +
-                    "Add to group by or wrap in first() if you don't care which value you get.")
+                    "Add to group by or wrap in first() (or first_value) if you don't care " +
+                    "which value you get.")
               case e if groupingExprs.exists(_.semanticEquals(e)) => // OK
               case e if e.references.isEmpty => // OK
               case e => e.children.foreach(checkValidAggregateExpression)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ba77b70a378a6..f73b24e3630db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -179,7 +179,9 @@ object FunctionRegistry {
     expression[Average]("avg"),
     expression[Count]("count"),
     expression[First]("first"),
+    expression[First]("first_value"),
     expression[Last]("last"),
+    expression[Last]("last_value"),
     expression[Max]("max"),
     expression[Min]("min"),
     expression[Stddev]("stddev"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index c0bc7ec09c34a..515246d344244 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -21,6 +21,8 @@ import java.lang.{Long => JLong}
 import java.util
 
 import com.clearspring.analytics.hash.MurmurHash
+
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
@@ -118,7 +120,23 @@ case class Count(child: Expression) extends DeclarativeAggregate {
   override val evaluateExpression = Cast(currentCount, LongType)
 }
 
-case class First(child: Expression) extends DeclarativeAggregate {
+/**
+ * Returns the first value of `child` for a group of rows. If the first value of `child`
+ * is `null`, it returns `null` (respecting nulls). Even if [[First]] is used on a already
+ * sorted column, if we do partial aggregation and final aggregation (when mergeExpression
+ * is used) its result will not be deterministic (unless the input table is sorted and has
+ * a single partition, and we use a single reducer to do the aggregation.).
+ * @param child
+ */
+case class First(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -135,24 +153,61 @@ case class First(child: Expression) extends DeclarativeAggregate {
 
   private val first = AttributeReference("first", child.dataType)()
 
-  override val aggBufferAttributes = first :: Nil
+  private val valueSet = AttributeReference("valueSet", BooleanType)()
+
+  override val aggBufferAttributes = first :: valueSet :: Nil
 
   override val initialValues = Seq(
-    /* first = */ Literal.create(null, child.dataType)
+    /* first = */ Literal.create(null, child.dataType),
+    /* valueSet = */ Literal.create(false, BooleanType)
   )
 
-  override val updateExpressions = Seq(
-    /* first = */ If(IsNull(first), child, first)
-  )
+  override val updateExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* first = */ If(Or(valueSet, IsNull(child)), first, child),
+        /* valueSet = */ Or(valueSet, IsNotNull(child))
+      )
+    } else {
+      Seq(
+        /* first = */ If(valueSet, first, child),
+        /* valueSet = */ Literal.create(true, BooleanType)
+      )
+    }
+  }
 
-  override val mergeExpressions = Seq(
-    /* first = */ If(IsNull(first.left), first.right, first.left)
-  )
+  override val mergeExpressions = {
+    // For first, we can just check if valueSet.left is set to true. If it is set
+    // to true, we use first.right. If not, we use first.right (even if valueSet.right is
+    // false, we are safe to do so because first.right will be null in this case).
+    Seq(
+      /* first = */ If(valueSet.left, first.left, first.right),
+      /* valueSet = */ Or(valueSet.left, valueSet.right)
+    )
+  }
 
   override val evaluateExpression = first
+
+  override def toString: String = s"FIRST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 }
 
-case class Last(child: Expression) extends DeclarativeAggregate {
+/**
+ * Returns the last value of `child` for a group of rows. If the last value of `child`
+ * is `null`, it returns `null` (respecting nulls). Even if [[Last]] is used on a already
+ * sorted column, if we do partial aggregation and final aggregation (when mergeExpression
+ * is used) its result will not be deterministic (unless the input table is sorted and has
+ * a single partition, and we use a single reducer to do the aggregation.).
+ * @param child
+ */
+case class Last(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -175,15 +230,33 @@ case class Last(child: Expression) extends DeclarativeAggregate {
     /* last = */ Literal.create(null, child.dataType)
   )
 
-  override val updateExpressions = Seq(
-    /* last = */ If(IsNull(child), last, child)
-  )
+  override val updateExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* last = */ If(IsNull(child), last, child)
+      )
+    } else {
+      Seq(
+        /* last = */ child
+      )
+    }
+  }
 
-  override val mergeExpressions = Seq(
-    /* last = */ If(IsNull(last.right), last.left, last.right)
-  )
+  override val mergeExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* last = */ If(IsNull(last.right), last.left, last.right)
+      )
+    } else {
+      Seq(
+        /* last = */ last.right
+      )
+    }
+  }
 
   override val evaluateExpression = last
+
+  override def toString: String = s"LAST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 }
 
 case class Max(child: Expression) extends DeclarativeAggregate {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index f656ccf13b156..12bdab0915801 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -61,15 +61,15 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = true)
 
-        case expressions.First(child) =>
+        case expressions.First(child, ignoreNulls) =>
           aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.First(child),
+            aggregateFunction = aggregate.First(child, ignoreNulls),
             mode = aggregate.Complete,
             isDistinct = false)
 
-        case expressions.Last(child) =>
+        case expressions.Last(child, ignoreNulls) =>
           aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Last(child),
+            aggregateFunction = aggregate.Last(child, ignoreNulls),
             mode = aggregate.Complete,
             isDistinct = false)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index f1c47f39043c8..95061c4635879 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
@@ -630,59 +631,113 @@ case class CombineSetsAndSumFunction(
   }
 }
 
-case class First(child: Expression) extends UnaryExpression with PartialAggregate1 {
+case class First(
+    child: Expression,
+    ignoreNullsExpr: Expression)
+  extends UnaryExpression with PartialAggregate1 {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
+
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
-  override def toString: String = s"FIRST($child)"
+  override def toString: String = s"FIRST(${child}${if (ignoreNulls) " IGNORE NULLS"})"
 
   override def asPartial: SplitEvaluation = {
-    val partialFirst = Alias(First(child), "PartialFirst")()
+    val partialFirst = Alias(First(child, ignoreNulls), "PartialFirst")()
     SplitEvaluation(
-      First(partialFirst.toAttribute),
+      First(partialFirst.toAttribute, ignoreNulls),
       partialFirst :: Nil)
   }
-  override def newInstance(): FirstFunction = new FirstFunction(child, this)
+  override def newInstance(): FirstFunction = new FirstFunction(child, ignoreNulls, this)
 }
 
-case class FirstFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
-  def this() = this(null, null) // Required for serialization.
+object First {
+  def apply(child: Expression): First = First(child, ignoreNulls = false)
 
-  var result: Any = null
+  def apply(child: Expression, ignoreNulls: Boolean): First =
+    First(child, Literal.create(ignoreNulls, BooleanType))
+}
+
+case class FirstFunction(
+    expr: Expression,
+    ignoreNulls: Boolean,
+    base: AggregateExpression1)
+  extends AggregateFunction1 {
+
+  def this() = this(null, null.asInstanceOf[Boolean], null) // Required for serialization.
+
+  private[this] var result: Any = null
+
+  private[this] var valueSet: Boolean = false
 
   override def update(input: InternalRow): Unit = {
-    // We ignore null values.
-    if (result == null) {
-      result = expr.eval(input)
+    if (!valueSet) {
+      val value = expr.eval(input)
+      // When we have not set the result, we will set the result if we respect nulls
+      // (i.e. ignoreNulls is false), or we ignore nulls and the evaluated value is not null.
+      if (!ignoreNulls || (ignoreNulls && value != null)) {
+        result = value
+        valueSet = true
+      }
     }
   }
 
   override def eval(input: InternalRow): Any = result
 }
 
-case class Last(child: Expression) extends UnaryExpression with PartialAggregate1 {
+case class Last(
+    child: Expression,
+    ignoreNullsExpr: Expression)
+  extends UnaryExpression with PartialAggregate1 {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
+
   override def references: AttributeSet = child.references
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
-  override def toString: String = s"LAST($child)"
+  override def toString: String = s"LAST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 
   override def asPartial: SplitEvaluation = {
-    val partialLast = Alias(Last(child), "PartialLast")()
+    val partialLast = Alias(Last(child, ignoreNulls), "PartialLast")()
     SplitEvaluation(
-      Last(partialLast.toAttribute),
+      Last(partialLast.toAttribute, ignoreNulls),
       partialLast :: Nil)
   }
-  override def newInstance(): LastFunction = new LastFunction(child, this)
+  override def newInstance(): LastFunction = new LastFunction(child, ignoreNulls, this)
 }
 
-case class LastFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
-  def this() = this(null, null) // Required for serialization.
+object Last {
+  def apply(child: Expression): Last = Last(child, ignoreNulls = false)
+
+  def apply(child: Expression, ignoreNulls: Boolean): Last =
+    Last(child, Literal.create(ignoreNulls, BooleanType))
+}
+
+case class LastFunction(
+    expr: Expression,
+    ignoreNulls: Boolean,
+    base: AggregateExpression1)
+  extends AggregateFunction1 {
+
+  def this() = this(null, null.asInstanceOf[Boolean], null) // Required for serialization.
 
   var result: Any = null
 
   override def update(input: InternalRow): Unit = {
     val value = expr.eval(input)
-    // We ignore null values.
-    if (value != null) {
+    if (!ignoreNulls || (ignoreNulls && value != null)) {
       result = value
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index c3d2246297021..8b9247adea200 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.expressions
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.types.BooleanType
 import org.apache.spark.sql.{Column, catalyst}
 import org.apache.spark.sql.catalyst.expressions._
 
@@ -149,13 +150,17 @@ class WindowSpec private[sql](
       case Count(child) => WindowExpression(
         UnresolvedWindowFunction("count", child :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
-      case First(child) => WindowExpression(
+      case First(child, ignoreNulls) => WindowExpression(
         // TODO this is a hack for Hive UDAF first_value
-        UnresolvedWindowFunction("first_value", child :: Nil),
+        UnresolvedWindowFunction(
+          "first_value",
+          child :: ignoreNulls :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
-      case Last(child) => WindowExpression(
+      case Last(child, ignoreNulls) => WindowExpression(
         // TODO this is a hack for Hive UDAF last_value
-        UnresolvedWindowFunction("last_value", child :: Nil),
+        UnresolvedWindowFunction(
+          "last_value",
+          child :: ignoreNulls :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
       case Min(child) => WindowExpression(
         UnresolvedWindowFunction("min", child :: Nil),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index c9e1bb199591d..f38a3f63c3b58 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -323,6 +323,44 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       Row(11.125) :: Nil)
   }
 
+  test("first_value and last_value") {
+    // We force to use a single partition for the sort and aggregate to make result
+    // deterministic.
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT
+            |  first_valUE(key),
+            |  lasT_value(key),
+            |  firSt(key),
+            |  lASt(key),
+            |  first_valUE(key, true),
+            |  lasT_value(key, true),
+            |  firSt(key, true),
+            |  lASt(key, true)
+            |FROM (SELECT key FROM agg1 ORDER BY key) tmp
+          """.stripMargin),
+        Row(null, 3, null, 3, 1, 3, 1, 3) :: Nil)
+
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT
+            |  first_valUE(key),
+            |  lasT_value(key),
+            |  firSt(key),
+            |  lASt(key),
+            |  first_valUE(key, true),
+            |  lasT_value(key, true),
+            |  firSt(key, true),
+            |  lASt(key, true)
+            |FROM (SELECT key FROM agg1 ORDER BY key DESC) tmp
+          """.stripMargin),
+        Row(3, null, 3, null, 3, 1, 3, 1) :: Nil)
+    }
+  }
+
   test("udaf") {
     checkAnswer(
       sqlContext.sql(

From a37cd870489d7ff85455bf9b57d9aa5329514058 Mon Sep 17 00:00:00 2001
From: Artem Aliev <artem.aliev@datastax.com>
Date: Wed, 21 Oct 2015 14:12:00 -0700
Subject: [PATCH 633/802] [SPARK-11208][SQL] Filter out
 'hive.metastore.rawstore.impl' from executionHive temporary config

The executionHive assumed to be a standard meta store located in temporary directory as a derby db. But hive.metastore.rawstore.impl was not filtered out so any custom implementation of the metastore with other storage properties (not JDO) will persist that temporary functions. CassandraHiveMetaStore from DataStax Enterprise is one of examples.

Author: Artem Aliev <artem.aliev@datastax.com>

Closes #9178 from artem-aliev/SPARK-11208.
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveContext.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 61f611638fe9f..c7460d46b6370 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -725,7 +725,8 @@ private[hive] object HiveContext {
     // We have to mask all properties in hive-site.xml that relates to metastore data source
     // as we used a local metastore here.
     HiveConf.ConfVars.values().foreach { confvar =>
-      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")) {
+      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")
+        || confvar.varname.contains("hive.metastore.rawstore.impl")) {
         propMap.put(confvar.varname, confvar.getDefaultExpr())
       }
     }

From 19ad18638e27cc7b403ea98c4f9f40a940932e30 Mon Sep 17 00:00:00 2001
From: Shagun Sodhani <sshagunsodhani@gmail.com>
Date: Wed, 21 Oct 2015 14:18:06 -0700
Subject: [PATCH 634/802] [SPARK-11233][SQL] register cosh in function registry

Author: Shagun Sodhani <sshagunsodhani@gmail.com>

Closes #9199 from shagunsodhani/proposed-fix-#11233.
---
 .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index f73b24e3630db..3dce6c1a27e85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -142,6 +142,7 @@ object FunctionRegistry {
     expression[Ceil]("ceil"),
     expression[Ceil]("ceiling"),
     expression[Cos]("cos"),
+    expression[Cosh]("cosh"),
     expression[Conv]("conv"),
     expression[EulerNumber]("e"),
     expression[Exp]("exp"),

From dce2f8c9d74bda46f3c7a7ebe7372b04d6b17a61 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 21 Oct 2015 14:29:03 -0700
Subject: [PATCH 635/802] [SPARK-8654][SQL] Analysis exception when using NULL
 IN (...) : invalid cast

In the analysis phase , while processing the rules for IN predicate, we
compare the in-list types to the lhs expression type and generate
cast operation if necessary. In the case of NULL [NOT] IN expr1 , we end up
generating cast between in list types to NULL like cast (1 as NULL) which
is not a valid cast.

The fix is to find a common type between LHS and RHS expressions and cast
all the expression to the common type.

Author: Dilip Biswal <dbiswal@us.ibm.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #9036 from dilipbiswal/spark_8654_new.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 11 ++++++++--
 .../sql/catalyst/analysis/AnalysisSuite.scala | 21 +++++++++++++++++++
 .../analysis/HiveTypeCoercionSuite.scala      |  3 ++-
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87a3845b2d9e5..1140150f66864 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -304,7 +304,11 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Convert all expressions in in() list to the left operator type
+   * Convert the value and in list expressions to the common operator type
+   * by looking at all the argument types and finding the closest one that
+   * all the arguments can be cast to. When no common operator type is found
+   * the original expression will be returned and an Analysis Exception will
+   * be raised at type checking phase.
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
@@ -312,7 +316,10 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
-        i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
+        findWiderCommonType(i.children.map(_.dataType)) match {
+          case Some(finalDataType) => i.withNewChildren(i.children.map(Cast(_, finalDataType)))
+          case None => i
+        }
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 0a1fa74bede76..71d2939ecffe6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -153,4 +153,25 @@ class AnalysisSuite extends AnalysisTest {
 
     checkAnalysis(plan, expected)
   }
+
+  test("SPARK-8654: invalid CAST in NULL IN(...) expression") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(2))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: different types in inlist but can be converted to a commmon type") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(1.2345))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: check type compatibility error") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(true), Literal(1))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisError(plan, Seq("data type mismatch: Arguments must be same type"))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 6f33ab733b615..d3fafaae89938 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -470,7 +470,8 @@ class HiveTypeCoercionSuite extends PlanTest {
     )
     ruleTest(inConversion,
       In(Literal("a"), Seq(Literal(1), Literal("b"))),
-      In(Literal("a"), Seq(Cast(Literal(1), StringType), Cast(Literal("b"), StringType)))
+      In(Cast(Literal("a"), StringType),
+        Seq(Cast(Literal(1), StringType), Cast(Literal("b"), StringType)))
     )
   }
 }

From f481090a71940f06602a73f5bbd004980dea026f Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Wed, 21 Oct 2015 15:07:08 -0700
Subject: [PATCH 636/802] [SPARK-10151][SQL] Support invocation of hive macro

Macro in hive (which is GenericUDFMacro) contains real function inside of it but it's not conveyed to tasks, resulting null-pointer exception.

Author: navis.ryu <navis@apache.org>

Closes #8354 from navis/SPARK-10151.
---
 .../sql/hive/execution/HiveCompatibilitySuite.scala      | 1 +
 .../scala/org/apache/spark/sql/hive/HiveContext.scala    | 9 +++++++--
 .../scala/org/apache/spark/sql/hive/HiveInspectors.scala | 4 ++++
 .../main/scala/org/apache/spark/sql/hive/HiveQl.scala    | 2 ++
 .../main/scala/org/apache/spark/sql/hive/HiveShim.scala  | 9 ++++-----
 .../main/scala/org/apache/spark/sql/hive/hiveUDFs.scala  | 5 ++++-
 .../golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada      | 1 +
 .../golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6      | 0
 .../golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c     | 0
 .../golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587     | 1 +
 .../golden/macro-12-62b999122975c2a5de8e49fee089c041     | 0
 .../golden/macro-13-87e53d2b4c84098e662779e8f0a59084     | 1 +
 .../golden/macro-14-3a31df84432674ad410f44b137e32c2d     | 0
 .../golden/macro-15-56966c45104c0d9bc407e79538c2c029     | 0
 .../golden/macro-16-56966c45104c0d9bc407e79538c2c029     | 0
 .../golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548     | 1 +
 .../golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a     | 0
 .../golden/macro-19-e3c828c372607b8bf7be00a99359b662     | 0
 .../golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec      | 1 +
 .../golden/macro-20-cb252a243d59809930a4ff371cbfa292     | 0
 .../golden/macro-21-cb252a243d59809930a4ff371cbfa292     | 0
 .../golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff      | 0
 .../golden/macro-4-86292bbb7f147393c38bca051768dbda      | 0
 .../golden/macro-5-ca270bff813e5ab18a6a799016693aa8      | 0
 .../golden/macro-6-8976be22af3aba0cc4905e014b4e24fe      | 0
 .../golden/macro-7-decde0a59183a393e580941c633d3c5c      | 1 +
 .../golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020      | 0
 .../golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e      | 0
 28 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
 create mode 100644 sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6
 create mode 100644 sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c
 create mode 100644 sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
 create mode 100644 sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041
 create mode 100644 sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
 create mode 100644 sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d
 create mode 100644 sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029
 create mode 100644 sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029
 create mode 100644 sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
 create mode 100644 sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a
 create mode 100644 sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662
 create mode 100644 sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
 create mode 100644 sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292
 create mode 100644 sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292
 create mode 100644 sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff
 create mode 100644 sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda
 create mode 100644 sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8
 create mode 100644 sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe
 create mode 100644 sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
 create mode 100644 sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020
 create mode 100644 sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 8f29fa91f7ebb..eed9e436f9af7 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -684,6 +684,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "load_file_with_space_in_the_name",
     "loadpart1",
     "louter_join_ppr",
+    "macro",
     "mapjoin_distinct",
     "mapjoin_filter_on_outerjoin",
     "mapjoin_mapjoin",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c7460d46b6370..c328734df316b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 import java.util.concurrent.TimeUnit
+import java.util.regex.Pattern
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
@@ -592,10 +593,14 @@ class HiveContext private[hive](
     )
   }
 
+  private def functionOrMacroDDLPattern(command: String) = Pattern.compile(
+    ".*(create|drop)\\s+(temporary\\s+)?(function|macro).+", Pattern.DOTALL).matcher(command)
+
   protected[hive] def runSqlHive(sql: String): Seq[String] = {
-    if (sql.toLowerCase.contains("create temporary function")) {
+    val command = sql.trim.toLowerCase
+    if (functionOrMacroDDLPattern(command).matches()) {
       executionHive.runSqlHive(sql)
-    } else if (sql.trim.toLowerCase.startsWith("set")) {
+    } else if (command.startsWith("set")) {
       metadataHive.runSqlHive(sql)
       executionHive.runSqlHive(sql)
     } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index cfe2bb05ad89e..43c238fd49e0e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -723,6 +723,10 @@ private[hive] trait HiveInspectors {
         inspectorToDataType(m.getMapValueObjectInspector))
     case _: WritableStringObjectInspector => StringType
     case _: JavaStringObjectInspector => StringType
+    case _: WritableHiveVarcharObjectInspector => StringType
+    case _: JavaHiveVarcharObjectInspector => StringType
+    case _: WritableHiveCharObjectInspector => StringType
+    case _: JavaHiveCharObjectInspector => StringType
     case _: WritableIntObjectInspector => IntegerType
     case _: JavaIntObjectInspector => IntegerType
     case _: WritableDoubleObjectInspector => DoubleType
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index d4ff5cc0f12a2..3697761f20c28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -117,6 +117,7 @@ private[hive] object HiveQl extends Logging {
     "TOK_CREATEDATABASE",
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
+    "TOK_CREATEMACRO",
     "TOK_CREATEROLE",
 
     "TOK_DESCDATABASE",
@@ -125,6 +126,7 @@ private[hive] object HiveQl extends Logging {
     "TOK_DROPDATABASE",
     "TOK_DROPFUNCTION",
     "TOK_DROPINDEX",
+    "TOK_DROPMACRO",
     "TOK_DROPROLE",
     "TOK_DROPTABLE_PROPERTIES",
     "TOK_DROPVIEW",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 004805f3aed0b..f0697613cff3b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -117,9 +117,10 @@ private[hive] object HiveShim {
    * Detail discussion can be found at https://github.com/apache/spark/pull/3640
    *
    * @param functionClassName UDF class name
+   * @param instance optional UDF instance which contains additional information (for macro)
    */
-  private[hive] case class HiveFunctionWrapper(var functionClassName: String)
-    extends java.io.Externalizable {
+  private[hive] case class HiveFunctionWrapper(var functionClassName: String,
+    private var instance: AnyRef = null) extends java.io.Externalizable {
 
     // for Serialization
     def this() = this(null)
@@ -154,8 +155,6 @@ private[hive] object HiveShim {
       serializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), function, out)
     }
 
-    private var instance: AnyRef = null
-
     def writeExternal(out: java.io.ObjectOutput) {
       // output the function name
       out.writeUTF(functionClassName)
@@ -184,7 +183,7 @@ private[hive] object HiveShim {
         // read the function in bytes
         val functionInBytesLength = in.readInt()
         val functionInBytes = new Array[Byte](functionInBytesLength)
-        in.read(functionInBytes, 0, functionInBytesLength)
+        in.readFully(functionInBytes)
 
         // deserialize the function object via Hive Utilities
         instance = deserializePlan[AnyRef](new java.io.ByteArrayInputStream(functionInBytes),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index a2ebf6552fd06..f57b206999399 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -64,7 +64,10 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
       // don't satisfy the hive UDF, such as type mismatch, input number mismatch, etc. Here we
       // catch the exception and throw AnalysisException instead.
       try {
-        if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        if (classOf[GenericUDFMacro].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveGenericUDF(
+            new HiveFunctionWrapper(functionClassName, functionInfo.getGenericUDF), children)
+        } else if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
           HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
         } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
           HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
diff --git a/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6 b/sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c b/sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587 b/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041 b/sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084 b/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
@@ -0,0 +1 @@
+1
diff --git a/sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d b/sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029 b/sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029 b/sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548 b/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
new file mode 100644
index 0000000000000..f599e28b8ab0d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
@@ -0,0 +1 @@
+10
diff --git a/sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a b/sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662 b/sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec b/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
new file mode 100644
index 0000000000000..b49805ff631c1
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
@@ -0,0 +1 @@
+0.8807970779778823
diff --git a/sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292 b/sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292 b/sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff b/sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda b/sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8 b/sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe b/sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c b/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
@@ -0,0 +1 @@
+2
diff --git a/sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020 b/sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e b/sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 555b2086a1ee432067de77032f1e3c64735481f0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 21 Oct 2015 15:33:13 -0700
Subject: [PATCH 637/802] Minor cleanup of ShuffleMapStage.outputLocs code.

I was looking at this code and found the documentation to be insufficient. I added more documentation, and refactored some relevant code path slightly to improve encapsulation. There are more that I want to do, but I want to get these changes in before doing more work.

My goal is to reduce exposing internal fields directly in ShuffleMapStage to improve encapsulation. After this change, DAGScheduler no longer directly writes outputLocs. There are still 3 places that reads outputLocs directly, but we can change those later.

Author: Reynold Xin <rxin@databricks.com>

Closes #9175 from rxin/stage-cleanup.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 28 ++++++-------------
 .../apache/spark/scheduler/ResultStage.scala  |  5 ++++
 .../spark/scheduler/ShuffleMapStage.scala     | 21 ++++++++++++++
 .../org/apache/spark/scheduler/Stage.scala    |  5 +++-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index ade372be092ae..995862ece5944 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -353,10 +353,12 @@ class DAGScheduler(
     if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
       val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId)
       val locs = MapOutputTracker.deserializeMapStatuses(serLocs)
-      for (i <- 0 until locs.length) {
-        stage.outputLocs(i) = Option(locs(i)).toList // locs(i) will be null if missing
+      (0 until locs.length).foreach { i =>
+        if (locs(i) ne null) {
+          // locs(i) will be null if missing
+          stage.addOutputLoc(i, locs(i))
+        }
       }
-      stage.numAvailableOutputs = locs.count(_ != null)
     } else {
       // Kind of ugly: need to register RDDs with the cache and map output tracker here
       // since we can't do it in the RDD constructor because # of partitions is unknown
@@ -894,7 +896,7 @@ class DAGScheduler(
     submitStage(finalStage)
 
     // If the whole stage has already finished, tell the listener and remove it
-    if (!finalStage.outputLocs.contains(Nil)) {
+    if (finalStage.isAvailable) {
       markMapStageJobAsFinished(job, mapOutputTracker.getStatistics(dependency))
     }
 
@@ -931,24 +933,12 @@ class DAGScheduler(
     stage.pendingPartitions.clear()
 
     // First figure out the indexes of partition ids to compute.
-    val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
-      stage match {
-        case stage: ShuffleMapStage =>
-          val allPartitions = 0 until stage.numPartitions
-          val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
-          (allPartitions, filteredPartitions)
-        case stage: ResultStage =>
-          val job = stage.resultOfJob.get
-          val allPartitions = 0 until job.numPartitions
-          val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
-          (allPartitions, filteredPartitions)
-      }
-    }
+    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
 
     // Create internal accumulators if the stage has no accumulators initialized.
     // Reset internal accumulators only if this stage is not partially submitted
     // Otherwise, we may override existing accumulator values from some tasks
-    if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
+    if (stage.internalAccumulators.isEmpty || stage.numPartitions == partitionsToCompute.size) {
       stage.resetInternalAccumulators()
     }
 
@@ -1202,7 +1192,7 @@ class DAGScheduler(
 
               clearCacheLocs()
 
-              if (shuffleStage.outputLocs.contains(Nil)) {
+              if (!shuffleStage.isAvailable) {
                 // Some tasks had failed; let's resubmit this shuffleStage
                 // TODO: Lower-level scheduler should also deal with this
                 logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name +
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index c0451da1f0247..c1d86af7e8fb5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -43,5 +43,10 @@ private[spark] class ResultStage(
    */
   var resultOfJob: Option[ActiveJob] = None
 
+  override def findMissingPartitions(): Seq[Int] = {
+    val job = resultOfJob.get
+    (0 until job.numPartitions).filter(id => !job.finished(id))
+  }
+
   override def toString: String = "ResultStage " + id
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 7d92960876403..3832d99eddaef 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -48,12 +48,33 @@ private[spark] class ShuffleMapStage(
   /** Running map-stage jobs that were submitted to execute this stage independently (if any) */
   var mapStageJobs: List[ActiveJob] = Nil
 
+  /**
+   * Number of partitions that have shuffle outputs.
+   * When this reaches [[numPartitions]], this map stage is ready.
+   * This should be kept consistent as `outputLocs.filter(!_.isEmpty).size`.
+   */
   var numAvailableOutputs: Int = 0
 
+  /**
+   * Returns true if the map stage is ready, i.e. all partitions have shuffle outputs.
+   * This should be the same as `outputLocs.contains(Nil)`.
+   */
   def isAvailable: Boolean = numAvailableOutputs == numPartitions
 
+  /**
+   * List of [[MapStatus]] for each partition. The index of the array is the map partition id,
+   * and each value in the array is the list of possible [[MapStatus]] for a partition
+   * (a single task might run multiple times).
+   */
   val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
 
+  override def findMissingPartitions(): Seq[Int] = {
+    val missing = (0 until numPartitions).filter(id => outputLocs(id).isEmpty)
+    assert(missing.size == numPartitions - numAvailableOutputs,
+      s"${missing.size} missing, expected ${numPartitions - numAvailableOutputs}")
+    missing
+  }
+
   def addOutputLoc(partition: Int, status: MapStatus): Unit = {
     val prevList = outputLocs(partition)
     outputLocs(partition) = status :: prevList
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index a3829c319c48d..5ce4a484344f1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -61,7 +61,7 @@ private[scheduler] abstract class Stage(
     val callSite: CallSite)
   extends Logging {
 
-  val numPartitions = rdd.partitions.size
+  val numPartitions = rdd.partitions.length
 
   /** Set of jobs that this stage belongs to. */
   val jobIds = new HashSet[Int]
@@ -138,6 +138,9 @@ private[scheduler] abstract class Stage(
     case stage: Stage => stage != null && stage.id == id
     case _ => false
   }
+
+  /** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */
+  def findMissingPartitions(): Seq[Int]
 }
 
 private[scheduler] object Stage {

From 40a10d7675578f8370d07e23810d9fc5d58e0550 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 21 Oct 2015 17:50:33 -0700
Subject: [PATCH 638/802] [SPARK-9392][SQL] Dataframe drop should work on
 unresolved columns

Dataframe drop should work on unresolved columns

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8821 from yanboliang/spark-9392.
---
 .../src/main/scala/org/apache/spark/sql/DataFrame.scala    | 7 ++++++-
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala   | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 174bc6f42ad8d..2f10aa9f3c446 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1237,9 +1237,14 @@ class DataFrame private[sql](
    * @since 1.4.1
    */
   def drop(col: Column): DataFrame = {
+    val expression = col match {
+      case Column(u: UnresolvedAttribute) =>
+        queryExecution.analyzed.resolveQuoted(u.name, sqlContext.analyzer.resolver).getOrElse(u)
+      case Column(expr: Expression) => expr
+    }
     val attrs = this.logicalPlan.output
     val colsAfterDrop = attrs.filter { attr =>
-      attr != col.expr
+      attr != expression
     }.map(attr => Column(attr))
     select(colsAfterDrop : _*)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 6424f1f1d9cf6..f4c7aa34e560c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -388,13 +388,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.schema.map(_.name) === Seq("key", "value"))
   }
 
-  test("drop unknown column with same name (no-op) with column reference") {
+  test("drop unknown column with same name with column reference") {
     val col = Column("key")
     val df = testData.drop(col)
     checkAnswer(
       df,
-      testData.collect().toSeq)
-    assert(df.schema.map(_.name) === Seq("key", "value"))
+      testData.collect().map(x => Row(x.getString(1))).toSeq)
+    assert(df.schema.map(_.name) === Seq("value"))
   }
 
   test("drop column after join with duplicate columns using column reference") {

From 1d9733271595596683a6d956a7433fa601df1cc1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 21 Oct 2015 19:20:31 -0700
Subject: [PATCH 639/802] [SPARK-11243][SQL] output UnsafeRow from columnar
 cache

This PR change InMemoryTableScan to output UnsafeRow, and optimize the unrolling and scanning by coping the bytes for var-length types between UnsafeRow and ByteBuffer directly without creating the wrapper objects. When scanning the decimals in TPC-DS store_sales table, it's 80% faster (copy it as long without create Decimal objects).

Author: Davies Liu <davies@databricks.com>

Closes #9203 from davies/unsafe_cache.
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  31 +++++-
 .../codegen/UnsafeArrayWriter.java            |  78 ++++++++++----
 .../expressions/codegen/UnsafeRowWriter.java  | 102 ++++++++++++------
 .../codegen/GenerateUnsafeProjection.scala    |  57 +---------
 .../spark/sql/columnar/ColumnType.scala       |  81 ++++++++++++--
 .../sql/columnar/GenerateColumnAccessor.scala |  68 ++++++++++--
 .../columnar/InMemoryColumnarTableScan.scala  |   6 +-
 7 files changed, 291 insertions(+), 132 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 366615f6fe69f..850838af9be35 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -402,7 +402,7 @@ public UTF8String getUTF8String(int ordinal) {
     if (isNullAt(ordinal)) return null;
     final long offsetAndSize = getLong(ordinal);
     final int offset = (int) (offsetAndSize >> 32);
-    final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+    final int size = (int) offsetAndSize;
     return UTF8String.fromAddress(baseObject, baseOffset + offset, size);
   }
 
@@ -413,7 +413,7 @@ public byte[] getBinary(int ordinal) {
     } else {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
-      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final int size = (int) offsetAndSize;
       final byte[] bytes = new byte[size];
       Platform.copyMemory(
         baseObject,
@@ -446,7 +446,7 @@ public UnsafeRow getStruct(int ordinal, int numFields) {
     } else {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
-      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final int size = (int) offsetAndSize;
       final UnsafeRow row = new UnsafeRow();
       row.pointTo(baseObject, baseOffset + offset, numFields, size);
       return row;
@@ -460,7 +460,7 @@ public UnsafeArrayData getArray(int ordinal) {
     } else {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
-      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final int size = (int) offsetAndSize;
       final UnsafeArrayData array = new UnsafeArrayData();
       array.pointTo(baseObject, baseOffset + offset, size);
       return array;
@@ -474,7 +474,7 @@ public UnsafeMapData getMap(int ordinal) {
     } else {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
-      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final int size = (int) offsetAndSize;
       final UnsafeMapData map = new UnsafeMapData();
       map.pointTo(baseObject, baseOffset + offset, size);
       return map;
@@ -618,6 +618,27 @@ public void writeTo(ByteBuffer buffer) {
     buffer.position(pos + sizeInBytes);
   }
 
+  /**
+   * Write the bytes of var-length field into ByteBuffer
+   *
+   * Note: only work with HeapByteBuffer
+   */
+  public void writeFieldTo(int ordinal, ByteBuffer buffer) {
+    final long offsetAndSize = getLong(ordinal);
+    final int offset = (int) (offsetAndSize >> 32);
+    final int size = (int) offsetAndSize;
+
+    buffer.putInt(size);
+    int pos = buffer.position();
+    buffer.position(pos + size);
+    Platform.copyMemory(
+      baseObject,
+      baseOffset + offset,
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + pos,
+      size);
+  }
+
   @Override
   public void writeExternal(ObjectOutput out) throws IOException {
     byte[] bytes = getBytes();
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
index 7f2a1cb07af01..7dd932d1981b7 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen;
 
-import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.types.CalendarInterval;
@@ -64,29 +63,72 @@ public void setOffset(int ordinal) {
     Platform.putInt(holder.buffer, getElementOffset(ordinal), relativeOffset);
   }
 
-  public void writeCompactDecimal(int ordinal, Decimal input, int precision, int scale) {
-    // make sure Decimal object has the same scale as DecimalType
-    if (input.changePrecision(precision, scale)) {
-      Platform.putLong(holder.buffer, holder.cursor, input.toUnscaledLong());
-      setOffset(ordinal);
-      holder.cursor += 8;
-    } else {
-      setNullAt(ordinal);
+  public void write(int ordinal, boolean value) {
+    Platform.putBoolean(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 1;
+  }
+
+  public void write(int ordinal, byte value) {
+    Platform.putByte(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 1;
+  }
+
+  public void write(int ordinal, short value) {
+    Platform.putShort(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 2;
+  }
+
+  public void write(int ordinal, int value) {
+    Platform.putInt(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 4;
+  }
+
+  public void write(int ordinal, long value) {
+    Platform.putLong(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 8;
+  }
+
+  public void write(int ordinal, float value) {
+    if (Float.isNaN(value)) {
+      value = Float.NaN;
+    }
+    Platform.putFloat(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 4;
+  }
+
+  public void write(int ordinal, double value) {
+    if (Double.isNaN(value)) {
+      value = Double.NaN;
     }
+    Platform.putDouble(holder.buffer, holder.cursor, value);
+    setOffset(ordinal);
+    holder.cursor += 8;
   }
 
   public void write(int ordinal, Decimal input, int precision, int scale) {
     // make sure Decimal object has the same scale as DecimalType
     if (input.changePrecision(precision, scale)) {
-      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
-      assert bytes.length <= 16;
-      holder.grow(bytes.length);
-
-      // Write the bytes to the variable length portion.
-      Platform.copyMemory(
-        bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
-      setOffset(ordinal);
-      holder.cursor += bytes.length;
+      if (precision <= Decimal.MAX_LONG_DIGITS()) {
+        Platform.putLong(holder.buffer, holder.cursor, input.toUnscaledLong());
+        setOffset(ordinal);
+        holder.cursor += 8;
+      } else {
+        final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
+        assert bytes.length <= 16;
+        holder.grow(bytes.length);
+
+        // Write the bytes to the variable length portion.
+        Platform.copyMemory(
+          bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
+        setOffset(ordinal);
+        holder.cursor += bytes.length;
+      }
     } else {
       setNullAt(ordinal);
     }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
index e1f5a05d1d446..adbe2621870df 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -58,6 +58,10 @@ private void zeroOutPaddingBytes(int numBytes) {
     }
   }
 
+  public boolean isNullAt(int ordinal) {
+    return BitSetMethods.isSet(holder.buffer, startingOffset, ordinal);
+  }
+
   public void setNullAt(int ordinal) {
     BitSetMethods.set(holder.buffer, startingOffset, ordinal);
     Platform.putLong(holder.buffer, getFieldOffset(ordinal), 0L);
@@ -95,41 +99,75 @@ public void alignToWords(int numBytes) {
     }
   }
 
-  public void writeCompactDecimal(int ordinal, Decimal input, int precision, int scale) {
-    // make sure Decimal object has the same scale as DecimalType
-    if (input.changePrecision(precision, scale)) {
-      Platform.putLong(holder.buffer, getFieldOffset(ordinal), input.toUnscaledLong());
-    } else {
-      setNullAt(ordinal);
-    }
+  public void write(int ordinal, boolean value) {
+    Platform.putBoolean(holder.buffer, getFieldOffset(ordinal), value);
   }
 
-  public void write(int ordinal, Decimal input, int precision, int scale) {
-    // grow the global buffer before writing data.
-    holder.grow(16);
+  public void write(int ordinal, byte value) {
+    Platform.putByte(holder.buffer, getFieldOffset(ordinal), value);
+  }
 
-    // zero-out the bytes
-    Platform.putLong(holder.buffer, holder.cursor, 0L);
-    Platform.putLong(holder.buffer, holder.cursor + 8, 0L);
+  public void write(int ordinal, short value) {
+    Platform.putShort(holder.buffer, getFieldOffset(ordinal), value);
+  }
 
-    // Make sure Decimal object has the same scale as DecimalType.
-    // Note that we may pass in null Decimal object to set null for it.
-    if (input == null || !input.changePrecision(precision, scale)) {
-      BitSetMethods.set(holder.buffer, startingOffset, ordinal);
-      // keep the offset for future update
-      setOffsetAndSize(ordinal, 0L);
-    } else {
-      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
-      assert bytes.length <= 16;
+  public void write(int ordinal, int value) {
+    Platform.putInt(holder.buffer, getFieldOffset(ordinal), value);
+  }
 
-      // Write the bytes to the variable length portion.
-      Platform.copyMemory(
-        bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
-      setOffsetAndSize(ordinal, bytes.length);
+  public void write(int ordinal, long value) {
+    Platform.putLong(holder.buffer, getFieldOffset(ordinal), value);
+  }
+
+  public void write(int ordinal, float value) {
+    if (Float.isNaN(value)) {
+      value = Float.NaN;
     }
+    Platform.putFloat(holder.buffer, getFieldOffset(ordinal), value);
+  }
 
-    // move the cursor forward.
-    holder.cursor += 16;
+  public void write(int ordinal, double value) {
+    if (Double.isNaN(value)) {
+      value = Double.NaN;
+    }
+    Platform.putDouble(holder.buffer, getFieldOffset(ordinal), value);
+  }
+
+  public void write(int ordinal, Decimal input, int precision, int scale) {
+    if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      // make sure Decimal object has the same scale as DecimalType
+      if (input.changePrecision(precision, scale)) {
+        Platform.putLong(holder.buffer, getFieldOffset(ordinal), input.toUnscaledLong());
+      } else {
+        setNullAt(ordinal);
+      }
+    } else {
+      // grow the global buffer before writing data.
+      holder.grow(16);
+
+      // zero-out the bytes
+      Platform.putLong(holder.buffer, holder.cursor, 0L);
+      Platform.putLong(holder.buffer, holder.cursor + 8, 0L);
+
+      // Make sure Decimal object has the same scale as DecimalType.
+      // Note that we may pass in null Decimal object to set null for it.
+      if (input == null || !input.changePrecision(precision, scale)) {
+        BitSetMethods.set(holder.buffer, startingOffset, ordinal);
+        // keep the offset for future update
+        setOffsetAndSize(ordinal, 0L);
+      } else {
+        final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
+        assert bytes.length <= 16;
+
+        // Write the bytes to the variable length portion.
+        Platform.copyMemory(
+          bytes, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, bytes.length);
+        setOffsetAndSize(ordinal, bytes.length);
+      }
+
+      // move the cursor forward.
+      holder.cursor += 16;
+    }
   }
 
   public void write(int ordinal, UTF8String input) {
@@ -151,7 +189,10 @@ public void write(int ordinal, UTF8String input) {
   }
 
   public void write(int ordinal, byte[] input) {
-    final int numBytes = input.length;
+    write(ordinal, input, 0, input.length);
+  }
+
+  public void write(int ordinal, byte[] input, int offset, int numBytes) {
     final int roundedSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
 
     // grow the global buffer before writing data.
@@ -160,7 +201,8 @@ public void write(int ordinal, byte[] input) {
     zeroOutPaddingBytes(numBytes);
 
     // Write the bytes to the variable length portion.
-    Platform.copyMemory(input, Platform.BYTE_ARRAY_OFFSET, holder.buffer, holder.cursor, numBytes);
+    Platform.copyMemory(input, Platform.BYTE_ARRAY_OFFSET + offset,
+      holder.buffer, holder.cursor, numBytes);
 
     setOffsetAndSize(ordinal, numBytes);
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index dbe92d6a83502..2136f82ba4752 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -89,7 +89,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
         val setNull = dt match {
           case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
             // Can't call setNullAt() for DecimalType with precision larger than 18.
-            s"$rowWriter.write($index, null, ${t.precision}, ${t.scale});"
+            s"$rowWriter.write($index, (Decimal) null, ${t.precision}, ${t.scale});"
           case _ => s"$rowWriter.setNullAt($index);"
         }
 
@@ -124,17 +124,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
             """
 
           case _ if ctx.isPrimitiveType(dt) =>
-            val fieldOffset = ctx.freshName("fieldOffset")
             s"""
-              final long $fieldOffset = $rowWriter.getFieldOffset($index);
-              Platform.putLong($bufferHolder.buffer, $fieldOffset, 0L);
-              ${writePrimitiveType(ctx, input.value, dt, s"$bufferHolder.buffer", fieldOffset)}
+              $rowWriter.write($index, ${input.value});
             """
 
-          case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-            s"$rowWriter.writeCompactDecimal($index, ${input.value}, " +
-              s"${t.precision}, ${t.scale});"
-
           case t: DecimalType =>
             s"$rowWriter.write($index, ${input.value}, ${t.precision}, ${t.scale});"
 
@@ -204,20 +197,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${writeMapToBuffer(ctx, element, kt, vt, bufferHolder)}
         """
 
-      case _ if ctx.isPrimitiveType(et) =>
-        // Should we do word align?
-        val dataSize = et.defaultSize
-
-        s"""
-          $arrayWriter.setOffset($index);
-          ${writePrimitiveType(ctx, element, et,
-            s"$bufferHolder.buffer", s"$bufferHolder.cursor")}
-          $bufferHolder.cursor += $dataSize;
-        """
-
-      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-        s"$arrayWriter.writeCompactDecimal($index, $element, ${t.precision}, ${t.scale});"
-
       case t: DecimalType =>
         s"$arrayWriter.write($index, $element, ${t.precision}, ${t.scale});"
 
@@ -296,38 +275,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     """
   }
 
-  private def writePrimitiveType(
-      ctx: CodeGenContext,
-      input: String,
-      dt: DataType,
-      buffer: String,
-      offset: String) = {
-    assert(ctx.isPrimitiveType(dt))
-
-    val putMethod = s"put${ctx.primitiveTypeName(dt)}"
-
-    dt match {
-      case FloatType | DoubleType =>
-        val normalized = ctx.freshName("normalized")
-        val boxedType = ctx.boxedType(dt)
-        val handleNaN =
-          s"""
-            final ${ctx.javaType(dt)} $normalized;
-            if ($boxedType.isNaN($input)) {
-              $normalized = $boxedType.NaN;
-            } else {
-              $normalized = $input;
-            }
-          """
-
-        s"""
-          $handleNaN
-          Platform.$putMethod($buffer, $offset, $normalized);
-        """
-      case _ => s"Platform.$putMethod($buffer, $offset, $input);"
-    }
-  }
-
   def createCode(ctx: CodeGenContext, expressions: Seq[Expression]): GeneratedExpressionCode = {
     val exprEvals = expressions.map(e => e.gen(ctx))
     val exprTypes = expressions.map(_.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 72fa299aa937b..68e509eb5047d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -32,6 +32,13 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * A help class for fast reading Int/Long/Float/Double from ByteBuffer in native order.
  *
+ * Note: There is not much difference between ByteBuffer.getByte/getShort and
+ * Unsafe.getByte/getShort, so we do not have helper methods for them.
+ *
+ * The unrolling (building columnar cache) is already slow, putLong/putDouble will not help much,
+ * so we do not have helper methods for them.
+ *
+ *
  * WARNNING: This only works with HeapByteBuffer
  */
 object ByteBufferHelper {
@@ -351,7 +358,38 @@ private[sql] object SHORT extends NativeColumnType(ShortType, 2) {
   }
 }
 
-private[sql] object STRING extends NativeColumnType(StringType, 8) {
+/**
+ * A fast path to copy var-length bytes between ByteBuffer and UnsafeRow without creating wrapper
+ * objects.
+ */
+private[sql] trait DirectCopyColumnType[JvmType] extends ColumnType[JvmType] {
+
+  // copy the bytes from ByteBuffer to UnsafeRow
+  override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
+    if (row.isInstanceOf[MutableUnsafeRow]) {
+      val numBytes = buffer.getInt
+      val cursor = buffer.position()
+      buffer.position(cursor + numBytes)
+      row.asInstanceOf[MutableUnsafeRow].writer.write(ordinal, buffer.array(),
+        buffer.arrayOffset() + cursor, numBytes)
+    } else {
+      setField(row, ordinal, extract(buffer))
+    }
+  }
+
+  // copy the bytes from UnsafeRow to ByteBuffer
+  override def append(row: InternalRow, ordinal: Int, buffer: ByteBuffer): Unit = {
+    if (row.isInstanceOf[UnsafeRow]) {
+      row.asInstanceOf[UnsafeRow].writeFieldTo(ordinal, buffer)
+    } else {
+      super.append(row, ordinal, buffer)
+    }
+  }
+}
+
+private[sql] object STRING
+  extends NativeColumnType(StringType, 8) with DirectCopyColumnType[UTF8String] {
+
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     row.getUTF8String(ordinal).numBytes() + 4
   }
@@ -363,16 +401,17 @@ private[sql] object STRING extends NativeColumnType(StringType, 8) {
 
   override def extract(buffer: ByteBuffer): UTF8String = {
     val length = buffer.getInt()
-    assert(buffer.hasArray)
-    val base = buffer.array()
-    val offset = buffer.arrayOffset()
     val cursor = buffer.position()
     buffer.position(cursor + length)
-    UTF8String.fromBytes(base, offset + cursor, length)
+    UTF8String.fromBytes(buffer.array(), buffer.arrayOffset() + cursor, length)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: UTF8String): Unit = {
-    row.update(ordinal, value.clone())
+    if (row.isInstanceOf[MutableUnsafeRow]) {
+      row.asInstanceOf[MutableUnsafeRow].writer.write(ordinal, value)
+    } else {
+      row.update(ordinal, value.clone())
+    }
   }
 
   override def getField(row: InternalRow, ordinal: Int): UTF8String = {
@@ -393,10 +432,28 @@ private[sql] case class COMPACT_DECIMAL(precision: Int, scale: Int)
     Decimal(ByteBufferHelper.getLong(buffer), precision, scale)
   }
 
+  override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
+    if (row.isInstanceOf[MutableUnsafeRow]) {
+      // copy it as Long
+      row.setLong(ordinal, ByteBufferHelper.getLong(buffer))
+    } else {
+      setField(row, ordinal, extract(buffer))
+    }
+  }
+
   override def append(v: Decimal, buffer: ByteBuffer): Unit = {
     buffer.putLong(v.toUnscaledLong)
   }
 
+  override def append(row: InternalRow, ordinal: Int, buffer: ByteBuffer): Unit = {
+    if (row.isInstanceOf[UnsafeRow]) {
+      // copy it as Long
+      buffer.putLong(row.getLong(ordinal))
+    } else {
+      append(getField(row, ordinal), buffer)
+    }
+  }
+
   override def getField(row: InternalRow, ordinal: Int): Decimal = {
     row.getDecimal(ordinal, precision, scale)
   }
@@ -417,7 +474,7 @@ private[sql] object COMPACT_DECIMAL {
 }
 
 private[sql] sealed abstract class ByteArrayColumnType[JvmType](val defaultSize: Int)
-  extends ColumnType[JvmType] {
+  extends ColumnType[JvmType] with DirectCopyColumnType[JvmType] {
 
   def serialize(value: JvmType): Array[Byte]
   def deserialize(bytes: Array[Byte]): JvmType
@@ -488,7 +545,8 @@ private[sql] object LARGE_DECIMAL {
   }
 }
 
-private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRow] {
+private[sql] case class STRUCT(dataType: StructType)
+  extends ColumnType[UnsafeRow] with DirectCopyColumnType[UnsafeRow] {
 
   private val numOfFields: Int = dataType.fields.size
 
@@ -528,7 +586,8 @@ private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRo
   override def clone(v: UnsafeRow): UnsafeRow = v.copy()
 }
 
-private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArrayData] {
+private[sql] case class ARRAY(dataType: ArrayType)
+  extends ColumnType[UnsafeArrayData] with DirectCopyColumnType[UnsafeArrayData] {
 
   override def defaultSize: Int = 16
 
@@ -566,7 +625,8 @@ private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArra
   override def clone(v: UnsafeArrayData): UnsafeArrayData = v.copy()
 }
 
-private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData] {
+private[sql] case class MAP(dataType: MapType)
+  extends ColumnType[UnsafeMapData] with DirectCopyColumnType[UnsafeMapData] {
 
   override def defaultSize: Int = 32
 
@@ -590,7 +650,6 @@ private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData]
 
   override def extract(buffer: ByteBuffer): UnsafeMapData = {
     val numBytes = buffer.getInt
-    assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + numBytes)
     val map = new UnsafeMapData
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
index e04bcda5800c7..d0f5bfa1cd7bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
@@ -20,17 +20,43 @@ package org.apache.spark.sql.columnar
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
+import org.apache.spark.sql.catalyst.expressions.codegen.{UnsafeRowWriter, CodeFormatter, CodeGenerator}
 import org.apache.spark.sql.types._
 
 /**
- * An Iterator to walk throught the InternalRows from a CachedBatch
+ * An Iterator to walk through the InternalRows from a CachedBatch
  */
 abstract class ColumnarIterator extends Iterator[InternalRow] {
-  def initialize(input: Iterator[CachedBatch], mutableRow: MutableRow, columnTypes: Array[DataType],
+  def initialize(input: Iterator[CachedBatch], columnTypes: Array[DataType],
     columnIndexes: Array[Int]): Unit
 }
 
+/**
+ * An helper class to update the fields of UnsafeRow, used by ColumnAccessor
+ *
+ * WARNNING: These setter MUST be called in increasing order of ordinals.
+ */
+class MutableUnsafeRow(val writer: UnsafeRowWriter) extends GenericMutableRow(null) {
+
+  override def isNullAt(i: Int): Boolean = writer.isNullAt(i)
+  override def setNullAt(i: Int): Unit = writer.setNullAt(i)
+
+  override def setBoolean(i: Int, v: Boolean): Unit = writer.write(i, v)
+  override def setByte(i: Int, v: Byte): Unit = writer.write(i, v)
+  override def setShort(i: Int, v: Short): Unit = writer.write(i, v)
+  override def setInt(i: Int, v: Int): Unit = writer.write(i, v)
+  override def setLong(i: Int, v: Long): Unit = writer.write(i, v)
+  override def setFloat(i: Int, v: Float): Unit = writer.write(i, v)
+  override def setDouble(i: Int, v: Double): Unit = writer.write(i, v)
+
+  // the writer will be used directly to avoid creating wrapper objects
+  override def setDecimal(i: Int, v: Decimal, precision: Int): Unit =
+    throw new UnsupportedOperationException
+  override def update(i: Int, v: Any): Unit = throw new UnsupportedOperationException
+
+  // all other methods inherited from GenericMutableRow are not need
+}
+
 /**
  * Generates bytecode for an [[ColumnarIterator]] for columnar cache.
  */
@@ -41,6 +67,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
 
   protected def create(columnTypes: Seq[DataType]): ColumnarIterator = {
     val ctx = newCodeGenContext()
+    val numFields = columnTypes.size
     val (initializeAccessors, extractors) = columnTypes.zipWithIndex.map { case (dt, index) =>
       val accessorName = ctx.freshName("accessor")
       val accessorCls = dt match {
@@ -74,13 +101,27 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
       }
 
       val extract = s"$accessorName.extractTo(mutableRow, $index);"
-
-      (createCode, extract)
+      val patch = dt match {
+        case DecimalType.Fixed(p, s) if p > Decimal.MAX_LONG_DIGITS =>
+          // For large Decimal, it should have 16 bytes for future update even it's null now.
+          s"""
+            if (mutableRow.isNullAt($index)) {
+              rowWriter.write($index, (Decimal) null, $p, $s);
+            }
+           """
+        case other => ""
+      }
+      (createCode, extract + patch)
     }.unzip
 
     val code = s"""
       import java.nio.ByteBuffer;
       import java.nio.ByteOrder;
+      import scala.collection.Iterator;
+      import org.apache.spark.sql.types.DataType;
+      import org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder;
+      import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter;
+      import org.apache.spark.sql.columnar.MutableUnsafeRow;
 
       public SpecificColumnarIterator generate($exprType[] expr) {
         return new SpecificColumnarIterator();
@@ -90,13 +131,17 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
 
         private ByteOrder nativeOrder = null;
         private byte[][] buffers = null;
+        private UnsafeRow unsafeRow = new UnsafeRow();
+        private BufferHolder bufferHolder = new BufferHolder();
+        private UnsafeRowWriter rowWriter = new UnsafeRowWriter();
+        private MutableUnsafeRow mutableRow = null;
 
         private int currentRow = 0;
         private int numRowsInBatch = 0;
 
         private scala.collection.Iterator input = null;
         private MutableRow mutableRow = null;
-        private ${classOf[DataType].getName}[] columnTypes = null;
+        private DataType[] columnTypes = null;
         private int[] columnIndexes = null;
 
         ${declareMutableStates(ctx)}
@@ -104,12 +149,12 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
         public SpecificColumnarIterator() {
           this.nativeOrder = ByteOrder.nativeOrder();
           this.buffers = new byte[${columnTypes.length}][];
+          this.mutableRow = new MutableUnsafeRow(rowWriter);
 
           ${initMutableStates(ctx)}
         }
 
-        public void initialize(scala.collection.Iterator input, MutableRow mutableRow,
-                               ${classOf[DataType].getName}[] columnTypes, int[] columnIndexes) {
+        public void initialize(Iterator input, DataType[] columnTypes, int[] columnIndexes) {
           this.input = input;
           this.mutableRow = mutableRow;
           this.columnTypes = columnTypes;
@@ -136,9 +181,12 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
         }
 
         public InternalRow next() {
-          ${extractors.mkString("\n")}
           currentRow += 1;
-          return mutableRow;
+          bufferHolder.reset();
+          rowWriter.initialize(bufferHolder, $numFields);
+          ${extractors.mkString("\n")}
+          unsafeRow.pointTo(bufferHolder.buffer, $numFields, bufferHolder.totalSize());
+          return unsafeRow;
         }
       }"""
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 9f76a61a1574b..b4607b12fcefa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -209,6 +209,8 @@ private[sql] case class InMemoryColumnarTableScan(
 
   override def output: Seq[Attribute] = attributes
 
+  override def outputsUnsafeRows: Boolean = true
+
   private def statsFor(a: Attribute) = relation.partitionStatistics.forAttribute(a)
 
   // Returned filter predicate should return false iff it is impossible for the input expression
@@ -317,14 +319,12 @@ private[sql] case class InMemoryColumnarTableScan(
           cachedBatchIterator
         }
 
-      val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
       val columnTypes = requestedColumnDataTypes.map {
         case udt: UserDefinedType[_] => udt.sqlType
         case other => other
       }.toArray
       val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
-      columnarIterator.initialize(cachedBatchesToScan, nextRow, columnTypes,
-        requestedColumnIndices.toArray)
+      columnarIterator.initialize(cachedBatchesToScan, columnTypes, requestedColumnIndices.toArray)
       if (enableAccumulators && columnarIterator.hasNext) {
         readPartitions += 1
       }

From c03b6d11589102b91f08728519e8520025db91e1 Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Thu, 22 Oct 2015 03:59:26 -0700
Subject: [PATCH 640/802] [SPARK-11121][CORE] Correct the TaskLocation type

Correct the logic to return `HDFSCacheTaskLocation` instance when the input `str` is a in memory location.

Author: zhichao.li <zhichao.li@intel.com>

Closes #9096 from zhichao-li/uselessBranch.
---
 .../org/apache/spark/scheduler/TaskLocation.scala     |  2 +-
 .../apache/spark/scheduler/TaskSetManagerSuite.scala  | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
index da07ce2c6ea49..1b65926f5c749 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
@@ -67,7 +67,7 @@ private[spark] object TaskLocation {
     if (hstr.equals(str)) {
       new HostTaskLocation(str)
     } else {
-      new HostTaskLocation(hstr)
+      new HDFSCacheTaskLocation(hstr)
     }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index f0eadf240943e..695523cc8aa3a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -759,9 +759,9 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     val sched = new FakeTaskScheduler(sc,
         ("execA", "host1"), ("execB", "host2"), ("execC", "host3"))
     val taskSet = FakeTask.createTaskSet(3,
-      Seq(HostTaskLocation("host1")),
-      Seq(HostTaskLocation("host2")),
-      Seq(HDFSCacheTaskLocation("host3")))
+      Seq(TaskLocation("host1")),
+      Seq(TaskLocation("host2")),
+      Seq(TaskLocation("hdfs_cache_host3")))
     val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, ANY)))
@@ -776,6 +776,11 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.myLocalityLevels.sameElements(Array(ANY)))
   }
 
+  test("Test TaskLocation for different host type.") {
+    assert(TaskLocation("host1") === HostTaskLocation("host1"))
+    assert(TaskLocation("hdfs_cache_host1") === HDFSCacheTaskLocation("host1"))
+  }
+
   def createTaskResult(id: Int): DirectTaskResult[Int] = {
     val valueSer = SparkEnv.get.serializer.newInstance()
     new DirectTaskResult[Int](valueSer.serialize(id), mutable.Map.empty, new TaskMetrics)

From 94e2064fa1b04c05c805d9175c7c78bf583db5c6 Mon Sep 17 00:00:00 2001
From: Forest Fang <forest.fang@outlook.com>
Date: Thu, 22 Oct 2015 09:34:07 -0700
Subject: [PATCH 641/802] [SPARK-11244][SPARKR] sparkR.stop() should remove
 SQLContext

SparkR should remove `.sparkRSQLsc` and `.sparkRHivesc` when `sparkR.stop()` is called. Otherwise even when SparkContext is reinitialized, `sparkRSQL.init` returns the stale copy of the object and complains:

```r
sc <- sparkR.init("local")
sqlContext <- sparkRSQL.init(sc)
sparkR.stop()
sc <- sparkR.init("local")
sqlContext <- sparkRSQL.init(sc)
sqlContext
```
producing
```r
Error in callJMethod(x, "getClass") :
  Invalid jobj 1. If SparkR was restarted, Spark operations need to be re-executed.
```

I have added the check and removal only when SparkContext itself is initialized. I have also added corresponding test for this fix. Let me know if you want me to move the test to SQL test suite instead.

p.s. I tried lint-r but ended up a lots of errors on existing code.

Author: Forest Fang <forest.fang@outlook.com>

Closes #9205 from saurfang/sparkR.stop.
---
 R/pkg/R/sparkR.R                |  8 ++++++++
 R/pkg/inst/tests/test_context.R | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 9cf2f1a361cf2..043b0057bd04a 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -39,6 +39,14 @@ sparkR.stop <- function() {
       sc <- get(".sparkRjsc", envir = env)
       callJMethod(sc, "stop")
       rm(".sparkRjsc", envir = env)
+
+      if (exists(".sparkRSQLsc", envir = env)) {
+        rm(".sparkRSQLsc", envir = env)
+      }
+
+      if (exists(".sparkRHivesc", envir = env)) {
+        rm(".sparkRHivesc", envir = env)
+      }
     }
 
     if (exists(".backendLaunched", envir = env)) {
diff --git a/R/pkg/inst/tests/test_context.R b/R/pkg/inst/tests/test_context.R
index 513bbc8e62059..e99815ed1562c 100644
--- a/R/pkg/inst/tests/test_context.R
+++ b/R/pkg/inst/tests/test_context.R
@@ -26,6 +26,16 @@ test_that("repeatedly starting and stopping SparkR", {
   }
 })
 
+test_that("repeatedly starting and stopping SparkR SQL", {
+  for (i in 1:4) {
+    sc <- sparkR.init()
+    sqlContext <- sparkRSQL.init(sc)
+    df <- createDataFrame(sqlContext, data.frame(a = 1:20))
+    expect_equal(count(df), 20)
+    sparkR.stop()
+  }
+})
+
 test_that("rdd GC across sparkR.stop", {
   sparkR.stop()
   sc <- sparkR.init() # sc should get id 0

From f6d06adf05afa9c5386dc2396c94e7a98730289f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 22 Oct 2015 09:46:30 -0700
Subject: [PATCH 642/802] [SPARK-10708] Consolidate sort shuffle
 implementations

There's a lot of duplication between SortShuffleManager and UnsafeShuffleManager. Given that these now provide the same set of functionality, now that UnsafeShuffleManager supports large records, I think that we should replace SortShuffleManager's serialized shuffle implementation with UnsafeShuffleManager's and should merge the two managers together.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8829 from JoshRosen/consolidate-sort-shuffle-implementations.
---
 .../sort/BypassMergeSortShuffleWriter.java    | 106 +++++--
 .../{unsafe => sort}/PackedRecordPointer.java |   2 +-
 .../ShuffleExternalSorter.java}               |  28 +-
 .../ShuffleInMemorySorter.java}               |  16 +-
 .../ShuffleSortDataFormat.java}               |   8 +-
 .../shuffle/sort/SortShuffleFileWriter.java   |  53 ----
 .../shuffle/{unsafe => sort}/SpillInfo.java   |   4 +-
 .../{unsafe => sort}/UnsafeShuffleWriter.java |  12 +-
 .../scala/org/apache/spark/SparkEnv.scala     |   2 +-
 .../shuffle/sort/SortShuffleManager.scala     | 175 +++++++++--
 .../shuffle/sort/SortShuffleWriter.scala      |  28 +-
 .../shuffle/unsafe/UnsafeShuffleManager.scala | 202 -------------
 .../spark/util/collection/ChainedBuffer.scala | 146 ----------
 .../util/collection/ExternalSorter.scala      |  35 +--
 .../PartitionedSerializedPairBuffer.scala     | 273 ------------------
 .../PackedRecordPointerSuite.java             |   5 +-
 .../ShuffleInMemorySorterSuite.java}          |  16 +-
 .../UnsafeShuffleWriterSuite.java             |  10 +-
 .../org/apache/spark/SortShuffleSuite.scala   |  65 +++++
 .../spark/scheduler/DAGSchedulerSuite.scala   |   6 +-
 .../BypassMergeSortShuffleWriterSuite.scala   |  64 ++--
 .../SortShuffleManagerSuite.scala}            |  30 +-
 .../shuffle/sort/SortShuffleWriterSuite.scala |  45 ---
 .../shuffle/unsafe/UnsafeShuffleSuite.scala   | 102 -------
 .../util/collection/ChainedBufferSuite.scala  | 144 ---------
 ...PartitionedSerializedPairBufferSuite.scala | 148 ----------
 docs/configuration.md                         |   7 +-
 project/MimaExcludes.scala                    |   9 +-
 .../apache/spark/sql/execution/Exchange.scala |  23 +-
 .../execution/UnsafeRowSerializerSuite.scala  |   9 +-
 30 files changed, 456 insertions(+), 1317 deletions(-)
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe => sort}/PackedRecordPointer.java (98%)
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe/UnsafeShuffleExternalSorter.java => sort/ShuffleExternalSorter.java} (95%)
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe/UnsafeShuffleInMemorySorter.java => sort/ShuffleInMemorySorter.java} (88%)
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe/UnsafeShuffleSortDataFormat.java => sort/ShuffleSortDataFormat.java} (86%)
 delete mode 100644 core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe => sort}/SpillInfo.java (90%)
 rename core/src/main/java/org/apache/spark/shuffle/{unsafe => sort}/UnsafeShuffleWriter.java (98%)
 delete mode 100644 core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
 rename core/src/test/java/org/apache/spark/shuffle/{unsafe => sort}/PackedRecordPointerSuite.java (96%)
 rename core/src/test/java/org/apache/spark/shuffle/{unsafe/UnsafeShuffleInMemorySorterSuite.java => sort/ShuffleInMemorySorterSuite.java} (87%)
 rename core/src/test/java/org/apache/spark/shuffle/{unsafe => sort}/UnsafeShuffleWriterSuite.java (98%)
 rename core/src/test/scala/org/apache/spark/shuffle/{unsafe/UnsafeShuffleManagerSuite.scala => sort/SortShuffleManagerSuite.scala} (80%)
 delete mode 100644 core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index f5d80bbcf3557..ee82d679935c0 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -21,21 +21,30 @@
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import javax.annotation.Nullable;
 
+import scala.None$;
+import scala.Option;
 import scala.Product2;
 import scala.Tuple2;
 import scala.collection.Iterator;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.io.Closeables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.Partitioner;
+import org.apache.spark.ShuffleDependency;
 import org.apache.spark.SparkConf;
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.scheduler.MapStatus;
+import org.apache.spark.scheduler.MapStatus$;
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.shuffle.IndexShuffleBlockResolver;
+import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.*;
 import org.apache.spark.util.Utils;
 
@@ -62,7 +71,7 @@
  * <p>
  * There have been proposals to completely remove this code path; see SPARK-6026 for details.
  */
-final class BypassMergeSortShuffleWriter<K, V> implements SortShuffleFileWriter<K, V> {
+final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
 
   private final Logger logger = LoggerFactory.getLogger(BypassMergeSortShuffleWriter.class);
 
@@ -72,31 +81,52 @@ final class BypassMergeSortShuffleWriter<K, V> implements SortShuffleFileWriter<
   private final BlockManager blockManager;
   private final Partitioner partitioner;
   private final ShuffleWriteMetrics writeMetrics;
+  private final int shuffleId;
+  private final int mapId;
   private final Serializer serializer;
+  private final IndexShuffleBlockResolver shuffleBlockResolver;
 
   /** Array of file writers, one for each partition */
   private DiskBlockObjectWriter[] partitionWriters;
+  @Nullable private MapStatus mapStatus;
+  private long[] partitionLengths;
+
+  /**
+   * Are we in the process of stopping? Because map tasks can call stop() with success = true
+   * and then call stop() with success = false if they get an exception, we want to make sure
+   * we don't try deleting files, etc twice.
+   */
+  private boolean stopping = false;
 
   public BypassMergeSortShuffleWriter(
-      SparkConf conf,
       BlockManager blockManager,
-      Partitioner partitioner,
-      ShuffleWriteMetrics writeMetrics,
-      Serializer serializer) {
+      IndexShuffleBlockResolver shuffleBlockResolver,
+      BypassMergeSortShuffleHandle<K, V> handle,
+      int mapId,
+      TaskContext taskContext,
+      SparkConf conf) {
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
     this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
-    this.numPartitions = partitioner.numPartitions();
     this.blockManager = blockManager;
-    this.partitioner = partitioner;
-    this.writeMetrics = writeMetrics;
-    this.serializer = serializer;
+    final ShuffleDependency<K, V, V> dep = handle.dependency();
+    this.mapId = mapId;
+    this.shuffleId = dep.shuffleId();
+    this.partitioner = dep.partitioner();
+    this.numPartitions = partitioner.numPartitions();
+    this.writeMetrics = new ShuffleWriteMetrics();
+    taskContext.taskMetrics().shuffleWriteMetrics_$eq(Option.apply(writeMetrics));
+    this.serializer = Serializer.getSerializer(dep.serializer());
+    this.shuffleBlockResolver = shuffleBlockResolver;
   }
 
   @Override
-  public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
+  public void write(Iterator<Product2<K, V>> records) throws IOException {
     assert (partitionWriters == null);
     if (!records.hasNext()) {
+      partitionLengths = new long[numPartitions];
+      shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
+      mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
       return;
     }
     final SerializerInstance serInstance = serializer.newInstance();
@@ -124,13 +154,24 @@ public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
     for (DiskBlockObjectWriter writer : partitionWriters) {
       writer.commitAndClose();
     }
+
+    partitionLengths =
+      writePartitionedFile(shuffleBlockResolver.getDataFile(shuffleId, mapId));
+    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
+    mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
   }
 
-  @Override
-  public long[] writePartitionedFile(
-      BlockId blockId,
-      TaskContext context,
-      File outputFile) throws IOException {
+  @VisibleForTesting
+  long[] getPartitionLengths() {
+    return partitionLengths;
+  }
+
+  /**
+   * Concatenate all of the per-partition files into a single combined file.
+   *
+   * @return array of lengths, in bytes, of each partition of the file (used by map output tracker).
+   */
+  private long[] writePartitionedFile(File outputFile) throws IOException {
     // Track location of the partition starts in the output file
     final long[] lengths = new long[numPartitions];
     if (partitionWriters == null) {
@@ -165,18 +206,33 @@ public long[] writePartitionedFile(
   }
 
   @Override
-  public void stop() throws IOException {
-    if (partitionWriters != null) {
-      try {
-        for (DiskBlockObjectWriter writer : partitionWriters) {
-          // This method explicitly does _not_ throw exceptions:
-          File file = writer.revertPartialWritesAndClose();
-          if (!file.delete()) {
-            logger.error("Error while deleting file {}", file.getAbsolutePath());
+  public Option<MapStatus> stop(boolean success) {
+    if (stopping) {
+      return None$.empty();
+    } else {
+      stopping = true;
+      if (success) {
+        if (mapStatus == null) {
+          throw new IllegalStateException("Cannot call stop(true) without having called write()");
+        }
+        return Option.apply(mapStatus);
+      } else {
+        // The map task failed, so delete our output data.
+        if (partitionWriters != null) {
+          try {
+            for (DiskBlockObjectWriter writer : partitionWriters) {
+              // This method explicitly does _not_ throw exceptions:
+              File file = writer.revertPartialWritesAndClose();
+              if (!file.delete()) {
+                logger.error("Error while deleting file {}", file.getAbsolutePath());
+              }
+            }
+          } finally {
+            partitionWriters = null;
           }
         }
-      } finally {
-        partitionWriters = null;
+        shuffleBlockResolver.removeDataByMap(shuffleId, mapId);
+        return None$.empty();
       }
     }
   }
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/PackedRecordPointer.java b/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
similarity index 98%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/PackedRecordPointer.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
index 4ee6a82c0423e..c11711966fa8c 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/PackedRecordPointer.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 /**
  * Wrapper around an 8-byte word that holds a 24-bit partition number and 40-bit record pointer.
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
similarity index 95%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index e73ba39468828..85fdaa8115fa3 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import javax.annotation.Nullable;
 import java.io.File;
@@ -48,7 +48,7 @@
  * <p>
  * Incoming records are appended to data pages. When all records have been inserted (or when the
  * current thread's shuffle memory limit is reached), the in-memory records are sorted according to
- * their partition ids (using a {@link UnsafeShuffleInMemorySorter}). The sorted records are then
+ * their partition ids (using a {@link ShuffleInMemorySorter}). The sorted records are then
  * written to a single output file (or multiple files, if we've spilled). The format of the output
  * files is the same as the format of the final output file written by
  * {@link org.apache.spark.shuffle.sort.SortShuffleWriter}: each output partition's records are
@@ -59,9 +59,9 @@
  * spill files. Instead, this merging is performed in {@link UnsafeShuffleWriter}, which uses a
  * specialized merge procedure that avoids extra serialization/deserialization.
  */
-final class UnsafeShuffleExternalSorter {
+final class ShuffleExternalSorter {
 
-  private final Logger logger = LoggerFactory.getLogger(UnsafeShuffleExternalSorter.class);
+  private final Logger logger = LoggerFactory.getLogger(ShuffleExternalSorter.class);
 
   @VisibleForTesting
   static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
@@ -76,6 +76,10 @@ final class UnsafeShuffleExternalSorter {
   private final BlockManager blockManager;
   private final TaskContext taskContext;
   private final ShuffleWriteMetrics writeMetrics;
+  private long numRecordsInsertedSinceLastSpill = 0;
+
+  /** Force this sorter to spill when there are this many elements in memory. For testing only */
+  private final long numElementsForSpillThreshold;
 
   /** The buffer size to use when writing spills using DiskBlockObjectWriter */
   private final int fileBufferSizeBytes;
@@ -94,12 +98,12 @@ final class UnsafeShuffleExternalSorter {
   private long peakMemoryUsedBytes;
 
   // These variables are reset after spilling:
-  @Nullable private UnsafeShuffleInMemorySorter inMemSorter;
+  @Nullable private ShuffleInMemorySorter inMemSorter;
   @Nullable private MemoryBlock currentPage = null;
   private long currentPagePosition = -1;
   private long freeSpaceInCurrentPage = 0;
 
-  public UnsafeShuffleExternalSorter(
+  public ShuffleExternalSorter(
       TaskMemoryManager memoryManager,
       ShuffleMemoryManager shuffleMemoryManager,
       BlockManager blockManager,
@@ -117,6 +121,8 @@ public UnsafeShuffleExternalSorter(
     this.numPartitions = numPartitions;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    this.numElementsForSpillThreshold =
+      conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", Long.MAX_VALUE);
     this.pageSizeBytes = (int) Math.min(
       PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, shuffleMemoryManager.pageSizeBytes());
     this.maxRecordSizeBytes = pageSizeBytes - 4;
@@ -140,7 +146,8 @@ private void initializeForWriting() throws IOException {
       throw new IOException("Could not acquire " + memoryRequested + " bytes of memory");
     }
 
-    this.inMemSorter = new UnsafeShuffleInMemorySorter(initialSize);
+    this.inMemSorter = new ShuffleInMemorySorter(initialSize);
+    numRecordsInsertedSinceLastSpill = 0;
   }
 
   /**
@@ -166,7 +173,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
     }
 
     // This call performs the actual sort.
-    final UnsafeShuffleInMemorySorter.UnsafeShuffleSorterIterator sortedRecords =
+    final ShuffleInMemorySorter.ShuffleSorterIterator sortedRecords =
       inMemSorter.getSortedIterator();
 
     // Currently, we need to open a new DiskBlockObjectWriter for each partition; we can avoid this
@@ -406,6 +413,10 @@ public void insertRecord(
       int lengthInBytes,
       int partitionId) throws IOException {
 
+    if (numRecordsInsertedSinceLastSpill > numElementsForSpillThreshold) {
+      spill();
+    }
+
     growPointerArrayIfNecessary();
     // Need 4 bytes to store the record length.
     final int totalSpaceRequired = lengthInBytes + 4;
@@ -453,6 +464,7 @@ public void insertRecord(
       recordBaseObject, recordBaseOffset, dataPageBaseObject, dataPagePosition, lengthInBytes);
     assert(inMemSorter != null);
     inMemSorter.insertRecord(recordAddress, partitionId);
+    numRecordsInsertedSinceLastSpill += 1;
   }
 
   /**
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
similarity index 88%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorter.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
index 5bab501da9364..a8dee6c6101c1 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
@@ -15,13 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import java.util.Comparator;
 
 import org.apache.spark.util.collection.Sorter;
 
-final class UnsafeShuffleInMemorySorter {
+final class ShuffleInMemorySorter {
 
   private final Sorter<PackedRecordPointer, long[]> sorter;
   private static final class SortComparator implements Comparator<PackedRecordPointer> {
@@ -44,10 +44,10 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
    */
   private int pointerArrayInsertPosition = 0;
 
-  public UnsafeShuffleInMemorySorter(int initialSize) {
+  public ShuffleInMemorySorter(int initialSize) {
     assert (initialSize > 0);
     this.pointerArray = new long[initialSize];
-    this.sorter = new Sorter<PackedRecordPointer, long[]>(UnsafeShuffleSortDataFormat.INSTANCE);
+    this.sorter = new Sorter<PackedRecordPointer, long[]>(ShuffleSortDataFormat.INSTANCE);
   }
 
   public void expandPointerArray() {
@@ -92,14 +92,14 @@ public void insertRecord(long recordPointer, int partitionId) {
   /**
    * An iterator-like class that's used instead of Java's Iterator in order to facilitate inlining.
    */
-  public static final class UnsafeShuffleSorterIterator {
+  public static final class ShuffleSorterIterator {
 
     private final long[] pointerArray;
     private final int numRecords;
     final PackedRecordPointer packedRecordPointer = new PackedRecordPointer();
     private int position = 0;
 
-    public UnsafeShuffleSorterIterator(int numRecords, long[] pointerArray) {
+    public ShuffleSorterIterator(int numRecords, long[] pointerArray) {
       this.numRecords = numRecords;
       this.pointerArray = pointerArray;
     }
@@ -117,8 +117,8 @@ public void loadNext() {
   /**
    * Return an iterator over record pointers in sorted order.
    */
-  public UnsafeShuffleSorterIterator getSortedIterator() {
+  public ShuffleSorterIterator getSortedIterator() {
     sorter.sort(pointerArray, 0, pointerArrayInsertPosition, SORT_COMPARATOR);
-    return new UnsafeShuffleSorterIterator(pointerArrayInsertPosition, pointerArray);
+    return new ShuffleSorterIterator(pointerArrayInsertPosition, pointerArray);
   }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleSortDataFormat.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java
similarity index 86%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleSortDataFormat.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java
index a66d74ee44782..8a1e5aec6ff0e 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleSortDataFormat.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java
@@ -15,15 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import org.apache.spark.util.collection.SortDataFormat;
 
-final class UnsafeShuffleSortDataFormat extends SortDataFormat<PackedRecordPointer, long[]> {
+final class ShuffleSortDataFormat extends SortDataFormat<PackedRecordPointer, long[]> {
 
-  public static final UnsafeShuffleSortDataFormat INSTANCE = new UnsafeShuffleSortDataFormat();
+  public static final ShuffleSortDataFormat INSTANCE = new ShuffleSortDataFormat();
 
-  private UnsafeShuffleSortDataFormat() { }
+  private ShuffleSortDataFormat() { }
 
   @Override
   public PackedRecordPointer getKey(long[] data, int pos) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
deleted file mode 100644
index 656ea0401a144..0000000000000
--- a/core/src/main/java/org/apache/spark/shuffle/sort/SortShuffleFileWriter.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.sort;
-
-import java.io.File;
-import java.io.IOException;
-
-import scala.Product2;
-import scala.collection.Iterator;
-
-import org.apache.spark.annotation.Private;
-import org.apache.spark.TaskContext;
-import org.apache.spark.storage.BlockId;
-
-/**
- * Interface for objects that {@link SortShuffleWriter} uses to write its output files.
- */
-@Private
-public interface SortShuffleFileWriter<K, V> {
-
-  void insertAll(Iterator<Product2<K, V>> records) throws IOException;
-
-  /**
-   * Write all the data added into this shuffle sorter into a file in the disk store. This is
-   * called by the SortShuffleWriter and can go through an efficient path of just concatenating
-   * binary files if we decided to avoid merge-sorting.
-   *
-   * @param blockId block ID to write to. The index file will be blockId.name + ".index".
-   * @param context a TaskContext for a running Spark task, for us to update shuffle metrics.
-   * @return array of lengths, in bytes, of each partition of the file (used by map output tracker)
-   */
-  long[] writePartitionedFile(
-      BlockId blockId,
-      TaskContext context,
-      File outputFile) throws IOException;
-
-  void stop() throws IOException;
-}
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/SpillInfo.java b/core/src/main/java/org/apache/spark/shuffle/sort/SpillInfo.java
similarity index 90%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/SpillInfo.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/SpillInfo.java
index 7bac0dc0bbeb6..df9f7b7abe028 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/SpillInfo.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/SpillInfo.java
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import java.io.File;
 
 import org.apache.spark.storage.TempShuffleBlockId;
 
 /**
- * Metadata for a block of data written by {@link UnsafeShuffleExternalSorter}.
+ * Metadata for a block of data written by {@link ShuffleExternalSorter}.
  */
 final class SpillInfo {
   final long[] partitionLengths;
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
similarity index 98%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
rename to core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index fdb309e365f69..e8f050cb2dab1 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import javax.annotation.Nullable;
 import java.io.*;
@@ -80,7 +80,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final boolean transferToEnabled;
 
   @Nullable private MapStatus mapStatus;
-  @Nullable private UnsafeShuffleExternalSorter sorter;
+  @Nullable private ShuffleExternalSorter sorter;
   private long peakMemoryUsedBytes = 0;
 
   /** Subclass of ByteArrayOutputStream that exposes `buf` directly. */
@@ -104,15 +104,15 @@ public UnsafeShuffleWriter(
       IndexShuffleBlockResolver shuffleBlockResolver,
       TaskMemoryManager memoryManager,
       ShuffleMemoryManager shuffleMemoryManager,
-      UnsafeShuffleHandle<K, V> handle,
+      SerializedShuffleHandle<K, V> handle,
       int mapId,
       TaskContext taskContext,
       SparkConf sparkConf) throws IOException {
     final int numPartitions = handle.dependency().partitioner().numPartitions();
-    if (numPartitions > UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()) {
+    if (numPartitions > SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE()) {
       throw new IllegalArgumentException(
         "UnsafeShuffleWriter can only be used for shuffles with at most " +
-          UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS() + " reduce partitions");
+          SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE() + " reduce partitions");
     }
     this.blockManager = blockManager;
     this.shuffleBlockResolver = shuffleBlockResolver;
@@ -195,7 +195,7 @@ public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOEx
 
   private void open() throws IOException {
     assert (sorter == null);
-    sorter = new UnsafeShuffleExternalSorter(
+    sorter = new ShuffleExternalSorter(
       memoryManager,
       shuffleMemoryManager,
       blockManager,
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index c32998345145a..704158bfc7643 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -330,7 +330,7 @@ object SparkEnv extends Logging {
     val shortShuffleMgrNames = Map(
       "hash" -> "org.apache.spark.shuffle.hash.HashShuffleManager",
       "sort" -> "org.apache.spark.shuffle.sort.SortShuffleManager",
-      "tungsten-sort" -> "org.apache.spark.shuffle.unsafe.UnsafeShuffleManager")
+      "tungsten-sort" -> "org.apache.spark.shuffle.sort.SortShuffleManager")
     val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")
     val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
index 9df4e551669cc..1105167d39d8d 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -19,9 +19,53 @@ package org.apache.spark.shuffle.sort
 
 import java.util.concurrent.ConcurrentHashMap
 
-import org.apache.spark.{Logging, SparkConf, TaskContext, ShuffleDependency}
+import org.apache.spark._
+import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle._
 
+/**
+ * In sort-based shuffle, incoming records are sorted according to their target partition ids, then
+ * written to a single map output file. Reducers fetch contiguous regions of this file in order to
+ * read their portion of the map output. In cases where the map output data is too large to fit in
+ * memory, sorted subsets of the output can are spilled to disk and those on-disk files are merged
+ * to produce the final output file.
+ *
+ * Sort-based shuffle has two different write paths for producing its map output files:
+ *
+ *  - Serialized sorting: used when all three of the following conditions hold:
+ *    1. The shuffle dependency specifies no aggregation or output ordering.
+ *    2. The shuffle serializer supports relocation of serialized values (this is currently
+ *       supported by KryoSerializer and Spark SQL's custom serializers).
+ *    3. The shuffle produces fewer than 16777216 output partitions.
+ *  - Deserialized sorting: used to handle all other cases.
+ *
+ * -----------------------
+ * Serialized sorting mode
+ * -----------------------
+ *
+ * In the serialized sorting mode, incoming records are serialized as soon as they are passed to the
+ * shuffle writer and are buffered in a serialized form during sorting. This write path implements
+ * several optimizations:
+ *
+ *  - Its sort operates on serialized binary data rather than Java objects, which reduces memory
+ *    consumption and GC overheads. This optimization requires the record serializer to have certain
+ *    properties to allow serialized records to be re-ordered without requiring deserialization.
+ *    See SPARK-4550, where this optimization was first proposed and implemented, for more details.
+ *
+ *  - It uses a specialized cache-efficient sorter ([[ShuffleExternalSorter]]) that sorts
+ *    arrays of compressed record pointers and partition ids. By using only 8 bytes of space per
+ *    record in the sorting array, this fits more of the array into cache.
+ *
+ *  - The spill merging procedure operates on blocks of serialized records that belong to the same
+ *    partition and does not need to deserialize records during the merge.
+ *
+ *  - When the spill compression codec supports concatenation of compressed data, the spill merge
+ *    simply concatenates the serialized and compressed spill partitions to produce the final output
+ *    partition.  This allows efficient data copying methods, like NIO's `transferTo`, to be used
+ *    and avoids the need to allocate decompression or copying buffers during the merge.
+ *
+ * For more details on these optimizations, see SPARK-7081.
+ */
 private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
 
   if (!conf.getBoolean("spark.shuffle.spill", true)) {
@@ -30,8 +74,12 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
         " Shuffle will continue to spill to disk when necessary.")
   }
 
-  private val indexShuffleBlockResolver = new IndexShuffleBlockResolver(conf)
-  private val shuffleMapNumber = new ConcurrentHashMap[Int, Int]()
+  /**
+   * A mapping from shuffle ids to the number of mappers producing output for those shuffles.
+   */
+  private[this] val numMapsForShuffle = new ConcurrentHashMap[Int, Int]()
+
+  override val shuffleBlockResolver = new IndexShuffleBlockResolver(conf)
 
   /**
    * Register a shuffle with the manager and obtain a handle for it to pass to tasks.
@@ -40,7 +88,22 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       shuffleId: Int,
       numMaps: Int,
       dependency: ShuffleDependency[K, V, C]): ShuffleHandle = {
-    new BaseShuffleHandle(shuffleId, numMaps, dependency)
+    if (SortShuffleWriter.shouldBypassMergeSort(SparkEnv.get.conf, dependency)) {
+      // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't
+      // need map-side aggregation, then write numPartitions files directly and just concatenate
+      // them at the end. This avoids doing serialization and deserialization twice to merge
+      // together the spilled files, which would happen with the normal code path. The downside is
+      // having multiple files open at a time and thus more memory allocated to buffers.
+      new BypassMergeSortShuffleHandle[K, V](
+        shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]])
+    } else if (SortShuffleManager.canUseSerializedShuffle(dependency)) {
+      // Otherwise, try to buffer map outputs in a serialized form, since this is more efficient:
+      new SerializedShuffleHandle[K, V](
+        shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]])
+    } else {
+      // Otherwise, buffer map outputs in a deserialized form:
+      new BaseShuffleHandle(shuffleId, numMaps, dependency)
+    }
   }
 
   /**
@@ -52,38 +115,114 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       startPartition: Int,
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
-    // We currently use the same block store shuffle fetcher as the hash-based shuffle.
     new BlockStoreShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 
   /** Get a writer for a given partition. Called on executors by map tasks. */
-  override def getWriter[K, V](handle: ShuffleHandle, mapId: Int, context: TaskContext)
-      : ShuffleWriter[K, V] = {
-    val baseShuffleHandle = handle.asInstanceOf[BaseShuffleHandle[K, V, _]]
-    shuffleMapNumber.putIfAbsent(baseShuffleHandle.shuffleId, baseShuffleHandle.numMaps)
-    new SortShuffleWriter(
-      shuffleBlockResolver, baseShuffleHandle, mapId, context)
+  override def getWriter[K, V](
+      handle: ShuffleHandle,
+      mapId: Int,
+      context: TaskContext): ShuffleWriter[K, V] = {
+    numMapsForShuffle.putIfAbsent(
+      handle.shuffleId, handle.asInstanceOf[BaseShuffleHandle[_, _, _]].numMaps)
+    val env = SparkEnv.get
+    handle match {
+      case unsafeShuffleHandle: SerializedShuffleHandle[K @unchecked, V @unchecked] =>
+        new UnsafeShuffleWriter(
+          env.blockManager,
+          shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver],
+          context.taskMemoryManager(),
+          env.shuffleMemoryManager,
+          unsafeShuffleHandle,
+          mapId,
+          context,
+          env.conf)
+      case bypassMergeSortHandle: BypassMergeSortShuffleHandle[K @unchecked, V @unchecked] =>
+        new BypassMergeSortShuffleWriter(
+          env.blockManager,
+          shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver],
+          bypassMergeSortHandle,
+          mapId,
+          context,
+          env.conf)
+      case other: BaseShuffleHandle[K @unchecked, V @unchecked, _] =>
+        new SortShuffleWriter(shuffleBlockResolver, other, mapId, context)
+    }
   }
 
   /** Remove a shuffle's metadata from the ShuffleManager. */
   override def unregisterShuffle(shuffleId: Int): Boolean = {
-    if (shuffleMapNumber.containsKey(shuffleId)) {
-      val numMaps = shuffleMapNumber.remove(shuffleId)
-      (0 until numMaps).map{ mapId =>
+    Option(numMapsForShuffle.remove(shuffleId)).foreach { numMaps =>
+      (0 until numMaps).foreach { mapId =>
         shuffleBlockResolver.removeDataByMap(shuffleId, mapId)
       }
     }
     true
   }
 
-  override val shuffleBlockResolver: IndexShuffleBlockResolver = {
-    indexShuffleBlockResolver
-  }
-
   /** Shut down this ShuffleManager. */
   override def stop(): Unit = {
     shuffleBlockResolver.stop()
   }
 }
 
+
+private[spark] object SortShuffleManager extends Logging {
+
+  /**
+   * The maximum number of shuffle output partitions that SortShuffleManager supports when
+   * buffering map outputs in a serialized form. This is an extreme defensive programming measure,
+   * since it's extremely unlikely that a single shuffle produces over 16 million output partitions.
+   * */
+  val MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE =
+    PackedRecordPointer.MAXIMUM_PARTITION_ID + 1
+
+  /**
+   * Helper method for determining whether a shuffle should use an optimized serialized shuffle
+   * path or whether it should fall back to the original path that operates on deserialized objects.
+   */
+  def canUseSerializedShuffle(dependency: ShuffleDependency[_, _, _]): Boolean = {
+    val shufId = dependency.shuffleId
+    val numPartitions = dependency.partitioner.numPartitions
+    val serializer = Serializer.getSerializer(dependency.serializer)
+    if (!serializer.supportsRelocationOfSerializedObjects) {
+      log.debug(s"Can't use serialized shuffle for shuffle $shufId because the serializer, " +
+        s"${serializer.getClass.getName}, does not support object relocation")
+      false
+    } else if (dependency.aggregator.isDefined) {
+      log.debug(
+        s"Can't use serialized shuffle for shuffle $shufId because an aggregator is defined")
+      false
+    } else if (numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) {
+      log.debug(s"Can't use serialized shuffle for shuffle $shufId because it has more than " +
+        s"$MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE partitions")
+      false
+    } else {
+      log.debug(s"Can use serialized shuffle for shuffle $shufId")
+      true
+    }
+  }
+}
+
+/**
+ * Subclass of [[BaseShuffleHandle]], used to identify when we've chosen to use the
+ * serialized shuffle.
+ */
+private[spark] class SerializedShuffleHandle[K, V](
+  shuffleId: Int,
+  numMaps: Int,
+  dependency: ShuffleDependency[K, V, V])
+  extends BaseShuffleHandle(shuffleId, numMaps, dependency) {
+}
+
+/**
+ * Subclass of [[BaseShuffleHandle]], used to identify when we've chosen to use the
+ * bypass merge sort shuffle path.
+ */
+private[spark] class BypassMergeSortShuffleHandle[K, V](
+  shuffleId: Int,
+  numMaps: Int,
+  dependency: ShuffleDependency[K, V, V])
+  extends BaseShuffleHandle(shuffleId, numMaps, dependency) {
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
index 5865e7640c1cf..bbd9c1ab53cd8 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -20,7 +20,6 @@ package org.apache.spark.shuffle.sort
 import org.apache.spark._
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.scheduler.MapStatus
-import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle}
 import org.apache.spark.storage.ShuffleBlockId
 import org.apache.spark.util.collection.ExternalSorter
@@ -36,7 +35,7 @@ private[spark] class SortShuffleWriter[K, V, C](
 
   private val blockManager = SparkEnv.get.blockManager
 
-  private var sorter: SortShuffleFileWriter[K, V] = null
+  private var sorter: ExternalSorter[K, V, _] = null
 
   // Are we in the process of stopping? Because map tasks can call stop() with success = true
   // and then call stop() with success = false if they get an exception, we want to make sure
@@ -54,15 +53,6 @@ private[spark] class SortShuffleWriter[K, V, C](
       require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
       new ExternalSorter[K, V, C](
         dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
-    } else if (SortShuffleWriter.shouldBypassMergeSort(
-        SparkEnv.get.conf, dep.partitioner.numPartitions, aggregator = None, keyOrdering = None)) {
-      // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't
-      // need local aggregation and sorting, write numPartitions files directly and just concatenate
-      // them at the end. This avoids doing serialization and deserialization twice to merge
-      // together the spilled files, which would happen with the normal code path. The downside is
-      // having multiple files open at a time and thus more memory allocated to buffers.
-      new BypassMergeSortShuffleWriter[K, V](SparkEnv.get.conf, blockManager, dep.partitioner,
-        writeMetrics, Serializer.getSerializer(dep.serializer))
     } else {
       // In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
       // care whether the keys get sorted in each partition; that will be done on the reduce side
@@ -111,12 +101,14 @@ private[spark] class SortShuffleWriter[K, V, C](
 }
 
 private[spark] object SortShuffleWriter {
-  def shouldBypassMergeSort(
-      conf: SparkConf,
-      numPartitions: Int,
-      aggregator: Option[Aggregator[_, _, _]],
-      keyOrdering: Option[Ordering[_]]): Boolean = {
-    val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
-    numPartitions <= bypassMergeThreshold && aggregator.isEmpty && keyOrdering.isEmpty
+  def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = {
+    // We cannot bypass sorting if we need to do map-side aggregation.
+    if (dep.mapSideCombine) {
+      require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
+      false
+    } else {
+      val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
+      dep.partitioner.numPartitions <= bypassMergeThreshold
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
deleted file mode 100644
index 75f22f642b9d1..0000000000000
--- a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.unsafe
-
-import java.util.Collections
-import java.util.concurrent.ConcurrentHashMap
-
-import org.apache.spark._
-import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle._
-import org.apache.spark.shuffle.sort.SortShuffleManager
-
-/**
- * Subclass of [[BaseShuffleHandle]], used to identify when we've chosen to use the new shuffle.
- */
-private[spark] class UnsafeShuffleHandle[K, V](
-    shuffleId: Int,
-    numMaps: Int,
-    dependency: ShuffleDependency[K, V, V])
-  extends BaseShuffleHandle(shuffleId, numMaps, dependency) {
-}
-
-private[spark] object UnsafeShuffleManager extends Logging {
-
-  /**
-   * The maximum number of shuffle output partitions that UnsafeShuffleManager supports.
-   */
-  val MAX_SHUFFLE_OUTPUT_PARTITIONS = PackedRecordPointer.MAXIMUM_PARTITION_ID + 1
-
-  /**
-   * Helper method for determining whether a shuffle should use the optimized unsafe shuffle
-   * path or whether it should fall back to the original sort-based shuffle.
-   */
-  def canUseUnsafeShuffle[K, V, C](dependency: ShuffleDependency[K, V, C]): Boolean = {
-    val shufId = dependency.shuffleId
-    val serializer = Serializer.getSerializer(dependency.serializer)
-    if (!serializer.supportsRelocationOfSerializedObjects) {
-      log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because the serializer, " +
-        s"${serializer.getClass.getName}, does not support object relocation")
-      false
-    } else if (dependency.aggregator.isDefined) {
-      log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because an aggregator is defined")
-      false
-    } else if (dependency.partitioner.numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS) {
-      log.debug(s"Can't use UnsafeShuffle for shuffle $shufId because it has more than " +
-        s"$MAX_SHUFFLE_OUTPUT_PARTITIONS partitions")
-      false
-    } else {
-      log.debug(s"Can use UnsafeShuffle for shuffle $shufId")
-      true
-    }
-  }
-}
-
-/**
- * A shuffle implementation that uses directly-managed memory to implement several performance
- * optimizations for certain types of shuffles. In cases where the new performance optimizations
- * cannot be applied, this shuffle manager delegates to [[SortShuffleManager]] to handle those
- * shuffles.
- *
- * UnsafeShuffleManager's optimizations will apply when _all_ of the following conditions hold:
- *
- *  - The shuffle dependency specifies no aggregation or output ordering.
- *  - The shuffle serializer supports relocation of serialized values (this is currently supported
- *    by KryoSerializer and Spark SQL's custom serializers).
- *  - The shuffle produces fewer than 16777216 output partitions.
- *  - No individual record is larger than 128 MB when serialized.
- *
- * In addition, extra spill-merging optimizations are automatically applied when the shuffle
- * compression codec supports concatenation of serialized streams. This is currently supported by
- * Spark's LZF serializer.
- *
- * At a high-level, UnsafeShuffleManager's design is similar to Spark's existing SortShuffleManager.
- * In sort-based shuffle, incoming records are sorted according to their target partition ids, then
- * written to a single map output file. Reducers fetch contiguous regions of this file in order to
- * read their portion of the map output. In cases where the map output data is too large to fit in
- * memory, sorted subsets of the output can are spilled to disk and those on-disk files are merged
- * to produce the final output file.
- *
- * UnsafeShuffleManager optimizes this process in several ways:
- *
- *  - Its sort operates on serialized binary data rather than Java objects, which reduces memory
- *    consumption and GC overheads. This optimization requires the record serializer to have certain
- *    properties to allow serialized records to be re-ordered without requiring deserialization.
- *    See SPARK-4550, where this optimization was first proposed and implemented, for more details.
- *
- *  - It uses a specialized cache-efficient sorter ([[UnsafeShuffleExternalSorter]]) that sorts
- *    arrays of compressed record pointers and partition ids. By using only 8 bytes of space per
- *    record in the sorting array, this fits more of the array into cache.
- *
- *  - The spill merging procedure operates on blocks of serialized records that belong to the same
- *    partition and does not need to deserialize records during the merge.
- *
- *  - When the spill compression codec supports concatenation of compressed data, the spill merge
- *    simply concatenates the serialized and compressed spill partitions to produce the final output
- *    partition.  This allows efficient data copying methods, like NIO's `transferTo`, to be used
- *    and avoids the need to allocate decompression or copying buffers during the merge.
- *
- * For more details on UnsafeShuffleManager's design, see SPARK-7081.
- */
-private[spark] class UnsafeShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
-
-  if (!conf.getBoolean("spark.shuffle.spill", true)) {
-    logWarning(
-      "spark.shuffle.spill was set to false, but this is ignored by the tungsten-sort shuffle " +
-      "manager; its optimized shuffles will continue to spill to disk when necessary.")
-  }
-
-  private[this] val sortShuffleManager: SortShuffleManager = new SortShuffleManager(conf)
-  private[this] val shufflesThatFellBackToSortShuffle =
-    Collections.newSetFromMap(new ConcurrentHashMap[Int, java.lang.Boolean]())
-  private[this] val numMapsForShufflesThatUsedNewPath = new ConcurrentHashMap[Int, Int]()
-
-  /**
-   * Register a shuffle with the manager and obtain a handle for it to pass to tasks.
-   */
-  override def registerShuffle[K, V, C](
-      shuffleId: Int,
-      numMaps: Int,
-      dependency: ShuffleDependency[K, V, C]): ShuffleHandle = {
-    if (UnsafeShuffleManager.canUseUnsafeShuffle(dependency)) {
-      new UnsafeShuffleHandle[K, V](
-        shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]])
-    } else {
-      new BaseShuffleHandle(shuffleId, numMaps, dependency)
-    }
-  }
-
-  /**
-   * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive).
-   * Called on executors by reduce tasks.
-   */
-  override def getReader[K, C](
-      handle: ShuffleHandle,
-      startPartition: Int,
-      endPartition: Int,
-      context: TaskContext): ShuffleReader[K, C] = {
-    sortShuffleManager.getReader(handle, startPartition, endPartition, context)
-  }
-
-  /** Get a writer for a given partition. Called on executors by map tasks. */
-  override def getWriter[K, V](
-      handle: ShuffleHandle,
-      mapId: Int,
-      context: TaskContext): ShuffleWriter[K, V] = {
-    handle match {
-      case unsafeShuffleHandle: UnsafeShuffleHandle[K @unchecked, V @unchecked] =>
-        numMapsForShufflesThatUsedNewPath.putIfAbsent(handle.shuffleId, unsafeShuffleHandle.numMaps)
-        val env = SparkEnv.get
-        new UnsafeShuffleWriter(
-          env.blockManager,
-          shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver],
-          context.taskMemoryManager(),
-          env.shuffleMemoryManager,
-          unsafeShuffleHandle,
-          mapId,
-          context,
-          env.conf)
-      case other =>
-        shufflesThatFellBackToSortShuffle.add(handle.shuffleId)
-        sortShuffleManager.getWriter(handle, mapId, context)
-    }
-  }
-
-  /** Remove a shuffle's metadata from the ShuffleManager. */
-  override def unregisterShuffle(shuffleId: Int): Boolean = {
-    if (shufflesThatFellBackToSortShuffle.remove(shuffleId)) {
-      sortShuffleManager.unregisterShuffle(shuffleId)
-    } else {
-      Option(numMapsForShufflesThatUsedNewPath.remove(shuffleId)).foreach { numMaps =>
-        (0 until numMaps).foreach { mapId =>
-          shuffleBlockResolver.removeDataByMap(shuffleId, mapId)
-        }
-      }
-      true
-    }
-  }
-
-  override val shuffleBlockResolver: IndexShuffleBlockResolver = {
-    sortShuffleManager.shuffleBlockResolver
-  }
-
-  /** Shut down this ShuffleManager. */
-  override def stop(): Unit = {
-    sortShuffleManager.stop()
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
deleted file mode 100644
index ae60f3b0cb555..0000000000000
--- a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util.collection
-
-import java.io.OutputStream
-
-import scala.collection.mutable.ArrayBuffer
-
-/**
- * A logical byte buffer that wraps a list of byte arrays. All the byte arrays have equal size. The
- * advantage of this over a standard ArrayBuffer is that it can grow without claiming large amounts
- * of memory and needing to copy the full contents. The disadvantage is that the contents don't
- * occupy a contiguous segment of memory.
- */
-private[spark] class ChainedBuffer(chunkSize: Int) {
-
-  private val chunkSizeLog2: Int = java.lang.Long.numberOfTrailingZeros(
-    java.lang.Long.highestOneBit(chunkSize))
-  assert((1 << chunkSizeLog2) == chunkSize,
-    s"ChainedBuffer chunk size $chunkSize must be a power of two")
-  private val chunks: ArrayBuffer[Array[Byte]] = new ArrayBuffer[Array[Byte]]()
-  private var _size: Long = 0
-
-  /**
-   * Feed bytes from this buffer into a DiskBlockObjectWriter.
-   *
-   * @param pos Offset in the buffer to read from.
-   * @param os OutputStream to read into.
-   * @param len Number of bytes to read.
-   */
-  def read(pos: Long, os: OutputStream, len: Int): Unit = {
-    if (pos + len > _size) {
-      throw new IndexOutOfBoundsException(
-        s"Read of $len bytes at position $pos would go past size ${_size} of buffer")
-    }
-    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
-    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
-    var written: Int = 0
-    while (written < len) {
-      val toRead: Int = math.min(len - written, chunkSize - posInChunk)
-      os.write(chunks(chunkIndex), posInChunk, toRead)
-      written += toRead
-      chunkIndex += 1
-      posInChunk = 0
-    }
-  }
-
-  /**
-   * Read bytes from this buffer into a byte array.
-   *
-   * @param pos Offset in the buffer to read from.
-   * @param bytes Byte array to read into.
-   * @param offs Offset in the byte array to read to.
-   * @param len Number of bytes to read.
-   */
-  def read(pos: Long, bytes: Array[Byte], offs: Int, len: Int): Unit = {
-    if (pos + len > _size) {
-      throw new IndexOutOfBoundsException(
-        s"Read of $len bytes at position $pos would go past size of buffer")
-    }
-    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
-    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
-    var written: Int = 0
-    while (written < len) {
-      val toRead: Int = math.min(len - written, chunkSize - posInChunk)
-      System.arraycopy(chunks(chunkIndex), posInChunk, bytes, offs + written, toRead)
-      written += toRead
-      chunkIndex += 1
-      posInChunk = 0
-    }
-  }
-
-  /**
-   * Write bytes from a byte array into this buffer.
-   *
-   * @param pos Offset in the buffer to write to.
-   * @param bytes Byte array to write from.
-   * @param offs Offset in the byte array to write from.
-   * @param len Number of bytes to write.
-   */
-  def write(pos: Long, bytes: Array[Byte], offs: Int, len: Int): Unit = {
-    if (pos > _size) {
-      throw new IndexOutOfBoundsException(
-        s"Write at position $pos starts after end of buffer ${_size}")
-    }
-    // Grow if needed
-    val endChunkIndex: Int = ((pos + len - 1) >> chunkSizeLog2).toInt
-    while (endChunkIndex >= chunks.length) {
-      chunks += new Array[Byte](chunkSize)
-    }
-
-    var chunkIndex: Int = (pos >> chunkSizeLog2).toInt
-    var posInChunk: Int = (pos - (chunkIndex.toLong << chunkSizeLog2)).toInt
-    var written: Int = 0
-    while (written < len) {
-      val toWrite: Int = math.min(len - written, chunkSize - posInChunk)
-      System.arraycopy(bytes, offs + written, chunks(chunkIndex), posInChunk, toWrite)
-      written += toWrite
-      chunkIndex += 1
-      posInChunk = 0
-    }
-
-    _size = math.max(_size, pos + len)
-  }
-
-  /**
-   * Total size of buffer that can be written to without allocating additional memory.
-   */
-  def capacity: Long = chunks.size.toLong * chunkSize
-
-  /**
-   * Size of the logical buffer.
-   */
-  def size: Long = _size
-}
-
-/**
- * Output stream that writes to a ChainedBuffer.
- */
-private[spark] class ChainedBufferOutputStream(chainedBuffer: ChainedBuffer) extends OutputStream {
-  private var pos: Long = 0
-
-  override def write(b: Int): Unit = {
-    throw new UnsupportedOperationException()
-  }
-
-  override def write(bytes: Array[Byte], offs: Int, len: Int): Unit = {
-    chainedBuffer.write(pos, bytes, offs, len)
-    pos += len
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 749be34d8e8fd..c48c453a90d01 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -29,7 +29,6 @@ import com.google.common.io.ByteStreams
 import org.apache.spark._
 import org.apache.spark.serializer._
 import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.shuffle.sort.{SortShuffleFileWriter, SortShuffleWriter}
 import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter}
 
 /**
@@ -69,8 +68,8 @@ import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter}
  * At a high level, this class works internally as follows:
  *
  * - We repeatedly fill up buffers of in-memory data, using either a PartitionedAppendOnlyMap if
- *   we want to combine by key, or a PartitionedSerializedPairBuffer or PartitionedPairBuffer if we
- *   don't. Inside these buffers, we sort elements by partition ID and then possibly also by key.
+ *   we want to combine by key, or a PartitionedPairBuffer if we don't.
+ *   Inside these buffers, we sort elements by partition ID and then possibly also by key.
  *   To avoid calling the partitioner multiple times with each key, we store the partition ID
  *   alongside each record.
  *
@@ -93,8 +92,7 @@ private[spark] class ExternalSorter[K, V, C](
     ordering: Option[Ordering[K]] = None,
     serializer: Option[Serializer] = None)
   extends Logging
-  with Spillable[WritablePartitionedPairCollection[K, C]]
-  with SortShuffleFileWriter[K, V] {
+  with Spillable[WritablePartitionedPairCollection[K, C]] {
 
   private val conf = SparkEnv.get.conf
 
@@ -104,13 +102,6 @@ private[spark] class ExternalSorter[K, V, C](
     if (shouldPartition) partitioner.get.getPartition(key) else 0
   }
 
-  // Since SPARK-7855, bypassMergeSort optimization is no longer performed as part of this class.
-  // As a sanity check, make sure that we're not handling a shuffle which should use that path.
-  if (SortShuffleWriter.shouldBypassMergeSort(conf, numPartitions, aggregator, ordering)) {
-    throw new IllegalArgumentException("ExternalSorter should not be used to handle "
-      + " a sort that the BypassMergeSortShuffleWriter should handle")
-  }
-
   private val blockManager = SparkEnv.get.blockManager
   private val diskBlockManager = blockManager.diskBlockManager
   private val ser = Serializer.getSerializer(serializer)
@@ -128,23 +119,11 @@ private[spark] class ExternalSorter[K, V, C](
   // grow internal data structures by growing + copying every time the number of objects doubles.
   private val serializerBatchSize = conf.getLong("spark.shuffle.spill.batchSize", 10000)
 
-  private val useSerializedPairBuffer =
-    ordering.isEmpty &&
-      conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
-      ser.supportsRelocationOfSerializedObjects
-  private val kvChunkSize = conf.getInt("spark.shuffle.sort.kvChunkSize", 1 << 22) // 4 MB
-  private def newBuffer(): WritablePartitionedPairCollection[K, C] with SizeTracker = {
-    if (useSerializedPairBuffer) {
-      new PartitionedSerializedPairBuffer(metaInitialRecords = 256, kvChunkSize, serInstance)
-    } else {
-      new PartitionedPairBuffer[K, C]
-    }
-  }
   // Data structures to store in-memory objects before we spill. Depending on whether we have an
   // Aggregator set, we either put objects into an AppendOnlyMap where we combine them, or we
   // store them in an array buffer.
   private var map = new PartitionedAppendOnlyMap[K, C]
-  private var buffer = newBuffer()
+  private var buffer = new PartitionedPairBuffer[K, C]
 
   // Total spilling statistics
   private var _diskBytesSpilled = 0L
@@ -192,7 +171,7 @@ private[spark] class ExternalSorter[K, V, C](
    */
   private[spark] def numSpills: Int = spills.size
 
-  override def insertAll(records: Iterator[Product2[K, V]]): Unit = {
+  def insertAll(records: Iterator[Product2[K, V]]): Unit = {
     // TODO: stop combining if we find that the reduction factor isn't high
     val shouldCombine = aggregator.isDefined
 
@@ -236,7 +215,7 @@ private[spark] class ExternalSorter[K, V, C](
     } else {
       estimatedSize = buffer.estimateSize()
       if (maybeSpill(buffer, estimatedSize)) {
-        buffer = newBuffer()
+        buffer = new PartitionedPairBuffer[K, C]
       }
     }
 
@@ -659,7 +638,7 @@ private[spark] class ExternalSorter[K, V, C](
    * @param context a TaskContext for a running Spark task, for us to update shuffle metrics.
    * @return array of lengths, in bytes, of each partition of the file (used by map output tracker)
    */
-  override def writePartitionedFile(
+  def writePartitionedFile(
       blockId: BlockId,
       context: TaskContext,
       outputFile: File): Array[Long] = {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
deleted file mode 100644
index 87a786b02d651..0000000000000
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util.collection
-
-import java.io.InputStream
-import java.nio.IntBuffer
-import java.util.Comparator
-
-import org.apache.spark.serializer.{JavaSerializerInstance, SerializerInstance}
-import org.apache.spark.storage.DiskBlockObjectWriter
-import org.apache.spark.util.collection.PartitionedSerializedPairBuffer._
-
-/**
- * Append-only buffer of key-value pairs, each with a corresponding partition ID, that serializes
- * its records upon insert and stores them as raw bytes.
- *
- * We use two data-structures to store the contents. The serialized records are stored in a
- * ChainedBuffer that can expand gracefully as records are added. This buffer is accompanied by a
- * metadata buffer that stores pointers into the data buffer as well as the partition ID of each
- * record. Each entry in the metadata buffer takes up a fixed amount of space.
- *
- * Sorting the collection means swapping entries in the metadata buffer - the record buffer need not
- * be modified at all. Storing the partition IDs in the metadata buffer means that comparisons can
- * happen without following any pointers, which should minimize cache misses.
- *
- * Currently, only sorting by partition is supported.
- *
- * Each record is laid out inside the the metaBuffer as follows. keyStart, a long, is split across
- * two integers:
- *
- *   +-------------+------------+------------+-------------+
- *   |         keyStart         | keyValLen  | partitionId |
- *   +-------------+------------+------------+-------------+
- *
- * The buffer can support up to `536870911 (2 ^ 29 - 1)` records.
- *
- * @param metaInitialRecords The initial number of entries in the metadata buffer.
- * @param kvBlockSize The size of each byte buffer in the ChainedBuffer used to store the records.
- * @param serializerInstance the serializer used for serializing inserted records.
- */
-private[spark] class PartitionedSerializedPairBuffer[K, V](
-    metaInitialRecords: Int,
-    kvBlockSize: Int,
-    serializerInstance: SerializerInstance)
-  extends WritablePartitionedPairCollection[K, V] with SizeTracker {
-
-  if (serializerInstance.isInstanceOf[JavaSerializerInstance]) {
-    throw new IllegalArgumentException("PartitionedSerializedPairBuffer does not support" +
-      " Java-serialized objects.")
-  }
-
-  require(metaInitialRecords <= MAXIMUM_RECORDS,
-    s"Can't make capacity bigger than ${MAXIMUM_RECORDS} records")
-  private var metaBuffer = IntBuffer.allocate(metaInitialRecords * RECORD_SIZE)
-
-  private val kvBuffer: ChainedBuffer = new ChainedBuffer(kvBlockSize)
-  private val kvOutputStream = new ChainedBufferOutputStream(kvBuffer)
-  private val kvSerializationStream = serializerInstance.serializeStream(kvOutputStream)
-
-  def insert(partition: Int, key: K, value: V): Unit = {
-    if (metaBuffer.position == metaBuffer.capacity) {
-      growMetaBuffer()
-    }
-
-    val keyStart = kvBuffer.size
-    kvSerializationStream.writeKey[Any](key)
-    kvSerializationStream.writeValue[Any](value)
-    kvSerializationStream.flush()
-    val keyValLen = (kvBuffer.size - keyStart).toInt
-
-    // keyStart, a long, gets split across two ints
-    metaBuffer.put(keyStart.toInt)
-    metaBuffer.put((keyStart >> 32).toInt)
-    metaBuffer.put(keyValLen)
-    metaBuffer.put(partition)
-  }
-
-  /** Double the size of the array because we've reached capacity */
-  private def growMetaBuffer(): Unit = {
-    if (metaBuffer.capacity >= MAXIMUM_META_BUFFER_CAPACITY) {
-      throw new IllegalStateException(s"Can't insert more than ${MAXIMUM_RECORDS} records")
-    }
-    val newCapacity =
-      if (metaBuffer.capacity * 2 < 0 || metaBuffer.capacity * 2 > MAXIMUM_META_BUFFER_CAPACITY) {
-        // Overflow
-        MAXIMUM_META_BUFFER_CAPACITY
-      } else {
-        metaBuffer.capacity * 2
-      }
-    val newMetaBuffer = IntBuffer.allocate(newCapacity)
-    newMetaBuffer.put(metaBuffer.array)
-    metaBuffer = newMetaBuffer
-  }
-
-  /** Iterate through the data in a given order. For this class this is not really destructive. */
-  override def partitionedDestructiveSortedIterator(keyComparator: Option[Comparator[K]])
-    : Iterator[((Int, K), V)] = {
-    sort(keyComparator)
-    val is = orderedInputStream
-    val deserStream = serializerInstance.deserializeStream(is)
-    new Iterator[((Int, K), V)] {
-      var metaBufferPos = 0
-      def hasNext: Boolean = metaBufferPos < metaBuffer.position
-      def next(): ((Int, K), V) = {
-        val key = deserStream.readKey[Any]().asInstanceOf[K]
-        val value = deserStream.readValue[Any]().asInstanceOf[V]
-        val partition = metaBuffer.get(metaBufferPos + PARTITION)
-        metaBufferPos += RECORD_SIZE
-        ((partition, key), value)
-      }
-    }
-  }
-
-  override def estimateSize: Long = metaBuffer.capacity * 4L + kvBuffer.capacity
-
-  override def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
-    : WritablePartitionedIterator = {
-    sort(keyComparator)
-    new WritablePartitionedIterator {
-      // current position in the meta buffer in ints
-      var pos = 0
-
-      def writeNext(writer: DiskBlockObjectWriter): Unit = {
-        val keyStart = getKeyStartPos(metaBuffer, pos)
-        val keyValLen = metaBuffer.get(pos + KEY_VAL_LEN)
-        pos += RECORD_SIZE
-        kvBuffer.read(keyStart, writer, keyValLen)
-        writer.recordWritten()
-      }
-      def nextPartition(): Int = metaBuffer.get(pos + PARTITION)
-      def hasNext(): Boolean = pos < metaBuffer.position
-    }
-  }
-
-  // Visible for testing
-  def orderedInputStream: OrderedInputStream = {
-    new OrderedInputStream(metaBuffer, kvBuffer)
-  }
-
-  private def sort(keyComparator: Option[Comparator[K]]): Unit = {
-    val comparator = if (keyComparator.isEmpty) {
-      new Comparator[Int]() {
-        def compare(partition1: Int, partition2: Int): Int = {
-          partition1 - partition2
-        }
-      }
-    } else {
-      throw new UnsupportedOperationException()
-    }
-
-    val sorter = new Sorter(new SerializedSortDataFormat)
-    sorter.sort(metaBuffer, 0, metaBuffer.position / RECORD_SIZE, comparator)
-  }
-}
-
-private[spark] class OrderedInputStream(metaBuffer: IntBuffer, kvBuffer: ChainedBuffer)
-    extends InputStream {
-
-  import PartitionedSerializedPairBuffer._
-
-  private var metaBufferPos = 0
-  private var kvBufferPos =
-    if (metaBuffer.position > 0) getKeyStartPos(metaBuffer, metaBufferPos) else 0
-
-  override def read(bytes: Array[Byte]): Int = read(bytes, 0, bytes.length)
-
-  override def read(bytes: Array[Byte], offs: Int, len: Int): Int = {
-    if (metaBufferPos >= metaBuffer.position) {
-      return -1
-    }
-    val bytesRemainingInRecord = (metaBuffer.get(metaBufferPos + KEY_VAL_LEN) -
-      (kvBufferPos - getKeyStartPos(metaBuffer, metaBufferPos))).toInt
-    val toRead = math.min(bytesRemainingInRecord, len)
-    kvBuffer.read(kvBufferPos, bytes, offs, toRead)
-    if (toRead == bytesRemainingInRecord) {
-      metaBufferPos += RECORD_SIZE
-      if (metaBufferPos < metaBuffer.position) {
-        kvBufferPos = getKeyStartPos(metaBuffer, metaBufferPos)
-      }
-    } else {
-      kvBufferPos += toRead
-    }
-    toRead
-  }
-
-  override def read(): Int = {
-    throw new UnsupportedOperationException()
-  }
-}
-
-private[spark] class SerializedSortDataFormat extends SortDataFormat[Int, IntBuffer] {
-
-  private val META_BUFFER_TMP = new Array[Int](RECORD_SIZE)
-
-  /** Return the sort key for the element at the given index. */
-  override protected def getKey(metaBuffer: IntBuffer, pos: Int): Int = {
-    metaBuffer.get(pos * RECORD_SIZE + PARTITION)
-  }
-
-  /** Swap two elements. */
-  override def swap(metaBuffer: IntBuffer, pos0: Int, pos1: Int): Unit = {
-    val iOff = pos0 * RECORD_SIZE
-    val jOff = pos1 * RECORD_SIZE
-    System.arraycopy(metaBuffer.array, iOff, META_BUFFER_TMP, 0, RECORD_SIZE)
-    System.arraycopy(metaBuffer.array, jOff, metaBuffer.array, iOff, RECORD_SIZE)
-    System.arraycopy(META_BUFFER_TMP, 0, metaBuffer.array, jOff, RECORD_SIZE)
-  }
-
-  /** Copy a single element from src(srcPos) to dst(dstPos). */
-  override def copyElement(
-      src: IntBuffer,
-      srcPos: Int,
-      dst: IntBuffer,
-      dstPos: Int): Unit = {
-    val srcOff = srcPos * RECORD_SIZE
-    val dstOff = dstPos * RECORD_SIZE
-    System.arraycopy(src.array, srcOff, dst.array, dstOff, RECORD_SIZE)
-  }
-
-  /**
-   * Copy a range of elements starting at src(srcPos) to dst, starting at dstPos.
-   * Overlapping ranges are allowed.
-   */
-  override def copyRange(
-      src: IntBuffer,
-      srcPos: Int,
-      dst: IntBuffer,
-      dstPos: Int,
-      length: Int): Unit = {
-    val srcOff = srcPos * RECORD_SIZE
-    val dstOff = dstPos * RECORD_SIZE
-    System.arraycopy(src.array, srcOff, dst.array, dstOff, RECORD_SIZE * length)
-  }
-
-  /**
-   * Allocates a Buffer that can hold up to 'length' elements.
-   * All elements of the buffer should be considered invalid until data is explicitly copied in.
-   */
-  override def allocate(length: Int): IntBuffer = {
-    IntBuffer.allocate(length * RECORD_SIZE)
-  }
-}
-
-private object PartitionedSerializedPairBuffer {
-  val KEY_START = 0 // keyStart, a long, gets split across two ints
-  val KEY_VAL_LEN = 2
-  val PARTITION = 3
-  val RECORD_SIZE = PARTITION + 1 // num ints of metadata
-
-  val MAXIMUM_RECORDS = Int.MaxValue / RECORD_SIZE // (2 ^ 29) - 1
-  val MAXIMUM_META_BUFFER_CAPACITY = MAXIMUM_RECORDS * RECORD_SIZE // (2 ^ 31) - 4
-
-  def getKeyStartPos(metaBuffer: IntBuffer, metaBufferPos: Int): Long = {
-    val lower32 = metaBuffer.get(metaBufferPos + KEY_START)
-    val upper32 = metaBuffer.get(metaBufferPos + KEY_START + 1)
-    (upper32.toLong << 32) | (lower32 & 0xFFFFFFFFL)
-  }
-}
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
similarity index 96%
rename from core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java
rename to core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
index 934b7e03050b6..232ae4d926bcd 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/PackedRecordPointerSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
+import org.apache.spark.shuffle.sort.PackedRecordPointer;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -24,7 +25,7 @@
 import org.apache.spark.unsafe.memory.MemoryAllocator;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.memory.TaskMemoryManager;
-import static org.apache.spark.shuffle.unsafe.PackedRecordPointer.*;
+import static org.apache.spark.shuffle.sort.PackedRecordPointer.*;
 
 public class PackedRecordPointerSuite {
 
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
similarity index 87%
rename from core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java
rename to core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
index 40fefe2c9d140..1ef3c5ff64bac 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import java.util.Arrays;
 import java.util.Random;
@@ -30,7 +30,7 @@
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.memory.TaskMemoryManager;
 
-public class UnsafeShuffleInMemorySorterSuite {
+public class ShuffleInMemorySorterSuite {
 
   private static String getStringFromDataPage(Object baseObject, long baseOffset, int strLength) {
     final byte[] strBytes = new byte[strLength];
@@ -40,8 +40,8 @@ private static String getStringFromDataPage(Object baseObject, long baseOffset,
 
   @Test
   public void testSortingEmptyInput() {
-    final UnsafeShuffleInMemorySorter sorter = new UnsafeShuffleInMemorySorter(100);
-    final UnsafeShuffleInMemorySorter.UnsafeShuffleSorterIterator iter = sorter.getSortedIterator();
+    final ShuffleInMemorySorter sorter = new ShuffleInMemorySorter(100);
+    final ShuffleInMemorySorter.ShuffleSorterIterator iter = sorter.getSortedIterator();
     assert(!iter.hasNext());
   }
 
@@ -62,7 +62,7 @@ public void testBasicSorting() throws Exception {
       new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
     final MemoryBlock dataPage = memoryManager.allocatePage(2048);
     final Object baseObject = dataPage.getBaseObject();
-    final UnsafeShuffleInMemorySorter sorter = new UnsafeShuffleInMemorySorter(4);
+    final ShuffleInMemorySorter sorter = new ShuffleInMemorySorter(4);
     final HashPartitioner hashPartitioner = new HashPartitioner(4);
 
     // Write the records into the data page and store pointers into the sorter
@@ -79,7 +79,7 @@ public void testBasicSorting() throws Exception {
     }
 
     // Sort the records
-    final UnsafeShuffleInMemorySorter.UnsafeShuffleSorterIterator iter = sorter.getSortedIterator();
+    final ShuffleInMemorySorter.ShuffleSorterIterator iter = sorter.getSortedIterator();
     int prevPartitionId = -1;
     Arrays.sort(dataToSort);
     for (int i = 0; i < dataToSort.length; i++) {
@@ -103,7 +103,7 @@ public void testBasicSorting() throws Exception {
 
   @Test
   public void testSortingManyNumbers() throws Exception {
-    UnsafeShuffleInMemorySorter sorter = new UnsafeShuffleInMemorySorter(4);
+    ShuffleInMemorySorter sorter = new ShuffleInMemorySorter(4);
     int[] numbersToSort = new int[128000];
     Random random = new Random(16);
     for (int i = 0; i < numbersToSort.length; i++) {
@@ -112,7 +112,7 @@ public void testSortingManyNumbers() throws Exception {
     }
     Arrays.sort(numbersToSort);
     int[] sorterResult = new int[numbersToSort.length];
-    UnsafeShuffleInMemorySorter.UnsafeShuffleSorterIterator iter = sorter.getSortedIterator();
+    ShuffleInMemorySorter.ShuffleSorterIterator iter = sorter.getSortedIterator();
     int j = 0;
     while (iter.hasNext()) {
       iter.loadNext();
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
similarity index 98%
rename from core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
rename to core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index d218344cd4520..29d9823b1f71b 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.shuffle.sort;
 
 import java.io.*;
 import java.nio.ByteBuffer;
@@ -23,7 +23,6 @@
 
 import scala.*;
 import scala.collection.Iterator;
-import scala.reflect.ClassTag;
 import scala.runtime.AbstractFunction1;
 
 import com.google.common.collect.Iterators;
@@ -56,6 +55,7 @@
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.shuffle.ShuffleMemoryManager;
+import org.apache.spark.shuffle.sort.SerializedShuffleHandle;
 import org.apache.spark.storage.*;
 import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryAllocator;
@@ -204,7 +204,7 @@ private UnsafeShuffleWriter<Object, Object> createWriter(
       shuffleBlockResolver,
       taskMemoryManager,
       shuffleMemoryManager,
-      new UnsafeShuffleHandle<Object, Object>(0, 1, shuffleDep),
+      new SerializedShuffleHandle<Object, Object>(0, 1, shuffleDep),
       0, // map id
       taskContext,
       conf
@@ -461,7 +461,7 @@ public void writeRecordsThatAreBiggerThanDiskWriteBufferSize() throws Exception
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(false);
     final ArrayList<Product2<Object, Object>> dataToWrite =
       new ArrayList<Product2<Object, Object>>();
-    final byte[] bytes = new byte[(int) (UnsafeShuffleExternalSorter.DISK_WRITE_BUFFER_SIZE * 2.5)];
+    final byte[] bytes = new byte[(int) (ShuffleExternalSorter.DISK_WRITE_BUFFER_SIZE * 2.5)];
     new Random(42).nextBytes(bytes);
     dataToWrite.add(new Tuple2<Object, Object>(1, ByteBuffer.wrap(bytes)));
     writer.write(dataToWrite.iterator());
@@ -516,7 +516,7 @@ public void testPeakMemoryUsed() throws Exception {
         shuffleBlockResolver,
         taskMemoryManager,
         shuffleMemoryManager,
-        new UnsafeShuffleHandle<>(0, 1, shuffleDep),
+        new SerializedShuffleHandle<>(0, 1, shuffleDep),
         0, // map id
         taskContext,
         conf);
diff --git a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
index 63358172ea1f4..b8ab227517cc4 100644
--- a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
@@ -17,13 +17,78 @@
 
 package org.apache.spark
 
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import org.apache.commons.io.FileUtils
+import org.apache.commons.io.filefilter.TrueFileFilter
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.rdd.ShuffledRDD
+import org.apache.spark.shuffle.sort.SortShuffleManager
+import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
+import org.apache.spark.util.Utils
+
 class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
 
   // This test suite should run all tests in ShuffleSuite with sort-based shuffle.
 
+  private var tempDir: File = _
+
   override def beforeAll() {
     conf.set("spark.shuffle.manager", "sort")
   }
+
+  override def beforeEach(): Unit = {
+    tempDir = Utils.createTempDir()
+    conf.set("spark.local.dir", tempDir.getAbsolutePath)
+  }
+
+  override def afterEach(): Unit = {
+    try {
+      Utils.deleteRecursively(tempDir)
+    } finally {
+      super.afterEach()
+    }
+  }
+
+  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
+    sc = new SparkContext("local", "test", conf)
+    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
+    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
+    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
+      .setSerializer(new KryoSerializer(conf))
+    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
+    ensureFilesAreCleanedUp(shuffledRdd)
+  }
+
+  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
+    sc = new SparkContext("local", "test", conf)
+    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
+    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
+    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
+      .setSerializer(new JavaSerializer(conf))
+    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
+    ensureFilesAreCleanedUp(shuffledRdd)
+  }
+
+  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
+    def getAllFiles: Set[File] =
+      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
+    val filesBeforeShuffle = getAllFiles
+    // Force the shuffle to be performed
+    shuffledRdd.count()
+    // Ensure that the shuffle actually created files that will need to be cleaned up
+    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
+    filesCreatedByShuffle.map(_.getName) should be
+    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
+    // Check that the cleanup actually removes the files
+    sc.env.blockManager.master.removeShuffle(0, blocking = true)
+    for (file <- filesCreatedByShuffle) {
+      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 5b01ddb298c39..3816b8c4a09aa 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -1062,10 +1062,10 @@ class DAGSchedulerSuite
    */
   test("don't submit stage until its dependencies map outputs are registered (SPARK-5259)") {
     val firstRDD = new MyRDD(sc, 3, Nil)
-    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, new HashPartitioner(2))
     val firstShuffleId = firstShuffleDep.shuffleId
     val shuffleMapRdd = new MyRDD(sc, 3, List(firstShuffleDep))
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
     submit(reduceRdd, Array(0))
 
@@ -1175,7 +1175,7 @@ class DAGSchedulerSuite
    */
   test("register map outputs correctly after ExecutorLost and task Resubmitted") {
     val firstRDD = new MyRDD(sc, 3, Nil)
-    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, new HashPartitioner(2))
     val reduceRdd = new MyRDD(sc, 5, List(firstShuffleDep))
     submit(reduceRdd, Array(0))
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index 341f56df2dafc..b92a302806f76 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -33,7 +33,8 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.executor.{TaskMetrics, ShuffleWriteMetrics}
-import org.apache.spark.serializer.{SerializerInstance, Serializer, JavaSerializer}
+import org.apache.spark.shuffle.IndexShuffleBlockResolver
+import org.apache.spark.serializer.{JavaSerializer, SerializerInstance}
 import org.apache.spark.storage._
 import org.apache.spark.util.Utils
 
@@ -42,25 +43,31 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
   @Mock(answer = RETURNS_SMART_NULLS) private var blockManager: BlockManager = _
   @Mock(answer = RETURNS_SMART_NULLS) private var diskBlockManager: DiskBlockManager = _
   @Mock(answer = RETURNS_SMART_NULLS) private var taskContext: TaskContext = _
+  @Mock(answer = RETURNS_SMART_NULLS) private var blockResolver: IndexShuffleBlockResolver = _
+  @Mock(answer = RETURNS_SMART_NULLS) private var dependency: ShuffleDependency[Int, Int, Int] = _
 
   private var taskMetrics: TaskMetrics = _
-  private var shuffleWriteMetrics: ShuffleWriteMetrics = _
   private var tempDir: File = _
   private var outputFile: File = _
   private val conf: SparkConf = new SparkConf(loadDefaults = false)
   private val temporaryFilesCreated: mutable.Buffer[File] = new ArrayBuffer[File]()
   private val blockIdToFileMap: mutable.Map[BlockId, File] = new mutable.HashMap[BlockId, File]
-  private val shuffleBlockId: ShuffleBlockId = new ShuffleBlockId(0, 0, 0)
-  private val serializer: Serializer = new JavaSerializer(conf)
+  private var shuffleHandle: BypassMergeSortShuffleHandle[Int, Int] = _
 
   override def beforeEach(): Unit = {
     tempDir = Utils.createTempDir()
     outputFile = File.createTempFile("shuffle", null, tempDir)
-    shuffleWriteMetrics = new ShuffleWriteMetrics
     taskMetrics = new TaskMetrics
-    taskMetrics.shuffleWriteMetrics = Some(shuffleWriteMetrics)
     MockitoAnnotations.initMocks(this)
+    shuffleHandle = new BypassMergeSortShuffleHandle[Int, Int](
+      shuffleId = 0,
+      numMaps = 2,
+      dependency = dependency
+    )
+    when(dependency.partitioner).thenReturn(new HashPartitioner(7))
+    when(dependency.serializer).thenReturn(Some(new JavaSerializer(conf)))
     when(taskContext.taskMetrics()).thenReturn(taskMetrics)
+    when(blockResolver.getDataFile(0, 0)).thenReturn(outputFile)
     when(blockManager.diskBlockManager).thenReturn(diskBlockManager)
     when(blockManager.getDiskWriter(
       any[BlockId],
@@ -107,18 +114,20 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
 
   test("write empty iterator") {
     val writer = new BypassMergeSortShuffleWriter[Int, Int](
-      new SparkConf(loadDefaults = false),
       blockManager,
-      new HashPartitioner(7),
-      shuffleWriteMetrics,
-      serializer
+      blockResolver,
+      shuffleHandle,
+      0, // MapId
+      taskContext,
+      conf
     )
-    writer.insertAll(Iterator.empty)
-    val partitionLengths = writer.writePartitionedFile(shuffleBlockId, taskContext, outputFile)
-    assert(partitionLengths.sum === 0)
+    writer.write(Iterator.empty)
+    writer.stop( /* success = */ true)
+    assert(writer.getPartitionLengths.sum === 0)
     assert(outputFile.exists())
     assert(outputFile.length() === 0)
     assert(temporaryFilesCreated.isEmpty)
+    val shuffleWriteMetrics = taskContext.taskMetrics().shuffleWriteMetrics.get
     assert(shuffleWriteMetrics.shuffleBytesWritten === 0)
     assert(shuffleWriteMetrics.shuffleRecordsWritten === 0)
     assert(taskMetrics.diskBytesSpilled === 0)
@@ -129,17 +138,19 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
     def records: Iterator[(Int, Int)] =
       Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
     val writer = new BypassMergeSortShuffleWriter[Int, Int](
-      new SparkConf(loadDefaults = false),
       blockManager,
-      new HashPartitioner(7),
-      shuffleWriteMetrics,
-      serializer
+      blockResolver,
+      shuffleHandle,
+      0, // MapId
+      taskContext,
+      conf
     )
-    writer.insertAll(records)
+    writer.write(records)
+    writer.stop( /* success = */ true)
     assert(temporaryFilesCreated.nonEmpty)
-    val partitionLengths = writer.writePartitionedFile(shuffleBlockId, taskContext, outputFile)
-    assert(partitionLengths.sum === outputFile.length())
+    assert(writer.getPartitionLengths.sum === outputFile.length())
     assert(temporaryFilesCreated.count(_.exists()) === 0) // check that temporary files were deleted
+    val shuffleWriteMetrics = taskContext.taskMetrics().shuffleWriteMetrics.get
     assert(shuffleWriteMetrics.shuffleBytesWritten === outputFile.length())
     assert(shuffleWriteMetrics.shuffleRecordsWritten === records.length)
     assert(taskMetrics.diskBytesSpilled === 0)
@@ -148,14 +159,15 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
 
   test("cleanup of intermediate files after errors") {
     val writer = new BypassMergeSortShuffleWriter[Int, Int](
-      new SparkConf(loadDefaults = false),
       blockManager,
-      new HashPartitioner(7),
-      shuffleWriteMetrics,
-      serializer
+      blockResolver,
+      shuffleHandle,
+      0, // MapId
+      taskContext,
+      conf
     )
     intercept[SparkException] {
-      writer.insertAll((0 until 100000).iterator.map(i => {
+      writer.write((0 until 100000).iterator.map(i => {
         if (i == 99990) {
           throw new SparkException("Intentional failure")
         }
@@ -163,7 +175,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
       }))
     }
     assert(temporaryFilesCreated.nonEmpty)
-    writer.stop()
+    writer.stop( /* success = */ false)
     assert(temporaryFilesCreated.count(_.exists()) === 0)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleManagerSuite.scala
similarity index 80%
rename from core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
rename to core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleManagerSuite.scala
index 6727934d8c7ca..8744a072cb3f6 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleManagerSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe
+package org.apache.spark.shuffle.sort
 
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
@@ -29,9 +29,9 @@ import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, Serializer}
  * Tests for the fallback logic in UnsafeShuffleManager. Actual tests of shuffling data are
  * performed in other suites.
  */
-class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
+class SortShuffleManagerSuite extends SparkFunSuite with Matchers {
 
-  import UnsafeShuffleManager.canUseUnsafeShuffle
+  import SortShuffleManager.canUseSerializedShuffle
 
   private class RuntimeExceptionAnswer extends Answer[Object] {
     override def answer(invocation: InvocationOnMock): Object = {
@@ -55,10 +55,10 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
     dep
   }
 
-  test("supported shuffle dependencies") {
+  test("supported shuffle dependencies for serialized shuffle") {
     val kryo = Some(new KryoSerializer(new SparkConf()))
 
-    assert(canUseUnsafeShuffle(shuffleDep(
+    assert(canUseSerializedShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
       keyOrdering = None,
@@ -68,7 +68,7 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
 
     val rangePartitioner = mock(classOf[RangePartitioner[Any, Any]])
     when(rangePartitioner.numPartitions).thenReturn(2)
-    assert(canUseUnsafeShuffle(shuffleDep(
+    assert(canUseSerializedShuffle(shuffleDep(
       partitioner = rangePartitioner,
       serializer = kryo,
       keyOrdering = None,
@@ -77,7 +77,7 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
     )))
 
     // Shuffles with key orderings are supported as long as no aggregator is specified
-    assert(canUseUnsafeShuffle(shuffleDep(
+    assert(canUseSerializedShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
       keyOrdering = Some(mock(classOf[Ordering[Any]])),
@@ -87,12 +87,12 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
 
   }
 
-  test("unsupported shuffle dependencies") {
+  test("unsupported shuffle dependencies for serialized shuffle") {
     val kryo = Some(new KryoSerializer(new SparkConf()))
     val java = Some(new JavaSerializer(new SparkConf()))
 
     // We only support serializers that support object relocation
-    assert(!canUseUnsafeShuffle(shuffleDep(
+    assert(!canUseSerializedShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = java,
       keyOrdering = None,
@@ -100,9 +100,11 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
       mapSideCombine = false
     )))
 
-    // We do not support shuffles with more than 16 million output partitions
-    assert(!canUseUnsafeShuffle(shuffleDep(
-      partitioner = new HashPartitioner(UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS + 1),
+    // The serialized shuffle path do not support shuffles with more than 16 million output
+    // partitions, due to a limitation in its sorter implementation.
+    assert(!canUseSerializedShuffle(shuffleDep(
+      partitioner = new HashPartitioner(
+        SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE + 1),
       serializer = kryo,
       keyOrdering = None,
       aggregator = None,
@@ -110,14 +112,14 @@ class UnsafeShuffleManagerSuite extends SparkFunSuite with Matchers {
     )))
 
     // We do not support shuffles that perform aggregation
-    assert(!canUseUnsafeShuffle(shuffleDep(
+    assert(!canUseSerializedShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
       keyOrdering = None,
       aggregator = Some(mock(classOf[Aggregator[Any, Any, Any]])),
       mapSideCombine = false
     )))
-    assert(!canUseUnsafeShuffle(shuffleDep(
+    assert(!canUseSerializedShuffle(shuffleDep(
       partitioner = new HashPartitioner(2),
       serializer = kryo,
       keyOrdering = Some(mock(classOf[Ordering[Any]])),
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
deleted file mode 100644
index 34b4984f12c09..0000000000000
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.sort
-
-import org.mockito.Mockito._
-
-import org.apache.spark.{Aggregator, SparkConf, SparkFunSuite}
-
-class SortShuffleWriterSuite extends SparkFunSuite {
-
-  import SortShuffleWriter._
-
-  test("conditions for bypassing merge-sort") {
-    val conf = new SparkConf(loadDefaults = false)
-    val agg = mock(classOf[Aggregator[_, _, _]], RETURNS_SMART_NULLS)
-    val ord = implicitly[Ordering[Int]]
-
-    // Numbers of partitions that are above and below the default bypassMergeThreshold
-    val FEW_PARTITIONS = 50
-    val MANY_PARTITIONS = 10000
-
-    // Shuffles with no ordering or aggregator: should bypass unless # of partitions is high
-    assert(shouldBypassMergeSort(conf, FEW_PARTITIONS, None, None))
-    assert(!shouldBypassMergeSort(conf, MANY_PARTITIONS, None, None))
-
-    // Shuffles with an ordering or aggregator: should not bypass even if they have few partitions
-    assert(!shouldBypassMergeSort(conf, FEW_PARTITIONS, None, Some(ord)))
-    assert(!shouldBypassMergeSort(conf, FEW_PARTITIONS, Some(agg), None))
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
deleted file mode 100644
index 259020a2ddc34..0000000000000
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.unsafe
-
-import java.io.File
-
-import scala.collection.JavaConverters._
-
-import org.apache.commons.io.FileUtils
-import org.apache.commons.io.filefilter.TrueFileFilter
-import org.scalatest.BeforeAndAfterAll
-
-import org.apache.spark.{HashPartitioner, ShuffleDependency, SparkContext, ShuffleSuite}
-import org.apache.spark.rdd.ShuffledRDD
-import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
-import org.apache.spark.util.Utils
-
-class UnsafeShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
-
-  // This test suite should run all tests in ShuffleSuite with unsafe-based shuffle.
-
-  override def beforeAll() {
-    conf.set("spark.shuffle.manager", "tungsten-sort")
-  }
-
-  test("UnsafeShuffleManager properly cleans up files for shuffles that use the new shuffle path") {
-    val tmpDir = Utils.createTempDir()
-    try {
-      val myConf = conf.clone()
-        .set("spark.local.dir", tmpDir.getAbsolutePath)
-      sc = new SparkContext("local", "test", myConf)
-      // Create a shuffled RDD and verify that it will actually use the new UnsafeShuffle path
-      val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
-      val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
-        .setSerializer(new KryoSerializer(myConf))
-      val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
-      assert(UnsafeShuffleManager.canUseUnsafeShuffle(shuffleDep))
-      def getAllFiles: Set[File] =
-        FileUtils.listFiles(tmpDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
-      val filesBeforeShuffle = getAllFiles
-      // Force the shuffle to be performed
-      shuffledRdd.count()
-      // Ensure that the shuffle actually created files that will need to be cleaned up
-      val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
-      filesCreatedByShuffle.map(_.getName) should be
-        Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
-      // Check that the cleanup actually removes the files
-      sc.env.blockManager.master.removeShuffle(0, blocking = true)
-      for (file <- filesCreatedByShuffle) {
-        assert (!file.exists(), s"Shuffle file $file was not cleaned up")
-      }
-    } finally {
-      Utils.deleteRecursively(tmpDir)
-    }
-  }
-
-  test("UnsafeShuffleManager properly cleans up files for shuffles that use the old shuffle path") {
-    val tmpDir = Utils.createTempDir()
-    try {
-      val myConf = conf.clone()
-        .set("spark.local.dir", tmpDir.getAbsolutePath)
-      sc = new SparkContext("local", "test", myConf)
-      // Create a shuffled RDD and verify that it will actually use the old SortShuffle path
-      val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
-      val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
-        .setSerializer(new JavaSerializer(myConf))
-      val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
-      assert(!UnsafeShuffleManager.canUseUnsafeShuffle(shuffleDep))
-      def getAllFiles: Set[File] =
-        FileUtils.listFiles(tmpDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
-      val filesBeforeShuffle = getAllFiles
-      // Force the shuffle to be performed
-      shuffledRdd.count()
-      // Ensure that the shuffle actually created files that will need to be cleaned up
-      val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
-      filesCreatedByShuffle.map(_.getName) should be
-        Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
-      // Check that the cleanup actually removes the files
-      sc.env.blockManager.master.removeShuffle(0, blocking = true)
-      for (file <- filesCreatedByShuffle) {
-        assert (!file.exists(), s"Shuffle file $file was not cleaned up")
-      }
-    } finally {
-      Utils.deleteRecursively(tmpDir)
-    }
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
deleted file mode 100644
index 05306f408847d..0000000000000
--- a/core/src/test/scala/org/apache/spark/util/collection/ChainedBufferSuite.scala
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util.collection
-
-import java.nio.ByteBuffer
-
-import org.scalatest.Matchers._
-
-import org.apache.spark.SparkFunSuite
-
-class ChainedBufferSuite extends SparkFunSuite {
-  test("write and read at start") {
-    // write from start of source array
-    val buffer = new ChainedBuffer(8)
-    buffer.capacity should be (0)
-    verifyWriteAndRead(buffer, 0, 0, 0, 4)
-    buffer.capacity should be (8)
-
-    // write from middle of source array
-    verifyWriteAndRead(buffer, 0, 5, 0, 4)
-    buffer.capacity should be (8)
-
-    // read to middle of target array
-    verifyWriteAndRead(buffer, 0, 0, 5, 4)
-    buffer.capacity should be (8)
-
-    // write up to border
-    verifyWriteAndRead(buffer, 0, 0, 0, 8)
-    buffer.capacity should be (8)
-
-    // expand into second buffer
-    verifyWriteAndRead(buffer, 0, 0, 0, 12)
-    buffer.capacity should be (16)
-
-    // expand into multiple buffers
-    verifyWriteAndRead(buffer, 0, 0, 0, 28)
-    buffer.capacity should be (32)
-  }
-
-  test("write and read at middle") {
-    val buffer = new ChainedBuffer(8)
-
-    // fill to a middle point
-    verifyWriteAndRead(buffer, 0, 0, 0, 3)
-
-    // write from start of source array
-    verifyWriteAndRead(buffer, 3, 0, 0, 4)
-    buffer.capacity should be (8)
-
-    // write from middle of source array
-    verifyWriteAndRead(buffer, 3, 5, 0, 4)
-    buffer.capacity should be (8)
-
-    // read to middle of target array
-    verifyWriteAndRead(buffer, 3, 0, 5, 4)
-    buffer.capacity should be (8)
-
-    // write up to border
-    verifyWriteAndRead(buffer, 3, 0, 0, 5)
-    buffer.capacity should be (8)
-
-    // expand into second buffer
-    verifyWriteAndRead(buffer, 3, 0, 0, 12)
-    buffer.capacity should be (16)
-
-    // expand into multiple buffers
-    verifyWriteAndRead(buffer, 3, 0, 0, 28)
-    buffer.capacity should be (32)
-  }
-
-  test("write and read at later buffer") {
-    val buffer = new ChainedBuffer(8)
-
-    // fill to a middle point
-    verifyWriteAndRead(buffer, 0, 0, 0, 11)
-
-    // write from start of source array
-    verifyWriteAndRead(buffer, 11, 0, 0, 4)
-    buffer.capacity should be (16)
-
-    // write from middle of source array
-    verifyWriteAndRead(buffer, 11, 5, 0, 4)
-    buffer.capacity should be (16)
-
-    // read to middle of target array
-    verifyWriteAndRead(buffer, 11, 0, 5, 4)
-    buffer.capacity should be (16)
-
-    // write up to border
-    verifyWriteAndRead(buffer, 11, 0, 0, 5)
-    buffer.capacity should be (16)
-
-    // expand into second buffer
-    verifyWriteAndRead(buffer, 11, 0, 0, 12)
-    buffer.capacity should be (24)
-
-    // expand into multiple buffers
-    verifyWriteAndRead(buffer, 11, 0, 0, 28)
-    buffer.capacity should be (40)
-  }
-
-
-  // Used to make sure we're writing different bytes each time
-  var rangeStart = 0
-
-  /**
-   * @param buffer The buffer to write to and read from.
-   * @param offsetInBuffer The offset to write to in the buffer.
-   * @param offsetInSource The offset in the array that the bytes are written from.
-   * @param offsetInTarget The offset in the array to read the bytes into.
-   * @param length The number of bytes to read and write
-   */
-  def verifyWriteAndRead(
-      buffer: ChainedBuffer,
-      offsetInBuffer: Int,
-      offsetInSource: Int,
-      offsetInTarget: Int,
-      length: Int): Unit = {
-    val source = new Array[Byte](offsetInSource + length)
-    (rangeStart until rangeStart + length).map(_.toByte).copyToArray(source, offsetInSource)
-    buffer.write(offsetInBuffer, source, offsetInSource, length)
-    val target = new Array[Byte](offsetInTarget + length)
-    buffer.read(offsetInBuffer, target, offsetInTarget, length)
-    ByteBuffer.wrap(source, offsetInSource, length) should be
-      (ByteBuffer.wrap(target, offsetInTarget, length))
-
-    rangeStart += 100
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
deleted file mode 100644
index 3b67f6206495a..0000000000000
--- a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util.collection
-
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
-
-import com.google.common.io.ByteStreams
-
-import org.mockito.Matchers.any
-import org.mockito.Mockito._
-import org.mockito.Mockito.RETURNS_SMART_NULLS
-import org.mockito.invocation.InvocationOnMock
-import org.mockito.stubbing.Answer
-import org.scalatest.Matchers._
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.serializer.KryoSerializer
-import org.apache.spark.storage.DiskBlockObjectWriter
-
-class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
-  test("OrderedInputStream single record") {
-    val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
-
-    val buffer = new PartitionedSerializedPairBuffer[Int, SomeStruct](4, 32, serializerInstance)
-    val struct = SomeStruct("something", 5)
-    buffer.insert(4, 10, struct)
-
-    val bytes = ByteStreams.toByteArray(buffer.orderedInputStream)
-
-    val baos = new ByteArrayOutputStream()
-    val stream = serializerInstance.serializeStream(baos)
-    stream.writeObject(10)
-    stream.writeObject(struct)
-    stream.close()
-
-    baos.toByteArray should be (bytes)
-  }
-
-  test("insert single record") {
-    val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
-    val buffer = new PartitionedSerializedPairBuffer[Int, SomeStruct](4, 32, serializerInstance)
-    val struct = SomeStruct("something", 5)
-    buffer.insert(4, 10, struct)
-    val elements = buffer.partitionedDestructiveSortedIterator(None).toArray
-    elements.size should be (1)
-    elements.head should be (((4, 10), struct))
-  }
-
-  test("insert multiple records") {
-    val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
-    val buffer = new PartitionedSerializedPairBuffer[Int, SomeStruct](4, 32, serializerInstance)
-    val struct1 = SomeStruct("something1", 8)
-    buffer.insert(6, 1, struct1)
-    val struct2 = SomeStruct("something2", 9)
-    buffer.insert(4, 2, struct2)
-    val struct3 = SomeStruct("something3", 10)
-    buffer.insert(5, 3, struct3)
-
-    val elements = buffer.partitionedDestructiveSortedIterator(None).toArray
-    elements.size should be (3)
-    elements(0) should be (((4, 2), struct2))
-    elements(1) should be (((5, 3), struct3))
-    elements(2) should be (((6, 1), struct1))
-  }
-
-  test("write single record") {
-    val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
-    val buffer = new PartitionedSerializedPairBuffer[Int, SomeStruct](4, 32, serializerInstance)
-    val struct = SomeStruct("something", 5)
-    buffer.insert(4, 10, struct)
-    val it = buffer.destructiveSortedWritablePartitionedIterator(None)
-    val (writer, baos) = createMockWriter()
-    assert(it.hasNext)
-    it.nextPartition should be (4)
-    it.writeNext(writer)
-    assert(!it.hasNext)
-
-    val stream = serializerInstance.deserializeStream(new ByteArrayInputStream(baos.toByteArray))
-    stream.readObject[AnyRef]() should be (10)
-    stream.readObject[AnyRef]() should be (struct)
-  }
-
-  test("write multiple records") {
-    val serializerInstance = new KryoSerializer(new SparkConf()).newInstance
-    val buffer = new PartitionedSerializedPairBuffer[Int, SomeStruct](4, 32, serializerInstance)
-    val struct1 = SomeStruct("something1", 8)
-    buffer.insert(6, 1, struct1)
-    val struct2 = SomeStruct("something2", 9)
-    buffer.insert(4, 2, struct2)
-    val struct3 = SomeStruct("something3", 10)
-    buffer.insert(5, 3, struct3)
-
-    val it = buffer.destructiveSortedWritablePartitionedIterator(None)
-    val (writer, baos) = createMockWriter()
-    assert(it.hasNext)
-    it.nextPartition should be (4)
-    it.writeNext(writer)
-    assert(it.hasNext)
-    it.nextPartition should be (5)
-    it.writeNext(writer)
-    assert(it.hasNext)
-    it.nextPartition should be (6)
-    it.writeNext(writer)
-    assert(!it.hasNext)
-
-    val stream = serializerInstance.deserializeStream(new ByteArrayInputStream(baos.toByteArray))
-    val iter = stream.asIterator
-    iter.next() should be (2)
-    iter.next() should be (struct2)
-    iter.next() should be (3)
-    iter.next() should be (struct3)
-    iter.next() should be (1)
-    iter.next() should be (struct1)
-    assert(!iter.hasNext)
-  }
-
-  def createMockWriter(): (DiskBlockObjectWriter, ByteArrayOutputStream) = {
-    val writer = mock(classOf[DiskBlockObjectWriter], RETURNS_SMART_NULLS)
-    val baos = new ByteArrayOutputStream()
-    when(writer.write(any(), any(), any())).thenAnswer(new Answer[Unit] {
-      override def answer(invocationOnMock: InvocationOnMock): Unit = {
-        val args = invocationOnMock.getArguments
-        val bytes = args(0).asInstanceOf[Array[Byte]]
-        val offset = args(1).asInstanceOf[Int]
-        val length = args(2).asInstanceOf[Int]
-        baos.write(bytes, offset, length)
-      }
-    })
-    (writer, baos)
-  }
-}
-
-case class SomeStruct(str: String, num: Int)
diff --git a/docs/configuration.md b/docs/configuration.md
index 46d92ceb762d6..be9c36bdfe3de 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -437,12 +437,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.manager</code></td>
   <td>sort</td>
   <td>
-    Implementation to use for shuffling data. There are three implementations available:
-    <code>sort</code>, <code>hash</code> and the new (1.5+) <code>tungsten-sort</code>.
+    Implementation to use for shuffling data. There are two implementations available:
+    <code>sort</code> and <code>hash</code>.
     Sort-based shuffle is more memory-efficient and is the default option starting in 1.2.
-    Tungsten-sort is similar to the sort based shuffle, with a direct binary cache-friendly
-    implementation with a fall back to regular sort based shuffle if its requirements are not
-    met.
   </td>
 </tr>
 <tr>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 0872d3f3e7093..b5e661d3ecfa8 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -37,6 +37,7 @@ object MimaExcludes {
       Seq(
         MimaBuild.excludeSparkPackage("deploy"),
         MimaBuild.excludeSparkPackage("network"),
+        MimaBuild.excludeSparkPackage("unsafe"),
         // These are needed if checking against the sbt build, since they are part of
         // the maven-generated artifacts in 1.3.
         excludePackage("org.spark-project.jetty"),
@@ -44,7 +45,11 @@ object MimaExcludes {
         // SQL execution is considered private.
         excludePackage("org.apache.spark.sql.execution"),
         // SQL columnar is considered private.
-        excludePackage("org.apache.spark.sql.columnar")
+        excludePackage("org.apache.spark.sql.columnar"),
+        // The shuffle package is considered private.
+        excludePackage("org.apache.spark.shuffle"),
+        // The collections utlities are considered pricate.
+        excludePackage("org.apache.spark.util.collection")
       ) ++
       MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
       MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++
@@ -750,4 +755,4 @@ object MimaExcludes {
       MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
     case _ => Seq()
   }
-}
\ No newline at end of file
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 1d3379a5e2d91..7f60c8f5eaa95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -24,7 +24,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.shuffle.sort.SortShuffleManager
-import org.apache.spark.shuffle.unsafe.UnsafeShuffleManager
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.attachTree
@@ -87,10 +86,8 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
     // fewer partitions (like RangePartitioner, for example).
     val conf = child.sqlContext.sparkContext.conf
     val shuffleManager = SparkEnv.get.shuffleManager
-    val sortBasedShuffleOn = shuffleManager.isInstanceOf[SortShuffleManager] ||
-      shuffleManager.isInstanceOf[UnsafeShuffleManager]
+    val sortBasedShuffleOn = shuffleManager.isInstanceOf[SortShuffleManager]
     val bypassMergeThreshold = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
-    val serializeMapOutputs = conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true)
     if (sortBasedShuffleOn) {
       val bypassIsSupported = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]
       if (bypassIsSupported && partitioner.numPartitions <= bypassMergeThreshold) {
@@ -99,22 +96,18 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
         // doesn't buffer deserialized records.
         // Note that we'll have to remove this case if we fix SPARK-6026 and remove this bypass.
         false
-      } else if (serializeMapOutputs && serializer.supportsRelocationOfSerializedObjects) {
-        // SPARK-4550 extended sort-based shuffle to serialize individual records prior to sorting
-        // them. This optimization is guarded by a feature-flag and is only applied in cases where
-        // shuffle dependency does not specify an aggregator or ordering and the record serializer
-        // has certain properties. If this optimization is enabled, we can safely avoid the copy.
+      } else if (serializer.supportsRelocationOfSerializedObjects) {
+        // SPARK-4550 and  SPARK-7081 extended sort-based shuffle to serialize individual records
+        // prior to sorting them. This optimization is only applied in cases where shuffle
+        // dependency does not specify an aggregator or ordering and the record serializer has
+        // certain properties. If this optimization is enabled, we can safely avoid the copy.
         //
         // Exchange never configures its ShuffledRDDs with aggregators or key orderings, so we only
         // need to check whether the optimization is enabled and supported by our serializer.
-        //
-        // This optimization also applies to UnsafeShuffleManager (added in SPARK-7081).
         false
       } else {
-        // Spark's SortShuffleManager uses `ExternalSorter` to buffer records in memory. This code
-        // path is used both when SortShuffleManager is used and when UnsafeShuffleManager falls
-        // back to SortShuffleManager to perform a shuffle that the new fast path can't handle. In
-        // both cases, we must copy.
+        // Spark's SortShuffleManager uses `ExternalSorter` to buffer records in memory, so we must
+        // copy.
         true
       }
     } else if (shuffleManager.isInstanceOf[HashShuffleManager]) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index 75d1fced594c4..1680d7e0a85ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -101,7 +101,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
     val oldEnv = SparkEnv.get // save the old SparkEnv, as it will be overwritten
     Utils.tryWithSafeFinally {
       val conf = new SparkConf()
-        .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
+        .set("spark.shuffle.spill.initialMemoryThreshold", "1")
         .set("spark.shuffle.sort.bypassMergeThreshold", "0")
         .set("spark.testing.memory", "80000")
 
@@ -109,7 +109,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
       outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
       // prepare data
       val converter = unsafeRowConverter(Array(IntegerType))
-      val data = (1 to 1000).iterator.map { i =>
+      val data = (1 to 10000).iterator.map { i =>
         (i, converter(Row(i)))
       }
       val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
@@ -141,9 +141,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test("SPARK-10403: unsafe row serializer with UnsafeShuffleManager") {
-    val conf = new SparkConf()
-      .set("spark.shuffle.manager", "tungsten-sort")
+  test("SPARK-10403: unsafe row serializer with SortShuffleManager") {
+    val conf = new SparkConf().set("spark.shuffle.manager", "sort")
     sc = new SparkContext("local", "test", conf)
     val row = Row("Hello", 123)
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))

From 42d225f449c633be7465493c57b9881303ee14ba Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 22 Oct 2015 10:53:59 -0700
Subject: [PATCH 643/802] [SPARK-11216][SQL][FOLLOW-UP] add encoder/decoder for
 external row

address comments in https://github.com/apache/spark/pull/9184

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9212 from cloud-fan/encoder.
---
 .../spark/sql/catalyst/encoders/ClassEncoder.scala | 14 +++-----------
 .../spark/sql/catalyst/encoders/RowEncoder.scala   |  9 ++++++---
 .../spark/sql/catalyst/expressions/objects.scala   |  8 +++++++-
 .../sql/catalyst/encoders/RowEncoderSuite.scala    |  2 +-
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
index f3a1063871775..54096f18cbea1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
@@ -48,20 +48,12 @@ case class ClassEncoder[T](
   private val dataType = ObjectType(clsTag.runtimeClass)
 
   override def toRow(t: T): InternalRow = {
-    if (t == null) {
-      null
-    } else {
-      inputRow(0) = t
-      extractProjection(inputRow)
-    }
+    inputRow(0) = t
+    extractProjection(inputRow)
   }
 
   override def fromRow(row: InternalRow): T = {
-    if (row eq null) {
-      null.asInstanceOf[T]
-    } else {
-      constructProjection(row).get(0, dataType).asInstanceOf[T]
-    }
+    constructProjection(row).get(0, dataType).asInstanceOf[T]
   }
 
   override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 3e74aabd078df..5142856afdcac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -26,8 +26,11 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
+/**
+ * A factory for constructing encoders that convert external row to/from the Spark SQL
+ * internal binary representation.
+ */
 object RowEncoder {
-
   def apply(schema: StructType): ClassEncoder[Row] = {
     val cls = classOf[Row]
     val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
@@ -136,7 +139,7 @@ object RowEncoder {
         constructorFor(BoundReference(i, f.dataType, f.nullable), f.dataType)
       )
     }
-    CreateRow(fields)
+    CreateExternalRow(fields)
   }
 
   private def constructorFor(input: Expression, dataType: DataType): Expression = dataType match {
@@ -195,7 +198,7 @@ object RowEncoder {
           Literal.create(null, externalDataTypeFor(f.dataType)),
           constructorFor(getField(input, i, f.dataType), f.dataType))
       }
-      CreateRow(convertedFields)
+      CreateExternalRow(convertedFields)
   }
 
   private def getField(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index 8fc00ad1bcb04..b42d6c5c1e14e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -456,7 +456,13 @@ case class MapObjects(
   }
 }
 
-case class CreateRow(children: Seq[Expression]) extends Expression {
+/**
+ * Constructs a new external row, using the result of evaluating the specified expressions
+ * as content.
+ *
+ * @param children A list of expression to use as content of the external row.
+ */
+case class CreateExternalRow(children: Seq[Expression]) extends Expression {
   override def dataType: DataType = ObjectType(classOf[Row])
 
   override def nullable: Boolean = false
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
index 6041b62b74bdd..e8301e8e06b52 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -73,7 +73,7 @@ class RowEncoderSuite extends SparkFunSuite {
   private def encodeDecodeTest(schema: StructType): Unit = {
     test(s"encode/decode: ${schema.simpleString}") {
       val encoder = RowEncoder(schema)
-      val inputGenerator = RandomDataGenerator.forType(schema).get
+      val inputGenerator = RandomDataGenerator.forType(schema, nullable = false).get
 
       var input: Row = null
       try {

From 7bb6d31cff279776f90744407291682774cfe1c2 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 22 Oct 2015 11:31:47 -0700
Subject: [PATCH 644/802] [SPARK-11232][CORE] Use 'offer' instead of 'put' to
 make sure calling send won't be interrupted

The current `NettyRpcEndpointRef.send` can be interrupted because it uses `LinkedBlockingQueue.put`, which may hang the application.

Image the following execution order:

  | thread 1: TaskRunner.kill | thread 2: TaskRunner.run
------------- | ------------- | -------------
1 | killed = true |
2 |  | if (killed) {
3 |  | throw new TaskKilledException
4 |  | case _: TaskKilledException  _: InterruptedException if task.killed =>
5 | task.kill(interruptThread): interruptThread is true |
6 | | execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
7 | | localEndpoint.send(StatusUpdate(taskId, state, serializedData)): in LocalBackend

Then `localEndpoint.send(StatusUpdate(taskId, state, serializedData))` will throw `InterruptedException`. This will prevent the executor from updating the task status and hang the application.

An failure caused by the above issue here: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/44062/consoleFull

Since `receivers` is an unbounded `LinkedBlockingQueue`, we can just use `LinkedBlockingQueue.offer` to resolve this issue.

Author: zsxwing <zsxwing@gmail.com>

Closes #9198 from zsxwing/dont-interrupt-send.
---
 .../scala/org/apache/spark/rpc/netty/Dispatcher.scala  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index f1a8273f157ef..7bf44a6565b61 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -66,7 +66,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
       }
       val data = endpoints.get(name)
       endpointRefs.put(data.endpoint, data.ref)
-      receivers.put(data)  // for the OnStart message
+      receivers.offer(data)  // for the OnStart message
     }
     endpointRef
   }
@@ -80,7 +80,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val data = endpoints.remove(name)
     if (data != null) {
       data.inbox.stop()
-      receivers.put(data)  // for the OnStop message
+      receivers.offer(data)  // for the OnStop message
     }
     // Don't clean `endpointRefs` here because it's possible that some messages are being processed
     // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
@@ -163,7 +163,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
         true
       } else {
         data.inbox.post(createMessageFn(data.ref))
-        receivers.put(data)
+        receivers.offer(data)
         false
       }
     }
@@ -183,7 +183,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     // Stop all endpoints. This will queue all endpoints for processing by the message loops.
     endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
     // Enqueue a message that tells the message loops to stop.
-    receivers.put(PoisonPill)
+    receivers.offer(PoisonPill)
     threadpool.shutdown()
   }
 
@@ -218,7 +218,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
             val data = receivers.take()
             if (data == PoisonPill) {
               // Put PoisonPill back so that other MessageLoops can see it.
-              receivers.put(PoisonPill)
+              receivers.offer(PoisonPill)
               return
             }
             data.inbox.process(Dispatcher.this)

From 3535b91ddc9fd05b613a121e09263b0f378bd5fa Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 22 Oct 2015 11:39:06 -0700
Subject: [PATCH 645/802] [SPARK-11163] Remove unnecessary addPendingTask
 calls.

This commit removes unnecessary calls to addPendingTask in
TaskSetManager.executorLost. These calls are unnecessary: for
tasks that are still pending and haven't been launched, they're
still in all of the correct pending lists, so calling addPendingTask
has no effect. For tasks that are currently running (which may still be
in the pending lists, depending on how they were scheduled), we call
addPendingTask in handleFailedTask, so the calls at the beginning
of executorLost are redundant.

I think these calls are left over from when we re-computed the locality
levels in addPendingTask; now that we call recomputeLocality separately,
I don't think these are necessary.

Now that those calls are removed, the readding parameter in addPendingTask
is no longer necessary, so this commit also removes that parameter.

markhamstra can you take a look at this?

cc vanzin

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #9154 from kayousterhout/SPARK-11163.
---
 .../spark/scheduler/TaskSetManager.scala      | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c02597c4365c9..987800d3d1f1e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -177,14 +177,11 @@ private[spark] class TaskSetManager(
 
   var emittedTaskSizeWarning = false
 
-  /**
-   * Add a task to all the pending-task lists that it should be on. If readding is set, we are
-   * re-adding the task so only include it in each list if it's not already there.
-   */
-  private def addPendingTask(index: Int, readding: Boolean = false) {
-    // Utility method that adds `index` to a list only if readding=false or it's not already there
+  /** Add a task to all the pending-task lists that it should be on. */
+  private def addPendingTask(index: Int) {
+    // Utility method that adds `index` to a list only if it's not already there
     def addTo(list: ArrayBuffer[Int]) {
-      if (!readding || !list.contains(index)) {
+      if (!list.contains(index)) {
         list += index
       }
     }
@@ -219,9 +216,7 @@ private[spark] class TaskSetManager(
       addTo(pendingTasksWithNoPrefs)
     }
 
-    if (!readding) {
-      allPendingTasks += index  // No point scanning this whole list to find the old task there
-    }
+    allPendingTasks += index  // No point scanning this whole list to find the old task there
   }
 
   /**
@@ -783,18 +778,6 @@ private[spark] class TaskSetManager(
 
   /** Called by TaskScheduler when an executor is lost so we can re-enqueue our tasks */
   override def executorLost(execId: String, host: String, reason: ExecutorLossReason) {
-    logInfo("Re-queueing tasks for " + execId + " from TaskSet " + taskSet.id)
-
-    // Re-enqueue pending tasks for this host based on the status of the cluster. Note
-    // that it's okay if we add a task to the same queue twice (if it had multiple preferred
-    // locations), because dequeueTaskFromList will skip already-running tasks.
-    for (index <- getPendingTasksForExecutor(execId)) {
-      addPendingTask(index, readding = true)
-    }
-    for (index <- getPendingTasksForHost(host)) {
-      addPendingTask(index, readding = true)
-    }
-
     // Re-enqueue any tasks that ran on the failed executor if this is a shuffle map stage,
     // and we are not using an external shuffle server which could serve the shuffle outputs.
     // The reason is the next stage wouldn't be able to fetch the data from this dead executor

From d4950e6be48954125eeb1be550c102636521bde3 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 22 Oct 2015 13:11:37 -0700
Subject: [PATCH 646/802] [SPARK-9735][SQL] Respect the user specified schema
 than the infer partition schema for HadoopFsRelation

To enable the unit test of `hadoopFsRelationSuite.Partition column type casting`. It previously threw exception like below, as we treat the auto infer partition schema with higher priority than the user specified one.

```
java.lang.ClassCastException: java.lang.Integer cannot be cast to org.apache.spark.unsafe.types.UTF8String
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow$class.getUTF8String(rows.scala:45)
	at org.apache.spark.sql.catalyst.expressions.GenericInternalRow.getUTF8String(rows.scala:220)
	at org.apache.spark.sql.catalyst.expressions.JoinedRow.getUTF8String(JoinedRow.scala:102)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(generated.java:62)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$17$$anonfun$apply$9.apply(DataSourceStrategy.scala:212)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$17$$anonfun$apply$9.apply(DataSourceStrategy.scala:212)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at scala.collection.AbstractIterator.to(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:903)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:903)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1846)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1846)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
07:44:01.344 ERROR org.apache.spark.executor.Executor: Exception in task 14.0 in stage 3.0 (TID 206)
java.lang.ClassCastException: java.lang.Integer cannot be cast to org.apache.spark.unsafe.types.UTF8String
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow$class.getUTF8String(rows.scala:45)
	at org.apache.spark.sql.catalyst.expressions.GenericInternalRow.getUTF8String(rows.scala:220)
	at org.apache.spark.sql.catalyst.expressions.JoinedRow.getUTF8String(JoinedRow.scala:102)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(generated.java:62)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$17$$anonfun$apply$9.apply(DataSourceStrategy.scala:212)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$17$$anonfun$apply$9.apply(DataSourceStrategy.scala:212)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at scala.collection.AbstractIterator.to(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:903)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:903)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1846)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1846)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8026 from chenghao-intel/partition_discovery.
---
 .../apache/spark/sql/sources/interfaces.scala | 29 +++++++++++--
 .../sql/sources/hadoopFsRelationSuites.scala  | 42 +++++++++++++------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 7b030b7d73bd5..84eef0f8a672c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.execution.{FileRelation, RDDConversions}
 import org.apache.spark.sql.execution.datasources.{PartitioningUtils, PartitionSpec, Partition}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql._
 import org.apache.spark.util.SerializableConfiguration
 
@@ -544,11 +544,32 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   }
 
   private def discoverPartitions(): PartitionSpec = {
-    val typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled()
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
-    PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME,
-      typeInference)
+    userDefinedPartitionColumns match {
+      case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
+        val spec = PartitioningUtils.parsePartitions(
+          leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME, typeInference = false)
+
+        // Without auto inference, all of value in the `row` should be null or in StringType,
+        // we need to cast into the data type that user specified.
+        def castPartitionValuesToUserSchema(row: InternalRow) = {
+          InternalRow((0 until row.numFields).map { i =>
+            Cast(
+              Literal.create(row.getString(i), StringType),
+              userProvidedSchema.fields(i).dataType).eval()
+          }: _*)
+        }
+
+        PartitionSpec(userProvidedSchema, spec.partitions.map { part =>
+          part.copy(values = castPartitionValuesToUserSchema(part.values))
+        })
+
+      case _ =>
+        // user did not provide a partitioning schema
+        PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME,
+          typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled())
+    }
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 42b9b3d6340d8..e3605bb3f6bf0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -510,21 +510,39 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
-  // HadoopFsRelation.discoverPartitions() called by refresh(), which will ignore
-  // the given partition data type.
-  ignore("Partition column type casting") {
+  test("SPARK-9735 Partition column type casting") {
     withTempPath { file =>
-      val input = partitionedTestDF.select('a, 'b, 'p1.cast(StringType).as('ps), 'p2)
-
-      input
-        .write
-        .format(dataSourceName)
-        .mode(SaveMode.Overwrite)
-        .partitionBy("ps", "p2")
-        .saveAsTable("t")
+      val df = (for {
+        i <- 1 to 3
+        p2 <- Seq("foo", "bar")
+      } yield (i, s"val_$i", 1.0d, p2, 123, 123.123f)).toDF("a", "b", "p1", "p2", "p3", "f")
+
+      val input = df.select(
+        'a,
+        'b,
+        'p1.cast(StringType).as('ps1),
+        'p2,
+        'p3.cast(FloatType).as('pf1),
+        'f)
 
       withTempTable("t") {
-        checkAnswer(sqlContext.table("t"), input.collect())
+        input
+          .write
+          .format(dataSourceName)
+          .mode(SaveMode.Overwrite)
+          .partitionBy("ps1", "p2", "pf1", "f")
+          .saveAsTable("t")
+
+        input
+          .write
+          .format(dataSourceName)
+          .mode(SaveMode.Append)
+          .partitionBy("ps1", "p2", "pf1", "f")
+          .saveAsTable("t")
+
+        val realData = input.collect()
+
+        checkAnswer(sqlContext.table("t"), realData ++ realData)
       }
     }
   }

From 188ea348fdcf877d86f3c433cd15f6468fe3b42a Mon Sep 17 00:00:00 2001
From: guoxi <guoxi@us.ibm.com>
Date: Thu, 22 Oct 2015 13:56:18 -0700
Subject: [PATCH 647/802] [SPARK-11242][SQL] In conf/spark-env.sh.template
 SPARK_DRIVER_MEMORY is documented incorrectly

Minor fix on the comment

Author: guoxi <guoxi@us.ibm.com>

Closes #9201 from xguo27/SPARK-11242.
---
 conf/spark-env.sh.template | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index 990ded420be72..771251f90ee36 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -36,10 +36,10 @@
 
 # Options read in YARN client mode
 # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
-# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
-# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
-# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
-# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
 # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
 # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
 # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.

From 53e83a3a77cafc2ccd0764ecdb8b3ba735bc51fc Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 22 Oct 2015 15:20:17 -0700
Subject: [PATCH 648/802] [SPARK-11116][SQL] First Draft of Dataset API

*This PR adds a new experimental API to Spark, tentitively named Datasets.*

A `Dataset` is a strongly-typed collection of objects that can be transformed in parallel using functional or relational operations.  Example usage is as follows:

### Functional
```scala
> val ds: Dataset[Int] = Seq(1, 2, 3).toDS()
> ds.filter(_ % 1 == 0).collect()
res1: Array[Int] = Array(1, 2, 3)
```

### Relational
```scala
scala> ds.toDF().show()
+-----+
|value|
+-----+
|    1|
|    2|
|    3|
+-----+

> ds.select(expr("value + 1").as[Int]).collect()
res11: Array[Int] = Array(2, 3, 4)
```

## Comparison to RDDs
 A `Dataset` differs from an `RDD` in the following ways:
  - The creation of a `Dataset` requires the presence of an explicit `Encoder` that can be
    used to serialize the object into a binary format.  Encoders are also capable of mapping the
    schema of a given object to the Spark SQL type system.  In contrast, RDDs rely on runtime
    reflection based serialization.
  - Internally, a `Dataset` is represented by a Catalyst logical plan and the data is stored
    in the encoded form.  This representation allows for additional logical operations and
    enables many operations (sorting, shuffling, etc.) to be performed without deserializing to
    an object.

A `Dataset` can be converted to an `RDD` by calling the `.rdd` method.

## Comparison to DataFrames

A `Dataset` can be thought of as a specialized DataFrame, where the elements map to a specific
JVM object type, instead of to a generic `Row` container. A DataFrame can be transformed into
specific Dataset by calling `df.as[ElementType]`.  Similarly you can transform a strongly-typed
`Dataset` to a generic DataFrame by calling `ds.toDF()`.

## Implementation Status and TODOs

This is a rough cut at the least controversial parts of the API.  The primary purpose here is to get something committed so that we can better parallelize further work and get early feedback on the API.  The following is being deferred to future PRs:
 - Joins and Aggregations (prototype here https://github.com/apache/spark/commit/f11f91e6f08c8cf389b8388b626cd29eec32d937)
 - Support for Java

Additionally, the responsibility for binding an encoder to a given schema is currently done in a fairly ad-hoc fashion.  This is an internal detail, and what we are doing today works for the cases we care about.  However, as we add more APIs we'll probably need to do this in a more principled way (i.e. separate resolution from binding as we do in DataFrames).

## COMPATIBILITY NOTE
Long term we plan to make `DataFrame` extend `Dataset[Row]`.  However,
making this change to che class hierarchy would break the function signatures for the existing
function operations (map, flatMap, etc).  As such, this class should be considered a preview
of the final API.  Changes will be made to the interface after Spark 1.6.

Author: Michael Armbrust <michael@databricks.com>

Closes #9190 from marmbrus/dataset-infra.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |   8 +-
 .../sql/catalyst/encoders/ClassEncoder.scala  |  38 +-
 .../spark/sql/catalyst/encoders/Encoder.scala |  19 +-
 .../catalyst/encoders/ProductEncoder.scala    |  12 +-
 .../catalyst/encoders/primitiveTypes.scala    | 100 +++++
 .../spark/sql/catalyst/encoders/tuples.scala  | 173 ++++++++
 .../catalyst/expressions/AttributeMap.scala   |   7 +
 .../catalyst/expressions/AttributeSet.scala   |   4 +
 .../expressions/complexTypeCreator.scala      |   8 +
 .../sql/catalyst/expressions/package.scala    |  12 +
 .../plans/logical/basicOperators.scala        |  72 +++-
 .../encoders/PrimitiveEncoderSuite.scala      |  43 ++
 .../encoders/ProductEncoderSuite.scala        |  21 +-
 .../scala/org/apache/spark/sql/Column.scala   |  15 +
 .../org/apache/spark/sql/DataFrame.scala      |  11 +
 .../scala/org/apache/spark/sql/Dataset.scala  | 392 ++++++++++++++++++
 .../org/apache/spark/sql/DatasetHolder.scala  |  30 ++
 .../org/apache/spark/sql/GroupedDataset.scala |  68 +++
 .../org/apache/spark/sql/SQLContext.scala     |  12 +
 .../org/apache/spark/sql/SQLImplicits.scala   |  16 +-
 .../spark/sql/execution/GroupedIterator.scala | 141 +++++++
 .../spark/sql/execution/SparkStrategies.scala |   8 +
 .../spark/sql/execution/basicOperators.scala  |  79 ++++
 .../spark/sql/DatasetPrimitiveSuite.scala     | 103 +++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 124 ++++++
 .../org/apache/spark/sql/QueryTest.scala      |   8 +
 26 files changed, 1501 insertions(+), 23 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 27c96f41221ad..713c6b547d9b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -411,9 +411,9 @@ trait ScalaReflection {
   }
 
   /** Returns expressions for extracting all the fields from the given type. */
-  def extractorsFor[T : TypeTag](inputObject: Expression): Seq[Expression] = {
+  def extractorsFor[T : TypeTag](inputObject: Expression): CreateNamedStruct = {
     ScalaReflectionLock.synchronized {
-      extractorFor(inputObject, typeTag[T].tpe).asInstanceOf[CreateStruct].children
+      extractorFor(inputObject, typeTag[T].tpe).asInstanceOf[CreateNamedStruct]
     }
   }
 
@@ -497,11 +497,11 @@ trait ScalaReflection {
             }
           }
 
-          CreateStruct(params.head.map { p =>
+          CreateNamedStruct(params.head.flatMap { p =>
             val fieldName = p.name.toString
             val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
             val fieldValue = Invoke(inputObject, fieldName, dataTypeFor(fieldType))
-            extractorFor(fieldValue, fieldType)
+            expressions.Literal(fieldName) :: extractorFor(fieldValue, fieldType) :: Nil
           })
 
         case t if t <:< localTypeOf[Array[_]] =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
index 54096f18cbea1..b484b8fde6369 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.encoders
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, SimpleAnalyzer}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
@@ -41,9 +41,11 @@ case class ClassEncoder[T](
     clsTag: ClassTag[T])
   extends Encoder[T] {
 
-  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  @transient
+  private lazy val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
   private val inputRow = new GenericMutableRow(1)
 
+  @transient
   private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
   private val dataType = ObjectType(clsTag.runtimeClass)
 
@@ -64,4 +66,36 @@ case class ClassEncoder[T](
 
     copy(constructExpression = boundExpression)
   }
+
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): ClassEncoder[T] = {
+    val positionToAttribute = AttributeMap.toIndex(oldSchema)
+    val attributeToNewPosition = AttributeMap.byIndex(newSchema)
+    copy(constructExpression = constructExpression transform {
+      case r: BoundReference =>
+        r.copy(ordinal = attributeToNewPosition(positionToAttribute(r.ordinal)))
+    })
+  }
+
+  override def bindOrdinals(schema: Seq[Attribute]): ClassEncoder[T] = {
+    var remaining = schema
+    copy(constructExpression = constructExpression transform {
+      case u: UnresolvedAttribute =>
+        val pos = remaining.head
+        remaining = remaining.drop(1)
+        pos
+    })
+  }
+
+  protected val attrs = extractExpressions.map(_.collect {
+    case a: Attribute => s"#${a.exprId}"
+    case b: BoundReference => s"[${b.ordinal}]"
+  }.headOption.getOrElse(""))
+
+
+  protected val schemaString =
+    schema
+      .zip(attrs)
+      .map { case(f, a) => s"${f.name}$a: ${f.dataType.simpleString}"}.mkString(", ")
+
+  override def toString: String = s"class[$schemaString]"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index bdb1c0959da87..efb872ddb81e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.types.StructType
  * and reuse internal buffers to improve performance.
  */
 trait Encoder[T] {
+
   /** Returns the schema of encoding this type of object as a Row. */
   def schema: StructType
 
@@ -46,13 +47,27 @@ trait Encoder[T] {
 
   /**
    * Returns an object of type `T`, extracting the required values from the provided row.  Note that
-   * you must bind the encoder to a specific schema before you can call this function.
+   * you must `bind` an encoder to a specific schema before you can call this function.
    */
   def fromRow(row: InternalRow): T
 
   /**
    * Returns a new copy of this encoder, where the expressions used by `fromRow` are bound to the
-   * given schema
+   * given schema.
    */
   def bind(schema: Seq[Attribute]): Encoder[T]
+
+  /**
+   * Binds this encoder to the given schema positionally.  In this binding, the first reference to
+   * any input is mapped to `schema(0)`, and so on for each input that is encountered.
+   */
+  def bindOrdinals(schema: Seq[Attribute]): Encoder[T]
+
+  /**
+   * Given an encoder that has already been bound to a given schema, returns a new encoder that
+   * where the positions are mapped from `oldSchema` to `newSchema`.  This can be used, for example,
+   * when you are trying to use an encoder on grouping keys that were orriginally part of a larger
+   * row, but now you have projected out only the key expressions.
+   */
+  def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
index 4f7ce455ada99..34f5e6c030f58 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -31,15 +31,17 @@ import org.apache.spark.sql.types.{ObjectType, StructType}
 object ProductEncoder {
   def apply[T <: Product : TypeTag]: ClassEncoder[T] = {
     // We convert the not-serializable TypeTag into StructType and ClassTag.
-    val schema = ScalaReflection.schemaFor[T].dataType.asInstanceOf[StructType]
     val mirror = typeTag[T].mirror
     val cls = mirror.runtimeClass(typeTag[T].tpe)
 
     val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
-    val extractExpressions = ScalaReflection.extractorsFor[T](inputObject)
+    val extractExpression = ScalaReflection.extractorsFor[T](inputObject)
     val constructExpression = ScalaReflection.constructorFor[T]
-    new ClassEncoder[T](schema, extractExpressions, constructExpression, ClassTag[T](cls))
-  }
-
 
+    new ClassEncoder[T](
+      extractExpression.dataType,
+      extractExpression.flatten,
+      constructExpression,
+      ClassTag[T](cls))
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
new file mode 100644
index 0000000000000..a93f2d7c6115d
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.sql.types._
+
+/** An encoder for primitive Long types. */
+case class LongEncoder(fieldName: String = "value", ordinal: Int = 0) extends Encoder[Long] {
+  private val row = UnsafeRow.createFromByteArray(64, 1)
+
+  override def clsTag: ClassTag[Long] = ClassTag.Long
+  override def schema: StructType =
+    StructType(StructField(fieldName, LongType) :: Nil)
+
+  override def fromRow(row: InternalRow): Long = row.getLong(ordinal)
+
+  override def toRow(t: Long): InternalRow = {
+    row.setLong(ordinal, t)
+    row
+  }
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[Long] = this
+  override def bind(schema: Seq[Attribute]): Encoder[Long] = this
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[Long] = this
+}
+
+/** An encoder for primitive Integer types. */
+case class IntEncoder(fieldName: String = "value", ordinal: Int = 0) extends Encoder[Int] {
+  private val row = UnsafeRow.createFromByteArray(64, 1)
+
+  override def clsTag: ClassTag[Int] = ClassTag.Int
+  override def schema: StructType =
+    StructType(StructField(fieldName, IntegerType) :: Nil)
+
+  override def fromRow(row: InternalRow): Int = row.getInt(ordinal)
+
+  override def toRow(t: Int): InternalRow = {
+    row.setInt(ordinal, t)
+    row
+  }
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[Int] = this
+  override def bind(schema: Seq[Attribute]): Encoder[Int] = this
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[Int] = this
+}
+
+/** An encoder for String types. */
+case class StringEncoder(
+    fieldName: String = "value",
+    ordinal: Int = 0) extends Encoder[String] {
+
+  val record = new SpecificMutableRow(StringType :: Nil)
+
+  @transient
+  lazy val projection =
+    GenerateUnsafeProjection.generate(BoundReference(0, StringType, true) :: Nil)
+
+  override def schema: StructType =
+    StructType(
+      StructField("value", StringType, nullable = false) :: Nil)
+
+  override def clsTag: ClassTag[String] = scala.reflect.classTag[String]
+
+
+  override final def fromRow(row: InternalRow): String = {
+    row.getString(ordinal)
+  }
+
+  override final def toRow(value: String): InternalRow = {
+    val utf8String = UTF8String.fromString(value)
+    record(0) = utf8String
+    // TODO: this is a bit of a hack to produce UnsafeRows
+    projection(record)
+  }
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[String] = this
+  override def bind(schema: Seq[Attribute]): Encoder[String] = this
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[String] = this
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
new file mode 100644
index 0000000000000..a48eeda7d2e6f
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.types.{StructField, StructType}
+
+// Most of this file is codegen.
+// scalastyle:off
+
+/**
+ * A set of composite encoders that take sub encoders and map each of their objects to a
+ * Scala tuple.  Note that currently the implementation is fairly limited and only supports going
+ * from an internal row to a tuple.
+ */
+object TupleEncoder {
+
+  /** Code generator for composite tuple encoders. */
+  def main(args: Array[String]): Unit = {
+    (2 to 5).foreach { i =>
+      val types = (1 to i).map(t => s"T$t").mkString(", ")
+      val tupleType = s"($types)"
+      val args = (1 to i).map(t => s"e$t: Encoder[T$t]").mkString(", ")
+      val fields = (1 to i).map(t => s"""StructField("_$t", e$t.schema)""").mkString(", ")
+      val fromRow = (1 to i).map(t => s"e$t.fromRow(row)").mkString(", ")
+
+      println(
+        s"""
+          |class Tuple${i}Encoder[$types]($args) extends Encoder[$tupleType] {
+          |  val schema = StructType(Array($fields))
+          |
+          |  def clsTag: ClassTag[$tupleType] = scala.reflect.classTag[$tupleType]
+          |
+          |  def fromRow(row: InternalRow): $tupleType = {
+          |    ($fromRow)
+          |  }
+          |
+          |  override def toRow(t: $tupleType): InternalRow =
+          |    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
+          |
+          |  override def bind(schema: Seq[Attribute]): Encoder[$tupleType] = {
+          |    this
+          |  }
+          |
+          |  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[$tupleType] =
+          |    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+          |
+          |
+          |  override def bindOrdinals(schema: Seq[Attribute]): Encoder[$tupleType] =
+          |    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+          |}
+        """.stripMargin)
+    }
+  }
+}
+
+class Tuple2Encoder[T1, T2](e1: Encoder[T1], e2: Encoder[T2]) extends Encoder[(T1, T2)] {
+  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema)))
+
+  def clsTag: ClassTag[(T1, T2)] = scala.reflect.classTag[(T1, T2)]
+
+  def fromRow(row: InternalRow): (T1, T2) = {
+    (e1.fromRow(row), e2.fromRow(row))
+  }
+
+  override def toRow(t: (T1, T2)): InternalRow =
+    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
+
+  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2)] = {
+    this
+  }
+
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+}
+
+
+class Tuple3Encoder[T1, T2, T3](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3]) extends Encoder[(T1, T2, T3)] {
+  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema)))
+
+  def clsTag: ClassTag[(T1, T2, T3)] = scala.reflect.classTag[(T1, T2, T3)]
+
+  def fromRow(row: InternalRow): (T1, T2, T3) = {
+    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row))
+  }
+
+  override def toRow(t: (T1, T2, T3)): InternalRow =
+    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
+
+  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3)] = {
+    this
+  }
+
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+}
+
+
+class Tuple4Encoder[T1, T2, T3, T4](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4]) extends Encoder[(T1, T2, T3, T4)] {
+  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema), StructField("_4", e4.schema)))
+
+  def clsTag: ClassTag[(T1, T2, T3, T4)] = scala.reflect.classTag[(T1, T2, T3, T4)]
+
+  def fromRow(row: InternalRow): (T1, T2, T3, T4) = {
+    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row), e4.fromRow(row))
+  }
+
+  override def toRow(t: (T1, T2, T3, T4)): InternalRow =
+    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
+
+  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] = {
+    this
+  }
+
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+}
+
+
+class Tuple5Encoder[T1, T2, T3, T4, T5](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4], e5: Encoder[T5]) extends Encoder[(T1, T2, T3, T4, T5)] {
+  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema), StructField("_4", e4.schema), StructField("_5", e5.schema)))
+
+  def clsTag: ClassTag[(T1, T2, T3, T4, T5)] = scala.reflect.classTag[(T1, T2, T3, T4, T5)]
+
+  def fromRow(row: InternalRow): (T1, T2, T3, T4, T5) = {
+    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row), e4.fromRow(row), e5.fromRow(row))
+  }
+
+  override def toRow(t: (T1, T2, T3, T4, T5)): InternalRow =
+    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
+
+  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] = {
+    this
+  }
+
+  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+
+
+  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] =
+    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
index 96a11e352ec50..ef3cc554b79c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
@@ -26,6 +26,13 @@ object AttributeMap {
   def apply[A](kvs: Seq[(Attribute, A)]): AttributeMap[A] = {
     new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap)
   }
+
+  /** Given a schema, constructs an [[AttributeMap]] from [[Attribute]] to ordinal */
+  def byIndex(schema: Seq[Attribute]): AttributeMap[Int] = apply(schema.zipWithIndex)
+
+  /** Given a schema, constructs a map from ordinal to Attribute. */
+  def toIndex(schema: Seq[Attribute]): Map[Int, Attribute] =
+    schema.zipWithIndex.map { case (a, i) => i -> a }.toMap
 }
 
 class AttributeMap[A](baseMap: Map[ExprId, (Attribute, A)])
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index 5345696570b41..3831535574205 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -31,6 +31,10 @@ protected class AttributeEquals(val a: Attribute) {
 }
 
 object AttributeSet {
+  /** Returns an empty [[AttributeSet]]. */
+  val empty = apply(Iterable.empty)
+
+  /** Constructs a new [[AttributeSet]] that contains a single [[Attribute]]. */
   def apply(a: Attribute): AttributeSet = new AttributeSet(Set(new AttributeEquals(a)))
 
   /** Constructs a new [[AttributeSet]] given a sequence of [[Expression Expressions]]. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index a5f02e2463aed..059e45bd684ed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -125,6 +125,14 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
  */
 case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
 
+  /**
+   * Returns Aliased [[Expressions]] that could be used to construct a flattened version of this
+   * StructType.
+   */
+  def flatten: Seq[NamedExpression] = valExprs.zip(names).map {
+    case (v, n) => Alias(v, n.toString)()
+  }
+
   private lazy val (nameExprs, valExprs) =
     children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 30b7f8d3766a5..f1fa13daa77eb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
  * A set of classes that can be used to represent trees of relational expressions.  A key goal of
@@ -80,4 +81,15 @@ package object expressions  {
     /** Uses the given row to store the output of the projection. */
     def target(row: MutableRow): MutableProjection
   }
+
+
+  /**
+   * Helper functions for working with `Seq[Attribute]`.
+   */
+  implicit class AttributeSeq(attrs: Seq[Attribute]) {
+    /** Creates a StructType with a schema matching this `Seq[Attribute]`. */
+    def toStructType: StructType = {
+      StructType(attrs.map(a => StructField(a.name, a.dataType, a.nullable)))
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index ae9482c10f126..21a55a5371841 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
+import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.Utils
 import org.apache.spark.sql.catalyst.plans._
@@ -417,7 +418,7 @@ case class Distinct(child: LogicalPlan) extends UnaryNode {
 }
 
 /**
- * Return a new RDD that has exactly `numPartitions` partitions. Differs from
+ * Returns a new RDD that has exactly `numPartitions` partitions. Differs from
  * [[RepartitionByExpression]] as this method is called directly by DataFrame's, because the user
  * asked for `coalesce` or `repartition`. [[RepartitionByExpression]] is used when the consumer
  * of the output requires some specific ordering or distribution of the data.
@@ -443,3 +444,72 @@ case object OneRowRelation extends LeafNode {
   override def statistics: Statistics = Statistics(sizeInBytes = 1)
 }
 
+/**
+ * A relation produced by applying `func` to each partition of the `child`. tEncoder/uEncoder are
+ * used respectively to decode/encode from the JVM object representation expected by `func.`
+ */
+case class MapPartitions[T, U](
+    func: Iterator[T] => Iterator[U],
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    output: Seq[Attribute],
+    child: LogicalPlan) extends UnaryNode {
+  override def missingInput: AttributeSet = AttributeSet.empty
+}
+
+/** Factory for constructing new `AppendColumn` nodes. */
+object AppendColumn {
+  def apply[T : Encoder, U : Encoder](func: T => U, child: LogicalPlan): AppendColumn[T, U] = {
+    val attrs = implicitly[Encoder[U]].schema.toAttributes
+    new AppendColumn[T, U](func, implicitly[Encoder[T]], implicitly[Encoder[U]], attrs, child)
+  }
+}
+
+/**
+ * A relation produced by applying `func` to each partition of the `child`, concatenating the
+ * resulting columns at the end of the input row. tEncoder/uEncoder are used respectively to
+ * decode/encode from the JVM object representation expected by `func.`
+ */
+case class AppendColumn[T, U](
+    func: T => U,
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    newColumns: Seq[Attribute],
+    child: LogicalPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output ++ newColumns
+  override def missingInput: AttributeSet = super.missingInput -- newColumns
+}
+
+/** Factory for constructing new `MapGroups` nodes. */
+object MapGroups {
+  def apply[K : Encoder, T : Encoder, U : Encoder](
+      func: (K, Iterator[T]) => Iterator[U],
+      groupingAttributes: Seq[Attribute],
+      child: LogicalPlan): MapGroups[K, T, U] = {
+    new MapGroups(
+      func,
+      implicitly[Encoder[K]],
+      implicitly[Encoder[T]],
+      implicitly[Encoder[U]],
+      groupingAttributes,
+      implicitly[Encoder[U]].schema.toAttributes,
+      child)
+  }
+}
+
+/**
+ * Applies func to each unique group in `child`, based on the evaluation of `groupingAttributes`.
+ * Func is invoked with an object representation of the grouping key an iterator containing the
+ * object representation of all the rows with that key.
+ */
+case class MapGroups[K, T, U](
+    func: (K, Iterator[T]) => Iterator[U],
+    kEncoder: Encoder[K],
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    groupingAttributes: Seq[Attribute],
+    output: Seq[Attribute],
+    child: LogicalPlan) extends UnaryNode {
+  override def missingInput: AttributeSet = AttributeSet.empty
+}
+
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala
new file mode 100644
index 0000000000000..52f8383faca92
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.SparkFunSuite
+
+class PrimitiveEncoderSuite extends SparkFunSuite {
+  test("long encoder") {
+    val enc = new LongEncoder()
+    val row = enc.toRow(10)
+    assert(row.getLong(0) == 10)
+    assert(enc.fromRow(row) == 10)
+  }
+
+  test("int encoder") {
+    val enc = new IntEncoder()
+    val row = enc.toRow(10)
+    assert(row.getInt(0) == 10)
+    assert(enc.fromRow(row) == 10)
+  }
+
+  test("string encoder") {
+    val enc = new StringEncoder()
+    val row = enc.toRow("test")
+    assert(row.getString(0) == "test")
+    assert(enc.fromRow(row) == "test")
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
index 02e43ddb35478..7735acbcbad41 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -248,12 +248,16 @@ class ProductEncoderSuite extends SparkFunSuite {
         val types =
           convertedBack.productIterator.filter(_ != null).map(_.getClass.getName).mkString(",")
 
-        val encodedData = convertedData.toSeq(encoder.schema).zip(encoder.schema).map {
-          case (a: ArrayData, StructField(_, at: ArrayType, _, _)) =>
-            a.toArray[Any](at.elementType).toSeq
-          case (other, _) =>
-            other
-        }.mkString("[", ",", "]")
+        val encodedData = try {
+          convertedData.toSeq(encoder.schema).zip(encoder.schema).map {
+            case (a: ArrayData, StructField(_, at: ArrayType, _, _)) =>
+              a.toArray[Any](at.elementType).toSeq
+            case (other, _) =>
+              other
+          }.mkString("[", ",", "]")
+        } catch {
+          case e: Throwable => s"Failed to toSeq: $e"
+        }
 
         fail(
           s"""Encoded/Decoded data does not match input data
@@ -272,8 +276,9 @@ class ProductEncoderSuite extends SparkFunSuite {
              |Construct Expressions:
              |${boundEncoder.constructExpression.treeString}
              |
-           """.stripMargin)
+         """.stripMargin)
+        }
       }
-    }
+
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 37d559c8e4301..de11a1699afd9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql
 
+
 import scala.language.implicitConversions
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.Logging
 import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.types._
@@ -36,6 +38,11 @@ private[sql] object Column {
   def unapply(col: Column): Option[Expression] = Some(col.expr)
 }
 
+/**
+ * A [[Column]] where an [[Encoder]] has been given for the expected return type.
+ * @since 1.6.0
+ */
+class TypedColumn[T](expr: Expression)(implicit val encoder: Encoder[T]) extends Column(expr)
 
 /**
  * :: Experimental ::
@@ -69,6 +76,14 @@ class Column(protected[sql] val expr: Expression) extends Logging {
 
   override def hashCode: Int = this.expr.hashCode
 
+  /**
+   * Provides a type hint about the expected return value of this column.  This information can
+   * be used by operations such as `select` on a [[Dataset]] to automatically convert the
+   * results into the correct JVM types.
+   * @since 1.6.0
+   */
+  def as[T : Encoder]: TypedColumn[T] = new TypedColumn[T](expr)
+
   /**
    * Extracts a value or values from a complex type.
    * The following types of extraction are supported:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 2f10aa9f3c446..bf25bcde208e2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
@@ -258,6 +259,16 @@ class DataFrame private[sql](
   // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
   def toDF(): DataFrame = this
 
+  /**
+   * :: Experimental ::
+   * Converts this [[DataFrame]] to a strongly-typed [[Dataset]] containing objects of the
+   * specified type, `U`.
+   * @group basic
+   * @since 1.6.0
+   */
+  @Experimental
+  def as[U : Encoder]: Dataset[U] = new Dataset[U](sqlContext, queryExecution)
+
   /**
    * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
    * from a RDD of tuples into a [[DataFrame]] with meaningful names. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
new file mode 100644
index 0000000000000..96213c7630400
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.encoders._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A [[Dataset]] is a strongly typed collection of objects that can be transformed in parallel
+ * using functional or relational operations.
+ *
+ * A [[Dataset]] differs from an [[RDD]] in the following ways:
+ *  - Internally, a [[Dataset]] is represented by a Catalyst logical plan and the data is stored
+ *    in the encoded form.  This representation allows for additional logical operations and
+ *    enables many operations (sorting, shuffling, etc.) to be performed without deserializing to
+ *    an object.
+ *  - The creation of a [[Dataset]] requires the presence of an explicit [[Encoder]] that can be
+ *    used to serialize the object into a binary format.  Encoders are also capable of mapping the
+ *    schema of a given object to the Spark SQL type system.  In contrast, RDDs rely on runtime
+ *    reflection based serialization. Operations that change the type of object stored in the
+ *    dataset also need an encoder for the new type.
+ *
+ * A [[Dataset]] can be thought of as a specialized DataFrame, where the elements map to a specific
+ * JVM object type, instead of to a generic [[Row]] container. A DataFrame can be transformed into
+ * specific Dataset by calling `df.as[ElementType]`.  Similarly you can transform a strongly-typed
+ * [[Dataset]] to a generic DataFrame by calling `ds.toDF()`.
+ *
+ * COMPATIBILITY NOTE: Long term we plan to make [[DataFrame]] extend `Dataset[Row]`.  However,
+ * making this change to the class hierarchy would break the function signatures for the existing
+ * functional operations (map, flatMap, etc).  As such, this class should be considered a preview
+ * of the final API.  Changes will be made to the interface after Spark 1.6.
+ *
+ * @since 1.6.0
+ */
+@Experimental
+class Dataset[T] private[sql](
+    @transient val sqlContext: SQLContext,
+    @transient val queryExecution: QueryExecution)(
+    implicit val encoder: Encoder[T]) extends Serializable {
+
+  private implicit def classTag = encoder.clsTag
+
+  private[sql] def this(sqlContext: SQLContext, plan: LogicalPlan)(implicit encoder: Encoder[T]) =
+    this(sqlContext, new QueryExecution(sqlContext, plan))
+
+  /** Returns the schema of the encoded form of the objects in this [[Dataset]]. */
+  def schema: StructType = encoder.schema
+
+  /* ************* *
+   *  Conversions  *
+   * ************* */
+
+  /**
+   * Returns a new `Dataset` where each record has been mapped on to the specified type.
+   * TODO: should bind here...
+   * TODO: document binding rules
+   * @since 1.6.0
+   */
+  def as[U : Encoder]: Dataset[U] = new Dataset(sqlContext, queryExecution)(implicitly[Encoder[U]])
+
+  /**
+   * Applies a logical alias to this [[Dataset]] that can be used to disambiguate columns that have
+   * the same name after two Datasets have been joined.
+   */
+  def as(alias: String): Dataset[T] = withPlan(Subquery(alias, _))
+
+  /**
+   * Converts this strongly typed collection of data to generic Dataframe.  In contrast to the
+   * strongly typed objects that Dataset operations work on, a Dataframe returns generic [[Row]]
+   * objects that allow fields to be accessed by ordinal or name.
+   */
+  def toDF(): DataFrame = DataFrame(sqlContext, logicalPlan)
+
+
+  /**
+   * Returns this Dataset.
+   * @since 1.6.0
+   */
+  def toDS(): Dataset[T] = this
+
+  /**
+   * Converts this Dataset to an RDD.
+   * @since 1.6.0
+   */
+  def rdd: RDD[T] = {
+    val tEnc = implicitly[Encoder[T]]
+    val input = queryExecution.analyzed.output
+    queryExecution.toRdd.mapPartitions { iter =>
+      val bound = tEnc.bind(input)
+      iter.map(bound.fromRow)
+    }
+  }
+
+  /* *********************** *
+   *  Functional Operations  *
+   * *********************** */
+
+  /**
+   * Concise syntax for chaining custom transformations.
+   * {{{
+   *   def featurize(ds: Dataset[T]) = ...
+   *
+   *   dataset
+   *     .transform(featurize)
+   *     .transform(...)
+   * }}}
+   *
+   * @since 1.6.0
+   */
+  def transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] = t(this)
+
+  /**
+   * Returns a new [[Dataset]] that only contains elements where `func` returns `true`.
+   * @since 1.6.0
+   */
+  def filter(func: T => Boolean): Dataset[T] = mapPartitions(_.filter(func))
+
+  /**
+   * Returns a new [[Dataset]] that contains the result of applying `func` to each element.
+   * @since 1.6.0
+   */
+  def map[U : Encoder](func: T => U): Dataset[U] = mapPartitions(_.map(func))
+
+  /**
+   * Returns a new [[Dataset]] that contains the result of applying `func` to each element.
+   * @since 1.6.0
+   */
+  def mapPartitions[U : Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] = {
+    new Dataset(
+      sqlContext,
+      MapPartitions[T, U](
+        func,
+        implicitly[Encoder[T]],
+        implicitly[Encoder[U]],
+        implicitly[Encoder[U]].schema.toAttributes,
+        logicalPlan))
+  }
+
+  def flatMap[U : Encoder](func: T => TraversableOnce[U]): Dataset[U] =
+    mapPartitions(_.flatMap(func))
+
+  /* ************** *
+   *  Side effects  *
+   * ************** */
+
+  /**
+   * Runs `func` on each element of this Dataset.
+   * @since 1.6.0
+   */
+  def foreach(func: T => Unit): Unit = rdd.foreach(func)
+
+  /**
+   * Runs `func` on each partition of this Dataset.
+   * @since 1.6.0
+   */
+  def foreachPartition(func: Iterator[T] => Unit): Unit = rdd.foreachPartition(func)
+
+  /* ************* *
+   *  Aggregation  *
+   * ************* */
+
+  /**
+   * Reduces the elements of this Dataset using the specified  binary function.  The given function
+   * must be commutative and associative or the result may be non-deterministic.
+   * @since 1.6.0
+   */
+  def reduce(func: (T, T) => T): T = rdd.reduce(func)
+
+  /**
+   * Aggregates the elements of each partition, and then the results for all the partitions, using a
+   * given associative and commutative function and a neutral "zero value".
+   *
+   * This behaves somewhat differently than the fold operations implemented for non-distributed
+   * collections in functional languages like Scala. This fold operation may be applied to
+   * partitions individually, and then those results will be folded into the final result.
+   * If op is not commutative, then the result may differ from that of a fold applied to a
+   * non-distributed collection.
+   * @since 1.6.0
+   */
+  def fold(zeroValue: T)(op: (T, T) => T): T = rdd.fold(zeroValue)(op)
+
+  /**
+   * Returns a [[GroupedDataset]] where the data is grouped by the given key function.
+   * @since 1.6.0
+   */
+  def groupBy[K : Encoder](func: T => K): GroupedDataset[K, T] = {
+    val inputPlan = queryExecution.analyzed
+    val withGroupingKey = AppendColumn(func, inputPlan)
+    val executed = sqlContext.executePlan(withGroupingKey)
+
+    new GroupedDataset(
+      implicitly[Encoder[K]].bindOrdinals(withGroupingKey.newColumns),
+      implicitly[Encoder[T]].bind(inputPlan.output),
+      executed,
+      inputPlan.output,
+      withGroupingKey.newColumns)
+  }
+
+  /* ****************** *
+   *  Typed Relational  *
+   * ****************** */
+
+  /**
+   * Returns a new [[Dataset]] by computing the given [[Column]] expression for each element.
+   *
+   * {{{
+   *   val ds = Seq(1, 2, 3).toDS()
+   *   val newDS = ds.select(e[Int]("value + 1"))
+   * }}}
+   * @since 1.6.0
+   */
+  def select[U1: Encoder](c1: TypedColumn[U1]): Dataset[U1] = {
+    new Dataset[U1](sqlContext, Project(Alias(c1.expr, "_1")() :: Nil, logicalPlan))
+  }
+
+  // Codegen
+  // scalastyle:off
+
+  /** sbt scalaShell; println(Seq(1).toDS().genSelect) */
+  private def genSelect: String = {
+    (2 to 5).map { n =>
+      val types = (1 to n).map(i =>s"U$i").mkString(", ")
+      val args = (1 to n).map(i => s"c$i: TypedColumn[U$i]").mkString(", ")
+      val encoders = (1 to n).map(i => s"c$i.encoder").mkString(", ")
+      val schema = (1 to n).map(i => s"""Alias(c$i.expr, "_$i")()""").mkString(" :: ")
+      s"""
+         |/**
+         | * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
+         | * @since 1.6.0
+         | */
+         |def select[$types]($args): Dataset[($types)] = {
+         |  implicit val te = new Tuple${n}Encoder($encoders)
+         |  new Dataset[($types)](sqlContext,
+         |    Project(
+         |      $schema :: Nil,
+         |      logicalPlan))
+         |}
+         |
+       """.stripMargin
+    }.mkString("\n")
+  }
+
+  /**
+   * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
+   * @since 1.6.0
+   */
+  def select[U1, U2](c1: TypedColumn[U1], c2: TypedColumn[U2]): Dataset[(U1, U2)] = {
+    implicit val te = new Tuple2Encoder(c1.encoder, c2.encoder)
+    new Dataset[(U1, U2)](sqlContext,
+      Project(
+        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Nil,
+        logicalPlan))
+  }
+
+
+
+  /**
+   * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
+   * @since 1.6.0
+   */
+  def select[U1, U2, U3](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3]): Dataset[(U1, U2, U3)] = {
+    implicit val te = new Tuple3Encoder(c1.encoder, c2.encoder, c3.encoder)
+    new Dataset[(U1, U2, U3)](sqlContext,
+      Project(
+        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Nil,
+        logicalPlan))
+  }
+
+
+
+  /**
+   * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
+   * @since 1.6.0
+   */
+  def select[U1, U2, U3, U4](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3], c4: TypedColumn[U4]): Dataset[(U1, U2, U3, U4)] = {
+    implicit val te = new Tuple4Encoder(c1.encoder, c2.encoder, c3.encoder, c4.encoder)
+    new Dataset[(U1, U2, U3, U4)](sqlContext,
+      Project(
+        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Alias(c4.expr, "_4")() :: Nil,
+        logicalPlan))
+  }
+
+
+
+  /**
+   * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
+   * @since 1.6.0
+   */
+  def select[U1, U2, U3, U4, U5](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3], c4: TypedColumn[U4], c5: TypedColumn[U5]): Dataset[(U1, U2, U3, U4, U5)] = {
+    implicit val te = new Tuple5Encoder(c1.encoder, c2.encoder, c3.encoder, c4.encoder, c5.encoder)
+    new Dataset[(U1, U2, U3, U4, U5)](sqlContext,
+      Project(
+        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Alias(c4.expr, "_4")() :: Alias(c5.expr, "_5")() :: Nil,
+        logicalPlan))
+  }
+
+  // scalastyle:on
+
+  /* **************** *
+   *  Set operations  *
+   * **************** */
+
+  /**
+   * Returns a new [[Dataset]] that contains only the unique elements of this [[Dataset]].
+   *
+   * Note that, equality checking is performed directly on the encoded representation of the data
+   * and thus is not affected by a custom `equals` function defined on `T`.
+   * @since 1.6.0
+   */
+  def distinct: Dataset[T] = withPlan(Distinct)
+
+  /**
+   * Returns a new [[Dataset]] that contains only the elements of this [[Dataset]] that are also
+   * present in `other`.
+   *
+   * Note that, equality checking is performed directly on the encoded representation of the data
+   * and thus is not affected by a custom `equals` function defined on `T`.
+   * @since 1.6.0
+   */
+  def intersect(other: Dataset[T]): Dataset[T] =
+    withPlan[T](other)(Intersect)
+
+  /**
+   * Returns a new [[Dataset]] that contains the elements of both this and the `other` [[Dataset]]
+   * combined.
+   *
+   * Note that, this function is not a typical set union operation, in that it does not eliminate
+   * duplicate items.  As such, it is analagous to `UNION ALL` in SQL.
+   * @since 1.6.0
+   */
+  def union(other: Dataset[T]): Dataset[T] =
+    withPlan[T](other)(Union)
+
+  /**
+   * Returns a new [[Dataset]] where any elements present in `other` have been removed.
+   *
+   * Note that, equality checking is performed directly on the encoded representation of the data
+   * and thus is not affected by a custom `equals` function defined on `T`.
+   * @since 1.6.0
+   */
+  def subtract(other: Dataset[T]): Dataset[T] = withPlan[T](other)(Except)
+
+  /* ************************** *
+   *  Gather to Driver Actions  *
+   * ************************** */
+
+  /** Returns the first element in this [[Dataset]]. */
+  def first(): T = rdd.first()
+
+  /** Collects the elements to an Array. */
+  def collect(): Array[T] = rdd.collect()
+
+  /** Returns the first `num` elements of this [[Dataset]] as an Array. */
+  def take(num: Int): Array[T] = rdd.take(num)
+
+  /* ******************** *
+   *  Internal Functions  *
+   * ******************** */
+
+  private[sql] def logicalPlan = queryExecution.analyzed
+
+  private[sql] def withPlan(f: LogicalPlan => LogicalPlan): Dataset[T] =
+    new Dataset[T](sqlContext, sqlContext.executePlan(f(logicalPlan)))
+
+  private[sql] def withPlan[R : Encoder](
+      other: Dataset[_])(
+      f: (LogicalPlan, LogicalPlan) => LogicalPlan): Dataset[R] =
+    new Dataset[R](
+      sqlContext,
+      sqlContext.executePlan(
+        f(logicalPlan, other.logicalPlan)))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
new file mode 100644
index 0000000000000..17817cbcc5e05
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
@@ -0,0 +1,30 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+/**
+ * A container for a [[DataFrame]], used for implicit conversions.
+ *
+ * @since 1.3.0
+ */
+private[sql] case class DatasetHolder[T](df: Dataset[T]) {
+
+  // This is declared with parentheses to prevent the Scala compiler from treating
+  // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
+  def toDS(): Dataset[T] = df
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
new file mode 100644
index 0000000000000..89a16dd8b0acc
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.encoders.Encoder
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.QueryExecution
+
+/**
+ * A [[Dataset]] has been logically grouped by a user specified grouping key.  Users should not
+ * construct a [[GroupedDataset]] directly, but should instead call `groupBy` on an existing
+ * [[Dataset]].
+ */
+class GroupedDataset[K, T] private[sql](
+    private val kEncoder: Encoder[K],
+    private val tEncoder: Encoder[T],
+    queryExecution: QueryExecution,
+    private val dataAttributes: Seq[Attribute],
+    private val groupingAttributes: Seq[Attribute]) extends Serializable {
+
+  private implicit def kEnc = kEncoder
+  private implicit def tEnc = tEncoder
+  private def logicalPlan = queryExecution.analyzed
+  private def sqlContext = queryExecution.sqlContext
+
+  /**
+   * Returns a [[Dataset]] that contains each unique key.
+   */
+  def keys: Dataset[K] = {
+    new Dataset[K](
+      sqlContext,
+      Distinct(
+        Project(groupingAttributes, logicalPlan)))
+  }
+
+  /**
+   * Applies the given function to each group of data.  For each unique group, the function will
+   * be passed the group key and an iterator that contains all of the elements in the group. The
+   * function can return an iterator containing elements of an arbitrary type which will be returned
+   * as a new [[Dataset]].
+   *
+   * Internally, the implementation will spill to disk if any given group is too large to fit into
+   * memory.  However, users must take care to avoid materializing the whole iterator for a group
+   * (for example, by calling `toList`) unless they are sure that this is possible given the memory
+   * constraints of their cluster.
+   */
+  def mapGroups[U : Encoder](f: (K, Iterator[T]) => Iterator[U]): Dataset[U] = {
+    new Dataset[U](
+      sqlContext,
+      MapGroups(f, groupingAttributes, logicalPlan))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a107639947aa2..5e7198f974389 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -21,6 +21,7 @@ import java.beans.{BeanInfo, Introspector}
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicReference
 
+
 import scala.collection.JavaConverters._
 import scala.collection.immutable
 import scala.reflect.runtime.universe.TypeTag
@@ -33,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
@@ -487,6 +489,16 @@ class SQLContext private[sql](
     DataFrame(this, logicalPlan)
   }
 
+
+  def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = {
+    val enc = implicitly[Encoder[T]]
+    val attributes = enc.schema.toAttributes
+    val encoded = data.map(d => enc.toRow(d).copy())
+    val plan = new LocalRelation(attributes, encoded)
+
+    new Dataset[T](this, plan)
+  }
+
   /**
    * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be
    * converted to Catalyst rows.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index bf03c61088426..af8474df0de80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.encoders._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
 
@@ -30,9 +34,19 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * A collection of implicit methods for converting common Scala objects into [[DataFrame]]s.
  */
-private[sql] abstract class SQLImplicits {
+abstract class SQLImplicits {
   protected def _sqlContext: SQLContext
 
+  implicit def newProductEncoder[T <: Product : TypeTag]: Encoder[T] = ProductEncoder[T]
+
+  implicit def newIntEncoder: Encoder[Int] = new IntEncoder()
+  implicit def newLongEncoder: Encoder[Long] = new LongEncoder()
+  implicit def newStringEncoder: Encoder[String] = new StringEncoder()
+
+  implicit def localSeqToDatasetHolder[T : Encoder](s: Seq[T]): DatasetHolder[T] = {
+    DatasetHolder(_sqlContext.createDataset(s))
+  }
+
   /**
    * An implicit conversion that turns a Scala `Symbol` into a [[Column]].
    * @since 1.3.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
new file mode 100644
index 0000000000000..10742cf7348f8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateOrdering}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder, Ascending, Expression}
+
+object GroupedIterator {
+  def apply(
+      input: Iterator[InternalRow],
+      keyExpressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
+    if (input.hasNext) {
+      new GroupedIterator(input, keyExpressions, inputSchema)
+    } else {
+      Iterator.empty
+    }
+  }
+}
+
+/**
+ * Iterates over a presorted set of rows, chunking it up by the grouping expression.  Each call to
+ * next will return a pair containing the current group and an iterator that will return all the
+ * elements of that group.  Iterators for each group are lazily constructed by extracting rows
+ * from the input iterator.  As such, full groups are never materialized by this class.
+ *
+ * Example input:
+ * {{{
+ *   Input: [a, 1], [b, 2], [b, 3]
+ *   Grouping: x#1
+ *   InputSchema: x#1, y#2
+ * }}}
+ *
+ * Result:
+ * {{{
+ *   First call to next():  ([a], Iterator([a, 1])
+ *   Second call to next(): ([b], Iterator([b, 2], [b, 3])
+ * }}}
+ *
+ * Note, the class does not handle the case of an empty input for simplicity of implementation.
+ * Use the factory to construct a new instance.
+ *
+ * @param input An iterator of rows.  This iterator must be ordered by the groupingExpressions or
+ *              it is possible for the same group to appear more than once.
+ * @param groupingExpressions The set of expressions used to do grouping.  The result of evaluating
+ *                            these expressions will be returned as the first part of each call
+ *                            to `next()`.
+ * @param inputSchema The schema of the rows in the `input` iterator.
+ */
+class GroupedIterator private(
+    input: Iterator[InternalRow],
+    groupingExpressions: Seq[Expression],
+    inputSchema: Seq[Attribute])
+  extends Iterator[(InternalRow, Iterator[InternalRow])] {
+
+  /** Compares two input rows and returns 0 if they are in the same group. */
+  val sortOrder = groupingExpressions.map(SortOrder(_, Ascending))
+  val keyOrdering = GenerateOrdering.generate(sortOrder, inputSchema)
+
+  /** Creates a row containing only the key for a given input row. */
+  val keyProjection = GenerateUnsafeProjection.generate(groupingExpressions, inputSchema)
+
+  /**
+   * Holds null or the row that will be returned on next call to `next()` in the inner iterator.
+   */
+  var currentRow = input.next()
+
+  /** Holds a copy of an input row that is in the current group. */
+  var currentGroup = currentRow.copy()
+  var currentIterator: Iterator[InternalRow] = null
+  assert(keyOrdering.compare(currentGroup, currentRow) == 0)
+
+  // Return true if we already have the next iterator or fetching a new iterator is successful.
+  def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator
+
+  def next(): (InternalRow, Iterator[InternalRow]) = {
+    assert(hasNext) // Ensure we have fetched the next iterator.
+    val ret = (keyProjection(currentGroup), currentIterator)
+    currentIterator = null
+    ret
+  }
+
+  def fetchNextGroupIterator(): Boolean = {
+    if (currentRow != null || input.hasNext) {
+      val inputIterator = new Iterator[InternalRow] {
+        // Return true if we have a row and it is in the current group, or if fetching a new row is
+        // successful.
+        def hasNext = {
+          (currentRow != null && keyOrdering.compare(currentGroup, currentRow) == 0) ||
+            fetchNextRowInGroup()
+        }
+
+        def fetchNextRowInGroup(): Boolean = {
+          if (currentRow != null || input.hasNext) {
+            currentRow = input.next()
+            if (keyOrdering.compare(currentGroup, currentRow) == 0) {
+              // The row is in the current group.  Continue the inner iterator.
+              true
+            } else {
+              // We got a row, but its not in the right group.  End this inner iterator and prepare
+              // for the next group.
+              currentIterator = null
+              currentGroup = currentRow.copy()
+              false
+            }
+          } else {
+            // There is no more input so we are done.
+            false
+          }
+        }
+
+        def next(): InternalRow = {
+          assert(hasNext) // Ensure we have fetched the next row.
+          val res = currentRow
+          currentRow = null
+          res
+        }
+      }
+      currentIterator = inputIterator
+      true
+    } else {
+      false
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 79bd1a41808de..637deff4e2202 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -372,6 +372,14 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.Distinct(child) =>
         throw new IllegalStateException(
           "logical distinct operator should have been replaced by aggregate in the optimizer")
+
+      case logical.MapPartitions(f, tEnc, uEnc, output, child) =>
+        execution.MapPartitions(f, tEnc, uEnc, output, planLater(child)) :: Nil
+      case logical.AppendColumn(f, tEnc, uEnc, newCol, child) =>
+        execution.AppendColumns(f, tEnc, uEnc, newCol, planLater(child)) :: Nil
+      case logical.MapGroups(f, kEnc, tEnc, uEnc, grouping, output, child) =>
+        execution.MapGroups(f, kEnc, tEnc, uEnc, grouping, output, planLater(child)) :: Nil
+
       case logical.Repartition(numPartitions, shuffle, child) =>
         if (shuffle) {
           execution.Exchange(RoundRobinPartitioning(numPartitions), planLater(child)) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index dc38fe59feed5..2bb3dba5bd2ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -20,7 +20,9 @@ package org.apache.spark.sql.execution
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.MutablePair
@@ -311,3 +313,80 @@ case class OutputFaker(output: Seq[Attribute], child: SparkPlan) extends SparkPl
 
   protected override def doExecute(): RDD[InternalRow] = child.execute()
 }
+
+/**
+ * Applies the given function to each input row and encodes the result.
+ */
+case class MapPartitions[T, U](
+    func: Iterator[T] => Iterator[U],
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    output: Seq[Attribute],
+    child: SparkPlan) extends UnaryNode {
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val tBoundEncoder = tEncoder.bind(child.output)
+      func(iter.map(tBoundEncoder.fromRow)).map(uEncoder.toRow)
+    }
+  }
+}
+
+/**
+ * Applies the given function to each input row, appending the encoded result at the end of the row.
+ */
+case class AppendColumns[T, U](
+    func: T => U,
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    newColumns: Seq[Attribute],
+    child: SparkPlan) extends UnaryNode {
+
+  override def output: Seq[Attribute] = child.output ++ newColumns
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val tBoundEncoder = tEncoder.bind(child.output)
+      val combiner = GenerateUnsafeRowJoiner.create(tEncoder.schema, uEncoder.schema)
+      iter.map { row =>
+        val newColumns = uEncoder.toRow(func(tBoundEncoder.fromRow(row)))
+        combiner.join(row.asInstanceOf[UnsafeRow], newColumns.asInstanceOf[UnsafeRow]): InternalRow
+      }
+    }
+  }
+}
+
+/**
+ * Groups the input rows together and calls the function with each group and an iterator containing
+ * all elements in the group.  The result of this function is encoded and flattened before
+ * being output.
+ */
+case class MapGroups[K, T, U](
+    func: (K, Iterator[T]) => Iterator[U],
+    kEncoder: Encoder[K],
+    tEncoder: Encoder[T],
+    uEncoder: Encoder[U],
+    groupingAttributes: Seq[Attribute],
+    output: Seq[Attribute],
+    child: SparkPlan) extends UnaryNode {
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    ClusteredDistribution(groupingAttributes) :: Nil
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
+    Seq(groupingAttributes.map(SortOrder(_, Ascending)))
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val grouped = GroupedIterator(iter, groupingAttributes, child.output)
+      val groupKeyEncoder = kEncoder.bind(groupingAttributes)
+
+      grouped.flatMap { case (key, rowIter) =>
+        val result = func(
+          groupKeyEncoder.fromRow(key),
+          rowIter.map(tEncoder.fromRow))
+        result.map(uEncoder.toRow)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
new file mode 100644
index 0000000000000..32443557fb8e0
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import scala.language.postfixOps
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+case class IntClass(value: Int)
+
+class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("toDS") {
+    val data = Seq(1, 2, 3, 4, 5, 6)
+    checkAnswer(
+      data.toDS(),
+      data: _*)
+  }
+
+  test("as case class / collect") {
+    val ds = Seq(1, 2, 3).toDS().as[IntClass]
+    checkAnswer(
+      ds,
+      IntClass(1), IntClass(2), IntClass(3))
+
+    assert(ds.collect().head == IntClass(1))
+  }
+
+  test("map") {
+    val ds = Seq(1, 2, 3).toDS()
+    checkAnswer(
+      ds.map(_ + 1),
+      2, 3, 4)
+  }
+
+  test("filter") {
+    val ds = Seq(1, 2, 3, 4).toDS()
+    checkAnswer(
+      ds.filter(_ % 2 == 0),
+      2, 4)
+  }
+
+  test("foreach") {
+    val ds = Seq(1, 2, 3).toDS()
+    val acc = sparkContext.accumulator(0)
+    ds.foreach(acc +=)
+    assert(acc.value == 6)
+  }
+
+  test("foreachPartition") {
+    val ds = Seq(1, 2, 3).toDS()
+    val acc = sparkContext.accumulator(0)
+    ds.foreachPartition(_.foreach(acc +=))
+    assert(acc.value == 6)
+  }
+
+  test("reduce") {
+    val ds = Seq(1, 2, 3).toDS()
+    assert(ds.reduce(_ + _) == 6)
+  }
+
+  test("fold") {
+    val ds = Seq(1, 2, 3).toDS()
+    assert(ds.fold(0)(_ + _) == 6)
+  }
+
+  test("groupBy function, keys") {
+    val ds = Seq(1, 2, 3, 4, 5).toDS()
+    val grouped = ds.groupBy(_ % 2)
+    checkAnswer(
+      grouped.keys,
+      0, 1)
+  }
+
+  test("groupBy function, mapGroups") {
+    val ds = Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11).toDS()
+    val grouped = ds.groupBy(_ % 2)
+    val agged = grouped.mapGroups { case (g, iter) =>
+      val name = if (g == 0) "even" else "odd"
+      Iterator((name, iter.size))
+    }
+
+    checkAnswer(
+      agged,
+      ("even", 5), ("odd", 6))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
new file mode 100644
index 0000000000000..08496249c60cc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import scala.language.postfixOps
+
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.SharedSQLContext
+
+case class ClassData(a: String, b: Int)
+
+class DatasetSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("toDS") {
+    val data = Seq(("a", 1) , ("b", 2), ("c", 3))
+    checkAnswer(
+      data.toDS(),
+      data: _*)
+  }
+
+  test("as case class / collect") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDF("a", "b").as[ClassData]
+    checkAnswer(
+      ds,
+      ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+    assert(ds.collect().head == ClassData("a", 1))
+  }
+
+  test("as case class - reordered fields by name") {
+    val ds = Seq((1, "a"), (2, "b"), (3, "c")).toDF("b", "a").as[ClassData]
+    assert(ds.collect() === Array(ClassData("a", 1), ClassData("b", 2), ClassData("c", 3)))
+  }
+
+  test("map") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.map(v => (v._1, v._2 + 1)),
+      ("a", 2), ("b", 3), ("c", 4))
+  }
+
+  test("select") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.select(expr("_2 + 1").as[Int]),
+      2, 3, 4)
+  }
+
+  test("select 3") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.select(
+        expr("_1").as[String],
+        expr("_2").as[Int],
+        expr("_2 + 1").as[Int]),
+      ("a", 1, 2), ("b", 2, 3), ("c", 3, 4))
+  }
+
+  test("filter") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.filter(_._1 == "b"),
+      ("b", 2))
+  }
+
+  test("foreach") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    val acc = sparkContext.accumulator(0)
+    ds.foreach(v => acc += v._2)
+    assert(acc.value == 6)
+  }
+
+  test("foreachPartition") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    val acc = sparkContext.accumulator(0)
+    ds.foreachPartition(_.foreach(v => acc += v._2))
+    assert(acc.value == 6)
+  }
+
+  test("reduce") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    assert(ds.reduce((a, b) => ("sum", a._2 + b._2)) == ("sum", 6))
+  }
+
+  test("fold") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    assert(ds.fold(("", 0))((a, b) => ("sum", a._2 + b._2)) == ("sum", 6))
+  }
+
+  test("groupBy function, keys") {
+    val ds = Seq(("a", 1), ("b", 1)).toDS()
+    val grouped = ds.groupBy(v => (1, v._2))
+    checkAnswer(
+      grouped.keys,
+      (1, 1))
+  }
+
+  test("groupBy function, mapGroups") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val grouped = ds.groupBy(v => (v._1, "word"))
+    val agged = grouped.mapGroups { case (g, iter) =>
+      Iterator((g._1, iter.map(_._2).sum))
+    }
+
+    checkAnswer(
+      agged,
+      ("a", 30), ("b", 3), ("c", 1))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index e3c5a426671d0..aba567512fe32 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -20,10 +20,12 @@ package org.apache.spark.sql
 import java.util.{Locale, TimeZone}
 
 import scala.collection.JavaConverters._
+import scala.reflect.runtime.universe._
 
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.columnar.InMemoryRelation
+import org.apache.spark.sql.catalyst.encoders.{ProductEncoder, Encoder}
 
 abstract class QueryTest extends PlanTest {
 
@@ -53,6 +55,12 @@ abstract class QueryTest extends PlanTest {
     }
   }
 
+  protected def checkAnswer[T : Encoder](ds: => Dataset[T], expectedAnswer: T*): Unit = {
+    checkAnswer(
+      ds.toDF(),
+      sqlContext.createDataset(expectedAnswer).toDF().collect().toSeq)
+  }
+
   /**
    * Runs the plan and makes sure the answer matches the expected result.
    * @param df the [[DataFrame]] to be executed

From 163d53e829c166f061589cc379f61642d4c9a40f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 22 Oct 2015 15:27:11 -0700
Subject: [PATCH 649/802] [SPARK-7021] Add JUnit output for Python unit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WIP

Author: Gábor Lipták <gliptak@gmail.com>

Closes #8323 from gliptak/SPARK-7021.
---
 python/pyspark/ml/tests.py        |  9 ++++++++-
 python/pyspark/mllib/tests.py     |  9 ++++++++-
 python/pyspark/sql/tests.py       |  9 ++++++++-
 python/pyspark/streaming/tests.py | 11 ++++++++++-
 python/pyspark/tests.py           | 19 ++++++++++++++-----
 5 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6a2577d66f287..7a16cf52cccb2 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -20,6 +20,10 @@
 """
 
 import sys
+try:
+    import xmlrunner
+except ImportError:
+    xmlrunner = None
 
 if sys.version_info[:2] <= (2, 6):
     try:
@@ -368,4 +372,7 @@ def test_fit_maximize_metric(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    if xmlrunner:
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+    else:
+        unittest.main()
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 2ad69a0ab1d3d..f8e8e0e0adbea 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -31,6 +31,10 @@
 from numpy import sum as array_sum
 
 from py4j.protocol import Py4JJavaError
+try:
+    import xmlrunner
+except ImportError:
+    xmlrunner = None
 
 if sys.version > '3':
     basestring = str
@@ -1538,7 +1542,10 @@ def test_load_vectors(self):
 if __name__ == "__main__":
     if not _have_scipy:
         print("NOTE: Skipping SciPy tests as it does not seem to be installed")
-    unittest.main()
+    if xmlrunner:
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+    else:
+        unittest.main()
     if not _have_scipy:
         print("NOTE: SciPy tests were skipped as it does not seem to be installed")
     sc.stop()
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f465e1fa20941..6356d4bd6669b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -31,6 +31,10 @@
 import datetime
 
 import py4j
+try:
+    import xmlrunner
+except ImportError:
+    xmlrunner = None
 
 if sys.version_info[:2] <= (2, 6):
     try:
@@ -1222,4 +1226,7 @@ def test_window_functions_without_partitionBy(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    if xmlrunner:
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+    else:
+        unittest.main()
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 49634252fd465..2c908daa8b214 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -27,6 +27,11 @@
 import shutil
 from functools import reduce
 
+try:
+    import xmlrunner
+except ImportError:
+    xmlrunner = None
+
 if sys.version_info[:2] <= (2, 6):
     try:
         import unittest2 as unittest
@@ -1303,4 +1308,8 @@ def search_kinesis_asl_assembly_jar():
     for testcase in testcases:
         sys.stderr.write("[Running %s]\n" % (testcase))
         tests = unittest.TestLoader().loadTestsFromTestCase(testcase)
-        unittest.TextTestRunner(verbosity=3).run(tests)
+        if xmlrunner:
+            unittest.main(tests, verbosity=3,
+                          testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+        else:
+            unittest.TextTestRunner(verbosity=3).run(tests)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 3c51809444401..5bd94476597ab 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -35,6 +35,10 @@
 import hashlib
 
 from py4j.protocol import Py4JJavaError
+try:
+    import xmlrunner
+except ImportError:
+    xmlrunner = None
 
 if sys.version_info[:2] <= (2, 6):
     try:
@@ -249,10 +253,12 @@ def __getattr__(self, item):
 
     # Regression test for SPARK-3415
     def test_pickling_file_handles(self):
-        ser = CloudPickleSerializer()
-        out1 = sys.stderr
-        out2 = ser.loads(ser.dumps(out1))
-        self.assertEqual(out1, out2)
+        # to be corrected with SPARK-11160
+        if not xmlrunner:
+            ser = CloudPickleSerializer()
+            out1 = sys.stderr
+            out2 = ser.loads(ser.dumps(out1))
+            self.assertEqual(out1, out2)
 
     def test_func_globals(self):
 
@@ -2006,7 +2012,10 @@ def test_statcounter_array(self):
         print("NOTE: Skipping SciPy tests as it does not seem to be installed")
     if not _have_numpy:
         print("NOTE: Skipping NumPy tests as it does not seem to be installed")
-    unittest.main()
+    if xmlrunner:
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+    else:
+        unittest.main()
     if not _have_scipy:
         print("NOTE: SciPy tests were skipped as it does not seem to be installed")
     if not _have_numpy:

From 34e71c6d89c1f2b6236dbf0d75cd12da08003c84 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 22 Oct 2015 15:58:08 -0700
Subject: [PATCH 650/802] [SPARK-11251] Fix page size calculation in local mode

```
// My machine only has 8 cores
$ bin/spark-shell --master local[32]
scala> val df = sc.parallelize(Seq((1, 1), (2, 2))).toDF("a", "b")
scala> df.as("x").join(df.as("y"), $"x.a" === $"y.a").count()

Caused by: java.io.IOException: Unable to acquire 2097152 bytes of memory
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.acquireNewPage(UnsafeExternalSorter.java:351)
```

Author: Andrew Or <andrew@databricks.com>

Closes #9209 from andrewor14/fix-local-page-size.
---
 .../scala/org/apache/spark/SparkContext.scala | 48 ++++++++++++++-----
 .../scala/org/apache/spark/SparkEnv.scala     |  4 +-
 .../OutputCommitCoordinatorSuite.scala        |  3 +-
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ccba3ed9e643c..a6857b4c7d882 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -269,7 +269,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       conf: SparkConf,
       isLocal: Boolean,
       listenerBus: LiveListenerBus): SparkEnv = {
-    SparkEnv.createDriverEnv(conf, isLocal, listenerBus)
+    SparkEnv.createDriverEnv(conf, isLocal, listenerBus, SparkContext.numDriverCores(master))
   }
 
   private[spark] def env: SparkEnv = _env
@@ -2560,6 +2560,21 @@ object SparkContext extends Logging {
     res
   }
 
+  /**
+   * The number of driver cores to use for execution in local mode, 0 otherwise.
+   */
+  private[spark] def numDriverCores(master: String): Int = {
+    def convertToInt(threads: String): Int = {
+      if (threads == "*") Runtime.getRuntime.availableProcessors() else threads.toInt
+    }
+    master match {
+      case "local" => 1
+      case SparkMasterRegex.LOCAL_N_REGEX(threads) => convertToInt(threads)
+      case SparkMasterRegex.LOCAL_N_FAILURES_REGEX(threads, _) => convertToInt(threads)
+      case _ => 0 // driver is not used for execution
+    }
+  }
+
   /**
    * Create a task scheduler based on a given master URL.
    * Return a 2-tuple of the scheduler backend and the task scheduler.
@@ -2567,18 +2582,7 @@ object SparkContext extends Logging {
   private def createTaskScheduler(
       sc: SparkContext,
       master: String): (SchedulerBackend, TaskScheduler) = {
-    // Regular expression used for local[N] and local[*] master formats
-    val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
-    // Regular expression for local[N, maxRetries], used in tests with failing tasks
-    val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
-    // Regular expression for simulating a Spark cluster of [N, cores, memory] locally
-    val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
-    // Regular expression for connecting to Spark deploy clusters
-    val SPARK_REGEX = """spark://(.*)""".r
-    // Regular expression for connection to Mesos cluster by mesos:// or zk:// url
-    val MESOS_REGEX = """(mesos|zk)://.*""".r
-    // Regular expression for connection to Simr cluster
-    val SIMR_REGEX = """simr://(.*)""".r
+    import SparkMasterRegex._
 
     // When running locally, don't try to re-execute tasks on failure.
     val MAX_LOCAL_TASK_FAILURES = 1
@@ -2719,6 +2723,24 @@ object SparkContext extends Logging {
   }
 }
 
+/**
+ * A collection of regexes for extracting information from the master string.
+ */
+private object SparkMasterRegex {
+  // Regular expression used for local[N] and local[*] master formats
+  val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
+  // Regular expression for local[N, maxRetries], used in tests with failing tasks
+  val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
+  // Regular expression for simulating a Spark cluster of [N, cores, memory] locally
+  val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
+  // Regular expression for connecting to Spark deploy clusters
+  val SPARK_REGEX = """spark://(.*)""".r
+  // Regular expression for connection to Mesos cluster by mesos:// or zk:// url
+  val MESOS_REGEX = """(mesos|zk)://.*""".r
+  // Regular expression for connection to Simr cluster
+  val SIMR_REGEX = """simr://(.*)""".r
+}
+
 /**
  * A class encapsulating how to convert some type T to Writable. It stores both the Writable class
  * corresponding to T (e.g. IntWritable for Int) and a function for doing the conversion.
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 704158bfc7643..b5c35c569e45f 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -190,6 +190,7 @@ object SparkEnv extends Logging {
       conf: SparkConf,
       isLocal: Boolean,
       listenerBus: LiveListenerBus,
+      numCores: Int,
       mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
     assert(conf.contains("spark.driver.host"), "spark.driver.host is not set on the driver!")
     assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")
@@ -202,6 +203,7 @@ object SparkEnv extends Logging {
       port,
       isDriver = true,
       isLocal = isLocal,
+      numUsableCores = numCores,
       listenerBus = listenerBus,
       mockOutputCommitCoordinator = mockOutputCommitCoordinator
     )
@@ -241,8 +243,8 @@ object SparkEnv extends Logging {
       port: Int,
       isDriver: Boolean,
       isLocal: Boolean,
+      numUsableCores: Int,
       listenerBus: LiveListenerBus = null,
-      numUsableCores: Int = 0,
       mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
 
     // Listener bus is only used on the driver
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 6d08d7c5b7d2a..48456a9cd6e7b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -87,7 +87,8 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
         outputCommitCoordinator = spy(new OutputCommitCoordinator(conf, isDriver = true))
         // Use Mockito.spy() to maintain the default infrastructure everywhere else.
         // This mocking allows us to control the coordinator responses in test cases.
-        SparkEnv.createDriverEnv(conf, isLocal, listenerBus, Some(outputCommitCoordinator))
+        SparkEnv.createDriverEnv(conf, isLocal, listenerBus,
+          SparkContext.numDriverCores(master), Some(outputCommitCoordinator))
       }
     }
     // Use Mockito.spy() to maintain the default infrastructure everywhere else

From a88c66ca8780c7228dc909f904d31cd9464ee0e3 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 22 Oct 2015 21:01:01 -0700
Subject: [PATCH 651/802] [SPARK-11098][CORE] Add Outbox to cache the sending
 messages to resolve the message disorder issue

The current NettyRpc has a message order issue because it uses a thread pool to send messages. E.g., running the following two lines in the same thread,

```
ref.send("A")
ref.send("B")
```

The remote endpoint may see "B" before "A" because sending "A" and "B" are in parallel.
To resolve this issue, this PR added an outbox for each connection, and if we are connecting to the remote node when sending messages, just cache the sending messages in the outbox and send them one by one when the connection is established.

Author: zsxwing <zsxwing@gmail.com>

Closes #9197 from zsxwing/rpc-outbox.
---
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 145 +++++++-----
 .../org/apache/spark/rpc/netty/Outbox.scala   | 222 ++++++++++++++++++
 2 files changed, 310 insertions(+), 57 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index e01cf1a29e95b..284284eb805b7 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -20,6 +20,7 @@ import java.io._
 import java.net.{InetSocketAddress, URI}
 import java.nio.ByteBuffer
 import java.util.concurrent._
+import java.util.concurrent.atomic.AtomicBoolean
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
@@ -70,12 +71,30 @@ private[netty] class NettyRpcEnv(
   // Because TransportClientFactory.createClient is blocking, we need to run it in this thread pool
   // to implement non-blocking send/ask.
   // TODO: a non-blocking TransportClientFactory.createClient in future
-  private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
+  private[netty] val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
     "netty-rpc-connection",
     conf.getInt("spark.rpc.connect.threads", 64))
 
   @volatile private var server: TransportServer = _
 
+  private val stopped = new AtomicBoolean(false)
+
+  /**
+   * A map for [[RpcAddress]] and [[Outbox]]. When we are connecting to a remote [[RpcAddress]],
+   * we just put messages to its [[Outbox]] to implement a non-blocking `send` method.
+   */
+  private val outboxes = new ConcurrentHashMap[RpcAddress, Outbox]()
+
+  /**
+   * Remove the address's Outbox and stop it.
+   */
+  private[netty] def removeOutbox(address: RpcAddress): Unit = {
+    val outbox = outboxes.remove(address)
+    if (outbox != null) {
+      outbox.stop()
+    }
+  }
+
   def start(port: Int): Unit = {
     val bootstraps: java.util.List[TransportServerBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
@@ -116,6 +135,30 @@ private[netty] class NettyRpcEnv(
     dispatcher.stop(endpointRef)
   }
 
+  private def postToOutbox(address: RpcAddress, message: OutboxMessage): Unit = {
+    val targetOutbox = {
+      val outbox = outboxes.get(address)
+      if (outbox == null) {
+        val newOutbox = new Outbox(this, address)
+        val oldOutbox = outboxes.putIfAbsent(address, newOutbox)
+        if (oldOutbox == null) {
+          newOutbox
+        } else {
+          oldOutbox
+        }
+      } else {
+        outbox
+      }
+    }
+    if (stopped.get) {
+      // It's possible that we put `targetOutbox` after stopping. So we need to clean it.
+      outboxes.remove(address)
+      targetOutbox.stop()
+    } else {
+      targetOutbox.send(message)
+    }
+  }
+
   private[netty] def send(message: RequestMessage): Unit = {
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
@@ -127,37 +170,28 @@ private[netty] class NettyRpcEnv(
           val ack = response.asInstanceOf[Ack]
           logTrace(s"Received ack from ${ack.sender}")
         case Failure(e) =>
-          logError(s"Exception when sending $message", e)
+          logWarning(s"Exception when sending $message", e)
       }(ThreadUtils.sameThread)
     } else {
       // Message to a remote RPC endpoint.
-      try {
-        // `createClient` will block if it cannot find a known connection, so we should run it in
-        // clientConnectionExecutor
-        clientConnectionExecutor.execute(new Runnable {
-          override def run(): Unit = Utils.tryLogNonFatalError {
-            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
-            client.sendRpc(serialize(message), new RpcResponseCallback {
-
-              override def onFailure(e: Throwable): Unit = {
-                logError(s"Exception when sending $message", e)
-              }
-
-              override def onSuccess(response: Array[Byte]): Unit = {
-                val ack = deserialize[Ack](response)
-                logDebug(s"Receive ack from ${ack.sender}")
-              }
-            })
-          }
-        })
-      } catch {
-        case e: RejectedExecutionException =>
-          // `send` after shutting clientConnectionExecutor down, ignore it
-          logWarning(s"Cannot send $message because RpcEnv is stopped")
-      }
+      postToOutbox(remoteAddr, OutboxMessage(serialize(message), new RpcResponseCallback {
+
+        override def onFailure(e: Throwable): Unit = {
+          logWarning(s"Exception when sending $message", e)
+        }
+
+        override def onSuccess(response: Array[Byte]): Unit = {
+          val ack = deserialize[Ack](response)
+          logDebug(s"Receive ack from ${ack.sender}")
+        }
+      }))
     }
   }
 
+  private[netty] def createClient(address: RpcAddress): TransportClient = {
+    clientFactory.createClient(address.host, address.port)
+  }
+
   private[netty] def ask(message: RequestMessage): Future[Any] = {
     val promise = Promise[Any]()
     val remoteAddr = message.receiver.address
@@ -180,39 +214,25 @@ private[netty] class NettyRpcEnv(
           }
       }(ThreadUtils.sameThread)
     } else {
-      try {
-        // `createClient` will block if it cannot find a known connection, so we should run it in
-        // clientConnectionExecutor
-        clientConnectionExecutor.execute(new Runnable {
-          override def run(): Unit = {
-            val client = clientFactory.createClient(remoteAddr.host, remoteAddr.port)
-            client.sendRpc(serialize(message), new RpcResponseCallback {
-
-              override def onFailure(e: Throwable): Unit = {
-                if (!promise.tryFailure(e)) {
-                  logWarning("Ignore Exception", e)
-                }
-              }
-
-              override def onSuccess(response: Array[Byte]): Unit = {
-                val reply = deserialize[AskResponse](response)
-                if (reply.reply.isInstanceOf[RpcFailure]) {
-                  if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
-                    logWarning(s"Ignore failure: ${reply.reply}")
-                  }
-                } else if (!promise.trySuccess(reply.reply)) {
-                  logWarning(s"Ignore message: ${reply}")
-                }
-              }
-            })
-          }
-        })
-      } catch {
-        case e: RejectedExecutionException =>
+      postToOutbox(remoteAddr, OutboxMessage(serialize(message), new RpcResponseCallback {
+
+        override def onFailure(e: Throwable): Unit = {
           if (!promise.tryFailure(e)) {
-            logWarning(s"Ignore failure", e)
+            logWarning("Ignore Exception", e)
           }
-      }
+        }
+
+        override def onSuccess(response: Array[Byte]): Unit = {
+          val reply = deserialize[AskResponse](response)
+          if (reply.reply.isInstanceOf[RpcFailure]) {
+            if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
+              logWarning(s"Ignore failure: ${reply.reply}")
+            }
+          } else if (!promise.trySuccess(reply.reply)) {
+            logWarning(s"Ignore message: ${reply}")
+          }
+        }
+      }))
     }
     promise.future
   }
@@ -245,6 +265,16 @@ private[netty] class NettyRpcEnv(
   }
 
   private def cleanup(): Unit = {
+    if (!stopped.compareAndSet(false, true)) {
+      return
+    }
+
+    val iter = outboxes.values().iterator()
+    while (iter.hasNext()) {
+      val outbox = iter.next()
+      outboxes.remove(outbox.address)
+      outbox.stop()
+    }
     if (timeoutScheduler != null) {
       timeoutScheduler.shutdownNow()
     }
@@ -463,6 +493,7 @@ private[netty] class NettyRpcHandler(
     val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      nettyEnv.removeOutbox(clientAddr)
       val messageOpt: Option[RemoteProcessDisconnected] =
       synchronized {
         remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
new file mode 100644
index 0000000000000..7d9d593b36241
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc.netty
+
+import java.util.concurrent.Callable
+import javax.annotation.concurrent.GuardedBy
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.SparkException
+import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
+import org.apache.spark.rpc.RpcAddress
+
+private[netty] case class OutboxMessage(content: Array[Byte], callback: RpcResponseCallback)
+
+private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
+
+  outbox => // Give this an alias so we can use it more clearly in closures.
+
+  @GuardedBy("this")
+  private val messages = new java.util.LinkedList[OutboxMessage]
+
+  @GuardedBy("this")
+  private var client: TransportClient = null
+
+  /**
+   * connectFuture points to the connect task. If there is no connect task, connectFuture will be
+   * null.
+   */
+  @GuardedBy("this")
+  private var connectFuture: java.util.concurrent.Future[Unit] = null
+
+  @GuardedBy("this")
+  private var stopped = false
+
+  /**
+   * If there is any thread draining the message queue
+   */
+  @GuardedBy("this")
+  private var draining = false
+
+  /**
+   * Send a message. If there is no active connection, cache it and launch a new connection. If
+   * [[Outbox]] is stopped, the sender will be notified with a [[SparkException]].
+   */
+  def send(message: OutboxMessage): Unit = {
+    val dropped = synchronized {
+      if (stopped) {
+        true
+      } else {
+        messages.add(message)
+        false
+      }
+    }
+    if (dropped) {
+      message.callback.onFailure(new SparkException("Message is dropped because Outbox is stopped"))
+    } else {
+      drainOutbox()
+    }
+  }
+
+  /**
+   * Drain the message queue. If there is other draining thread, just exit. If the connection has
+   * not been established, launch a task in the `nettyEnv.clientConnectionExecutor` to setup the
+   * connection.
+   */
+  private def drainOutbox(): Unit = {
+    var message: OutboxMessage = null
+    synchronized {
+      if (stopped) {
+        return
+      }
+      if (connectFuture != null) {
+        // We are connecting to the remote address, so just exit
+        return
+      }
+      if (client == null) {
+        // There is no connect task but client is null, so we need to launch the connect task.
+        launchConnectTask()
+        return
+      }
+      if (draining) {
+        // There is some thread draining, so just exit
+        return
+      }
+      message = messages.poll()
+      if (message == null) {
+        return
+      }
+      draining = true
+    }
+    while (true) {
+      try {
+        val _client = synchronized { client }
+        if (_client != null) {
+          _client.sendRpc(message.content, message.callback)
+        } else {
+          assert(stopped == true)
+        }
+      } catch {
+        case NonFatal(e) =>
+          handleNetworkFailure(e)
+          return
+      }
+      synchronized {
+        if (stopped) {
+          return
+        }
+        message = messages.poll()
+        if (message == null) {
+          draining = false
+          return
+        }
+      }
+    }
+  }
+
+  private def launchConnectTask(): Unit = {
+    connectFuture = nettyEnv.clientConnectionExecutor.submit(new Callable[Unit] {
+
+      override def call(): Unit = {
+        try {
+          val _client = nettyEnv.createClient(address)
+          outbox.synchronized {
+            client = _client
+            if (stopped) {
+              closeClient()
+            }
+          }
+        } catch {
+          case ie: InterruptedException =>
+            // exit
+            return
+          case NonFatal(e) =>
+            outbox.synchronized { connectFuture = null }
+            handleNetworkFailure(e)
+            return
+        }
+        outbox.synchronized { connectFuture = null }
+        // It's possible that no thread is draining now. If we don't drain here, we cannot send the
+        // messages until the next message arrives.
+        drainOutbox()
+      }
+    })
+  }
+
+  /**
+   * Stop [[Inbox]] and notify the waiting messages with the cause.
+   */
+  private def handleNetworkFailure(e: Throwable): Unit = {
+    synchronized {
+      assert(connectFuture == null)
+      if (stopped) {
+        return
+      }
+      stopped = true
+      closeClient()
+    }
+    // Remove this Outbox from nettyEnv so that the further messages will create a new Outbox along
+    // with a new connection
+    nettyEnv.removeOutbox(address)
+
+    // Notify the connection failure for the remaining messages
+    //
+    // We always check `stopped` before updating messages, so here we can make sure no thread will
+    // update messages and it's safe to just drain the queue.
+    var message = messages.poll()
+    while (message != null) {
+      message.callback.onFailure(e)
+      message = messages.poll()
+    }
+    assert(messages.isEmpty)
+  }
+
+  private def closeClient(): Unit = synchronized {
+    // Not sure if `client.close` is idempotent. Just for safety.
+    if (client != null) {
+      client.close()
+    }
+    client = null
+  }
+
+  /**
+   * Stop [[Outbox]]. The remaining messages in the [[Outbox]] will be notified with a
+   * [[SparkException]].
+   */
+  def stop(): Unit = {
+    synchronized {
+      if (stopped) {
+        return
+      }
+      stopped = true
+      if (connectFuture != null) {
+        connectFuture.cancel(true)
+      }
+      closeClient()
+    }
+
+    // We always check `stopped` before updating messages, so here we can make sure no thread will
+    // update messages and it's safe to just drain the queue.
+    var message = messages.poll()
+    while (message != null) {
+      message.callback.onFailure(new SparkException("Message is dropped because Outbox is stopped"))
+      message = messages.poll()
+    }
+  }
+}

From fa6a4fbf08c8cca36cbe9f0d2bd20bc7be2ca45d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 22 Oct 2015 22:41:21 -0700
Subject: [PATCH 652/802] [SPARK-11134][CORE] Increase LauncherBackendSuite
 timeout.

This test can take a little while to finish on slow / loaded machines.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9235 from vanzin/SPARK-11134.
---
 .../org/apache/spark/launcher/LauncherBackendSuite.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
index 07e8869833e95..639d1daa36c73 100644
--- a/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
@@ -54,13 +54,13 @@ class LauncherBackendSuite extends SparkFunSuite with Matchers {
       .startApplication()
 
     try {
-      eventually(timeout(10 seconds), interval(100 millis)) {
+      eventually(timeout(30 seconds), interval(100 millis)) {
         handle.getAppId() should not be (null)
       }
 
       handle.stop()
 
-      eventually(timeout(10 seconds), interval(100 millis)) {
+      eventually(timeout(30 seconds), interval(100 millis)) {
         handle.getState() should be (SparkAppHandle.State.KILLED)
       }
     } finally {

From b1c1597e3c47f1912809f3c5ab21833fa4241b54 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Thu, 22 Oct 2015 22:42:15 -0700
Subject: [PATCH 653/802] Fix a (very tiny) typo

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #9230 from jaceklaskowski/utils-seconds-typo.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 55950405f0488..5a976ee839b1e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -952,7 +952,7 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * Convert a time parameter such as (50s, 100ms, or 250us) to microseconds for internal use. If
+   * Convert a time parameter such as (50s, 100ms, or 250us) to seconds for internal use. If
    * no suffix is provided, the passed number is assumed to be in seconds.
    */
   def timeStringAsSeconds(str: String): Long = {

From cdea0174e32a5f4c28fd59899b2e9774994303d5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 23 Oct 2015 00:00:21 -0700
Subject: [PATCH 654/802] [SPARK-11273][SQL] Move
 ArrayData/MapData/DataTypeParser to catalyst.util package

Author: Reynold Xin <rxin@databricks.com>

Closes #9239 from rxin/types-private.
---
 .../apache/spark/mllib/linalg/Matrices.scala  |  1 +
 .../apache/spark/mllib/linalg/Vectors.scala   |  1 +
 .../expressions/SpecializedGetters.java       |  4 ++--
 .../catalyst/expressions/UnsafeArrayData.java |  1 +
 .../catalyst/expressions/UnsafeMapData.java   |  2 +-
 .../execution/UnsafeExternalRowSorter.java    |  2 +-
 .../sql/catalyst/CatalystTypeConverters.scala |  2 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |  2 +-
 .../apache/spark/sql/catalyst/SqlParser.scala |  1 +
 .../sql/catalyst/encoders/RowEncoder.scala    |  2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |  2 +-
 .../sql/catalyst/expressions/JoinedRow.scala  |  1 +
 .../sql/catalyst/expressions/aggregates.scala |  2 +-
 .../expressions/codegen/CodeGenerator.scala   |  1 +
 .../codegen/GenerateSafeProjection.scala      |  1 +
 .../expressions/collectionOperations.scala    |  1 +
 .../expressions/complexTypeCreator.scala      |  2 +-
 .../expressions/complexTypeExtractors.scala   |  1 +
 .../sql/catalyst/expressions/generators.scala |  1 +
 .../sql/catalyst/expressions/objects.scala    |  1 +
 .../expressions/regexpExpressions.scala       |  2 +-
 .../spark/sql/catalyst/expressions/rows.scala |  1 +
 .../expressions/stringExpressions.scala       |  1 +
 .../util}/AbstractScalaRowIterator.scala      |  2 +-
 .../util}/ArrayBasedMapData.scala             |  4 ++--
 .../{types => catalyst/util}/ArrayData.scala  |  3 ++-
 .../util}/DataTypeParser.scala                |  3 ++-
 .../util}/GenericArrayData.scala              |  3 ++-
 .../{types => catalyst/util}/MapData.scala    |  4 +++-
 .../apache/spark/sql/types/StructType.scala   |  2 +-
 .../encoders/ProductEncoderSuite.scala        | 20 +++++++++----------
 .../expressions/UnsafeRowConverterSuite.scala |  2 +-
 .../codegen/GeneratedProjectionSuite.scala    |  1 +
 .../util}/DataTypeParserSuite.scala           |  3 ++-
 .../types/{decimal => }/DecimalSuite.scala    |  3 +--
 .../scala/org/apache/spark/sql/Column.scala   |  4 ++--
 .../sql/execution/datasources/DDLParser.scala |  1 +
 .../datasources/json/JacksonGenerator.scala   |  2 +-
 .../datasources/json/JacksonParser.scala      |  2 +-
 .../parquet/CatalystRowConverter.scala        |  2 +-
 .../apache/spark/sql/execution/python.scala   |  3 ++-
 .../spark/sql/test/ExamplePointUDT.scala      |  1 +
 .../org/apache/spark/sql/UnsafeRowSuite.scala |  5 +++--
 .../spark/sql/UserDefinedTypeSuite.scala      |  2 ++
 .../sql/columnar/ColumnarTestUtils.scala      |  3 ++-
 .../execution/RowFormatConvertersSuite.scala  |  3 ++-
 .../spark/sql/hive/HiveInspectors.scala       |  6 +++---
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  3 ++-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |  1 +
 .../spark/sql/hive/HiveInspectorSuite.scala   |  1 +
 50 files changed, 76 insertions(+), 48 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{ => catalyst/util}/AbstractScalaRowIterator.scala (96%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{types => catalyst/util}/ArrayBasedMapData.scala (96%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{types => catalyst/util}/ArrayData.scala (97%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{types => catalyst/util}/DataTypeParser.scala (98%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{types => catalyst/util}/GenericArrayData.scala (98%)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/{types => catalyst/util}/MapData.scala (93%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/{types => catalyst/util}/DataTypeParserSuite.scala (98%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/types/{decimal => }/DecimalSuite.scala (99%)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 8ba6e4e78d969..8879dcf75c9bf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -26,6 +26,7 @@ import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 3642e9286504f..dcdc614455d34 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -30,6 +30,7 @@ import org.apache.spark.annotation.{AlphaComponent, Since}
 import org.apache.spark.mllib.util.NumericParser
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
index 8f1027f3164c8..eea7149d02594 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.catalyst.expressions;
 
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.types.ArrayData;
+import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.sql.types.MapData;
+import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 761f0447943e8..3513960b41813 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -21,6 +21,7 @@
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 
+import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index 5bebe2a96e391..651eb1ff0c561 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -19,7 +19,7 @@
 
 import java.nio.ByteBuffer;
 
-import org.apache.spark.sql.types.MapData;
+import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.unsafe.Platform;
 
 /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 1d27182912c8a..7d94e0566faa9 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -26,7 +26,7 @@
 
 import org.apache.spark.SparkEnv;
 import org.apache.spark.TaskContext;
-import org.apache.spark.sql.AbstractScalaRowIterator;
+import org.apache.spark.sql.catalyst.util.AbstractScalaRowIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index f25591794abdb..2ec0ff53c89c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -27,7 +27,7 @@ import scala.language.existentials
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 713c6b547d9b7..c25161ee81b66 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedExtractValue, UnresolvedAttribute}
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData, ArrayData, DateTimeUtils}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 08ca325b21777..833368b7d5898 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.util.DataTypeParser
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 5142856afdcac..e9cc00a2b64ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -22,7 +22,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 99d7444dc470f..5564e242b0472 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -22,7 +22,7 @@ import java.math.{BigDecimal => JavaBigDecimal}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{StringUtils, DateTimeUtils}
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala
index d3560df0792eb..935c3aa28c999 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/JoinedRow.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{MapData, ArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 95061c4635879..70819be5af5b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.util.TypeUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayData, TypeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index a4ec5085fa153..f0f7a6cf0cc4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -27,6 +27,7 @@ import org.codehaus.janino.ClassBodyEvaluator
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.{MapData, ArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index ee50587ed097e..f0ed8645d923f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData}
 import org.apache.spark.sql.types._
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 75c66bc271fe0..89d87726ac649 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -20,6 +20,7 @@ import java.util.Comparator
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, CodegenFallback, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.util.{MapData, GenericArrayData, ArrayData}
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 059e45bd684ed..1854dfaa7db35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -21,7 +21,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.TypeUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, TypeUtils}
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index a2b5a6a58090e..41cd0a104a1f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.util.{MapData, GenericArrayData, ArrayData}
 import org.apache.spark.sql.types._
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index c0845e1a0102f..1a2092c909c56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.util.{MapData, ArrayData}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index b42d6c5c1e14e..81855289762c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 
 import scala.language.existentials
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 64f15945c790d..9e484c5ed83bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -22,7 +22,7 @@ import java.util.regex.{MatchResult, Pattern}
 import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.StringUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, StringUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 017efd2a166a7..cfc68fc00bea8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{MapData, ArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index abc5c94589baa..8770c4b76c2e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -22,6 +22,7 @@ import java.util.{HashMap, Locale, Map => JMap}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala
similarity index 96%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala
index 1090bdb5a4bd3..6d35f140cf23f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.catalyst.util
 
 /**
  * Shim to allow us to implement [[scala.Iterator]] in Java. Scala 2.11+ has an AbstractIterator
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
similarity index 96%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
index e5ffe32217351..70b028d2b3f7c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
 
 class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) extends MapData {
   require(keyArray.numElements() == valueArray.numElements())
@@ -42,7 +42,7 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte
     ArrayBasedMapData.toScalaMap(this).hashCode()
   }
 
-  override def toString(): String = {
+  override def toString: String = {
     s"keys: $keyArray, values: $valueArray"
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
similarity index 97%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
index b4ea300f5f306..cad4a08b0d839 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
@@ -15,11 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
 
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.types.DataType
 
 abstract class ArrayData extends SpecializedGetters with Serializable {
   def numElements(): Int
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
similarity index 98%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
index 6e081ea9237bd..2b83651f9086d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeParser.scala
@@ -15,13 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
 
 import scala.language.implicitConversions
 import scala.util.matching.Regex
 import scala.util.parsing.combinator.syntactical.StandardTokenParsers
 
 import org.apache.spark.sql.catalyst.SqlLexical
+import org.apache.spark.sql.types._
 
 /**
  * This is a data type parser that can be used to parse string representations of data types
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
similarity index 98%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
index 9448d88d6c5f0..e9bf7b33e35be 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.{DataType, Decimal}
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 class GenericArrayData(val array: Array[Any]) extends ArrayData {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
similarity index 93%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
index f50969f0f0b79..40db6067adf71 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
@@ -15,7 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.sql.types.DataType
 
 abstract class MapData extends Serializable {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index d6b436724b2a0..11fce4beaf55f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -18,13 +18,13 @@
 package org.apache.spark.sql.types
 
 import scala.collection.mutable.ArrayBuffer
-import scala.math.max
 
 import org.json4s.JsonDSL._
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering}
+import org.apache.spark.sql.catalyst.util.DataTypeParser
 
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
index 7735acbcbad41..008d0bea8a941 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
-import java.util
-
-import org.apache.spark.sql.types.{StructField, ArrayType, ArrayData}
-
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.runtime.universe._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst._
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types.{StructField, ArrayType}
 
 case class RepeatedStruct(s: Seq[PrimitiveData])
 
@@ -166,43 +164,43 @@ class ProductEncoderSuite extends SparkFunSuite {
     null: Array[Byte]))
   encodeDecodeTestCustom(("Array[Byte]",
     Array[Byte](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Int] null",
     null: Array[Int]))
   encodeDecodeTestCustom(("Array[Int]",
     Array[Int](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Long] null",
     null: Array[Long]))
   encodeDecodeTestCustom(("Array[Long]",
     Array[Long](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Double] null",
     null: Array[Double]))
   encodeDecodeTestCustom(("Array[Double]",
     Array[Double](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Float] null",
     null: Array[Float]))
   encodeDecodeTestCustom(("Array[Float]",
     Array[Float](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Boolean] null",
     null: Array[Boolean]))
   encodeDecodeTestCustom(("Array[Boolean]",
     Array[Boolean](true, false)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTest(("Array[Short] null",
     null: Array[Short]))
   encodeDecodeTestCustom(("Array[Short]",
     Array[Short](1, 2, 3)))
-    { (l, r) => util.Arrays.equals(l._2, r._2) }
+    { (l, r) => java.util.Arrays.equals(l._2, r._2) }
 
   encodeDecodeTestCustom(("java.sql.Timestamp",
     new java.sql.Timestamp(1)))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index c6aad34e972b5..68545f33e5465 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -23,7 +23,7 @@ import org.scalatest.Matchers
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
index 098944a9f4fc5..5adcac39c6514 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
index 1ba290753ce48..1e3409a9db6eb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DataTypeParserSuite.scala
@@ -15,9 +15,10 @@
 * limitations under the License.
 */
 
-package org.apache.spark.sql.types
+package org.apache.spark.sql.catalyst.util
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
 
 class DataTypeParserSuite extends SparkFunSuite {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
similarity index 99%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
index f9aceb8d3b13e..50683947da224 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
@@ -15,10 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.types.decimal
+package org.apache.spark.sql.types
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.types.Decimal
 import org.scalatest.PrivateMethodTester
 
 import scala.language.postfixOps
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index de11a1699afd9..e4f4cf1533ac4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.sql
 
-
 import scala.language.implicitConversions
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.Logging
 import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.encoders.Encoder
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.util.DataTypeParser
 import org.apache.spark.sql.types._
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
index 446739d5b8a2c..6969b423d01b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.SaveMode
 import org.apache.spark.sql.catalyst.{TableIdentifier, AbstractSparkSQLParser}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.util.DataTypeParser
 import org.apache.spark.sql.types._
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index d7d6edeb6c6d3..3f34520afe6b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils}
 
 import scala.collection.Map
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index 09b8a9e936a1d..b2e52011a7276 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 49007e45ecf87..b16c46579f7c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -32,7 +32,7 @@ import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index d4e6980967e82..d611b0011da16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -24,6 +24,7 @@ import scala.collection.JavaConverters._
 
 import net.razorvine.pickle._
 
+import org.apache.spark.{Logging => SparkLogging, TaskContext, Accumulator}
 import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
@@ -33,9 +34,9 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{MapData, GenericArrayData, ArrayBasedMapData, ArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.{Logging => SparkLogging, TaskContext, Accumulator}
 
 /**
  * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index a741a45f1c527..8d4854b698ed7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.test
 
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayData}
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 7d1ee39d4b539..00f1526576cc5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql
 import java.io.ByteArrayOutputStream
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.serializer.{KryoSerializer, JavaSerializer}
+import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.memory.MemoryAllocator
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index d17671d48a2fc..a229e5814df89 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayData}
+
 import scala.beans.{BeanInfo, BeanProperty}
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 964cdb52b245a..a5882f7870e37 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -22,7 +22,8 @@ import scala.util.Random
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow}
-import org.apache.spark.sql.types.{ArrayBasedMapData, GenericArrayData, AtomicType, Decimal}
+import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData}
+import org.apache.spark.sql.types.{AtomicType, Decimal}
 import org.apache.spark.unsafe.types.UTF8String
 
 object ColumnarTestUtils {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index 5dc37e5c3c238..b3fceeab64cfe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -21,8 +21,9 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute, Literal, IsNull}
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{GenericArrayData, ArrayType, StringType}
+import org.apache.spark.sql.types.{ArrayType, StringType}
 import org.apache.spark.unsafe.types.UTF8String
 
 class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 43c238fd49e0e..36f0708f9da3d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.{io => hadoopIo}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, types}
 import org.apache.spark.unsafe.types.UTF8String
@@ -50,8 +50,8 @@ import org.apache.spark.unsafe.types.UTF8String
  *     java.sql.Date
  *     java.sql.Timestamp
  *  Complex Types =>
- *    Map: [[org.apache.spark.sql.types.MapData]]
- *    List: [[org.apache.spark.sql.types.ArrayData]]
+ *    Map: [[MapData]]
+ *    List: [[ArrayData]]
  *    Struct: [[org.apache.spark.sql.catalyst.InternalRow]]
  *    Union: NOT SUPPORTED YET
  *  The Complex types plays as a container, which can hold arbitrary data types.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 5819cb9d08778..fdb576bedbbaf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -32,11 +32,12 @@ import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.util.DataTypeParser
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.execution.{FileRelation, datasources}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index f57b206999399..2ccad474b4f7a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.types._
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 81a70b8d42267..8bb9058cd74ef 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.io.LongWritable
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.util.{MapData, GenericArrayData, ArrayBasedMapData}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.Row
 

From 16dc9f344c08deee104090106cb0a537a90e33fc Mon Sep 17 00:00:00 2001
From: Rohan Bhanderi <rohan.bhanderi@sjsu.edu>
Date: Fri, 23 Oct 2015 01:10:46 -0700
Subject: [PATCH 655/802] Fix typo "Received" to "Receiver" in
 streaming-kafka-integration.md

Removed typo on line 8 in markdown : "Received" -> "Receiver"

Author: Rohan Bhanderi <rohan.bhanderi@sjsu.edu>

Closes #9242 from RohanBhanderi/patch-1.
---
 docs/streaming-kafka-integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index 5db39ae54a274..ab7f0117c0b7f 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -5,7 +5,7 @@ title: Spark Streaming + Kafka Integration Guide
 [Apache Kafka](http://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service. Here we explain how to configure Spark Streaming to receive data from Kafka. There are two approaches to this - the old approach using Receivers and Kafka's high-level API, and a new experimental approach (introduced in Spark 1.3) without using Receivers. They have different programming models, performance characteristics, and semantics guarantees, so read on for more details.
 
 ## Approach 1: Receiver-based Approach
-This approach uses a Receiver to receive the data. The Received is implemented using the Kafka high-level consumer API. As with all receivers, the data received from Kafka through a Receiver is stored in Spark executors, and then jobs launched by Spark Streaming processes the data. 
+This approach uses a Receiver to receive the data. The Receiver is implemented using the Kafka high-level consumer API. As with all receivers, the data received from Kafka through a Receiver is stored in Spark executors, and then jobs launched by Spark Streaming processes the data. 
 
 However, under default configuration, this approach can lose data under failures (see [receiver reliability](streaming-programming-guide.html#receiver-reliability). To ensure zero-data loss, you have to additionally enable Write Ahead Logs in Spark Streaming (introduced in Spark 1.2). This synchronously saves all the received Kafka data into write ahead logs on a distributed file system (e.g HDFS), so that all the data can be recovered on failure. See [Deploying section](streaming-programming-guide.html#deploying-applications) in the streaming programming guide for more details on Write Ahead Logs.
 

From 487d409e71767c76399217a07af8de1bb0da7aa8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 23 Oct 2015 01:33:14 -0700
Subject: [PATCH 656/802] [SPARK-11243][SQL] zero out padding bytes in
 UnsafeRow

For nested StructType, the underline buffer could be used for others before, we should zero out the padding bytes for those primitive types that have less than 8 bytes.

cc cloud-fan

Author: Davies Liu <davies@databricks.com>

Closes #9217 from davies/zero_out.
---
 .../expressions/codegen/UnsafeRowWriter.java  | 20 ++++++++++++++-----
 .../codegen/GeneratedProjectionSuite.scala    | 20 +++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
index adbe2621870df..048b7749d8fb4 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -100,19 +100,27 @@ public void alignToWords(int numBytes) {
   }
 
   public void write(int ordinal, boolean value) {
-    Platform.putBoolean(holder.buffer, getFieldOffset(ordinal), value);
+    final long offset = getFieldOffset(ordinal);
+    Platform.putLong(holder.buffer, offset, 0L);
+    Platform.putBoolean(holder.buffer, offset, value);
   }
 
   public void write(int ordinal, byte value) {
-    Platform.putByte(holder.buffer, getFieldOffset(ordinal), value);
+    final long offset = getFieldOffset(ordinal);
+    Platform.putLong(holder.buffer, offset, 0L);
+    Platform.putByte(holder.buffer, offset, value);
   }
 
   public void write(int ordinal, short value) {
-    Platform.putShort(holder.buffer, getFieldOffset(ordinal), value);
+    final long offset = getFieldOffset(ordinal);
+    Platform.putLong(holder.buffer, offset, 0L);
+    Platform.putShort(holder.buffer, offset, value);
   }
 
   public void write(int ordinal, int value) {
-    Platform.putInt(holder.buffer, getFieldOffset(ordinal), value);
+    final long offset = getFieldOffset(ordinal);
+    Platform.putLong(holder.buffer, offset, 0L);
+    Platform.putInt(holder.buffer, offset, value);
   }
 
   public void write(int ordinal, long value) {
@@ -123,7 +131,9 @@ public void write(int ordinal, float value) {
     if (Float.isNaN(value)) {
       value = Float.NaN;
     }
-    Platform.putFloat(holder.buffer, getFieldOffset(ordinal), value);
+    final long offset = getFieldOffset(ordinal);
+    Platform.putLong(holder.buffer, offset, 0L);
+    Platform.putFloat(holder.buffer, offset, value);
   }
 
   public void write(int ordinal, double value) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
index 5adcac39c6514..1522ee34e43a5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
@@ -99,4 +99,24 @@ class GeneratedProjectionSuite extends SparkFunSuite {
     val row2 = safeProj(unsafeRow)
     assert(row2 === row)
   }
+
+  test("padding bytes should be zeroed out") {
+    val types = Seq(BooleanType, ByteType, ShortType, IntegerType, FloatType, BinaryType,
+      StringType)
+    val struct = StructType(types.map(StructField("", _, true)))
+    val fields = Array[DataType](StringType, struct)
+    val unsafeProj = UnsafeProjection.create(fields)
+
+    val innerRow = InternalRow(false, 1.toByte, 2.toShort, 3, 4.0f, "".getBytes,
+      UTF8String.fromString(""))
+    val row1 = InternalRow(UTF8String.fromString(""), innerRow)
+    val unsafe1 = unsafeProj(row1).copy()
+    // create a Row with long String before the inner struct
+    val row2 = InternalRow(UTF8String.fromString("a_long_string").repeat(10), innerRow)
+    val unsafe2 = unsafeProj(row2).copy()
+    assert(unsafe1.getStruct(1, 7) === unsafe2.getStruct(1, 7))
+    val unsafe3 = unsafeProj(row1).copy()
+    assert(unsafe1 === unsafe3)
+    assert(unsafe1.getStruct(1, 7) === unsafe3.getStruct(1, 7))
+  }
 }

From 03ccb22080965d44fc0e1fc94dc75a96bfa26b8a Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Fri, 23 Oct 2015 08:31:01 -0700
Subject: [PATCH 657/802] [SPARK-10382] Make example code in user guide
 testable

A POC code for making example code in user guide testable.

mengxr We still need to talk about the labels in code.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #9109 from yinxusen/SPARK-10382.
---
 docs/_plugins/include_example.rb | 96 ++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 docs/_plugins/include_example.rb

diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
new file mode 100644
index 0000000000000..0f4184c7462be
--- /dev/null
+++ b/docs/_plugins/include_example.rb
@@ -0,0 +1,96 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+require 'liquid'
+require 'pygments'
+
+module Jekyll
+  class IncludeExampleTag < Liquid::Tag
+    
+    def initialize(tag_name, markup, tokens)
+      @markup = markup
+      super
+    end
+ 
+    def render(context)
+      site = context.registers[:site]
+      config_dir = (site.config['code_dir'] || '../examples/src/main').sub(/^\//,'')
+      @code_dir = File.join(site.source, config_dir)
+
+      clean_markup = @markup.strip
+      @file = File.join(@code_dir, clean_markup)
+      @lang = clean_markup.split('.').last
+
+      code = File.open(@file).read.encode("UTF-8")
+      code = select_lines(code)
+ 
+      Pygments.highlight(code, :lexer => @lang)
+    end
+ 
+    # Trim the code block so as to have the same indention, regardless of their positions in the
+    # code file.
+    def trim_codeblock(lines)
+      # Select the minimum indention of the current code block.
+      min_start_spaces = lines
+        .select { |l| l.strip.size !=0 }
+        .map { |l| l[/\A */].size }
+        .min
+
+      lines.map { |l| l[min_start_spaces .. -1] }
+    end
+
+    # Select lines according to labels in code. Currently we use "$example on$" and "$example off$"
+    # as labels. Note that code blocks identified by the labels should not overlap.
+    def select_lines(code)
+      lines = code.each_line.to_a
+
+      # Select the array of start labels from code.
+      startIndices = lines
+        .each_with_index
+        .select { |l, i| l.include? "$example on$" }
+        .map { |l, i| i }
+
+      # Select the array of end labels from code.
+      endIndices = lines
+        .each_with_index
+        .select { |l, i| l.include? "$example off$" }
+        .map { |l, i| i }
+
+      raise "Start indices amount is not equal to end indices amount, please check the code." \
+        unless startIndices.size == endIndices.size
+
+      raise "No code is selected by include_example, please check the code." \
+        if startIndices.size == 0
+
+      # Select and join code blocks together, with a space line between each of two continuous
+      # blocks.
+      lastIndex = -1
+      result = ""
+      startIndices.zip(endIndices).each do |start, endline|
+        raise "Overlapping between two example code blocks are not allowed." if start <= lastIndex
+        raise "$example on$ should not be in the same line with $example off$." if start == endline
+        lastIndex = endline
+        range = Range.new(start + 1, endline - 1)
+        result += trim_codeblock(lines[range]).join
+        result += "\n"
+      end
+      result
+    end
+  end
+end
+
+Liquid::Template.register_tag('include_example', Jekyll::IncludeExampleTag)

From 282a15f78e08f0dc9e696945be4fc973011a96d9 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 23 Oct 2015 08:43:49 -0700
Subject: [PATCH 658/802] [SPARK-10277] [MLLIB] [PYSPARK] Add @since annotation
 to pyspark.mllib.regression

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #8684 from yu-iskw/SPARK-10277.
---
 python/pyspark/mllib/regression.py | 102 ++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 961b5e80b013c..6f00d1df209c0 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -18,7 +18,7 @@
 import numpy as np
 from numpy import array
 
-from pyspark import RDD
+from pyspark import RDD, since
 from pyspark.streaming.dstream import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
 from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector
@@ -43,6 +43,8 @@ class LabeledPoint(object):
             column matrix)
 
     Note: 'label' and 'features' are accessible as class attributes.
+
+    .. versionadded:: 1.0.0
     """
 
     def __init__(self, label, features):
@@ -66,6 +68,8 @@ class LinearModel(object):
 
     :param weights: Weights computed for every feature.
     :param intercept: Intercept computed for this model.
+
+    .. versionadded:: 0.9.0
     """
 
     def __init__(self, weights, intercept):
@@ -73,11 +77,15 @@ def __init__(self, weights, intercept):
         self._intercept = float(intercept)
 
     @property
+    @since("1.0.0")
     def weights(self):
+        """Weights computed for every feature."""
         return self._coeff
 
     @property
+    @since("1.0.0")
     def intercept(self):
+        """Intercept computed for this model."""
         return self._intercept
 
     def __repr__(self):
@@ -94,8 +102,11 @@ class LinearRegressionModelBase(LinearModel):
     True
     >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6
     True
+
+    .. versionadded:: 0.9.0
     """
 
+    @since("0.9.0")
     def predict(self, x):
         """
         Predict the value of the dependent variable given a vector or
@@ -163,14 +174,20 @@ class LinearRegressionModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+
+    .. versionadded:: 0.9.0
     """
+    @since("1.4.0")
     def save(self, sc, path):
+        """Save a LinearRegressionModel."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel(
             _py2java(sc, self._coeff), self.intercept)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.4.0")
     def load(cls, sc, path):
+        """Load a LinearRegressionModel."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -199,8 +216,20 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
 
 
 class LinearRegressionWithSGD(object):
+    """
+    Train a linear regression model with no regularization using Stochastic Gradient Descent.
+    This solves the least squares regression formulation
+                 f(weights) = 1/n ||A weights-y||^2^
+    (which is the mean squared error).
+    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
+    its corresponding right hand side label y.
+    See also the documentation for the precise formulation.
+
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.0, regType=None, intercept=False,
               validateData=True, convergenceTol=0.001):
@@ -313,14 +342,20 @@ class LassoModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+
+    .. versionadded:: 0.9.0
     """
+    @since("1.4.0")
     def save(self, sc, path):
+        """Save a LassoModel."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel(
             _py2java(sc, self._coeff), self.intercept)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.4.0")
     def load(cls, sc, path):
+        """Load a LassoModel."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -330,8 +365,19 @@ def load(cls, sc, path):
 
 
 class LassoWithSGD(object):
+    """
+    Train a regression model with L1-regularization using Stochastic Gradient Descent.
+    This solves the l1-regularized least squares regression formulation
+             f(weights) = 1/2n ||A weights-y||^2^  + regParam ||weights||_1
+    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
+    its corresponding right hand side label y.
+    See also the documentation for the precise formulation.
+
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True, convergenceTol=0.001):
@@ -434,14 +480,20 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+
+    .. versionadded:: 0.9.0
     """
+    @since("1.4.0")
     def save(self, sc, path):
+        """Save a RidgeRegressionMode."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel(
             _py2java(sc, self._coeff), self.intercept)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.4.0")
     def load(cls, sc, path):
+        """Load a RidgeRegressionMode."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -451,8 +503,19 @@ def load(cls, sc, path):
 
 
 class RidgeRegressionWithSGD(object):
+    """
+    Train a regression model with L2-regularization using Stochastic Gradient Descent.
+    This solves the l2-regularized least squares regression formulation
+             f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
+    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
+    its corresponding right hand side label y.
+    See also the documentation for the precise formulation.
+
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True, convergenceTol=0.001):
@@ -531,6 +594,8 @@ class IsotonicRegressionModel(Saveable, Loader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, boundaries, predictions, isotonic):
@@ -538,6 +603,7 @@ def __init__(self, boundaries, predictions, isotonic):
         self.predictions = predictions
         self.isotonic = isotonic
 
+    @since("1.4.0")
     def predict(self, x):
         """
         Predict labels for provided features.
@@ -562,7 +628,9 @@ def predict(self, x):
             return x.map(lambda v: self.predict(v))
         return np.interp(x, self.boundaries, self.predictions)
 
+    @since("1.4.0")
     def save(self, sc, path):
+        """Save a IsotonicRegressionModel."""
         java_boundaries = _py2java(sc, self.boundaries.tolist())
         java_predictions = _py2java(sc, self.predictions.tolist())
         java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel(
@@ -570,7 +638,9 @@ def save(self, sc, path):
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.4.0")
     def load(cls, sc, path):
+        """Load a IsotonicRegressionModel."""
         java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load(
             sc._jsc.sc(), path)
         py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray()
@@ -579,8 +649,29 @@ def load(cls, sc, path):
 
 
 class IsotonicRegression(object):
+    """
+    Isotonic regression.
+    Currently implemented using parallelized pool adjacent violators algorithm.
+    Only univariate (single feature) algorithm supported.
+
+    Sequential PAV implementation based on:
+    Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
+      "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
+      Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
+
+    Sequential PAV parallelization based on:
+    Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
+      "An approach to parallelizing isotonic regression."
+      Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
+      Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+
+    @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
+
+    .. versionadded:: 1.4.0
+    """
 
     @classmethod
+    @since("1.4.0")
     def train(cls, data, isotonic=True):
         """
         Train a isotonic regression model on the given data.
@@ -598,10 +689,13 @@ class StreamingLinearAlgorithm(object):
     Base class that has to be inherited by any StreamingLinearAlgorithm.
 
     Prevents reimplementation of methods predictOn and predictOnValues.
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, model):
         self._model = model
 
+    @since("1.5.0")
     def latestModel(self):
         """
         Returns the latest model.
@@ -616,6 +710,7 @@ def _validate(self, dstream):
             raise ValueError(
                 "Model must be intialized using setInitialWeights")
 
+    @since("1.5.0")
     def predictOn(self, dstream):
         """
         Make predictions on a dstream.
@@ -625,6 +720,7 @@ def predictOn(self, dstream):
         self._validate(dstream)
         return dstream.map(lambda x: self._model.predict(x))
 
+    @since("1.5.0")
     def predictOnValues(self, dstream):
         """
         Make predictions on a keyed dstream.
@@ -649,6 +745,8 @@ class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     :param miniBatchFraction: Fraction of data on which SGD is run for each
                               iteration.
     :param convergenceTol: A condition which decides iteration termination.
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001):
         self.stepSize = stepSize
@@ -659,6 +757,7 @@ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, conver
         super(StreamingLinearRegressionWithSGD, self).__init__(
             model=self._model)
 
+    @since("1.5.0")
     def setInitialWeights(self, initialWeights):
         """
         Set the initial value of weights.
@@ -669,6 +768,7 @@ def setInitialWeights(self, initialWeights):
         self._model = LinearRegressionModel(initialWeights, 0)
         return self
 
+    @since("1.5.0")
     def trainOn(self, dstream):
         """Train the model on the incoming dstream."""
         self._validate(dstream)

From 4e38defae13b2b13e196b4d172722ef5e6266c66 Mon Sep 17 00:00:00 2001
From: Jayant Shekar <jayant@user-MBPMBA-3.local>
Date: Fri, 23 Oct 2015 08:45:13 -0700
Subject: [PATCH 659/802] [SPARK-6723] [MLLIB] Model import/export for
 ChiSqSelector

This is a PR for Parquet-based model import/export.

* Added save/load for ChiSqSelectorModel
* Updated the test suite ChiSqSelectorSuite

Author: Jayant Shekar <jayant@user-MBPMBA-3.local>

Closes #6785 from jayantshekhar/SPARK-6723.
---
 .../spark/mllib/feature/ChiSqSelector.scala   | 70 ++++++++++++++++++-
 .../mllib/feature/ChiSqSelectorSuite.scala    | 26 +++++++
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index b1524cf377808..5246faf221914 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -19,11 +19,18 @@ package org.apache.spark.mllib.feature
 
 import scala.collection.mutable.ArrayBuilder
 
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{SQLContext, Row}
 
 /**
  * :: Experimental ::
@@ -34,7 +41,7 @@ import org.apache.spark.rdd.RDD
 @Since("1.3.0")
 @Experimental
 class ChiSqSelectorModel @Since("1.3.0") (
-  @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
+  @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
   require(isSorted(selectedFeatures), "Array has to be sorted asc")
 
@@ -102,6 +109,67 @@ class ChiSqSelectorModel @Since("1.3.0") (
           s"Only sparse and dense vectors are supported but got ${other.getClass}.")
     }
   }
+
+  @Since("1.6.0")
+  override def save(sc: SparkContext, path: String): Unit = {
+    ChiSqSelectorModel.SaveLoadV1_0.save(sc, this, path)
+  }
+
+  override protected def formatVersion: String = "1.0"
+}
+
+object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
+  @Since("1.6.0")
+  override def load(sc: SparkContext, path: String): ChiSqSelectorModel = {
+    ChiSqSelectorModel.SaveLoadV1_0.load(sc, path)
+  }
+
+  private[feature]
+  object SaveLoadV1_0 {
+
+    private val thisFormatVersion = "1.0"
+
+    /** Model data for import/export */
+    case class Data(feature: Int)
+
+    private[feature]
+    val thisClassName = "org.apache.spark.mllib.feature.ChiSqSelectorModel"
+
+    def save(sc: SparkContext, model: ChiSqSelectorModel, path: String): Unit = {
+      val sqlContext = new SQLContext(sc)
+      import sqlContext.implicits._
+      val metadata = compact(render(
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
+
+      // Create Parquet data.
+      val dataArray = Array.tabulate(model.selectedFeatures.length) { i =>
+        Data(model.selectedFeatures(i))
+      }
+      sc.parallelize(dataArray, 1).toDF().write.parquet(Loader.dataPath(path))
+
+    }
+
+    def load(sc: SparkContext, path: String): ChiSqSelectorModel = {
+      implicit val formats = DefaultFormats
+      val sqlContext = new SQLContext(sc)
+      val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path)
+      assert(className == thisClassName)
+      assert(formatVersion == thisFormatVersion)
+
+      val dataFrame = sqlContext.read.parquet(Loader.dataPath(path))
+      val dataArray = dataFrame.select("feature")
+
+      // Check schema explicitly since erasure makes it hard to use match-case for checking.
+      Loader.checkSchema[Data](dataFrame.schema)
+
+      val features = dataArray.map {
+        case Row(feature: Int) => (feature)
+      }.collect()
+
+      return new ChiSqSelectorModel(features)
+    }
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index 889727fb55823..734800a9afad6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
 
 class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
@@ -63,4 +64,29 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
     }.collect().toSet
     assert(filteredData == preFilteredData)
   }
+
+  test("model load / save") {
+    val model = ChiSqSelectorSuite.createModel()
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    try {
+      model.save(sc, path)
+      val sameModel = ChiSqSelectorModel.load(sc, path)
+      ChiSqSelectorSuite.checkEqual(model, sameModel)
+    } finally {
+      Utils.deleteRecursively(tempDir)
+    }
+  }
+}
+
+object ChiSqSelectorSuite extends SparkFunSuite {
+
+  def createModel(): ChiSqSelectorModel = {
+    val arr = Array(1, 2, 3, 4)
+    new ChiSqSelectorModel(arr)
+  }
+
+  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
+    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
+  }
 }

From e1a897b657eb62e837026f7b3efafb9a6424ec4f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 23 Oct 2015 13:04:06 -0700
Subject: [PATCH 660/802] [SPARK-11274] [SQL] Text data source support for
 Spark SQL.

This adds API for reading and writing text files, similar to SparkContext.textFile and RDD.saveAsTextFile.
```
SQLContext.read.text("/path/to/something.txt")
DataFrame.write.text("/path/to/write.txt")
```

Using the new Dataset API, this also supports
```
val ds: Dataset[String] = SQLContext.read.text("/path/to/something.txt").as[String]
```

Author: Reynold Xin <rxin@databricks.com>

Closes #9240 from rxin/SPARK-11274.
---
 ...pache.spark.sql.sources.DataSourceRegister |   1 +
 .../apache/spark/sql/DataFrameReader.scala    |  16 ++
 .../apache/spark/sql/DataFrameWriter.scala    |  18 ++
 .../datasources/json/JSONRelation.scala       |   7 +-
 .../datasources/text/DefaultSource.scala      | 160 ++++++++++++++++++
 sql/core/src/test/resources/text-suite.txt    |   4 +
 .../datasources/text/TextSuite.scala          |  81 +++++++++
 7 files changed, 283 insertions(+), 4 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
 create mode 100644 sql/core/src/test/resources/text-suite.txt
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala

diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index ca50000b4756e..1ca2044057e56 100644
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1,3 +1,4 @@
 org.apache.spark.sql.execution.datasources.jdbc.DefaultSource
 org.apache.spark.sql.execution.datasources.json.DefaultSource
 org.apache.spark.sql.execution.datasources.parquet.DefaultSource
+org.apache.spark.sql.execution.datasources.text.DefaultSource
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e8651a3569d6f..824220d85e04d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -302,6 +302,22 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
     DataFrame(sqlContext, sqlContext.catalog.lookupRelation(TableIdentifier(tableName)))
   }
 
+  /**
+   * Loads a text file and returns a [[DataFrame]] with a single string column named "text".
+   * Each line in the text file is a new row in the resulting DataFrame. For example:
+   * {{{
+   *   // Scala:
+   *   sqlContext.read.text("/path/to/spark/README.md")
+   *
+   *   // Java:
+   *   sqlContext.read().text("/path/to/spark/README.md")
+   * }}}
+   *
+   * @param path input path
+   * @since 1.6.0
+   */
+  def text(path: String): DataFrame = format("text").load(path)
+
   ///////////////////////////////////////////////////////////////////////////////////////
   // Builder pattern config options
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 764510ab4b4bd..7887e559a3025 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -244,6 +244,8 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
    *                             tag/value. Normally at least a "user" and "password" property
    *                             should be included.
+   *
+   * @since 1.4.0
    */
   def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
     val props = new Properties()
@@ -317,6 +319,22 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    */
   def orc(path: String): Unit = format("orc").save(path)
 
+  /**
+   * Saves the content of the [[DataFrame]] in a text file at the specified path.
+   * The DataFrame must have only one column that is of string type.
+   * Each row becomes a new line in the output file. For example:
+   * {{{
+   *   // Scala:
+   *   df.write.text("/path/to/output")
+   *
+   *   // Java:
+   *   df.write().text("/path/to/output")
+   * }}}
+   *
+   * @since 1.6.0
+   */
+  def text(path: String): Unit = format("text").save(path)
+
   ///////////////////////////////////////////////////////////////////////////////////////
   // Builder pattern config options
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index d05e6efa83c84..794b889a93627 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -161,11 +161,10 @@ private[json] class JsonOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter with SparkHadoopMapRedUtil with Logging {
 
-  val writer = new CharArrayWriter()
+  private[this] val writer = new CharArrayWriter()
   // create the Generator without separator inserted between 2 records
-  val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
-
-  val result = new Text()
+  private[this] val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
+  private[this] val result = new Text()
 
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
new file mode 100644
index 0000000000000..ab26c57ad1923
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.text
+
+import com.google.common.base.Objects
+import org.apache.hadoop.fs.{Path, FileStatus}
+import org.apache.hadoop.io.{NullWritable, Text, LongWritable}
+import org.apache.hadoop.mapred.{TextInputFormat, JobConf}
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
+import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext, Job}
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.mapred.SparkHadoopMapRedUtil
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
+import org.apache.spark.sql.execution.datasources.PartitionSpec
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A data source for reading text files.
+ */
+class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
+
+  override def createRelation(
+      sqlContext: SQLContext,
+      paths: Array[String],
+      dataSchema: Option[StructType],
+      partitionColumns: Option[StructType],
+      parameters: Map[String, String]): HadoopFsRelation = {
+    dataSchema.foreach(verifySchema)
+    new TextRelation(None, partitionColumns, paths)(sqlContext)
+  }
+
+  override def shortName(): String = "text"
+
+  private def verifySchema(schema: StructType): Unit = {
+    if (schema.size != 1) {
+      throw new AnalysisException(
+        s"Text data source supports only a single column, and you have ${schema.size} columns.")
+    }
+    val tpe = schema(0).dataType
+    if (tpe != StringType) {
+      throw new AnalysisException(
+        s"Text data source supports only a string column, but you have ${tpe.simpleString}.")
+    }
+  }
+}
+
+private[sql] class TextRelation(
+    val maybePartitionSpec: Option[PartitionSpec],
+    override val userDefinedPartitionColumns: Option[StructType],
+    override val paths: Array[String] = Array.empty[String])
+    (@transient val sqlContext: SQLContext)
+  extends HadoopFsRelation(maybePartitionSpec) {
+
+  /** Data schema is always a single column, named "text". */
+  override def dataSchema: StructType = new StructType().add("text", StringType)
+
+  /** This is an internal data source that outputs internal row format. */
+  override val needConversion: Boolean = false
+
+  /** Read path. */
+  override def buildScan(inputPaths: Array[FileStatus]): RDD[Row] = {
+    val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    val paths = inputPaths.map(_.getPath).sortBy(_.toUri)
+
+    if (paths.nonEmpty) {
+      FileInputFormat.setInputPaths(job, paths: _*)
+    }
+
+    sqlContext.sparkContext.hadoopRDD(
+      conf.asInstanceOf[JobConf], classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
+      .mapPartitions { iter =>
+        var buffer = new Array[Byte](1024)
+        val row = new GenericMutableRow(1)
+        iter.map { case (_, line) =>
+          if (line.getLength > buffer.length) {
+            buffer = new Array[Byte](line.getLength)
+          }
+          System.arraycopy(line.getBytes, 0, buffer, 0, line.getLength)
+          row.update(0, UTF8String.fromBytes(buffer, 0, line.getLength))
+          row
+        }
+      }.asInstanceOf[RDD[Row]]
+  }
+
+  /** Write path. */
+  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        new TextOutputWriter(path, dataSchema, context)
+      }
+    }
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case that: TextRelation =>
+      paths.toSet == that.paths.toSet && partitionColumns == that.partitionColumns
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    Objects.hashCode(paths.toSet, partitionColumns)
+  }
+}
+
+class TextOutputWriter(path: String, dataSchema: StructType, context: TaskAttemptContext)
+  extends OutputWriter
+  with SparkHadoopMapRedUtil {
+
+  private[this] val buffer = new Text()
+
+  private val recordWriter: RecordWriter[NullWritable, Text] = {
+    new TextOutputFormat[NullWritable, Text]() {
+      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+        val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+        val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+        val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+        val split = taskAttemptId.getTaskID.getId
+        new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
+      }
+    }.getRecordWriter(context)
+  }
+
+  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
+
+  override protected[sql] def writeInternal(row: InternalRow): Unit = {
+    val utf8string = row.getUTF8String(0)
+    buffer.set(utf8string.getBytes)
+    recordWriter.write(NullWritable.get(), buffer)
+  }
+
+  override def close(): Unit = {
+    recordWriter.close(context)
+  }
+}
diff --git a/sql/core/src/test/resources/text-suite.txt b/sql/core/src/test/resources/text-suite.txt
new file mode 100644
index 0000000000000..e8fd967197fe8
--- /dev/null
+++ b/sql/core/src/test/resources/text-suite.txt
@@ -0,0 +1,4 @@
+This is a test file for the text data source
+1+1
+数据砖头
+"doh"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
new file mode 100644
index 0000000000000..0a2306c06646c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.text
+
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
+import org.apache.spark.util.Utils
+
+
+class TextSuite extends QueryTest with SharedSQLContext {
+
+  test("reading text file") {
+    verifyFrame(sqlContext.read.format("text").load(testFile))
+  }
+
+  test("SQLContext.read.text() API") {
+    verifyFrame(sqlContext.read.text(testFile))
+  }
+
+  test("writing") {
+    val df = sqlContext.read.text(testFile)
+
+    val tempFile = Utils.createTempDir()
+    tempFile.delete()
+    df.write.text(tempFile.getCanonicalPath)
+    verifyFrame(sqlContext.read.text(tempFile.getCanonicalPath))
+
+    Utils.deleteRecursively(tempFile)
+  }
+
+  test("error handling for invalid schema") {
+    val tempFile = Utils.createTempDir()
+    tempFile.delete()
+
+    val df = sqlContext.range(2)
+    intercept[AnalysisException] {
+      df.write.text(tempFile.getCanonicalPath)
+    }
+
+    intercept[AnalysisException] {
+      sqlContext.range(2).select(df("id"), df("id") + 1).write.text(tempFile.getCanonicalPath)
+    }
+  }
+
+  private def testFile: String = {
+    Thread.currentThread().getContextClassLoader.getResource("text-suite.txt").toString
+  }
+
+  /** Verifies data and schema. */
+  private def verifyFrame(df: DataFrame): Unit = {
+    // schema
+    assert(df.schema == new StructType().add("text", StringType))
+
+    // verify content
+    val data = df.collect()
+    assert(data(0) == Row("This is a test file for the text data source"))
+    assert(data(1) == Row("1+1"))
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    // scalastyle:off
+    assert(data(2) == Row("数据砖头"))
+    // scalastyle:on
+    assert(data(3) == Row("\"doh\""))
+    assert(data.length == 4)
+  }
+}

From 4725cb988b98f367c07214c4c3cfd1206fb2b5c2 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 23 Oct 2015 17:15:13 -0700
Subject: [PATCH 661/802] [SPARK-11194] [SQL] Use MutableURLClassLoader for the
 classLoader in IsolatedClientLoader.

https://issues.apache.org/jira/browse/SPARK-11194

Author: Yin Huai <yhuai@databricks.com>

Closes #9170 from yhuai/SPARK-11194.
---
 .../hive/client/IsolatedClientLoader.scala    | 79 ++++++++++++-------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 567e4d7b411ec..f99c3ed2ae987 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -30,7 +30,7 @@ import org.apache.commons.io.{FileUtils, IOUtils}
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkSubmitUtils
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.hive.HiveContext
@@ -148,39 +148,51 @@ private[hive] class IsolatedClientLoader(
   protected def classToPath(name: String): String =
     name.replaceAll("\\.", "/") + ".class"
 
-  /** The classloader that is used to load an isolated version of Hive. */
-  private[hive] var classLoader: ClassLoader = if (isolationOn) {
-    new URLClassLoader(allJars, rootClassLoader) {
-      override def loadClass(name: String, resolve: Boolean): Class[_] = {
-        val loaded = findLoadedClass(name)
-        if (loaded == null) doLoadClass(name, resolve) else loaded
-      }
-      def doLoadClass(name: String, resolve: Boolean): Class[_] = {
-        val classFileName = name.replaceAll("\\.", "/") + ".class"
-        if (isBarrierClass(name)) {
-          // For barrier classes, we construct a new copy of the class.
-          val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
-          logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
-          defineClass(name, bytes, 0, bytes.length)
-        } else if (!isSharedClass(name)) {
-          logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
-          super.loadClass(name, resolve)
-        } else {
-          // For shared classes, we delegate to baseClassLoader.
-          logDebug(s"shared class: $name")
-          baseClassLoader.loadClass(name)
+  /**
+   * The classloader that is used to load an isolated version of Hive.
+   * This classloader is a special URLClassLoader that exposes the addURL method.
+   * So, when we add jar, we can add this new jar directly through the addURL method
+   * instead of stacking a new URLClassLoader on top of it.
+   */
+  private[hive] val classLoader: MutableURLClassLoader = {
+    val isolatedClassLoader =
+      if (isolationOn) {
+        new URLClassLoader(allJars, rootClassLoader) {
+          override def loadClass(name: String, resolve: Boolean): Class[_] = {
+            val loaded = findLoadedClass(name)
+            if (loaded == null) doLoadClass(name, resolve) else loaded
+          }
+          def doLoadClass(name: String, resolve: Boolean): Class[_] = {
+            val classFileName = name.replaceAll("\\.", "/") + ".class"
+            if (isBarrierClass(name)) {
+              // For barrier classes, we construct a new copy of the class.
+              val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
+              logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
+              defineClass(name, bytes, 0, bytes.length)
+            } else if (!isSharedClass(name)) {
+              logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
+              super.loadClass(name, resolve)
+            } else {
+              // For shared classes, we delegate to baseClassLoader.
+              logDebug(s"shared class: $name")
+              baseClassLoader.loadClass(name)
+            }
+          }
         }
+      } else {
+        baseClassLoader
       }
-    }
-  } else {
-    baseClassLoader
+    // Right now, we create a URLClassLoader that gives preference to isolatedClassLoader
+    // over its own URLs when it loads classes and resources.
+    // We may want to use ChildFirstURLClassLoader based on
+    // the configuration of spark.executor.userClassPathFirst, which gives preference
+    // to its own URLs over the parent class loader (see Executor's createClassLoader method).
+    new NonClosableMutableURLClassLoader(isolatedClassLoader)
   }
 
   private[hive] def addJar(path: String): Unit = synchronized {
     val jarURL = new java.io.File(path).toURI.toURL
-    // TODO: we should avoid of stacking classloaders (use a single URLClassLoader and add jars
-    // to that)
-    classLoader = new java.net.URLClassLoader(Array(jarURL), classLoader)
+    classLoader.addURL(jarURL)
   }
 
   /** The isolated client interface to Hive. */
@@ -221,3 +233,14 @@ private[hive] class IsolatedClientLoader(
    */
   private[hive] var cachedHive: Any = null
 }
+
+/**
+ * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
+ * This class loader cannot be closed (its `close` method is a no-op).
+ */
+private[sql] class NonClosableMutableURLClassLoader(
+    parent: ClassLoader)
+  extends MutableURLClassLoader(Array.empty, parent) {
+
+  override def close(): Unit = {}
+}

From 2462dbcce89d657bca17ae311c99c2a4bee4a5fa Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Fri, 23 Oct 2015 21:38:04 -0700
Subject: [PATCH 662/802] [SPARK-10971][SPARKR] RRunner should allow setting
 path to Rscript.

Add a new spark conf option "spark.sparkr.r.driver.command" to specify the executable for an R script in client modes.

The existing spark conf option "spark.sparkr.r.command" is used to specify the executable for an R script in cluster modes for both driver and workers. See also [launch R worker script](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/api/r/RRDD.scala#L395).

BTW, [envrionment variable "SPARKR_DRIVER_R"](https://github.com/apache/spark/blob/master/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java#L275) is used to locate R shell on the local host.

For your information, PYSPARK has two environment variables serving simliar purpose:
PYSPARK_PYTHON	      Python binary executable to use for PySpark in both driver and workers (default is `python`).
PYSPARK_DRIVER_PYTHON	Python binary executable to use for PySpark in driver only (default is PYSPARK_PYTHON).
pySpark use the code [here](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala#L41) to determine the python executable for a python script.

Author: Sun Rui <rui.sun@intel.com>

Closes #9179 from sun-rui/SPARK-10971.
---
 .../org/apache/spark/deploy/RRunner.scala      | 11 ++++++++++-
 docs/configuration.md                          | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index 58cc1f9d963df..ed183cf16a9cb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -40,7 +40,16 @@ object RRunner {
 
     // Time to wait for SparkR backend to initialize in seconds
     val backendTimeout = sys.env.getOrElse("SPARKR_BACKEND_TIMEOUT", "120").toInt
-    val rCommand = "Rscript"
+    val rCommand = {
+      // "spark.sparkr.r.command" is deprecated and replaced by "spark.r.command",
+      // but kept here for backward compatibility.
+      var cmd = sys.props.getOrElse("spark.sparkr.r.command", "Rscript")
+      cmd = sys.props.getOrElse("spark.r.command", cmd)
+      if (sys.props.getOrElse("spark.submit.deployMode", "client") == "client") {
+        cmd = sys.props.getOrElse("spark.r.driver.command", cmd)
+      }
+      cmd
+    }
 
     // Check if the file path exists.
     // If not, change directory to current working directory for YARN cluster mode
diff --git a/docs/configuration.md b/docs/configuration.md
index be9c36bdfe3de..682384d4249e0 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1589,6 +1589,20 @@ Apart from these, the following properties are also available, and may be useful
     Number of threads used by RBackend to handle RPC calls from SparkR package.
   </td>
 </tr>
+<tr>
+  <td><code>spark.r.command</code></td>
+  <td>Rscript</td>
+  <td>
+    Executable for executing R scripts in cluster modes for both driver and workers.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.r.driver.command</code></td>
+  <td>spark.r.command</td>
+  <td>
+    Executable for executing R scripts in client modes for driver. Ignored in cluster modes.
+  </td>
+</tr>
 </table>
 
 #### Cluster Managers
@@ -1628,6 +1642,10 @@ The following variables can be set in `spark-env.sh`:
     <td><code>PYSPARK_DRIVER_PYTHON</code></td>
     <td>Python binary executable to use for PySpark in driver only (default is <code>PYSPARK_PYTHON</code>).</td>
   </tr>
+  <tr>
+    <td><code>SPARKR_DRIVER_R</code></td>
+    <td>R binary executable to use for SparkR shell (default is <code>R</code>).</td>
+  </tr>
   <tr>
     <td><code>SPARK_LOCAL_IP</code></td>
     <td>IP address of the machine to bind to.</td>

From 5e458125018029cef5cde3390f4a55dd4e164fde Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Fri, 23 Oct 2015 21:42:00 -0700
Subject: [PATCH 663/802] [SPARK-11294][SPARKR] Improve R doc for read.df,
 write.df, saveAsTable

Add examples for read.df, write.df; fix grouping for read.df, loadDF; fix formatting and text truncation for write.df, saveAsTable.

Several text issues:
![image](https://cloud.githubusercontent.com/assets/8969467/10708590/1303a44e-79c3-11e5-854f-3a2e16854cd7.png)
- text collapsed into a single paragraph
- text truncated at 2 places, eg. "overwrite: Existing data is expected to be overwritten by the contents of error:"

shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9261 from felixcheung/rdocreadwritedf.
---
 R/pkg/R/DataFrame.R  | 27 +++++++++++++--------------
 R/pkg/R/SQLContext.R | 16 +++++++++++-----
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 993be82a47f75..2acbd081cd504 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1572,18 +1572,17 @@ setMethod("except",
 #' spark.sql.sources.default will be used.
 #'
 #' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes:
-#'  append: Contents of this DataFrame are expected to be appended to existing data.
-#'  overwrite: Existing data is expected to be overwritten by the contents of
-#     this DataFrame.
-#'  error: An exception is expected to be thrown.
+#' data already exists in the data source. There are four modes: \cr
+#'  append: Contents of this DataFrame are expected to be appended to existing data. \cr
+#'  overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr
+#'  error: An exception is expected to be thrown. \cr
 #'  ignore: The save operation is expected to not save the contents of the DataFrame
-#     and to not change the existing data.
+#'     and to not change the existing data. \cr
 #'
 #' @param df A SparkSQL DataFrame
 #' @param path A name for the table
 #' @param source A name for external data source
-#' @param mode One of 'append', 'overwrite', 'error', 'ignore'
+#' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode
 #'
 #' @rdname write.df
 #' @name write.df
@@ -1596,6 +1595,7 @@ setMethod("except",
 #' path <- "path/to/file.json"
 #' df <- jsonFile(sqlContext, path)
 #' write.df(df, "myfile", "parquet", "overwrite")
+#' saveDF(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema)
 #' }
 setMethod("write.df",
           signature(df = "DataFrame", path = "character"),
@@ -1637,18 +1637,17 @@ setMethod("saveDF",
 #' spark.sql.sources.default will be used.
 #'
 #' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes:
-#'  append: Contents of this DataFrame are expected to be appended to existing data.
-#'  overwrite: Existing data is expected to be overwritten by the contents of
-#     this DataFrame.
-#'  error: An exception is expected to be thrown.
+#' data already exists in the data source. There are four modes: \cr
+#'  append: Contents of this DataFrame are expected to be appended to existing data. \cr
+#'  overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr
+#'  error: An exception is expected to be thrown. \cr
 #'  ignore: The save operation is expected to not save the contents of the DataFrame
-#     and to not change the existing data.
+#'     and to not change the existing data. \cr
 #'
 #' @param df A SparkSQL DataFrame
 #' @param tableName A name for the table
 #' @param source A name for external data source
-#' @param mode One of 'append', 'overwrite', 'error', 'ignore'
+#' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode
 #'
 #' @rdname saveAsTable
 #' @name saveAsTable
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 399f53657a68c..1bf025cce4376 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -452,14 +452,21 @@ dropTempTable <- function(sqlContext, tableName) {
 #'
 #' @param sqlContext SQLContext to use
 #' @param path The path of files to load
-#' @param source the name of external data source
+#' @param source The name of external data source
+#' @param schema The data schema defined in structType
 #' @return DataFrame
+#' @rdname read.df
+#' @name read.df
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df <- read.df(sqlContext, "path/to/file.json", source = "json")
+#' df1 <- read.df(sqlContext, "path/to/file.json", source = "json")
+#' schema <- structType(structField("name", "string"),
+#'                      structField("info", "map<string,double>"))
+#' df2 <- read.df(sqlContext, mapTypeJsonPath, "json", schema)
+#' df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema = "true")
 #' }
 
 read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
@@ -482,9 +489,8 @@ read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...)
   dataFrame(sdf)
 }
 
-#' @aliases loadDF
-#' @export
-
+#' @rdname read.df
+#' @name loadDF
 loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
   read.df(sqlContext, path, source, schema, ...)
 }

From ffed00493a0aa2373a04e3aa374404936fbe15c7 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Fri, 23 Oct 2015 22:56:55 -0700
Subject: [PATCH 664/802] =?UTF-8?q?[SPARK-11125]=20[SQL]=20Uninformative?=
 =?UTF-8?q?=20exception=20when=20running=20spark-sql=20witho=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ut building with -Phive-thriftserver and SPARK_PREPEND_CLASSES is set

This is the exception after this patch. Please help review.
```
java.lang.NoClassDefFoundError: org/apache/hadoop/hive/cli/CliDriver
	at java.lang.ClassLoader.defineClass1(Native Method)
	at java.lang.ClassLoader.defineClass(ClassLoader.java:800)
	at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
	at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
	at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:412)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:270)
	at org.apache.spark.util.Utils$.classForName(Utils.scala:173)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:647)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.cli.CliDriver
	at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
	... 21 more
Failed to load hive class.
You need to build Spark with -Phive and -Phive-thriftserver.
```

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9134 from zjffdu/SPARK-11125.
---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index ad92f5635af35..640cc325281a9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -655,6 +655,15 @@ object SparkSubmit {
           // scalastyle:on println
         }
         System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
+      case e: NoClassDefFoundError =>
+        e.printStackTrace(printStream)
+        if (e.getMessage.contains("org/apache/hadoop/hive")) {
+          // scalastyle:off println
+          printStream.println(s"Failed to load hive class.")
+          printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
+          // scalastyle:on println
+        }
+        System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
     }
 
     // SPARK-4170

From e5bc8c27577f96c1ae5dc8cf9bf41cbe2877ffe3 Mon Sep 17 00:00:00 2001
From: dima <pronix.service@gmail.com>
Date: Sat, 24 Oct 2015 18:16:45 +0100
Subject: [PATCH 665/802] [SPARK-11245] update twitter4j to 4.0.4 version

update twitter4j to 4.0.4 version
https://issues.apache.org/jira/browse/SPARK-11245

Author: dima <pronix.service@gmail.com>

Closes #9221 from pronix/twitter4j_update.
---
 external/twitter/pom.xml                                        | 2 +-
 .../apache/spark/streaming/twitter/TwitterInputDStream.scala    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 4c22ec8b3b154..087270de90b3f 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -51,7 +51,7 @@
     <dependency>
       <groupId>org.twitter4j</groupId>
       <artifactId>twitter4j-stream</artifactId>
-      <version>3.0.3</version>
+      <version>4.0.4</version>
     </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
diff --git a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
index d7de74b350543..9a85a6597c27f 100644
--- a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
+++ b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
@@ -87,7 +87,7 @@ class TwitterReceiver(
 
       val query = new FilterQuery
       if (filters.size > 0) {
-        query.track(filters.toArray)
+        query.track(filters.mkString(","))
         newTwitterStream.filter(query)
       } else {
         newTwitterStream.sample()

From 28132ceb10d0c127495ce8cb36135e1cb54164d7 Mon Sep 17 00:00:00 2001
From: Jeffrey Naisbitt <jnaisbitt@familysearch.org>
Date: Sat, 24 Oct 2015 18:21:36 +0100
Subject: [PATCH 666/802] [SPARK-11264] bin/spark-class can't find assembly
 jars with certain GREP_OPTIONS set

Temporarily remove GREP_OPTIONS if set in bin/spark-class.

Some GREP_OPTIONS will modify the output of the grep commands that are looking for the assembly jars.
For example, if the -n option is specified, the grep output will look like:
5:spark-assembly-1.5.1-hadoop2.4.0.jar

This will not match the regular expressions, and so the jar files will not be found.  We could improve the regular expression to handle this case and trim off extra characters, but it is difficult to know which options may or may not be set.  Unsetting GREP_OPTIONS within the script handles all the cases and gives the desired output.

Author: Jeffrey Naisbitt <jnaisbitt@familysearch.org>

Closes #9231 from naisbitt/unset-GREP_OPTIONS.
---
 bin/spark-class | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/spark-class b/bin/spark-class
index e38e08dec40e4..8cae6ccbabe7c 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -42,6 +42,7 @@ else
   ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
 fi
 
+GREP_OPTIONS=
 num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
 if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" -a "$SPARK_PREPEND_CLASSES" != "1" ]; then
   echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2

From 146da0d8100490a6e49a6c076ec253cdaf9f8905 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Sun, 25 Oct 2015 01:33:22 +0100
Subject: [PATCH 667/802] Fix typos

Two typos squashed.

BTW Let me know how to proceed with other typos if I ran across any. I don't feel well to leave them aside as much as sending pull requests with such tiny changes. Guide me.

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #9250 from jaceklaskowski/typos-hunting.
---
 core/src/main/scala/org/apache/spark/SparkConf.scala           | 2 +-
 .../main/scala/org/apache/spark/metrics/MetricsSystem.scala    | 2 +-
 .../main/scala/org/apache/spark/scheduler/TaskScheduler.scala  | 3 ++-
 core/src/main/scala/org/apache/spark/util/ThreadUtils.scala    | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 58d3b846fd80d..f023e4b21cb40 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -621,7 +621,7 @@ private[spark] object SparkConf extends Logging {
   /**
    * Return whether the given config should be passed to an executor on start-up.
    *
-   * Certain akka and authentication configs are required of the executor when it connects to
+   * Certain akka and authentication configs are required from the executor when it connects to
    * the scheduler, while the rest of the spark configs can be inherited from the driver later.
    */
   def isExecutorStartupConf(name: String): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 48afe3ae3511f..fdf76d312db3b 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -197,7 +197,7 @@ private[spark] class MetricsSystem private (
           }
         } catch {
           case e: Exception => {
-            logError("Sink class " + classPath + " cannot be instantialized")
+            logError("Sink class " + classPath + " cannot be instantiated")
             throw e
           }
         }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
index f25f3ed0d9037..cb9a3008107d7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -22,7 +22,8 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
 
 /**
- * Low-level task scheduler interface, currently implemented exclusively by TaskSchedulerImpl.
+ * Low-level task scheduler interface, currently implemented exclusively by
+ * [[org.apache.spark.scheduler.TaskSchedulerImpl]].
  * This interface allows plugging in different task schedulers. Each TaskScheduler schedules tasks
  * for a single SparkContext. These schedulers get sets of tasks submitted to them from the
  * DAGScheduler for each stage, and are responsible for sending the tasks to the cluster, running
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 15e7519d708c6..53283448c87b1 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -80,7 +80,7 @@ private[spark] object ThreadUtils {
   }
 
   /**
-   * Wrapper over newSingleThreadScheduledExecutor.
+   * Wrapper over ScheduledThreadPoolExecutor.
    */
   def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()

From b67dc6a4342577e73b0600b51052c286c4569960 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 25 Oct 2015 10:31:44 +0100
Subject: [PATCH 668/802] [SPARK-11299][DOC] Fix link to Scala DataFrame
 Functions reference

The SQL programming guide's link to the DataFrame functions reference points to the wrong location; this patch fixes that.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9269 from JoshRosen/SPARK-11299.
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 30206c6f6fd93..f07c9573696ed 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -215,7 +215,7 @@ df.groupBy("age").count().show()
 
 For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/scala/index.html#org.apache.spark.sql.DataFrame).
 
-In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.DataFrame).
+In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more.  The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.functions$).
 
 
 </div>

From 92b9c5edd90f7b89efc687c0cea6778daa1a6b66 Mon Sep 17 00:00:00 2001
From: Alexander Slesarenko <avslesarenko@gmail.com>
Date: Sun, 25 Oct 2015 10:37:10 +0100
Subject: [PATCH 669/802] [SPARK-6428][SQL] Removed unnecessary typecasts in
 MutableInt, MutableDouble etc.

marmbrus rxin I believe these typecasts are not required in the presence of explicit return types.

Author: Alexander Slesarenko <avslesarenko@gmail.com>

Closes #9262 from aslesarenko/remove-typecasts.
---
 .../expressions/SpecificMutableRow.scala       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 4f56f94bd4ca4..475cbe005a6ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -41,7 +41,7 @@ import org.apache.spark.unsafe.types.UTF8String
  *     val newCopy = new Mutable$tpe
  *     newCopy.isNull = isNull
  *     newCopy.value = value
- *     newCopy.asInstanceOf[this.type]
+ *     newCopy
  *   }
  * }"""
  * }.foreach(println)
@@ -78,7 +78,7 @@ final class MutableInt extends MutableValue {
     val newCopy = new MutableInt
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableInt]
+    newCopy
   }
 }
 
@@ -93,7 +93,7 @@ final class MutableFloat extends MutableValue {
     val newCopy = new MutableFloat
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableFloat]
+    newCopy
   }
 }
 
@@ -108,7 +108,7 @@ final class MutableBoolean extends MutableValue {
     val newCopy = new MutableBoolean
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableBoolean]
+    newCopy
   }
 }
 
@@ -123,7 +123,7 @@ final class MutableDouble extends MutableValue {
     val newCopy = new MutableDouble
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableDouble]
+    newCopy
   }
 }
 
@@ -138,7 +138,7 @@ final class MutableShort extends MutableValue {
     val newCopy = new MutableShort
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableShort]
+    newCopy
   }
 }
 
@@ -153,7 +153,7 @@ final class MutableLong extends MutableValue {
     val newCopy = new MutableLong
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableLong]
+    newCopy
   }
 }
 
@@ -168,7 +168,7 @@ final class MutableByte extends MutableValue {
     val newCopy = new MutableByte
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableByte]
+    newCopy
   }
 }
 
@@ -183,7 +183,7 @@ final class MutableAny extends MutableValue {
     val newCopy = new MutableAny
     newCopy.isNull = isNull
     newCopy.value = value
-    newCopy.asInstanceOf[MutableAny]
+    newCopy
   }
 }
 

From 80279ac1875d488f7000f352a958a35536bd4c2e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Sun, 25 Oct 2015 19:05:45 +0000
Subject: [PATCH 670/802] [SPARK-11287] Fixed class name to properly start
 TestExecutor from deploy.client.TestClient

Executing deploy.client.TestClient fails due to bad class name for TestExecutor in ApplicationDescription.

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #9255 from BryanCutler/fix-TestClient-classname-SPARK-11287.
---
 .../main/scala/org/apache/spark/deploy/client/TestClient.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
index 1c79089303e3d..adb3f02258029 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
@@ -48,8 +48,9 @@ private[spark] object TestClient {
     val url = args(0)
     val conf = new SparkConf
     val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf))
+    val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$")
     val desc = new ApplicationDescription("TestClient", Some(1), 512,
-      Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
+      Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
     val listener = new TestListener
     val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf)
     client.start()

From 63accc79625d8a03d0624717af5e1d81b18a6da3 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Sun, 25 Oct 2015 21:18:35 -0700
Subject: [PATCH 671/802] [SPARK-10891][STREAMING][KINESIS] Add MessageHandler
 to KinesisUtils.createStream similar to Direct Kafka

This PR allows users to map a Kinesis `Record` to a generic `T` when creating a Kinesis stream. This is particularly useful, if you would like to do extra work with Kinesis metadata such as sequence number, and partition key.

TODO:
 - [x] add tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #8954 from brkyvz/kinesis-handler.
---
 .../kinesis/KinesisBackedBlockRDD.scala       |  35 ++-
 .../kinesis/KinesisInputDStream.scala         |  15 +-
 .../streaming/kinesis/KinesisReceiver.scala   |  18 +-
 .../kinesis/KinesisRecordProcessor.scala      |   4 +-
 .../streaming/kinesis/KinesisUtils.scala      | 247 ++++++++++++++++--
 .../kinesis/JavaKinesisStreamSuite.java       |  29 +-
 .../kinesis/KinesisBackedBlockRDDSuite.scala  |  16 +-
 .../kinesis/KinesisReceiverSuite.scala        |   4 +-
 .../kinesis/KinesisStreamSuite.scala          |  44 +++-
 9 files changed, 337 insertions(+), 75 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
index 5d32fa699ae5b..000897a4e7290 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming.kinesis
 
 import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
 import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain}
@@ -67,7 +68,7 @@ class KinesisBackedBlockRDDPartition(
  * sequence numbers of the corresponding blocks.
  */
 private[kinesis]
-class KinesisBackedBlockRDD(
+class KinesisBackedBlockRDD[T: ClassTag](
     @transient sc: SparkContext,
     val regionName: String,
     val endpointUrl: String,
@@ -75,8 +76,9 @@ class KinesisBackedBlockRDD(
     @transient val arrayOfseqNumberRanges: Array[SequenceNumberRanges],
     @transient isBlockIdValid: Array[Boolean] = Array.empty,
     val retryTimeoutMs: Int = 10000,
+    val messageHandler: Record => T = KinesisUtils.defaultMessageHandler _,
     val awsCredentialsOption: Option[SerializableAWSCredentials] = None
-  ) extends BlockRDD[Array[Byte]](sc, blockIds) {
+  ) extends BlockRDD[T](sc, blockIds) {
 
   require(blockIds.length == arrayOfseqNumberRanges.length,
     "Number of blockIds is not equal to the number of sequence number ranges")
@@ -90,23 +92,23 @@ class KinesisBackedBlockRDD(
     }
   }
 
-  override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
+  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
     val blockManager = SparkEnv.get.blockManager
     val partition = split.asInstanceOf[KinesisBackedBlockRDDPartition]
     val blockId = partition.blockId
 
-    def getBlockFromBlockManager(): Option[Iterator[Array[Byte]]] = {
+    def getBlockFromBlockManager(): Option[Iterator[T]] = {
       logDebug(s"Read partition data of $this from block manager, block $blockId")
-      blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[Array[Byte]]])
+      blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[T]])
     }
 
-    def getBlockFromKinesis(): Iterator[Array[Byte]] = {
-      val credenentials = awsCredentialsOption.getOrElse {
+    def getBlockFromKinesis(): Iterator[T] = {
+      val credentials = awsCredentialsOption.getOrElse {
         new DefaultAWSCredentialsProviderChain().getCredentials()
       }
       partition.seqNumberRanges.ranges.iterator.flatMap { range =>
-        new KinesisSequenceRangeIterator(
-          credenentials, endpointUrl, regionName, range, retryTimeoutMs)
+        new KinesisSequenceRangeIterator(credentials, endpointUrl, regionName,
+          range, retryTimeoutMs).map(messageHandler)
       }
     }
     if (partition.isBlockIdValid) {
@@ -129,8 +131,7 @@ class KinesisSequenceRangeIterator(
     endpointUrl: String,
     regionId: String,
     range: SequenceNumberRange,
-    retryTimeoutMs: Int
-  ) extends NextIterator[Array[Byte]] with Logging {
+    retryTimeoutMs: Int) extends NextIterator[Record] with Logging {
 
   private val client = new AmazonKinesisClient(credentials)
   private val streamName = range.streamName
@@ -142,8 +143,8 @@ class KinesisSequenceRangeIterator(
 
   client.setEndpoint(endpointUrl, "kinesis", regionId)
 
-  override protected def getNext(): Array[Byte] = {
-    var nextBytes: Array[Byte] = null
+  override protected def getNext(): Record = {
+    var nextRecord: Record = null
     if (toSeqNumberReceived) {
       finished = true
     } else {
@@ -170,10 +171,7 @@ class KinesisSequenceRangeIterator(
       } else {
 
         // Get the record, copy the data into a byte array and remember its sequence number
-        val nextRecord: Record = internalIterator.next()
-        val byteBuffer = nextRecord.getData()
-        nextBytes = new Array[Byte](byteBuffer.remaining())
-        byteBuffer.get(nextBytes)
+        nextRecord = internalIterator.next()
         lastSeqNumber = nextRecord.getSequenceNumber()
 
         // If the this record's sequence number matches the stopping sequence number, then make sure
@@ -182,9 +180,8 @@ class KinesisSequenceRangeIterator(
           toSeqNumberReceived = true
         }
       }
-
     }
-    nextBytes
+    nextRecord
   }
 
   override protected def close(): Unit = {
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
index 2e4204dcb6f1a..72ab6357a53b0 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.streaming.kinesis
 
+import scala.reflect.ClassTag
+
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.model.Record
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.{BlockId, StorageLevel}
@@ -26,7 +29,7 @@ import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
 import org.apache.spark.streaming.{Duration, StreamingContext, Time}
 
-private[kinesis] class KinesisInputDStream(
+private[kinesis] class KinesisInputDStream[T: ClassTag](
     @transient _ssc: StreamingContext,
     streamName: String,
     endpointUrl: String,
@@ -35,11 +38,12 @@ private[kinesis] class KinesisInputDStream(
     checkpointAppName: String,
     checkpointInterval: Duration,
     storageLevel: StorageLevel,
+    messageHandler: Record => T,
     awsCredentialsOption: Option[SerializableAWSCredentials]
-  ) extends ReceiverInputDStream[Array[Byte]](_ssc) {
+  ) extends ReceiverInputDStream[T](_ssc) {
 
   private[streaming]
-  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = {
+  override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {
 
     // This returns true even for when blockInfos is empty
     val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty)
@@ -56,6 +60,7 @@ private[kinesis] class KinesisInputDStream(
         context.sc, regionName, endpointUrl, blockIds, seqNumRanges,
         isBlockIdValid = isBlockIdValid,
         retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt,
+        messageHandler = messageHandler,
         awsCredentialsOption = awsCredentialsOption)
     } else {
       logWarning("Kinesis sequence number information was not present with some block metadata," +
@@ -64,8 +69,8 @@ private[kinesis] class KinesisInputDStream(
     }
   }
 
-  override def getReceiver(): Receiver[Array[Byte]] = {
+  override def getReceiver(): Receiver[T] = {
     new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
-      checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption)
+      checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption)
   }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 6e0988c1af8a1..134d627cdaffa 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -80,7 +80,7 @@ case class SerializableAWSCredentials(accessKeyId: String, secretKey: String)
  * @param awsCredentialsOption Optional AWS credentials, used when user directly specifies
  *                             the credentials
  */
-private[kinesis] class KinesisReceiver(
+private[kinesis] class KinesisReceiver[T](
     val streamName: String,
     endpointUrl: String,
     regionName: String,
@@ -88,8 +88,9 @@ private[kinesis] class KinesisReceiver(
     checkpointAppName: String,
     checkpointInterval: Duration,
     storageLevel: StorageLevel,
-    awsCredentialsOption: Option[SerializableAWSCredentials]
-  ) extends Receiver[Array[Byte]](storageLevel) with Logging { receiver =>
+    messageHandler: Record => T,
+    awsCredentialsOption: Option[SerializableAWSCredentials])
+  extends Receiver[T](storageLevel) with Logging { receiver =>
 
   /*
    * =================================================================================
@@ -202,12 +203,7 @@ private[kinesis] class KinesisReceiver(
   /** Add records of the given shard to the current block being generated */
   private[kinesis] def addRecords(shardId: String, records: java.util.List[Record]): Unit = {
     if (records.size > 0) {
-      val dataIterator = records.iterator().asScala.map { record =>
-        val byteBuffer = record.getData()
-        val byteArray = new Array[Byte](byteBuffer.remaining())
-        byteBuffer.get(byteArray)
-        byteArray
-      }
+      val dataIterator = records.iterator().asScala.map(messageHandler)
       val metadata = SequenceNumberRange(streamName, shardId,
         records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber())
       blockGenerator.addMultipleDataWithCallback(dataIterator, metadata)
@@ -240,7 +236,7 @@ private[kinesis] class KinesisReceiver(
 
   /** Store the block along with its associated ranges */
   private def storeBlockWithRanges(
-      blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[Array[Byte]]): Unit = {
+      blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[T]): Unit = {
     val rangesToReportOption = blockIdToSeqNumRanges.remove(blockId)
     if (rangesToReportOption.isEmpty) {
       stop("Error while storing block into Spark, could not find sequence number ranges " +
@@ -325,7 +321,7 @@ private[kinesis] class KinesisReceiver(
     /** Callback method called when a block is ready to be pushed / stored. */
     def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = {
       storeBlockWithRanges(blockId,
-        arrayBuffer.asInstanceOf[mutable.ArrayBuffer[Array[Byte]]])
+        arrayBuffer.asInstanceOf[mutable.ArrayBuffer[T]])
     }
 
     /** Callback called in case of any error in internal of the BlockGenerator */
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index b2405123321e3..1d5178790ec4c 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -41,8 +41,8 @@ import org.apache.spark.Logging
  * @param checkpointState represents the checkpoint state including the next checkpoint time.
  *   It's injected here for mocking purposes.
  */
-private[kinesis] class KinesisRecordProcessor(
-    receiver: KinesisReceiver,
+private[kinesis] class KinesisRecordProcessor[T](
+    receiver: KinesisReceiver[T],
     workerId: String,
     checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging {
 
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index c799fadf2d5ce..2849fd8a82102 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -16,16 +16,120 @@
  */
 package org.apache.spark.streaming.kinesis
 
+import scala.reflect.ClassTag
+
 import com.amazonaws.regions.RegionUtils
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.model.Record
 
+import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 import org.apache.spark.streaming.{Duration, StreamingContext}
 
-
 object KinesisUtils {
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
+   *
+   * @param ssc StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param messageHandler A custom message handler that can generate a generic output from a
+   *                       Kinesis `Record`, which contains both message data, and metadata.
+   */
+  def createStream[T: ClassTag](
+      ssc: StreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      messageHandler: Record => T): ReceiverInputDStream[T] = {
+    val cleanedHandler = ssc.sc.clean(messageHandler)
+    // Setting scope to override receiver stream's scope of "receiver stream"
+    ssc.withNamedScope("kinesis stream") {
+      new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName),
+        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        cleanedHandler, None)
+    }
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note:
+   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   *  is enabled. Make sure that your checkpoint directory is secure.
+   *
+   * @param ssc StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param messageHandler A custom message handler that can generate a generic output from a
+   *                       Kinesis `Record`, which contains both message data, and metadata.
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   */
+  // scalastyle:off
+  def createStream[T: ClassTag](
+      ssc: StreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      messageHandler: Record => T,
+      awsAccessKeyId: String,
+      awsSecretKey: String): ReceiverInputDStream[T] = {
+    // scalastyle:on
+    val cleanedHandler = ssc.sc.clean(messageHandler)
+    ssc.withNamedScope("kinesis stream") {
+      new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName),
+        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        cleanedHandler, Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey)))
+    }
+  }
+
   /**
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
@@ -61,12 +165,12 @@ object KinesisUtils {
       regionName: String,
       initialPositionInStream: InitialPositionInStream,
       checkpointInterval: Duration,
-      storageLevel: StorageLevel
-    ): ReceiverInputDStream[Array[Byte]] = {
+      storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = {
     // Setting scope to override receiver stream's scope of "receiver stream"
     ssc.withNamedScope("kinesis stream") {
-      new KinesisInputDStream(ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel, None)
+      new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName),
+        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        defaultMessageHandler, None)
     }
   }
 
@@ -109,12 +213,11 @@ object KinesisUtils {
       checkpointInterval: Duration,
       storageLevel: StorageLevel,
       awsAccessKeyId: String,
-      awsSecretKey: String
-    ): ReceiverInputDStream[Array[Byte]] = {
+      awsSecretKey: String): ReceiverInputDStream[Array[Byte]] = {
     ssc.withNamedScope("kinesis stream") {
-      new KinesisInputDStream(ssc, streamName, endpointUrl, validateRegion(regionName),
+      new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName),
         initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
-        Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey)))
+        defaultMessageHandler, Some(SerializableAWSCredentials(awsAccessKeyId, awsSecretKey)))
     }
   }
 
@@ -156,11 +259,113 @@ object KinesisUtils {
       storageLevel: StorageLevel
     ): ReceiverInputDStream[Array[Byte]] = {
     ssc.withNamedScope("kinesis stream") {
-      new KinesisInputDStream(ssc, streamName, endpointUrl, getRegionByEndpoint(endpointUrl),
-        initialPositionInStream, ssc.sc.appName, checkpointInterval, storageLevel, None)
+      new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl,
+        getRegionByEndpoint(endpointUrl), initialPositionInStream, ssc.sc.appName,
+        checkpointInterval, storageLevel, defaultMessageHandler, None)
     }
   }
 
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
+   *
+   * @param jssc Java StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param messageHandler A custom message handler that can generate a generic output from a
+   *                       Kinesis `Record`, which contains both message data, and metadata.
+   * @param recordClass Class of the records in DStream
+   */
+  def createStream[T](
+      jssc: JavaStreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      messageHandler: JFunction[Record, T],
+      recordClass: Class[T]): JavaReceiverInputDStream[T] = {
+    implicit val recordCmt: ClassTag[T] = ClassTag(recordClass)
+    val cleanedHandler = jssc.sparkContext.clean(messageHandler.call(_))
+    createStream[T](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+      initialPositionInStream, checkpointInterval, storageLevel, cleanedHandler)
+  }
+
+  /**
+   * Create an input stream that pulls messages from a Kinesis stream.
+   * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
+   *
+   * Note:
+   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
+   *
+   * @param jssc Java StreamingContext object
+   * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
+   *                        (KCL) to update DynamoDB
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
+   *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param storageLevel Storage level to use for storing the received objects.
+   *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param messageHandler A custom message handler that can generate a generic output from a
+   *                       Kinesis `Record`, which contains both message data, and metadata.
+   * @param recordClass Class of the records in DStream
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   */
+  // scalastyle:off
+  def createStream[T](
+      jssc: JavaStreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: InitialPositionInStream,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      messageHandler: JFunction[Record, T],
+      recordClass: Class[T],
+      awsAccessKeyId: String,
+      awsSecretKey: String): JavaReceiverInputDStream[T] = {
+    // scalastyle:on
+    implicit val recordCmt: ClassTag[T] = ClassTag(recordClass)
+    val cleanedHandler = jssc.sparkContext.clean(messageHandler.call(_))
+    createStream[T](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+      initialPositionInStream, checkpointInterval, storageLevel, cleanedHandler,
+      awsAccessKeyId, awsSecretKey)
+  }
+
   /**
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
@@ -198,8 +403,8 @@ object KinesisUtils {
       checkpointInterval: Duration,
       storageLevel: StorageLevel
     ): JavaReceiverInputDStream[Array[Byte]] = {
-    createStream(jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
-      initialPositionInStream, checkpointInterval, storageLevel)
+    createStream[Array[Byte]](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+      initialPositionInStream, checkpointInterval, storageLevel, defaultMessageHandler(_))
   }
 
   /**
@@ -241,10 +446,10 @@ object KinesisUtils {
       checkpointInterval: Duration,
       storageLevel: StorageLevel,
       awsAccessKeyId: String,
-      awsSecretKey: String
-    ): JavaReceiverInputDStream[Array[Byte]] = {
-    createStream(jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
-        initialPositionInStream, checkpointInterval, storageLevel, awsAccessKeyId, awsSecretKey)
+      awsSecretKey: String): JavaReceiverInputDStream[Array[Byte]] = {
+    createStream[Array[Byte]](jssc.ssc, kinesisAppName, streamName, endpointUrl, regionName,
+      initialPositionInStream, checkpointInterval, storageLevel,
+      defaultMessageHandler(_), awsAccessKeyId, awsSecretKey)
   }
 
   /**
@@ -297,6 +502,14 @@ object KinesisUtils {
       throw new IllegalArgumentException(s"Region name '$regionName' is not valid")
     }
   }
+
+  private[kinesis] def defaultMessageHandler(record: Record): Array[Byte] = {
+    if (record == null) return null
+    val byteBuffer = record.getData()
+    val byteArray = new Array[Byte](byteBuffer.remaining())
+    byteBuffer.get(byteArray)
+    byteArray
+  }
 }
 
 /**
diff --git a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
index 87954a31f60ce..3f0f6793d2d21 100644
--- a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
+++ b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
@@ -17,14 +17,19 @@
 
 package org.apache.spark.streaming.kinesis;
 
+import com.amazonaws.services.kinesis.model.Record;
+import org.junit.Test;
+
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.storage.StorageLevel;
 import org.apache.spark.streaming.Duration;
 import org.apache.spark.streaming.LocalJavaStreamingContext;
 import org.apache.spark.streaming.api.java.JavaDStream;
-import org.junit.Test;
 
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
 
+import java.nio.ByteBuffer;
+
 /**
  * Demonstrate the use of the KinesisUtils Java API
  */
@@ -33,9 +38,27 @@ public class JavaKinesisStreamSuite extends LocalJavaStreamingContext {
   public void testKinesisStream() {
     // Tests the API, does not actually test data receiving
     JavaDStream<byte[]> kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
-        "https://kinesis.us-west-2.amazonaws.com", new Duration(2000), 
+        "https://kinesis.us-west-2.amazonaws.com", new Duration(2000),
         InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2());
-    
+
+    ssc.stop();
+  }
+
+
+  private static Function<Record, String> handler = new Function<Record, String>() {
+    @Override
+    public String call(Record record) {
+      return record.getPartitionKey() + "-" + record.getSequenceNumber();
+    }
+  };
+
+  @Test
+  public void testCustomHandler() {
+    // Tests the API, does not actually test data receiving
+    JavaDStream<String> kinesisStream = KinesisUtils.createStream(ssc, "testApp", "mySparkStream",
+        "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST,
+        new Duration(2000), StorageLevel.MEMORY_AND_DISK_2(), handler, String.class);
+
     ssc.stop();
   }
 }
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index a89e5627e014c..9f9e146a08d46 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -73,22 +73,22 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll
 
   testIfEnabled("Basic reading from Kinesis") {
     // Verify all data using multiple ranges in a single RDD partition
-    val receivedData1 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl,
-      fakeBlockIds(1),
+    val receivedData1 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName,
+      testUtils.endpointUrl, fakeBlockIds(1),
       Array(SequenceNumberRanges(allRanges.toArray))
     ).map { bytes => new String(bytes).toInt }.collect()
     assert(receivedData1.toSet === testData.toSet)
 
     // Verify all data using one range in each of the multiple RDD partitions
-    val receivedData2 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl,
-      fakeBlockIds(allRanges.size),
+    val receivedData2 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName,
+      testUtils.endpointUrl, fakeBlockIds(allRanges.size),
       allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray
     ).map { bytes => new String(bytes).toInt }.collect()
     assert(receivedData2.toSet === testData.toSet)
 
     // Verify ordering within each partition
-    val receivedData3 = new KinesisBackedBlockRDD(sc, testUtils.regionName, testUtils.endpointUrl,
-      fakeBlockIds(allRanges.size),
+    val receivedData3 = new KinesisBackedBlockRDD[Array[Byte]](sc, testUtils.regionName,
+      testUtils.endpointUrl, fakeBlockIds(allRanges.size),
       allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray
     ).map { bytes => new String(bytes).toInt }.collectPartitions()
     assert(receivedData3.length === allRanges.size)
@@ -209,7 +209,7 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll
       }, "Incorrect configuration of RDD, unexpected ranges set"
     )
 
-    val rdd = new KinesisBackedBlockRDD(
+    val rdd = new KinesisBackedBlockRDD[Array[Byte]](
       sc, testUtils.regionName, testUtils.endpointUrl, blockIds, ranges)
     val collectedData = rdd.map { bytes =>
       new String(bytes).toInt
@@ -223,7 +223,7 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll
     if (testIsBlockValid) {
       require(numPartitionsInBM === numPartitions, "All partitions must be in BlockManager")
       require(numPartitionsInKinesis === 0, "No partitions must be in Kinesis")
-      val rdd2 = new KinesisBackedBlockRDD(
+      val rdd2 = new KinesisBackedBlockRDD[Array[Byte]](
         sc, testUtils.regionName, testUtils.endpointUrl, blockIds.toArray, ranges,
         isBlockIdValid = Array.fill(blockIds.length)(false))
       intercept[SparkException] {
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 3d136aec2e702..17ab444704f44 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -52,14 +52,14 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
   record2.setData(ByteBuffer.wrap("Learning Spark".getBytes(StandardCharsets.UTF_8)))
   val batch = Arrays.asList(record1, record2)
 
-  var receiverMock: KinesisReceiver = _
+  var receiverMock: KinesisReceiver[Array[Byte]] = _
   var checkpointerMock: IRecordProcessorCheckpointer = _
   var checkpointClockMock: ManualClock = _
   var checkpointStateMock: KinesisCheckpointState = _
   var currentClockMock: Clock = _
 
   override def beforeFunction(): Unit = {
-    receiverMock = mock[KinesisReceiver]
+    receiverMock = mock[KinesisReceiver[Array[Byte]]]
     checkpointerMock = mock[IRecordProcessorCheckpointer]
     checkpointClockMock = mock[ManualClock]
     checkpointStateMock = mock[KinesisCheckpointState]
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index 1177dc758100d..ba84e557dfcc2 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -24,6 +24,7 @@ import scala.util.Random
 
 import com.amazonaws.regions.RegionUtils
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.model.Record
 import org.scalatest.Matchers._
 import org.scalatest.concurrent.Eventually
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
@@ -31,6 +32,7 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.{StorageLevel, StreamBlockId}
 import org.apache.spark.streaming._
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
 import org.apache.spark.streaming.kinesis.KinesisTestUtils._
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
 import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
@@ -113,9 +115,9 @@ class KinesisStreamSuite extends KinesisFunSuite
     val inputStream = KinesisUtils.createStream(ssc, appName, "dummyStream",
       dummyEndpointUrl, dummyRegionName, InitialPositionInStream.LATEST, Seconds(2),
       StorageLevel.MEMORY_AND_DISK_2, dummyAWSAccessKey, dummyAWSSecretKey)
-    assert(inputStream.isInstanceOf[KinesisInputDStream])
+    assert(inputStream.isInstanceOf[KinesisInputDStream[Array[Byte]]])
 
-    val kinesisStream = inputStream.asInstanceOf[KinesisInputDStream]
+    val kinesisStream = inputStream.asInstanceOf[KinesisInputDStream[Array[Byte]]]
     val time = Time(1000)
 
     // Generate block info data for testing
@@ -134,8 +136,8 @@ class KinesisStreamSuite extends KinesisFunSuite
     // Verify that the generated KinesisBackedBlockRDD has the all the right information
     val blockInfos = Seq(blockInfo1, blockInfo2)
     val nonEmptyRDD = kinesisStream.createBlockRDD(time, blockInfos)
-    nonEmptyRDD shouldBe a [KinesisBackedBlockRDD]
-    val kinesisRDD = nonEmptyRDD.asInstanceOf[KinesisBackedBlockRDD]
+    nonEmptyRDD shouldBe a [KinesisBackedBlockRDD[Array[Byte]]]
+    val kinesisRDD = nonEmptyRDD.asInstanceOf[KinesisBackedBlockRDD[Array[Byte]]]
     assert(kinesisRDD.regionName === dummyRegionName)
     assert(kinesisRDD.endpointUrl === dummyEndpointUrl)
     assert(kinesisRDD.retryTimeoutMs === batchDuration.milliseconds)
@@ -151,7 +153,7 @@ class KinesisStreamSuite extends KinesisFunSuite
 
     // Verify that KinesisBackedBlockRDD is generated even when there are no blocks
     val emptyRDD = kinesisStream.createBlockRDD(time, Seq.empty)
-    emptyRDD shouldBe a [KinesisBackedBlockRDD]
+    emptyRDD shouldBe a [KinesisBackedBlockRDD[Array[Byte]]]
     emptyRDD.partitions shouldBe empty
 
     // Verify that the KinesisBackedBlockRDD has isBlockValid = false when blocks are invalid
@@ -192,6 +194,32 @@ class KinesisStreamSuite extends KinesisFunSuite
     ssc.stop(stopSparkContext = false)
   }
 
+  testIfEnabled("custom message handling") {
+    val awsCredentials = KinesisTestUtils.getAWSCredentials()
+    def addFive(r: Record): Int = new String(r.getData.array()).toInt + 5
+    val stream = KinesisUtils.createStream(ssc, appName, testUtils.streamName,
+      testUtils.endpointUrl, testUtils.regionName, InitialPositionInStream.LATEST,
+      Seconds(10), StorageLevel.MEMORY_ONLY, addFive,
+      awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey)
+
+    stream shouldBe a [ReceiverInputDStream[Int]]
+
+    val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int]
+    stream.foreachRDD { rdd =>
+      collected ++= rdd.collect()
+      logInfo("Collected = " + rdd.collect().toSeq.mkString(", "))
+    }
+    ssc.start()
+
+    val testData = 1 to 10
+    eventually(timeout(120 seconds), interval(10 second)) {
+      testUtils.pushData(testData)
+      val modData = testData.map(_ + 5)
+      assert(collected === modData.toSet, "\nData received does not match data sent")
+    }
+    ssc.stop(stopSparkContext = false)
+  }
+
   testIfEnabled("failure recovery") {
     val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName)
     val checkpointDir = Utils.createTempDir().getAbsolutePath
@@ -210,7 +238,7 @@ class KinesisStreamSuite extends KinesisFunSuite
 
     // Verify that the generated RDDs are KinesisBackedBlockRDDs, and collect the data in each batch
     kinesisStream.foreachRDD((rdd: RDD[Array[Byte]], time: Time) => {
-      val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD]
+      val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD[Array[Byte]]]
       val data = rdd.map { bytes => new String(bytes).toInt }.collect().toSeq
       collectedData(time) = (kRdd.arrayOfseqNumberRanges, data)
     })
@@ -243,10 +271,10 @@ class KinesisStreamSuite extends KinesisFunSuite
     times.foreach { time =>
       val (arrayOfSeqNumRanges, data) = collectedData(time)
       val rdd = recoveredKinesisStream.getOrCompute(time).get.asInstanceOf[RDD[Array[Byte]]]
-      rdd shouldBe a [KinesisBackedBlockRDD]
+      rdd shouldBe a [KinesisBackedBlockRDD[Array[Byte]]]
 
       // Verify the recovered sequence ranges
-      val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD]
+      val kRdd = rdd.asInstanceOf[KinesisBackedBlockRDD[Array[Byte]]]
       assert(kRdd.arrayOfseqNumberRanges.size === arrayOfSeqNumRanges.size)
       arrayOfSeqNumRanges.zip(kRdd.arrayOfseqNumberRanges).foreach { case (expected, found) =>
         assert(expected.ranges.toSeq === found.ranges.toSeq)

From 85e654c5ec87e666a8845bfd77185c1ea57b268a Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 25 Oct 2015 21:19:52 -0700
Subject: [PATCH 672/802] [SPARK-10984] Simplify *MemoryManager class structure

This patch refactors the MemoryManager class structure. After #9000, Spark had the following classes:

- MemoryManager
- StaticMemoryManager
- ExecutorMemoryManager
- TaskMemoryManager
- ShuffleMemoryManager

This is fairly confusing. To simplify things, this patch consolidates several of these classes:

- ShuffleMemoryManager and ExecutorMemoryManager were merged into MemoryManager.
- TaskMemoryManager is moved into Spark Core.

**Key changes and tasks**:

- [x] Merge ExecutorMemoryManager into MemoryManager.
  - [x] Move pooling logic into Allocator.
- [x] Move TaskMemoryManager from `spark-unsafe` to `spark-core`.
- [x] Refactor the existing Tungsten TaskMemoryManager interactions so Tungsten code use only this and not both this and ShuffleMemoryManager.
- [x] Refactor non-Tungsten code to use the TaskMemoryManager instead of ShuffleMemoryManager.
- [x] Merge ShuffleMemoryManager into MemoryManager.
  - [x] Move code
  - [x] ~~Simplify 1/n calculation.~~ **Will defer to followup, since this needs more work.**
- [x] Port ShuffleMemoryManagerSuite tests.
- [x] Move classes from `unsafe` package to `memory` package.
- [ ] Figure out how to handle the hacky use of the memory managers in HashedRelation's broadcast variable construction.
- [x] Test porting and cleanup: several tests relied on mock functionality (such as `TestShuffleMemoryManager.markAsOutOfMemory`) which has been changed or broken during the memory manager consolidation
  - [x] AbstractBytesToBytesMapSuite
  - [x] UnsafeExternalSorterSuite
  - [x] UnsafeFixedWidthAggregationMapSuite
  - [x] UnsafeKVExternalSorterSuite

**Compatiblity notes**:

- This patch introduces breaking changes in `ExternalAppendOnlyMap`, which is marked as `DevloperAPI` (likely for legacy reasons): this class now cannot be used outside of a task.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9127 from JoshRosen/SPARK-10984.
---
 .../spark}/memory/TaskMemoryManager.java      | 111 +++---
 .../shuffle/sort/PackedRecordPointer.java     |   4 +-
 .../shuffle/sort/ShuffleExternalSorter.java   |  57 ++-
 .../shuffle/sort/UnsafeShuffleWriter.java     |   7 +-
 .../spark/unsafe/map/BytesToBytesMap.java     |  36 +-
 .../sort/RecordPointerAndKeyPrefix.java       |   4 +-
 .../unsafe/sort/UnsafeExternalSorter.java     |  51 +--
 .../unsafe/sort/UnsafeInMemorySorter.java     |   2 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  23 +-
 .../scala/org/apache/spark/TaskContext.scala  |   2 +-
 .../org/apache/spark/TaskContextImpl.scala    |   2 +-
 .../org/apache/spark/executor/Executor.scala  |   4 +-
 .../apache/spark/memory/MemoryManager.scala   | 197 ++++++++++-
 .../spark/memory/StaticMemoryManager.scala    |  12 +-
 .../spark/memory/UnifiedMemoryManager.scala   |  12 +-
 .../org/apache/spark/scheduler/Task.scala     |   6 +-
 .../shuffle/BlockStoreShuffleReader.scala     |   5 +-
 .../spark/shuffle/ShuffleMemoryManager.scala  | 209 -----------
 .../shuffle/sort/SortShuffleManager.scala     |   1 -
 .../shuffle/sort/SortShuffleWriter.scala      |   6 +-
 .../collection/ExternalAppendOnlyMap.scala    |  49 ++-
 .../util/collection/ExternalSorter.scala      |   8 +-
 .../spark/util/collection/Spillable.scala     |  16 +-
 .../spark}/memory/TaskMemoryManagerSuite.java |  25 +-
 .../sort/PackedRecordPointerSuite.java        |  12 +-
 .../sort/ShuffleInMemorySorterSuite.java      |   9 +-
 .../sort/UnsafeShuffleWriterSuite.java        |  53 ++-
 .../map/AbstractBytesToBytesMapSuite.java     | 108 ++----
 .../map/BytesToBytesMapOffHeapSuite.java      |   7 +-
 .../map/BytesToBytesMapOnHeapSuite.java       |   7 +-
 .../sort/UnsafeExternalSorterSuite.java       |  34 +-
 .../sort/UnsafeInMemorySorterSuite.java       |  13 +-
 .../scala/org/apache/spark/FailureSuite.scala |   4 +-
 .../memory/GrantEverythingMemoryManager.scala |  51 +--
 .../spark/memory/MemoryManagerSuite.scala     | 134 +++++++
 .../spark/memory/MemoryTestingUtils.scala     |  37 ++
 .../memory/StaticMemoryManagerSuite.scala     |  24 +-
 .../memory/UnifiedMemoryManagerSuite.scala    |  26 +-
 .../shuffle/ShuffleMemoryManagerSuite.scala   | 326 ------------------
 .../BlockManagerReplicationSuite.scala        |   4 +-
 .../spark/storage/BlockManagerSuite.scala     |   8 +-
 .../ExternalAppendOnlyMapSuite.scala          |  60 ++--
 .../util/collection/ExternalSorterSuite.scala |  48 ++-
 .../execution/UnsafeExternalRowSorter.java    |   1 -
 .../UnsafeFixedWidthAggregationMap.java       |  12 +-
 .../sql/execution/UnsafeKVExternalSorter.java |  22 +-
 .../TungstenAggregationIterator.scala         |   9 +-
 .../datasources/WriterContainer.scala         |   3 +-
 .../sql/execution/joins/HashedRelation.scala  |  21 +-
 .../org/apache/spark/sql/execution/sort.scala |   5 +-
 .../UnsafeFixedWidthAggregationMapSuite.scala |  54 ++-
 .../UnsafeKVExternalSorterSuite.scala         |  19 +-
 .../execution/UnsafeRowSerializerSuite.scala  |  10 +-
 .../TungstenAggregationIteratorSuite.scala    |   4 +-
 .../streaming/ReceivedBlockHandlerSuite.scala |   2 +-
 .../unsafe/memory/ExecutorMemoryManager.java  | 111 ------
 .../unsafe/memory/HeapMemoryAllocator.java    |  51 ++-
 .../spark/unsafe/memory/MemoryBlock.java      |   5 +-
 58 files changed, 888 insertions(+), 1255 deletions(-)
 rename {unsafe/src/main/java/org/apache/spark/unsafe => core/src/main/java/org/apache/spark}/memory/TaskMemoryManager.java (78%)
 delete mode 100644 core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
 rename {unsafe/src/test/java/org/apache/spark/unsafe => core/src/test/java/org/apache/spark}/memory/TaskMemoryManagerSuite.java (74%)
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala => core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala (56%)
 create mode 100644 core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
 delete mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
similarity index 78%
rename from unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
rename to core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 97b2c93f0dc37..7b31c90dac666 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.unsafe.memory;
+package org.apache.spark.memory;
 
 import java.util.*;
 
@@ -23,6 +23,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.spark.unsafe.memory.MemoryBlock;
+
 /**
  * Manages the memory allocated by an individual task.
  * <p>
@@ -87,13 +89,9 @@ public class TaskMemoryManager {
    */
   private final BitSet allocatedPages = new BitSet(PAGE_TABLE_SIZE);
 
-  /**
-   * Tracks memory allocated with {@link TaskMemoryManager#allocate(long)}, used to detect / clean
-   * up leaked memory.
-   */
-  private final HashSet<MemoryBlock> allocatedNonPageMemory = new HashSet<MemoryBlock>();
+  private final MemoryManager memoryManager;
 
-  private final ExecutorMemoryManager executorMemoryManager;
+  private final long taskAttemptId;
 
   /**
    * Tracks whether we're in-heap or off-heap. For off-heap, we short-circuit most of these methods
@@ -103,16 +101,38 @@ public class TaskMemoryManager {
   private final boolean inHeap;
 
   /**
-   * Construct a new MemoryManager.
+   * Construct a new TaskMemoryManager.
    */
-  public TaskMemoryManager(ExecutorMemoryManager executorMemoryManager) {
-    this.inHeap = executorMemoryManager.inHeap;
-    this.executorMemoryManager = executorMemoryManager;
+  public TaskMemoryManager(MemoryManager memoryManager, long taskAttemptId) {
+    this.inHeap = memoryManager.tungstenMemoryIsAllocatedInHeap();
+    this.memoryManager = memoryManager;
+    this.taskAttemptId = taskAttemptId;
+  }
+
+  /**
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   * @return number of bytes successfully granted (<= N).
+   */
+  public long acquireExecutionMemory(long size) {
+    return memoryManager.acquireExecutionMemory(size, taskAttemptId);
+  }
+
+  /**
+   * Release N bytes of execution memory.
+   */
+  public void releaseExecutionMemory(long size) {
+    memoryManager.releaseExecutionMemory(size, taskAttemptId);
+  }
+
+  public long pageSizeBytes() {
+    return memoryManager.pageSizeBytes();
   }
 
   /**
    * Allocate a block of memory that will be tracked in the MemoryManager's page table; this is
-   * intended for allocating large blocks of memory that will be shared between operators.
+   * intended for allocating large blocks of Tungsten memory that will be shared between operators.
+   *
+   * Returns `null` if there was not enough memory to allocate the page.
    */
   public MemoryBlock allocatePage(long size) {
     if (size > MAXIMUM_PAGE_SIZE_BYTES) {
@@ -129,7 +149,15 @@ public MemoryBlock allocatePage(long size) {
       }
       allocatedPages.set(pageNumber);
     }
-    final MemoryBlock page = executorMemoryManager.allocate(size);
+    final long acquiredExecutionMemory = acquireExecutionMemory(size);
+    if (acquiredExecutionMemory != size) {
+      releaseExecutionMemory(acquiredExecutionMemory);
+      synchronized (this) {
+        allocatedPages.clear(pageNumber);
+      }
+      return null;
+    }
+    final MemoryBlock page = memoryManager.tungstenMemoryAllocator().allocate(size);
     page.pageNumber = pageNumber;
     pageTable[pageNumber] = page;
     if (logger.isTraceEnabled()) {
@@ -152,45 +180,16 @@ public void freePage(MemoryBlock page) {
     if (logger.isTraceEnabled()) {
       logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size());
     }
-    // Cannot access a page once it's freed.
-    executorMemoryManager.free(page);
-  }
-
-  /**
-   * Allocates a contiguous block of memory. Note that the allocated memory is not guaranteed
-   * to be zeroed out (call `zero()` on the result if this is necessary). This method is intended
-   * to be used for allocating operators' internal data structures. For data pages that you want to
-   * exchange between operators, consider using {@link TaskMemoryManager#allocatePage(long)}, since
-   * that will enable intra-memory pointers (see
-   * {@link TaskMemoryManager#encodePageNumberAndOffset(MemoryBlock, long)} and this class's
-   * top-level Javadoc for more details).
-   */
-  public MemoryBlock allocate(long size) throws OutOfMemoryError {
-    assert(size > 0) : "Size must be positive, but got " + size;
-    final MemoryBlock memory = executorMemoryManager.allocate(size);
-    synchronized(allocatedNonPageMemory) {
-      allocatedNonPageMemory.add(memory);
-    }
-    return memory;
-  }
-
-  /**
-   * Free memory allocated by {@link TaskMemoryManager#allocate(long)}.
-   */
-  public void free(MemoryBlock memory) {
-    assert (memory.pageNumber == -1) : "Should call freePage() for pages, not free()";
-    executorMemoryManager.free(memory);
-    synchronized(allocatedNonPageMemory) {
-      final boolean wasAlreadyRemoved = !allocatedNonPageMemory.remove(memory);
-      assert (!wasAlreadyRemoved) : "Called free() on memory that was already freed!";
-    }
+    long pageSize = page.size();
+    memoryManager.tungstenMemoryAllocator().free(page);
+    releaseExecutionMemory(pageSize);
   }
 
   /**
    * Given a memory page and offset within that page, encode this address into a 64-bit long.
    * This address will remain valid as long as the corresponding page has not been freed.
    *
-   * @param page a data page allocated by {@link TaskMemoryManager#allocate(long)}.
+   * @param page a data page allocated by {@link TaskMemoryManager#allocatePage(long)}/
    * @param offsetInPage an offset in this page which incorporates the base offset. In other words,
    *                     this should be the value that you would pass as the base offset into an
    *                     UNSAFE call (e.g. page.baseOffset() + something).
@@ -270,17 +269,15 @@ public long cleanUpAllAllocatedMemory() {
       }
     }
 
-    synchronized (allocatedNonPageMemory) {
-      final Iterator<MemoryBlock> iter = allocatedNonPageMemory.iterator();
-      while (iter.hasNext()) {
-        final MemoryBlock memory = iter.next();
-        freedBytes += memory.size();
-        // We don't call free() here because that calls Set.remove, which would lead to a
-        // ConcurrentModificationException here.
-        executorMemoryManager.free(memory);
-        iter.remove();
-      }
-    }
+    freedBytes += memoryManager.releaseAllExecutionMemoryForTask(taskAttemptId);
+
     return freedBytes;
   }
+
+  /**
+   * Returns the memory consumption, in bytes, for the current task
+   */
+  public long getMemoryConsumptionForThisTask() {
+    return memoryManager.getExecutionMemoryUsageForTask(taskAttemptId);
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java b/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
index c11711966fa8c..f8f2b220e181d 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/PackedRecordPointer.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.shuffle.sort;
 
+import org.apache.spark.memory.TaskMemoryManager;
+
 /**
  * Wrapper around an 8-byte word that holds a 24-bit partition number and 40-bit record pointer.
  * <p>
@@ -26,7 +28,7 @@
  * </pre>
  * This implies that the maximum addressable page size is 2^27 bits = 128 megabytes, assuming that
  * our offsets in pages are not 8-byte-word-aligned. Since we have 2^13 pages (based off the
- * 13-bit page numbers assigned by {@link org.apache.spark.unsafe.memory.TaskMemoryManager}), this
+ * 13-bit page numbers assigned by {@link TaskMemoryManager}), this
  * implies that we can address 2^13 * 128 megabytes = 1 terabyte of RAM per task.
  * <p>
  * Assuming word-alignment would allow for a 1 gigabyte maximum page size, but we leave this
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index 85fdaa8115fa3..f43236f41ae7b 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -33,14 +33,13 @@
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.serializer.SerializerInstance;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.DiskBlockObjectWriter;
 import org.apache.spark.storage.TempShuffleBlockId;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 /**
@@ -72,7 +71,6 @@ final class ShuffleExternalSorter {
   @VisibleForTesting
   final int maxRecordSizeBytes;
   private final TaskMemoryManager taskMemoryManager;
-  private final ShuffleMemoryManager shuffleMemoryManager;
   private final BlockManager blockManager;
   private final TaskContext taskContext;
   private final ShuffleWriteMetrics writeMetrics;
@@ -105,7 +103,6 @@ final class ShuffleExternalSorter {
 
   public ShuffleExternalSorter(
       TaskMemoryManager memoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       BlockManager blockManager,
       TaskContext taskContext,
       int initialSize,
@@ -113,7 +110,6 @@ public ShuffleExternalSorter(
       SparkConf conf,
       ShuffleWriteMetrics writeMetrics) throws IOException {
     this.taskMemoryManager = memoryManager;
-    this.shuffleMemoryManager = shuffleMemoryManager;
     this.blockManager = blockManager;
     this.taskContext = taskContext;
     this.initialSize = initialSize;
@@ -124,7 +120,7 @@ public ShuffleExternalSorter(
     this.numElementsForSpillThreshold =
       conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", Long.MAX_VALUE);
     this.pageSizeBytes = (int) Math.min(
-      PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, shuffleMemoryManager.pageSizeBytes());
+      PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, taskMemoryManager.pageSizeBytes());
     this.maxRecordSizeBytes = pageSizeBytes - 4;
     this.writeMetrics = writeMetrics;
     initializeForWriting();
@@ -140,9 +136,9 @@ public ShuffleExternalSorter(
   private void initializeForWriting() throws IOException {
     // TODO: move this sizing calculation logic into a static method of sorter:
     final long memoryRequested = initialSize * 8L;
-    final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryRequested);
+    final long memoryAcquired = taskMemoryManager.acquireExecutionMemory(memoryRequested);
     if (memoryAcquired != memoryRequested) {
-      shuffleMemoryManager.release(memoryAcquired);
+      taskMemoryManager.releaseExecutionMemory(memoryAcquired);
       throw new IOException("Could not acquire " + memoryRequested + " bytes of memory");
     }
 
@@ -272,6 +268,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
    */
   @VisibleForTesting
   void spill() throws IOException {
+    assert(inMemSorter != null);
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
       Thread.currentThread().getId(),
       Utils.bytesToString(getMemoryUsage()),
@@ -281,7 +278,7 @@ void spill() throws IOException {
     writeSortedFile(false);
     final long inMemSorterMemoryUsage = inMemSorter.getMemoryUsage();
     inMemSorter = null;
-    shuffleMemoryManager.release(inMemSorterMemoryUsage);
+    taskMemoryManager.releaseExecutionMemory(inMemSorterMemoryUsage);
     final long spillSize = freeMemory();
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
 
@@ -316,9 +313,13 @@ private long freeMemory() {
     long memoryFreed = 0;
     for (MemoryBlock block : allocatedPages) {
       taskMemoryManager.freePage(block);
-      shuffleMemoryManager.release(block.size());
       memoryFreed += block.size();
     }
+    if (inMemSorter != null) {
+      long sorterMemoryUsage = inMemSorter.getMemoryUsage();
+      inMemSorter = null;
+      taskMemoryManager.releaseExecutionMemory(sorterMemoryUsage);
+    }
     allocatedPages.clear();
     currentPage = null;
     currentPagePosition = -1;
@@ -337,8 +338,9 @@ public void cleanupResources() {
       }
     }
     if (inMemSorter != null) {
-      shuffleMemoryManager.release(inMemSorter.getMemoryUsage());
+      long sorterMemoryUsage = inMemSorter.getMemoryUsage();
       inMemSorter = null;
+      taskMemoryManager.releaseExecutionMemory(sorterMemoryUsage);
     }
   }
 
@@ -353,21 +355,20 @@ private void growPointerArrayIfNecessary() throws IOException {
       logger.debug("Attempting to expand sort pointer array");
       final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage();
       final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2;
-      final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryToGrowPointerArray);
+      final long memoryAcquired = taskMemoryManager.acquireExecutionMemory(memoryToGrowPointerArray);
       if (memoryAcquired < memoryToGrowPointerArray) {
-        shuffleMemoryManager.release(memoryAcquired);
+        taskMemoryManager.releaseExecutionMemory(memoryAcquired);
         spill();
       } else {
         inMemSorter.expandPointerArray();
-        shuffleMemoryManager.release(oldPointerArrayMemoryUsage);
+        taskMemoryManager.releaseExecutionMemory(oldPointerArrayMemoryUsage);
       }
     }
   }
-  
+
   /**
    * Allocates more memory in order to insert an additional record. This will request additional
-   * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be
-   * obtained.
+   * memory from the memory manager and spill if the requested memory can not be obtained.
    *
    * @param requiredSpace the required space in the data page, in bytes, including space for storing
    *                      the record size. This must be less than or equal to the page size (records
@@ -386,17 +387,14 @@ private void acquireNewPageIfNecessary(int requiredSpace) throws IOException {
         throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
           pageSizeBytes + ")");
       } else {
-        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
-        if (memoryAcquired < pageSizeBytes) {
-          shuffleMemoryManager.release(memoryAcquired);
+        currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
+        if (currentPage == null) {
           spill();
-          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
-          if (memoryAcquiredAfterSpilling != pageSizeBytes) {
-            shuffleMemoryManager.release(memoryAcquiredAfterSpilling);
+          currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
+          if (currentPage == null) {
             throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
           }
         }
-        currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
         currentPagePosition = currentPage.getBaseOffset();
         freeSpaceInCurrentPage = pageSizeBytes;
         allocatedPages.add(currentPage);
@@ -430,17 +428,14 @@ public void insertRecord(
       long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
       // The record is larger than the page size, so allocate a special overflow page just to hold
       // that record.
-      final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-      if (memoryGranted != overflowPageSize) {
-        shuffleMemoryManager.release(memoryGranted);
+      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+      if (overflowPage == null) {
         spill();
-        final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-        if (memoryGrantedAfterSpill != overflowPageSize) {
-          shuffleMemoryManager.release(memoryGrantedAfterSpill);
+        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+        if (overflowPage == null) {
           throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
         }
       }
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
       allocatedPages.add(overflowPage);
       dataPage = overflowPage;
       dataPagePosition = overflowPage.getBaseOffset();
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index e8f050cb2dab1..f6c5c944bd77b 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -49,12 +49,11 @@
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 @Private
 public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
@@ -69,7 +68,6 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final BlockManager blockManager;
   private final IndexShuffleBlockResolver shuffleBlockResolver;
   private final TaskMemoryManager memoryManager;
-  private final ShuffleMemoryManager shuffleMemoryManager;
   private final SerializerInstance serializer;
   private final Partitioner partitioner;
   private final ShuffleWriteMetrics writeMetrics;
@@ -103,7 +101,6 @@ public UnsafeShuffleWriter(
       BlockManager blockManager,
       IndexShuffleBlockResolver shuffleBlockResolver,
       TaskMemoryManager memoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       SerializedShuffleHandle<K, V> handle,
       int mapId,
       TaskContext taskContext,
@@ -117,7 +114,6 @@ public UnsafeShuffleWriter(
     this.blockManager = blockManager;
     this.shuffleBlockResolver = shuffleBlockResolver;
     this.memoryManager = memoryManager;
-    this.shuffleMemoryManager = shuffleMemoryManager;
     this.mapId = mapId;
     final ShuffleDependency<K, V, V> dep = handle.dependency();
     this.shuffleId = dep.shuffleId();
@@ -197,7 +193,6 @@ private void open() throws IOException {
     assert (sorter == null);
     sorter = new ShuffleExternalSorter(
       memoryManager,
-      shuffleMemoryManager,
       blockManager,
       taskContext,
       INITIAL_SORT_BUFFER_SIZE,
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index b24eed3952fd6..f035bdac810bd 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -26,7 +26,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.array.LongArray;
@@ -34,7 +33,7 @@
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.memory.MemoryLocation;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 /**
  * An append-only hash map where keys and values are contiguous regions of bytes.
@@ -70,8 +69,6 @@ public final class BytesToBytesMap {
 
   private final TaskMemoryManager taskMemoryManager;
 
-  private final ShuffleMemoryManager shuffleMemoryManager;
-
   /**
    * A linked list for tracking all allocated data pages so that we can free all of our memory.
    */
@@ -169,13 +166,11 @@ public final class BytesToBytesMap {
 
   public BytesToBytesMap(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       double loadFactor,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
     this.taskMemoryManager = taskMemoryManager;
-    this.shuffleMemoryManager = shuffleMemoryManager;
     this.loadFactor = loadFactor;
     this.loc = new Location();
     this.pageSizeBytes = pageSizeBytes;
@@ -201,21 +196,18 @@ public BytesToBytesMap(
 
   public BytesToBytesMap(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes) {
-    this(taskMemoryManager, shuffleMemoryManager, initialCapacity, 0.70, pageSizeBytes, false);
+    this(taskMemoryManager, initialCapacity, 0.70, pageSizeBytes, false);
   }
 
   public BytesToBytesMap(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
     this(
       taskMemoryManager,
-      shuffleMemoryManager,
       initialCapacity,
       0.70,
       pageSizeBytes,
@@ -260,7 +252,6 @@ private void advanceToNextPage() {
       if (destructive && currentPage != null) {
         dataPagesIterator.remove();
         this.bmap.taskMemoryManager.freePage(currentPage);
-        this.bmap.shuffleMemoryManager.release(currentPage.size());
       }
       currentPage = dataPagesIterator.next();
       pageBaseObject = currentPage.getBaseObject();
@@ -572,14 +563,12 @@ public boolean putNewKey(
       if (useOverflowPage) {
         // The record is larger than the page size, so allocate a special overflow page just to hold
         // that record.
-        final long memoryRequested = requiredSize + 8;
-        final long memoryGranted = shuffleMemoryManager.tryToAcquire(memoryRequested);
-        if (memoryGranted != memoryRequested) {
-          shuffleMemoryManager.release(memoryGranted);
-          logger.debug("Failed to acquire {} bytes of memory", memoryRequested);
+        final long overflowPageSize = requiredSize + 8;
+        MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+        if (overflowPage == null) {
+          logger.debug("Failed to acquire {} bytes of memory", overflowPageSize);
           return false;
         }
-        MemoryBlock overflowPage = taskMemoryManager.allocatePage(memoryRequested);
         dataPages.add(overflowPage);
         dataPage = overflowPage;
         dataPageBaseObject = overflowPage.getBaseObject();
@@ -655,17 +644,15 @@ public boolean putNewKey(
   }
 
   /**
-   * Acquire a new page from the {@link ShuffleMemoryManager}.
+   * Acquire a new page from the memory manager.
    * @return whether there is enough space to allocate the new page.
    */
   private boolean acquireNewPage() {
-    final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
-    if (memoryGranted != pageSizeBytes) {
-      shuffleMemoryManager.release(memoryGranted);
+    MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes);
+    if (newPage == null) {
       logger.debug("Failed to acquire {} bytes of memory", pageSizeBytes);
       return false;
     }
-    MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes);
     dataPages.add(newPage);
     pageCursor = 0;
     currentDataPage = newPage;
@@ -705,7 +692,6 @@ public void free() {
       MemoryBlock dataPage = dataPagesIterator.next();
       dataPagesIterator.remove();
       taskMemoryManager.freePage(dataPage);
-      shuffleMemoryManager.release(dataPage.size());
     }
     assert(dataPages.isEmpty());
   }
@@ -714,10 +700,6 @@ public TaskMemoryManager getTaskMemoryManager() {
     return taskMemoryManager;
   }
 
-  public ShuffleMemoryManager getShuffleMemoryManager() {
-    return shuffleMemoryManager;
-  }
-
   public long getPageSizeBytes() {
     return pageSizeBytes;
   }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
index 0c4ebde407cfc..dbf6770e07391 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
@@ -17,9 +17,11 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
+import org.apache.spark.memory.TaskMemoryManager;
+
 final class RecordPointerAndKeyPrefix {
   /**
-   * A pointer to a record; see {@link org.apache.spark.unsafe.memory.TaskMemoryManager} for a
+   * A pointer to a record; see {@link TaskMemoryManager} for a
    * description of how these addresses are encoded.
    */
   public long recordPointer;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 0a311d2d935ac..e317ea391c556 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -32,12 +32,11 @@
 
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 /**
@@ -52,7 +51,6 @@ public final class UnsafeExternalSorter {
   private final RecordComparator recordComparator;
   private final int initialSize;
   private final TaskMemoryManager taskMemoryManager;
-  private final ShuffleMemoryManager shuffleMemoryManager;
   private final BlockManager blockManager;
   private final TaskContext taskContext;
   private ShuffleWriteMetrics writeMetrics;
@@ -82,7 +80,6 @@ public final class UnsafeExternalSorter {
 
   public static UnsafeExternalSorter createWithExistingInMemorySorter(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       BlockManager blockManager,
       TaskContext taskContext,
       RecordComparator recordComparator,
@@ -90,26 +87,24 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter(
       int initialSize,
       long pageSizeBytes,
       UnsafeInMemorySorter inMemorySorter) throws IOException {
-    return new UnsafeExternalSorter(taskMemoryManager, shuffleMemoryManager, blockManager,
+    return new UnsafeExternalSorter(taskMemoryManager, blockManager,
       taskContext, recordComparator, prefixComparator, initialSize, pageSizeBytes, inMemorySorter);
   }
 
   public static UnsafeExternalSorter create(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       BlockManager blockManager,
       TaskContext taskContext,
       RecordComparator recordComparator,
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes) throws IOException {
-    return new UnsafeExternalSorter(taskMemoryManager, shuffleMemoryManager, blockManager,
+    return new UnsafeExternalSorter(taskMemoryManager, blockManager,
       taskContext, recordComparator, prefixComparator, initialSize, pageSizeBytes, null);
   }
 
   private UnsafeExternalSorter(
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       BlockManager blockManager,
       TaskContext taskContext,
       RecordComparator recordComparator,
@@ -118,7 +113,6 @@ private UnsafeExternalSorter(
       long pageSizeBytes,
       @Nullable UnsafeInMemorySorter existingInMemorySorter) throws IOException {
     this.taskMemoryManager = taskMemoryManager;
-    this.shuffleMemoryManager = shuffleMemoryManager;
     this.blockManager = blockManager;
     this.taskContext = taskContext;
     this.recordComparator = recordComparator;
@@ -261,7 +255,6 @@ private long freeMemory() {
     long memoryFreed = 0;
     for (MemoryBlock block : allocatedPages) {
       taskMemoryManager.freePage(block);
-      shuffleMemoryManager.release(block.size());
       memoryFreed += block.size();
     }
     // TODO: track in-memory sorter memory usage (SPARK-10474)
@@ -309,8 +302,7 @@ private void growPointerArrayIfNecessary() throws IOException {
 
   /**
    * Allocates more memory in order to insert an additional record. This will request additional
-   * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be
-   * obtained.
+   * memory from the memory manager and spill if the requested memory can not be obtained.
    *
    * @param requiredSpace the required space in the data page, in bytes, including space for storing
    *                      the record size. This must be less than or equal to the page size (records
@@ -335,23 +327,20 @@ private void acquireNewPageIfNecessary(int requiredSpace) throws IOException {
   }
 
   /**
-   * Acquire a new page from the {@link ShuffleMemoryManager}.
+   * Acquire a new page from the memory manager.
    *
    * If there is not enough space to allocate the new page, spill all existing ones
    * and try again. If there is still not enough space, report error to the caller.
    */
   private void acquireNewPage() throws IOException {
-    final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
-    if (memoryAcquired < pageSizeBytes) {
-      shuffleMemoryManager.release(memoryAcquired);
+    currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
+    if (currentPage == null) {
       spill();
-      final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
-      if (memoryAcquiredAfterSpilling != pageSizeBytes) {
-        shuffleMemoryManager.release(memoryAcquiredAfterSpilling);
+      currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
+      if (currentPage == null) {
         throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
       }
     }
-    currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
     currentPagePosition = currentPage.getBaseOffset();
     freeSpaceInCurrentPage = pageSizeBytes;
     allocatedPages.add(currentPage);
@@ -379,17 +368,14 @@ public void insertRecord(
       long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
       // The record is larger than the page size, so allocate a special overflow page just to hold
       // that record.
-      final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-      if (memoryGranted != overflowPageSize) {
-        shuffleMemoryManager.release(memoryGranted);
+      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+      if (overflowPage == null) {
         spill();
-        final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-        if (memoryGrantedAfterSpill != overflowPageSize) {
-          shuffleMemoryManager.release(memoryGrantedAfterSpill);
+        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+        if (overflowPage == null) {
           throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
         }
       }
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
       allocatedPages.add(overflowPage);
       dataPage = overflowPage;
       dataPagePosition = overflowPage.getBaseOffset();
@@ -441,17 +427,14 @@ public void insertKVRecord(
       long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
       // The record is larger than the page size, so allocate a special overflow page just to hold
       // that record.
-      final long memoryGranted = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-      if (memoryGranted != overflowPageSize) {
-        shuffleMemoryManager.release(memoryGranted);
+      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+      if (overflowPage == null) {
         spill();
-        final long memoryGrantedAfterSpill = shuffleMemoryManager.tryToAcquire(overflowPageSize);
-        if (memoryGrantedAfterSpill != overflowPageSize) {
-          shuffleMemoryManager.release(memoryGrantedAfterSpill);
+        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
+        if (overflowPage == null) {
           throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
         }
       }
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
       allocatedPages.add(overflowPage);
       dataPage = overflowPage;
       dataPagePosition = overflowPage.getBaseOffset();
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index f7787e1019c2b..5aad72c374c37 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -21,7 +21,7 @@
 
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.util.collection.Sorter;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 /**
  * Sorts records using an AlphaSort-style key-prefix sort. This sort stores pointers to records
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index b5c35c569e45f..398e0936906a3 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -38,9 +38,8 @@ import org.apache.spark.rpc.akka.AkkaRpcEnv
 import org.apache.spark.scheduler.{OutputCommitCoordinator, LiveListenerBus}
 import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinatorEndpoint
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
+import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.storage._
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator}
 import org.apache.spark.util.{AkkaUtils, RpcUtils, Utils}
 
 /**
@@ -70,10 +69,7 @@ class SparkEnv (
     val httpFileServer: HttpFileServer,
     val sparkFilesDir: String,
     val metricsSystem: MetricsSystem,
-    // TODO: unify these *MemoryManager classes (SPARK-10984)
     val memoryManager: MemoryManager,
-    val shuffleMemoryManager: ShuffleMemoryManager,
-    val executorMemoryManager: ExecutorMemoryManager,
     val outputCommitCoordinator: OutputCommitCoordinator,
     val conf: SparkConf) extends Logging {
 
@@ -340,13 +336,11 @@ object SparkEnv extends Logging {
     val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
     val memoryManager: MemoryManager =
       if (useLegacyMemoryManager) {
-        new StaticMemoryManager(conf)
+        new StaticMemoryManager(conf, numUsableCores)
       } else {
-        new UnifiedMemoryManager(conf)
+        new UnifiedMemoryManager(conf, numUsableCores)
       }
 
-    val shuffleMemoryManager = ShuffleMemoryManager.create(conf, memoryManager, numUsableCores)
-
     val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
 
     val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(
@@ -405,15 +399,6 @@ object SparkEnv extends Logging {
       new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))
     outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef)
 
-    val executorMemoryManager: ExecutorMemoryManager = {
-      val allocator = if (conf.getBoolean("spark.unsafe.offHeap", false)) {
-        MemoryAllocator.UNSAFE
-      } else {
-        MemoryAllocator.HEAP
-      }
-      new ExecutorMemoryManager(allocator)
-    }
-
     val envInstance = new SparkEnv(
       executorId,
       rpcEnv,
@@ -431,8 +416,6 @@ object SparkEnv extends Logging {
       sparkFilesDir,
       metricsSystem,
       memoryManager,
-      shuffleMemoryManager,
-      executorMemoryManager,
       outputCommitCoordinator,
       conf)
 
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 63cca80b2d734..af558d6e5b474 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -21,8 +21,8 @@ import java.io.Serializable
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.TaskCompletionListener
 
 
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 5df94c6d3a103..f0ae83a9341bd 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -20,9 +20,9 @@ package org.apache.spark
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}
 
 private[spark] class TaskContextImpl(
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index c3491bb8b1cf3..9e88d488c0379 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -29,10 +29,10 @@ import scala.util.control.NonFatal
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Task}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.{StorageLevel, TaskResultBlockId}
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util._
 
 /**
@@ -179,7 +179,7 @@ private[spark] class Executor(
     }
 
     override def run(): Unit = {
-      val taskMemoryManager = new TaskMemoryManager(env.executorMemoryManager)
+      val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
       val deserializeStartTime = System.currentTimeMillis()
       Thread.currentThread.setContextClassLoader(replClassLoader)
       val ser = env.closureSerializer.newInstance()
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
index 7168ac549106f..6c9a71c3855b0 100644
--- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -17,20 +17,38 @@
 
 package org.apache.spark.memory
 
+import javax.annotation.concurrent.GuardedBy
+
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.Logging
-import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
+import com.google.common.annotations.VisibleForTesting
 
+import org.apache.spark.{SparkException, TaskContext, SparkConf, Logging}
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
+import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.unsafe.memory.MemoryAllocator
 
 /**
  * An abstract memory manager that enforces how memory is shared between execution and storage.
  *
  * In this context, execution memory refers to that used for computation in shuffles, joins,
  * sorts and aggregations, while storage memory refers to that used for caching and propagating
- * internal data across the cluster. There exists one of these per JVM.
+ * internal data across the cluster. There exists one MemoryManager per JVM.
+ *
+ * The MemoryManager abstract base class itself implements policies for sharing execution memory
+ * between tasks; it tries to ensure that each task gets a reasonable share of memory, instead of
+ * some task ramping up to a large amount first and then causing others to spill to disk repeatedly.
+ * If there are N tasks, it ensures that each task can acquire at least 1 / 2N of the memory
+ * before it has to spill, and at most 1 / N. Because N varies dynamically, we keep track of the
+ * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever
+ * this set changes. This is all done by synchronizing access to mutable state and using wait() and
+ * notifyAll() to signal changes to callers. Prior to Spark 1.6, this arbitration of memory across
+ * tasks was performed by the ShuffleMemoryManager.
  */
-private[spark] abstract class MemoryManager extends Logging {
+private[spark] abstract class MemoryManager(conf: SparkConf, numCores: Int) extends Logging {
+
+  // -- Methods related to memory allocation policies and bookkeeping ------------------------------
 
   // The memory store used to evict cached blocks
   private var _memoryStore: MemoryStore = _
@@ -42,8 +60,10 @@ private[spark] abstract class MemoryManager extends Logging {
   }
 
   // Amount of execution/storage memory in use, accesses must be synchronized on `this`
-  protected var _executionMemoryUsed: Long = 0
-  protected var _storageMemoryUsed: Long = 0
+  @GuardedBy("this") protected var _executionMemoryUsed: Long = 0
+  @GuardedBy("this") protected var _storageMemoryUsed: Long = 0
+  // Map from taskAttemptId -> memory consumption in bytes
+  @GuardedBy("this") private val executionMemoryForTask = new mutable.HashMap[Long, Long]()
 
   /**
    * Set the [[MemoryStore]] used by this manager to evict cached blocks.
@@ -65,15 +85,6 @@ private[spark] abstract class MemoryManager extends Logging {
 
   // TODO: avoid passing evicted blocks around to simplify method signatures (SPARK-10985)
 
-  /**
-   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
-   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
-   * @return number of bytes successfully granted (<= N).
-   */
-  def acquireExecutionMemory(
-      numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long
-
   /**
    * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
    * Blocks evicted in the process, if any, are added to `evictedBlocks`.
@@ -102,9 +113,92 @@ private[spark] abstract class MemoryManager extends Logging {
   }
 
   /**
-   * Release N bytes of execution memory.
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return number of bytes successfully granted (<= N).
+   */
+  @VisibleForTesting
+  private[memory] def doAcquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long
+
+  /**
+   * Try to acquire up to `numBytes` of execution memory for the current task and return the number
+   * of bytes obtained, or 0 if none can be allocated.
+   *
+   * This call may block until there is enough free memory in some situations, to make sure each
+   * task has a chance to ramp up to at least 1 / 2N of the total memory pool (where N is the # of
+   * active tasks) before it is forced to spill. This can happen if the number of tasks increase
+   * but an older task had a lot of memory already.
+   *
+   * Subclasses should override `doAcquireExecutionMemory` in order to customize the policies
+   * that control global sharing of memory between execution and storage.
    */
-  def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
+  private[memory]
+  final def acquireExecutionMemory(numBytes: Long, taskAttemptId: Long): Long = synchronized {
+    assert(numBytes > 0, "invalid number of bytes requested: " + numBytes)
+
+    // Add this task to the taskMemory map just so we can keep an accurate count of the number
+    // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
+    if (!executionMemoryForTask.contains(taskAttemptId)) {
+      executionMemoryForTask(taskAttemptId) = 0L
+      // This will later cause waiting tasks to wake up and check numTasks again
+      notifyAll()
+    }
+
+    // Once the cross-task memory allocation policy has decided to grant more memory to a task,
+    // this method is called in order to actually obtain that execution memory, potentially
+    // triggering eviction of storage memory:
+    def acquire(toGrant: Long): Long = synchronized {
+      val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+      val acquired = doAcquireExecutionMemory(toGrant, evictedBlocks)
+      // Register evicted blocks, if any, with the active task metrics
+      Option(TaskContext.get()).foreach { tc =>
+        val metrics = tc.taskMetrics()
+        val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
+        metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
+      }
+      executionMemoryForTask(taskAttemptId) += acquired
+      acquired
+    }
+
+    // Keep looping until we're either sure that we don't want to grant this request (because this
+    // task would have more than 1 / numActiveTasks of the memory) or we have enough free
+    // memory to give it (we always let each task get at least 1 / (2 * numActiveTasks)).
+    // TODO: simplify this to limit each task to its own slot
+    while (true) {
+      val numActiveTasks = executionMemoryForTask.keys.size
+      val curMem = executionMemoryForTask(taskAttemptId)
+      val freeMemory = maxExecutionMemory - executionMemoryForTask.values.sum
+
+      // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
+      // don't let it be negative
+      val maxToGrant =
+        math.min(numBytes, math.max(0, (maxExecutionMemory / numActiveTasks) - curMem))
+      // Only give it as much memory as is free, which might be none if it reached 1 / numTasks
+      val toGrant = math.min(maxToGrant, freeMemory)
+
+      if (curMem < maxExecutionMemory / (2 * numActiveTasks)) {
+        // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
+        // if we can't give it this much now, wait for other tasks to free up memory
+        // (this happens if older tasks allocated lots of memory before N grew)
+        if (
+          freeMemory >= math.min(maxToGrant, maxExecutionMemory / (2 * numActiveTasks) - curMem)) {
+          return acquire(toGrant)
+        } else {
+          logInfo(
+            s"TID $taskAttemptId waiting for at least 1/2N of execution memory pool to be free")
+          wait()
+        }
+      } else {
+        return acquire(toGrant)
+      }
+    }
+    0L  // Never reached
+  }
+
+  @VisibleForTesting
+  private[memory] def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
     if (numBytes > _executionMemoryUsed) {
       logWarning(s"Attempted to release $numBytes bytes of execution " +
         s"memory when we only have ${_executionMemoryUsed} bytes")
@@ -114,6 +208,36 @@ private[spark] abstract class MemoryManager extends Logging {
     }
   }
 
+  /**
+   * Release numBytes of execution memory belonging to the given task.
+   */
+  private[memory]
+  final def releaseExecutionMemory(numBytes: Long, taskAttemptId: Long): Unit = synchronized {
+    val curMem = executionMemoryForTask.getOrElse(taskAttemptId, 0L)
+    if (curMem < numBytes) {
+      throw new SparkException(
+        s"Internal error: release called on $numBytes bytes but task only has $curMem")
+    }
+    if (executionMemoryForTask.contains(taskAttemptId)) {
+      executionMemoryForTask(taskAttemptId) -= numBytes
+      if (executionMemoryForTask(taskAttemptId) <= 0) {
+        executionMemoryForTask.remove(taskAttemptId)
+      }
+      releaseExecutionMemory(numBytes)
+    }
+    notifyAll() // Notify waiters in acquireExecutionMemory() that memory has been freed
+  }
+
+  /**
+   * Release all memory for the given task and mark it as inactive (e.g. when a task ends).
+   * @return the number of bytes freed.
+   */
+  private[memory] def releaseAllExecutionMemoryForTask(taskAttemptId: Long): Long = synchronized {
+    val numBytesToFree = getExecutionMemoryUsageForTask(taskAttemptId)
+    releaseExecutionMemory(numBytesToFree, taskAttemptId)
+    numBytesToFree
+  }
+
   /**
    * Release N bytes of storage memory.
    */
@@ -155,4 +279,43 @@ private[spark] abstract class MemoryManager extends Logging {
     _storageMemoryUsed
   }
 
+  /**
+   * Returns the execution memory consumption, in bytes, for the given task.
+   */
+  private[memory] def getExecutionMemoryUsageForTask(taskAttemptId: Long): Long = synchronized {
+    executionMemoryForTask.getOrElse(taskAttemptId, 0L)
+  }
+
+  // -- Fields related to Tungsten managed memory -------------------------------------------------
+
+  /**
+   * The default page size, in bytes.
+   *
+   * If user didn't explicitly set "spark.buffer.pageSize", we figure out the default value
+   * by looking at the number of cores available to the process, and the total amount of memory,
+   * and then divide it by a factor of safety.
+   */
+  val pageSizeBytes: Long = {
+    val minPageSize = 1L * 1024 * 1024   // 1MB
+    val maxPageSize = 64L * minPageSize  // 64MB
+    val cores = if (numCores > 0) numCores else Runtime.getRuntime.availableProcessors()
+    // Because of rounding to next power of 2, we may have safetyFactor as 8 in worst case
+    val safetyFactor = 16
+    val size = ByteArrayMethods.nextPowerOf2(maxExecutionMemory / cores / safetyFactor)
+    val default = math.min(maxPageSize, math.max(minPageSize, size))
+    conf.getSizeAsBytes("spark.buffer.pageSize", default)
+  }
+
+  /**
+   * Tracks whether Tungsten memory will be allocated on the JVM heap or off-heap using
+   * sun.misc.Unsafe.
+   */
+  final val tungstenMemoryIsAllocatedInHeap: Boolean =
+    !conf.getBoolean("spark.unsafe.offHeap", false)
+
+  /**
+   * Allocates memory for use by Unsafe/Tungsten code.
+   */
+  private[memory] final val tungstenMemoryAllocator: MemoryAllocator =
+    if (tungstenMemoryIsAllocatedInHeap) MemoryAllocator.HEAP else MemoryAllocator.UNSAFE
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
index fa44f3723415d..9c2c2e90a2282 100644
--- a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -33,14 +33,16 @@ import org.apache.spark.storage.{BlockId, BlockStatus}
 private[spark] class StaticMemoryManager(
     conf: SparkConf,
     override val maxExecutionMemory: Long,
-    override val maxStorageMemory: Long)
-  extends MemoryManager {
+    override val maxStorageMemory: Long,
+    numCores: Int)
+  extends MemoryManager(conf, numCores) {
 
-  def this(conf: SparkConf) {
+  def this(conf: SparkConf, numCores: Int) {
     this(
       conf,
       StaticMemoryManager.getMaxExecutionMemory(conf),
-      StaticMemoryManager.getMaxStorageMemory(conf))
+      StaticMemoryManager.getMaxStorageMemory(conf),
+      numCores)
   }
 
   // Max number of bytes worth of blocks to evict when unrolling
@@ -52,7 +54,7 @@ private[spark] class StaticMemoryManager(
    * Acquire N bytes of memory for execution.
    * @return number of bytes successfully granted (<= N).
    */
-  override def acquireExecutionMemory(
+  override def doAcquireExecutionMemory(
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
     assert(numBytes >= 0)
diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
index 5bf78d5b674b3..a3093030a0f93 100644
--- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -42,10 +42,14 @@ import org.apache.spark.storage.{BlockStatus, BlockId}
  * up most of the storage space, in which case the new blocks will be evicted immediately
  * according to their respective storage levels.
  */
-private[spark] class UnifiedMemoryManager(conf: SparkConf, maxMemory: Long) extends MemoryManager {
+private[spark] class UnifiedMemoryManager(
+    conf: SparkConf,
+    maxMemory: Long,
+    numCores: Int)
+  extends MemoryManager(conf, numCores) {
 
-  def this(conf: SparkConf) {
-    this(conf, UnifiedMemoryManager.getMaxMemory(conf))
+  def this(conf: SparkConf, numCores: Int) {
+    this(conf, UnifiedMemoryManager.getMaxMemory(conf), numCores)
   }
 
   /**
@@ -91,7 +95,7 @@ private[spark] class UnifiedMemoryManager(conf: SparkConf, maxMemory: Long) exte
    * Blocks evicted in the process, if any, are added to `evictedBlocks`.
    * @return number of bytes successfully granted (<= N).
    */
-  override def acquireExecutionMemory(
+  private[memory] override def doAcquireExecutionMemory(
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
     assert(numBytes >= 0)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 9edf9f048f9fd..4fb32ba8cb188 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -25,8 +25,8 @@ import scala.collection.mutable.HashMap
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.{Accumulator, SparkEnv, TaskContextImpl, TaskContext}
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.serializer.SerializerInstance
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.ByteBufferInputStream
 import org.apache.spark.util.Utils
 
@@ -89,10 +89,6 @@ private[spark] abstract class Task[T](
     } finally {
       context.markTaskCompleted()
       try {
-        Utils.tryLogNonFatalError {
-          // Release memory used by this thread for shuffles
-          SparkEnv.get.shuffleMemoryManager.releaseMemoryForThisTask()
-        }
         Utils.tryLogNonFatalError {
           // Release memory used by this thread for unrolling blocks
           SparkEnv.get.blockManager.memoryStore.releaseUnrollMemoryForThisTask()
diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 7c3e2b5a3703b..b0abda4a81b8d 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -98,13 +98,14 @@ private[spark] class BlockStoreShuffleReader[K, C](
       case Some(keyOrd: Ordering[K]) =>
         // Create an ExternalSorter to sort the data. Note that if spark.shuffle.spill is disabled,
         // the ExternalSorter won't spill to disk.
-        val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser))
+        val sorter =
+          new ExternalSorter[K, C, C](context, ordering = Some(keyOrd), serializer = Some(ser))
         sorter.insertAll(aggregatedIter)
         context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
         context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
         context.internalMetricsToAccumulators(
           InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes)
-        sorter.iterator
+        CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](sorter.iterator, sorter.stop())
       case None =>
         aggregatedIter
     }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
deleted file mode 100644
index 9bd18da47f1a2..0000000000000
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle
-
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
-
-import com.google.common.annotations.VisibleForTesting
-
-import org.apache.spark._
-import org.apache.spark.memory.{StaticMemoryManager, MemoryManager}
-import org.apache.spark.storage.{BlockId, BlockStatus}
-import org.apache.spark.unsafe.array.ByteArrayMethods
-
-/**
- * Allocates a pool of memory to tasks for use in shuffle operations. Each disk-spilling
- * collection (ExternalAppendOnlyMap or ExternalSorter) used by these tasks can acquire memory
- * from this pool and release it as it spills data out. When a task ends, all its memory will be
- * released by the Executor.
- *
- * This class tries to ensure that each task gets a reasonable share of memory, instead of some
- * task ramping up to a large amount first and then causing others to spill to disk repeatedly.
- * If there are N tasks, it ensures that each tasks can acquire at least 1 / 2N of the memory
- * before it has to spill, and at most 1 / N. Because N varies dynamically, we keep track of the
- * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever
- * this set changes. This is all done by synchronizing access to `memoryManager` to mutate state
- * and using wait() and notifyAll() to signal changes.
- *
- * Use `ShuffleMemoryManager.create()` factory method to create a new instance.
- *
- * @param memoryManager the interface through which this manager acquires execution memory
- * @param pageSizeBytes number of bytes for each page, by default.
- */
-private[spark]
-class ShuffleMemoryManager protected (
-    memoryManager: MemoryManager,
-    val pageSizeBytes: Long)
-  extends Logging {
-
-  private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
-
-  private def currentTaskAttemptId(): Long = {
-    // In case this is called on the driver, return an invalid task attempt id.
-    Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(-1L)
-  }
-
-  /**
-   * Try to acquire up to numBytes memory for the current task, and return the number of bytes
-   * obtained, or 0 if none can be allocated. This call may block until there is enough free memory
-   * in some situations, to make sure each task has a chance to ramp up to at least 1 / 2N of the
-   * total memory pool (where N is the # of active tasks) before it is forced to spill. This can
-   * happen if the number of tasks increases but an older task had a lot of memory already.
-   */
-  def tryToAcquire(numBytes: Long): Long = memoryManager.synchronized {
-    val taskAttemptId = currentTaskAttemptId()
-    assert(numBytes > 0, "invalid number of bytes requested: " + numBytes)
-
-    // Add this task to the taskMemory map just so we can keep an accurate count of the number
-    // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
-    if (!taskMemory.contains(taskAttemptId)) {
-      taskMemory(taskAttemptId) = 0L
-      // This will later cause waiting tasks to wake up and check numTasks again
-      memoryManager.notifyAll()
-    }
-
-    // Keep looping until we're either sure that we don't want to grant this request (because this
-    // task would have more than 1 / numActiveTasks of the memory) or we have enough free
-    // memory to give it (we always let each task get at least 1 / (2 * numActiveTasks)).
-    // TODO: simplify this to limit each task to its own slot
-    while (true) {
-      val numActiveTasks = taskMemory.keys.size
-      val curMem = taskMemory(taskAttemptId)
-      val maxMemory = memoryManager.maxExecutionMemory
-      val freeMemory = maxMemory - taskMemory.values.sum
-
-      // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
-      // don't let it be negative
-      val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveTasks) - curMem))
-      // Only give it as much memory as is free, which might be none if it reached 1 / numTasks
-      val toGrant = math.min(maxToGrant, freeMemory)
-
-      if (curMem < maxMemory / (2 * numActiveTasks)) {
-        // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
-        // if we can't give it this much now, wait for other tasks to free up memory
-        // (this happens if older tasks allocated lots of memory before N grew)
-        if (freeMemory >= math.min(maxToGrant, maxMemory / (2 * numActiveTasks) - curMem)) {
-          return acquire(toGrant)
-        } else {
-          logInfo(
-            s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
-          memoryManager.wait()
-        }
-      } else {
-        return acquire(toGrant)
-      }
-    }
-    0L  // Never reached
-  }
-
-  /**
-   * Acquire N bytes of execution memory from the memory manager for the current task.
-   * @return number of bytes actually acquired (<= N).
-   */
-  private def acquire(numBytes: Long): Long = memoryManager.synchronized {
-    val taskAttemptId = currentTaskAttemptId()
-    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
-    val acquired = memoryManager.acquireExecutionMemory(numBytes, evictedBlocks)
-    // Register evicted blocks, if any, with the active task metrics
-    // TODO: just do this in `acquireExecutionMemory` (SPARK-10985)
-    Option(TaskContext.get()).foreach { tc =>
-      val metrics = tc.taskMetrics()
-      val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
-      metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
-    }
-    taskMemory(taskAttemptId) += acquired
-    acquired
-  }
-
-  /** Release numBytes bytes for the current task. */
-  def release(numBytes: Long): Unit = memoryManager.synchronized {
-    val taskAttemptId = currentTaskAttemptId()
-    val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
-    if (curMem < numBytes) {
-      throw new SparkException(
-        s"Internal error: release called on $numBytes bytes but task only has $curMem")
-    }
-    if (taskMemory.contains(taskAttemptId)) {
-      taskMemory(taskAttemptId) -= numBytes
-      memoryManager.releaseExecutionMemory(numBytes)
-    }
-    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
-  }
-
-  /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
-  def releaseMemoryForThisTask(): Unit = memoryManager.synchronized {
-    val taskAttemptId = currentTaskAttemptId()
-    taskMemory.remove(taskAttemptId).foreach { numBytes =>
-      memoryManager.releaseExecutionMemory(numBytes)
-    }
-    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
-  }
-
-  /** Returns the memory consumption, in bytes, for the current task */
-  def getMemoryConsumptionForThisTask(): Long = memoryManager.synchronized {
-    val taskAttemptId = currentTaskAttemptId()
-    taskMemory.getOrElse(taskAttemptId, 0L)
-  }
-}
-
-
-private[spark] object ShuffleMemoryManager {
-
-  def create(
-      conf: SparkConf,
-      memoryManager: MemoryManager,
-      numCores: Int): ShuffleMemoryManager = {
-    val maxMemory = memoryManager.maxExecutionMemory
-    val pageSize = ShuffleMemoryManager.getPageSize(conf, maxMemory, numCores)
-    new ShuffleMemoryManager(memoryManager, pageSize)
-  }
-
-  /**
-   * Create a dummy [[ShuffleMemoryManager]] with the specified capacity and page size.
-   */
-  def create(maxMemory: Long, pageSizeBytes: Long): ShuffleMemoryManager = {
-    val conf = new SparkConf
-    val memoryManager = new StaticMemoryManager(
-      conf, maxExecutionMemory = maxMemory, maxStorageMemory = Long.MaxValue)
-    new ShuffleMemoryManager(memoryManager, pageSizeBytes)
-  }
-
-  @VisibleForTesting
-  def createForTesting(maxMemory: Long): ShuffleMemoryManager = {
-    create(maxMemory, 4 * 1024 * 1024)
-  }
-
-  /**
-   * Sets the page size, in bytes.
-   *
-   * If user didn't explicitly set "spark.buffer.pageSize", we figure out the default value
-   * by looking at the number of cores available to the process, and the total amount of memory,
-   * and then divide it by a factor of safety.
-   */
-  private def getPageSize(conf: SparkConf, maxMemory: Long, numCores: Int): Long = {
-    val minPageSize = 1L * 1024 * 1024   // 1MB
-    val maxPageSize = 64L * minPageSize  // 64MB
-    val cores = if (numCores > 0) numCores else Runtime.getRuntime.availableProcessors()
-    // Because of rounding to next power of 2, we may have safetyFactor as 8 in worst case
-    val safetyFactor = 16
-    val size = ByteArrayMethods.nextPowerOf2(maxMemory / cores / safetyFactor)
-    val default = math.min(maxPageSize, math.max(minPageSize, size))
-    conf.getSizeAsBytes("spark.buffer.pageSize", default)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
index 1105167d39d8d..66b6bbc61fe8e 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -133,7 +133,6 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
           env.blockManager,
           shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver],
           context.taskMemoryManager(),
-          env.shuffleMemoryManager,
           unsafeShuffleHandle,
           mapId,
           context,
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
index bbd9c1ab53cd8..808317b017a0f 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -52,13 +52,13 @@ private[spark] class SortShuffleWriter[K, V, C](
     sorter = if (dep.mapSideCombine) {
       require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
       new ExternalSorter[K, V, C](
-        dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
+        context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
     } else {
       // In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
       // care whether the keys get sorted in each partition; that will be done on the reduce side
       // if the operation being run is sortByKey.
       new ExternalSorter[K, V, V](
-        aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
+        context, aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
     }
     sorter.insertAll(records)
 
@@ -67,7 +67,7 @@ private[spark] class SortShuffleWriter[K, V, C](
     // (see SPARK-3570).
     val outputFile = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId)
     val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
-    val partitionLengths = sorter.writePartitionedFile(blockId, context, outputFile)
+    val partitionLengths = sorter.writePartitionedFile(blockId, outputFile)
     shuffleBlockResolver.writeIndexFile(dep.shuffleId, mapId, partitionLengths)
 
     mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths)
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index cfa58f5ef408a..f6d81ee5bf05e 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -28,8 +28,10 @@ import com.google.common.io.ByteStreams
 
 import org.apache.spark.{Logging, SparkEnv, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.serializer.{DeserializationStream, Serializer}
 import org.apache.spark.storage.{BlockId, BlockManager}
+import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalAppendOnlyMap.HashComparator
 import org.apache.spark.executor.ShuffleWriteMetrics
 
@@ -55,12 +57,30 @@ class ExternalAppendOnlyMap[K, V, C](
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C,
     serializer: Serializer = SparkEnv.get.serializer,
-    blockManager: BlockManager = SparkEnv.get.blockManager)
+    blockManager: BlockManager = SparkEnv.get.blockManager,
+    context: TaskContext = TaskContext.get())
   extends Iterable[(K, C)]
   with Serializable
   with Logging
   with Spillable[SizeTracker] {
 
+  if (context == null) {
+    throw new IllegalStateException(
+      "Spillable collections should not be instantiated outside of tasks")
+  }
+
+  // Backwards-compatibility constructor for binary compatibility
+  def this(
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C,
+      serializer: Serializer,
+      blockManager: BlockManager) {
+    this(createCombiner, mergeValue, mergeCombiners, serializer, blockManager, TaskContext.get())
+  }
+
+  override protected[this] def taskMemoryManager: TaskMemoryManager = context.taskMemoryManager()
+
   private var currentMap = new SizeTrackingAppendOnlyMap[K, C]
   private val spilledMaps = new ArrayBuffer[DiskMapIterator]
   private val sparkConf = SparkEnv.get.conf
@@ -118,6 +138,10 @@ class ExternalAppendOnlyMap[K, V, C](
    * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked.
    */
   def insertAll(entries: Iterator[Product2[K, V]]): Unit = {
+    if (currentMap == null) {
+      throw new IllegalStateException(
+        "Cannot insert new elements into a map after calling iterator")
+    }
     // An update function for the map that we reuse across entries to avoid allocating
     // a new closure each time
     var curEntry: Product2[K, V] = null
@@ -215,17 +239,26 @@ class ExternalAppendOnlyMap[K, V, C](
   }
 
   /**
-   * Return an iterator that merges the in-memory map with the spilled maps.
+   * Return a destructive iterator that merges the in-memory map with the spilled maps.
    * If no spill has occurred, simply return the in-memory map's iterator.
    */
   override def iterator: Iterator[(K, C)] = {
+    if (currentMap == null) {
+      throw new IllegalStateException(
+        "ExternalAppendOnlyMap.iterator is destructive and should only be called once.")
+    }
     if (spilledMaps.isEmpty) {
-      currentMap.iterator
+      CompletionIterator[(K, C), Iterator[(K, C)]](currentMap.iterator, freeCurrentMap())
     } else {
       new ExternalIterator()
     }
   }
 
+  private def freeCurrentMap(): Unit = {
+    currentMap = null // So that the memory can be garbage-collected
+    releaseMemory()
+  }
+
   /**
    * An iterator that sort-merges (K, C) pairs from the in-memory map and the spilled maps
    */
@@ -237,7 +270,8 @@ class ExternalAppendOnlyMap[K, V, C](
 
     // Input streams are derived both from the in-memory map and spilled maps on disk
     // The in-memory map is sorted in place, while the spilled maps are already in sorted order
-    private val sortedMap = currentMap.destructiveSortedIterator(keyComparator)
+    private val sortedMap = CompletionIterator[(K, C), Iterator[(K, C)]](
+      currentMap.destructiveSortedIterator(keyComparator), freeCurrentMap())
     private val inputStreams = (Seq(sortedMap) ++ spilledMaps).map(it => it.buffered)
 
     inputStreams.foreach { it =>
@@ -493,12 +527,7 @@ class ExternalAppendOnlyMap[K, V, C](
       }
     }
 
-    val context = TaskContext.get()
-    // context is null in some tests of ExternalAppendOnlyMapSuite because these tests don't run in
-    // a TaskContext.
-    if (context != null) {
-      context.addTaskCompletionListener(context => cleanup())
-    }
+    context.addTaskCompletionListener(context => cleanup())
   }
 
   /** Convenience function to hash the given (K, C) pair by the key. */
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index c48c453a90d01..a44e72b7c16d3 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -27,6 +27,7 @@ import com.google.common.annotations.VisibleForTesting
 import com.google.common.io.ByteStreams
 
 import org.apache.spark._
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.serializer._
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter}
@@ -87,6 +88,7 @@ import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter}
  * - Users are expected to call stop() at the end to delete all the intermediate files.
  */
 private[spark] class ExternalSorter[K, V, C](
+    context: TaskContext,
     aggregator: Option[Aggregator[K, V, C]] = None,
     partitioner: Option[Partitioner] = None,
     ordering: Option[Ordering[K]] = None,
@@ -94,6 +96,8 @@ private[spark] class ExternalSorter[K, V, C](
   extends Logging
   with Spillable[WritablePartitionedPairCollection[K, C]] {
 
+  override protected[this] def taskMemoryManager: TaskMemoryManager = context.taskMemoryManager()
+
   private val conf = SparkEnv.get.conf
 
   private val numPartitions = partitioner.map(_.numPartitions).getOrElse(1)
@@ -640,7 +644,6 @@ private[spark] class ExternalSorter[K, V, C](
    */
   def writePartitionedFile(
       blockId: BlockId,
-      context: TaskContext,
       outputFile: File): Array[Long] = {
 
     // Track location of each range in the output file
@@ -686,8 +689,11 @@ private[spark] class ExternalSorter[K, V, C](
   }
 
   def stop(): Unit = {
+    map = null // So that the memory can be garbage-collected
+    buffer = null // So that the memory can be garbage-collected
     spills.foreach(s => s.file.delete())
     spills.clear()
+    releaseMemory()
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index d2a68ca7a3b4c..a76891acf0baf 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.util.collection
 
-import org.apache.spark.Logging
-import org.apache.spark.SparkEnv
+import org.apache.spark.memory.TaskMemoryManager
+import org.apache.spark.{Logging, SparkEnv}
 
 /**
  * Spills contents of an in-memory collection to disk when the memory threshold
@@ -40,7 +40,7 @@ private[spark] trait Spillable[C] extends Logging {
   protected def addElementsRead(): Unit = { _elementsRead += 1 }
 
   // Memory manager that can be used to acquire/release memory
-  private[this] val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
+  protected[this] def taskMemoryManager: TaskMemoryManager
 
   // Initial threshold for the size of a collection before we start tracking its memory usage
   // For testing only
@@ -78,7 +78,7 @@ private[spark] trait Spillable[C] extends Logging {
     if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
       // Claim up to double our current memory from the shuffle memory pool
       val amountToRequest = 2 * currentMemory - myMemoryThreshold
-      val granted = shuffleMemoryManager.tryToAcquire(amountToRequest)
+      val granted = taskMemoryManager.acquireExecutionMemory(amountToRequest)
       myMemoryThreshold += granted
       // If we were granted too little memory to grow further (either tryToAcquire returned 0,
       // or we already had more memory than myMemoryThreshold), spill the current collection
@@ -92,7 +92,7 @@ private[spark] trait Spillable[C] extends Logging {
       spill(collection)
       _elementsRead = 0
       _memoryBytesSpilled += currentMemory
-      releaseMemoryForThisThread()
+      releaseMemory()
     }
     shouldSpill
   }
@@ -103,11 +103,11 @@ private[spark] trait Spillable[C] extends Logging {
   def memoryBytesSpilled: Long = _memoryBytesSpilled
 
   /**
-   * Release our memory back to the shuffle pool so that other threads can grab it.
+   * Release our memory back to the execution pool so that other tasks can grab it.
    */
-  private def releaseMemoryForThisThread(): Unit = {
+  def releaseMemory(): Unit = {
     // The amount we requested does not include the initial memory tracking threshold
-    shuffleMemoryManager.release(myMemoryThreshold - initialMemoryThreshold)
+    taskMemoryManager.releaseExecutionMemory(myMemoryThreshold - initialMemoryThreshold)
     myMemoryThreshold = initialMemoryThreshold
   }
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
similarity index 74%
rename from unsafe/src/test/java/org/apache/spark/unsafe/memory/TaskMemoryManagerSuite.java
rename to core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index 06fb081183659..f381db0c62653 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -15,33 +15,28 @@
  * limitations under the License.
  */
 
-package org.apache.spark.unsafe.memory;
+package org.apache.spark.memory;
 
 import org.junit.Assert;
 import org.junit.Test;
 
-public class TaskMemoryManagerSuite {
+import org.apache.spark.SparkConf;
+import org.apache.spark.unsafe.memory.MemoryBlock;
 
-  @Test
-  public void leakedNonPageMemoryIsDetected() {
-    final TaskMemoryManager manager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
-    manager.allocate(1024);  // leak memory
-    Assert.assertEquals(1024, manager.cleanUpAllAllocatedMemory());
-  }
+public class TaskMemoryManagerSuite {
 
   @Test
   public void leakedPageMemoryIsDetected() {
-    final TaskMemoryManager manager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
     manager.allocatePage(4096);  // leak memory
     Assert.assertEquals(4096, manager.cleanUpAllAllocatedMemory());
   }
 
   @Test
   public void encodePageNumberAndOffsetOffHeap() {
-    final TaskMemoryManager manager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.UNSAFE));
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "true")), 0);
     final MemoryBlock dataPage = manager.allocatePage(256);
     // In off-heap mode, an offset is an absolute address that may require more than 51 bits to
     // encode. This test exercises that corner-case:
@@ -53,8 +48,8 @@ public void encodePageNumberAndOffsetOffHeap() {
 
   @Test
   public void encodePageNumberAndOffsetOnHeap() {
-    final TaskMemoryManager manager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
     final MemoryBlock dataPage = manager.allocatePage(256);
     final long encodedAddress = manager.encodePageNumberAndOffset(dataPage, 64);
     Assert.assertEquals(dataPage.getBaseObject(), manager.getPage(encodedAddress));
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
index 232ae4d926bcd..7fb2f92ca80e8 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
@@ -21,18 +21,19 @@
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
+import org.apache.spark.SparkConf;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import static org.apache.spark.shuffle.sort.PackedRecordPointer.*;
 
 public class PackedRecordPointerSuite {
 
   @Test
   public void heap() {
+    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
     final MemoryBlock page0 = memoryManager.allocatePage(128);
     final MemoryBlock page1 = memoryManager.allocatePage(128);
     final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1,
@@ -49,8 +50,9 @@ public void heap() {
 
   @Test
   public void offHeap() {
+    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "true");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.UNSAFE));
+      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
     final MemoryBlock page0 = memoryManager.allocatePage(128);
     final MemoryBlock page1 = memoryManager.allocatePage(128);
     final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1,
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
index 1ef3c5ff64bac..5049a5306ff21 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
@@ -24,11 +24,11 @@
 import org.junit.Test;
 
 import org.apache.spark.HashPartitioner;
+import org.apache.spark.SparkConf;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 public class ShuffleInMemorySorterSuite {
 
@@ -58,8 +58,9 @@ public void testBasicSorting() throws Exception {
       "Lychee",
       "Mango"
     };
+    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
     final MemoryBlock dataPage = memoryManager.allocatePage(2048);
     final Object baseObject = dataPage.getBaseObject();
     final ShuffleInMemorySorter sorter = new ShuffleInMemorySorter(4);
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index 29d9823b1f71b..d65926949c036 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -39,7 +39,6 @@
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.lessThan;
 import static org.junit.Assert.*;
-import static org.mockito.AdditionalAnswers.returnsFirstArg;
 import static org.mockito.Answers.RETURNS_SMART_NULLS;
 import static org.mockito.Mockito.*;
 
@@ -54,19 +53,15 @@
 import org.apache.spark.serializer.*;
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
-import org.apache.spark.shuffle.sort.SerializedShuffleHandle;
 import org.apache.spark.storage.*;
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 public class UnsafeShuffleWriterSuite {
 
   static final int NUM_PARTITITONS = 4;
-  final TaskMemoryManager taskMemoryManager =
-    new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+  TaskMemoryManager taskMemoryManager;
   final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITITONS);
   File mergedOutputFile;
   File tempDir;
@@ -76,7 +71,6 @@ public class UnsafeShuffleWriterSuite {
   final Serializer serializer = new KryoSerializer(new SparkConf());
   TaskMetrics taskMetrics;
 
-  @Mock(answer = RETURNS_SMART_NULLS) ShuffleMemoryManager shuffleMemoryManager;
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
   @Mock(answer = RETURNS_SMART_NULLS) IndexShuffleBlockResolver shuffleBlockResolver;
   @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
@@ -111,11 +105,11 @@ public void setUp() throws IOException {
     mergedOutputFile = File.createTempFile("mergedoutput", "", tempDir);
     partitionSizesInMergedFile = null;
     spillFilesCreated.clear();
-    conf = new SparkConf().set("spark.buffer.pageSize", "128m");
+    conf = new SparkConf()
+      .set("spark.buffer.pageSize", "128m")
+      .set("spark.unsafe.offHeap", "false");
     taskMetrics = new TaskMetrics();
-
-    when(shuffleMemoryManager.tryToAcquire(anyLong())).then(returnsFirstArg());
-    when(shuffleMemoryManager.pageSizeBytes()).thenReturn(128L * 1024 * 1024);
+    taskMemoryManager =  new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
 
     when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
     when(blockManager.getDiskWriter(
@@ -203,7 +197,6 @@ private UnsafeShuffleWriter<Object, Object> createWriter(
       blockManager,
       shuffleBlockResolver,
       taskMemoryManager,
-      shuffleMemoryManager,
       new SerializedShuffleHandle<Object, Object>(0, 1, shuffleDep),
       0, // map id
       taskContext,
@@ -405,11 +398,12 @@ public void mergeSpillsWithFileStreamAndNoCompression() throws Exception {
 
   @Test
   public void writeEnoughDataToTriggerSpill() throws Exception {
-    when(shuffleMemoryManager.tryToAcquire(anyLong()))
-      .then(returnsFirstArg()) // Allocate initial sort buffer
-      .then(returnsFirstArg()) // Allocate initial data page
-      .thenReturn(0L) // Deny request to allocate new data page
-      .then(returnsFirstArg());  // Grant new sort buffer and data page.
+    taskMemoryManager = spy(taskMemoryManager);
+    doCallRealMethod() // initialize sort buffer
+      .doCallRealMethod() // allocate initial data page
+      .doReturn(0L) // deny request to allocate new page
+      .doCallRealMethod() // grant new sort buffer and data page
+      .when(taskMemoryManager).acquireExecutionMemory(anyLong());
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(false);
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<Product2<Object, Object>>();
     final byte[] bigByteArray = new byte[PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES / 128];
@@ -417,7 +411,7 @@ public void writeEnoughDataToTriggerSpill() throws Exception {
       dataToWrite.add(new Tuple2<Object, Object>(i, bigByteArray));
     }
     writer.write(dataToWrite.iterator());
-    verify(shuffleMemoryManager, times(5)).tryToAcquire(anyLong());
+    verify(taskMemoryManager, times(5)).acquireExecutionMemory(anyLong());
     assertEquals(2, spillFilesCreated.size());
     writer.stop(true);
     readRecordsFromFile();
@@ -432,18 +426,19 @@ public void writeEnoughDataToTriggerSpill() throws Exception {
 
   @Test
   public void writeEnoughRecordsToTriggerSortBufferExpansionAndSpill() throws Exception {
-    when(shuffleMemoryManager.tryToAcquire(anyLong()))
-      .then(returnsFirstArg()) // Allocate initial sort buffer
-      .then(returnsFirstArg()) // Allocate initial data page
-      .thenReturn(0L) // Deny request to grow sort buffer
-      .then(returnsFirstArg());  // Grant new sort buffer and data page.
+    taskMemoryManager = spy(taskMemoryManager);
+    doCallRealMethod() // initialize sort buffer
+      .doCallRealMethod() // allocate initial data page
+      .doReturn(0L) // deny request to allocate new page
+      .doCallRealMethod() // grant new sort buffer and data page
+      .when(taskMemoryManager).acquireExecutionMemory(anyLong());
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(false);
-    final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<Product2<Object, Object>>();
+    final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<>();
     for (int i = 0; i < UnsafeShuffleWriter.INITIAL_SORT_BUFFER_SIZE; i++) {
       dataToWrite.add(new Tuple2<Object, Object>(i, i));
     }
     writer.write(dataToWrite.iterator());
-    verify(shuffleMemoryManager, times(5)).tryToAcquire(anyLong());
+    verify(taskMemoryManager, times(5)).acquireExecutionMemory(anyLong());
     assertEquals(2, spillFilesCreated.size());
     writer.stop(true);
     readRecordsFromFile();
@@ -509,13 +504,13 @@ public void testPeakMemoryUsed() throws Exception {
     final long recordLengthBytes = 8;
     final long pageSizeBytes = 256;
     final long numRecordsPerPage = pageSizeBytes / recordLengthBytes;
-    when(shuffleMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes);
+    taskMemoryManager = spy(taskMemoryManager);
+    when(taskMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes);
     final UnsafeShuffleWriter<Object, Object> writer =
       new UnsafeShuffleWriter<Object, Object>(
         blockManager,
         shuffleBlockResolver,
         taskMemoryManager,
-        shuffleMemoryManager,
         new SerializedShuffleHandle<>(0, 1, shuffleDep),
         0, // map id
         taskContext,
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index ab480b60adaed..6e52496cf933b 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -21,15 +21,13 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.spark.memory.TaskMemoryManager;
 import org.junit.*;
-import org.mockito.invocation.InvocationOnMock;
-import org.mockito.stubbing.Answer;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.junit.Assert.*;
-import static org.mockito.AdditionalMatchers.geq;
-import static org.mockito.Mockito.*;
 
-import org.apache.spark.shuffle.ShuffleMemoryManager;
+import org.apache.spark.SparkConf;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.memory.*;
 import org.apache.spark.unsafe.Platform;
@@ -39,42 +37,29 @@ public abstract class AbstractBytesToBytesMapSuite {
 
   private final Random rand = new Random(42);
 
-  private ShuffleMemoryManager shuffleMemoryManager;
+  private GrantEverythingMemoryManager memoryManager;
   private TaskMemoryManager taskMemoryManager;
-  private TaskMemoryManager sizeLimitedTaskMemoryManager;
   private final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
 
   @Before
   public void setup() {
-    shuffleMemoryManager = ShuffleMemoryManager.create(Long.MAX_VALUE, PAGE_SIZE_BYTES);
-    taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(getMemoryAllocator()));
-    // Mocked memory manager for tests that check the maximum array size, since actually allocating
-    // such large arrays will cause us to run out of memory in our tests.
-    sizeLimitedTaskMemoryManager = mock(TaskMemoryManager.class);
-    when(sizeLimitedTaskMemoryManager.allocate(geq(1L << 20))).thenAnswer(
-      new Answer<MemoryBlock>() {
-        @Override
-        public MemoryBlock answer(InvocationOnMock invocation) throws Throwable {
-          if (((Long) invocation.getArguments()[0] / 8) > Integer.MAX_VALUE) {
-            throw new OutOfMemoryError("Requested array size exceeds VM limit");
-          }
-          return new MemoryBlock(null, 0, (Long) invocation.getArguments()[0]);
-        }
-      }
-    );
+    memoryManager =
+      new GrantEverythingMemoryManager(
+        new SparkConf().set("spark.unsafe.offHeap", "" + useOffHeapMemoryAllocator()));
+    taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   }
 
   @After
   public void tearDown() {
     Assert.assertEquals(0L, taskMemoryManager.cleanUpAllAllocatedMemory());
-    if (shuffleMemoryManager != null) {
-      long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask();
-      shuffleMemoryManager = null;
-      Assert.assertEquals(0L, leakedShuffleMemory);
+    if (taskMemoryManager != null) {
+      long leakedMemory = taskMemoryManager.getMemoryConsumptionForThisTask();
+      taskMemoryManager = null;
+      Assert.assertEquals(0L, leakedMemory);
     }
   }
 
-  protected abstract MemoryAllocator getMemoryAllocator();
+  protected abstract boolean useOffHeapMemoryAllocator();
 
   private static byte[] getByteArray(MemoryLocation loc, int size) {
     final byte[] arr = new byte[size];
@@ -110,8 +95,7 @@ private static boolean arrayEquals(
 
   @Test
   public void emptyMap() {
-    BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, 64, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 64, PAGE_SIZE_BYTES);
     try {
       Assert.assertEquals(0, map.numElements());
       final int keyLengthInWords = 10;
@@ -126,8 +110,7 @@ public void emptyMap() {
 
   @Test
   public void setAndRetrieveAKey() {
-    BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, 64, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 64, PAGE_SIZE_BYTES);
     final int recordLengthWords = 10;
     final int recordLengthBytes = recordLengthWords * 8;
     final byte[] keyData = getRandomByteArray(recordLengthWords);
@@ -179,8 +162,7 @@ public void setAndRetrieveAKey() {
 
   private void iteratorTestBase(boolean destructive) throws Exception {
     final int size = 4096;
-    BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, size / 2, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, size / 2, PAGE_SIZE_BYTES);
     try {
       for (long i = 0; i < size; i++) {
         final long[] value = new long[] { i };
@@ -265,8 +247,8 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception {
     final int NUM_ENTRIES = 1000 * 1000;
     final int KEY_LENGTH = 24;
     final int VALUE_LENGTH = 40;
-    final BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
+    final BytesToBytesMap map =
+      new BytesToBytesMap(taskMemoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
     // Each record will take 8 + 24 + 40 = 72 bytes of space in the data page. Our 64-megabyte
     // pages won't be evenly-divisible by records of this size, which will cause us to waste some
     // space at the end of the page. This is necessary in order for us to take the end-of-record
@@ -335,9 +317,7 @@ public void randomizedStressTest() {
     // Java arrays' hashCodes() aren't based on the arrays' contents, so we need to wrap arrays
     // into ByteBuffers in order to use them as keys here.
     final Map<ByteBuffer, byte[]> expected = new HashMap<ByteBuffer, byte[]>();
-    final BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, size, PAGE_SIZE_BYTES);
-
+    final BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, size, PAGE_SIZE_BYTES);
     try {
       // Fill the map to 90% full so that we can trigger probing
       for (int i = 0; i < size * 0.9; i++) {
@@ -386,8 +366,7 @@ public void randomizedStressTest() {
   @Test
   public void randomizedTestWithRecordsLargerThanPageSize() {
     final long pageSizeBytes = 128;
-    final BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, 64, pageSizeBytes);
+    final BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 64, pageSizeBytes);
     // Java arrays' hashCodes() aren't based on the arrays' contents, so we need to wrap arrays
     // into ByteBuffers in order to use them as keys here.
     final Map<ByteBuffer, byte[]> expected = new HashMap<ByteBuffer, byte[]>();
@@ -436,9 +415,9 @@ public void randomizedTestWithRecordsLargerThanPageSize() {
 
   @Test
   public void failureToAllocateFirstPage() {
-    shuffleMemoryManager = ShuffleMemoryManager.createForTesting(1024);
-    BytesToBytesMap map =
-      new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, PAGE_SIZE_BYTES);
+    memoryManager.markExecutionAsOutOfMemory();
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1, PAGE_SIZE_BYTES);
+    memoryManager.markExecutionAsOutOfMemory();
     try {
       final long[] emptyArray = new long[0];
       final BytesToBytesMap.Location loc =
@@ -454,12 +433,14 @@ public void failureToAllocateFirstPage() {
 
   @Test
   public void failureToGrow() {
-    shuffleMemoryManager = ShuffleMemoryManager.createForTesting(1024 * 10);
-    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, 1024);
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1, 1024);
     try {
       boolean success = true;
       int i;
-      for (i = 0; i < 1024; i++) {
+      for (i = 0; i < 127; i++) {
+        if (i > 0) {
+          memoryManager.markExecutionAsOutOfMemory();
+        }
         final long[] arr = new long[]{i};
         final BytesToBytesMap.Location loc = map.lookup(arr, Platform.LONG_ARRAY_OFFSET, 8);
         success =
@@ -478,7 +459,7 @@ public void failureToGrow() {
   @Test
   public void initialCapacityBoundsChecking() {
     try {
-      new BytesToBytesMap(sizeLimitedTaskMemoryManager, shuffleMemoryManager, 0, PAGE_SIZE_BYTES);
+      new BytesToBytesMap(taskMemoryManager, 0, PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
@@ -486,36 +467,13 @@ public void initialCapacityBoundsChecking() {
 
     try {
       new BytesToBytesMap(
-        sizeLimitedTaskMemoryManager,
-        shuffleMemoryManager,
+        taskMemoryManager,
         BytesToBytesMap.MAX_CAPACITY + 1,
         PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
     }
-
-    // Ignored because this can OOM now that we allocate the long array w/o a TaskMemoryManager
-    // Can allocate _at_ the max capacity
-    //    BytesToBytesMap map = new BytesToBytesMap(
-    //      sizeLimitedTaskMemoryManager,
-    //      shuffleMemoryManager,
-    //      BytesToBytesMap.MAX_CAPACITY,
-    //      PAGE_SIZE_BYTES);
-    //    map.free();
-  }
-
-  // Ignored because this can OOM now that we allocate the long array w/o a TaskMemoryManager
-  @Ignore
-  public void resizingLargeMap() {
-    // As long as a map's capacity is below the max, we should be able to resize up to the max
-    BytesToBytesMap map = new BytesToBytesMap(
-      sizeLimitedTaskMemoryManager,
-      shuffleMemoryManager,
-      BytesToBytesMap.MAX_CAPACITY - 64,
-      PAGE_SIZE_BYTES);
-    map.growAndRehash();
-    map.free();
   }
 
   @Test
@@ -523,8 +481,7 @@ public void testPeakMemoryUsed() {
     final long recordLengthBytes = 24;
     final long pageSizeBytes = 256 + 8; // 8 bytes for end-of-page marker
     final long numRecordsPerPage = (pageSizeBytes - 8) / recordLengthBytes;
-    final BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, 1024, pageSizeBytes);
+    final BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1024, pageSizeBytes);
 
     // Since BytesToBytesMap is append-only, we expect the total memory consumption to be
     // monotonically increasing. More specifically, every time we allocate a new page it
@@ -564,8 +521,7 @@ public void testPeakMemoryUsed() {
 
   @Test
   public void testAcquirePageInConstructor() {
-    final BytesToBytesMap map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, 1, PAGE_SIZE_BYTES);
+    final BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1, PAGE_SIZE_BYTES);
     assertEquals(1, map.getNumDataPages());
     map.free();
   }
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
index 5a10de49f54fe..f0bad4d760c1d 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
@@ -17,13 +17,10 @@
 
 package org.apache.spark.unsafe.map;
 
-import org.apache.spark.unsafe.memory.MemoryAllocator;
-
 public class BytesToBytesMapOffHeapSuite extends AbstractBytesToBytesMapSuite {
 
   @Override
-  protected MemoryAllocator getMemoryAllocator() {
-    return MemoryAllocator.UNSAFE;
+  protected boolean useOffHeapMemoryAllocator() {
+    return true;
   }
-
 }
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
index 12cc9b25d93b3..d76bb4fd05c5f 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
@@ -17,13 +17,10 @@
 
 package org.apache.spark.unsafe.map;
 
-import org.apache.spark.unsafe.memory.MemoryAllocator;
-
 public class BytesToBytesMapOnHeapSuite extends AbstractBytesToBytesMapSuite {
 
   @Override
-  protected MemoryAllocator getMemoryAllocator() {
-    return MemoryAllocator.HEAP;
+  protected boolean useOffHeapMemoryAllocator() {
+    return false;
   }
-
 }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index a5bbaa95fa456..94d50b94fde3f 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -46,20 +46,19 @@
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.executor.TaskMetrics;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.serializer.SerializerInstance;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.storage.*;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 public class UnsafeExternalSorterSuite {
 
   final LinkedList<File> spillFilesCreated = new LinkedList<File>();
-  final TaskMemoryManager taskMemoryManager =
-    new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+  final GrantEverythingMemoryManager memoryManager =
+    new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"));
+  final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   // Use integer comparison for comparing prefixes (which are partition ids, in this case)
   final PrefixComparator prefixComparator = new PrefixComparator() {
     @Override
@@ -82,7 +81,6 @@ public int compare(
 
   SparkConf sparkConf;
   File tempDir;
-  ShuffleMemoryManager shuffleMemoryManager;
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
   @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
   @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext;
@@ -102,7 +100,6 @@ public void setUp() {
     MockitoAnnotations.initMocks(this);
     sparkConf = new SparkConf();
     tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "unsafe-test");
-    shuffleMemoryManager = ShuffleMemoryManager.create(Long.MAX_VALUE, pageSizeBytes);
     spillFilesCreated.clear();
     taskContext = mock(TaskContext.class);
     when(taskContext.taskMetrics()).thenReturn(new TaskMetrics());
@@ -143,13 +140,7 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
   @After
   public void tearDown() {
     try {
-      long leakedUnsafeMemory = taskMemoryManager.cleanUpAllAllocatedMemory();
-      if (shuffleMemoryManager != null) {
-        long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask();
-        shuffleMemoryManager = null;
-        assertEquals(0L, leakedShuffleMemory);
-      }
-      assertEquals(0, leakedUnsafeMemory);
+      assertEquals(0L, taskMemoryManager.cleanUpAllAllocatedMemory());
     } finally {
       Utils.deleteRecursively(tempDir);
       tempDir = null;
@@ -178,7 +169,6 @@ private static void insertRecord(
   private UnsafeExternalSorter newSorter() throws IOException {
     return UnsafeExternalSorter.create(
       taskMemoryManager,
-      shuffleMemoryManager,
       blockManager,
       taskContext,
       recordComparator,
@@ -236,12 +226,16 @@ public void testSortingEmptyArrays() throws Exception {
 
   @Test
   public void spillingOccursInResponseToMemoryPressure() throws Exception {
-    shuffleMemoryManager = ShuffleMemoryManager.create(pageSizeBytes * 2, pageSizeBytes);
     final UnsafeExternalSorter sorter = newSorter();
-    final int numRecords = (int) pageSizeBytes / 4;
-    for (int i = 0; i <= numRecords; i++) {
+    // This should be enough records to completely fill up a data page:
+    final int numRecords = (int) (pageSizeBytes / (4 + 4));
+    for (int i = 0; i < numRecords; i++) {
       insertNumber(sorter, numRecords - i);
     }
+    assertEquals(1, sorter.getNumberOfAllocatedPages());
+    memoryManager.markExecutionAsOutOfMemory();
+    // The insertion of this record should trigger a spill:
+    insertNumber(sorter, 0);
     // Ensure that spill files were created
     assertThat(tempDir.listFiles().length, greaterThanOrEqualTo(1));
     // Read back the sorted data:
@@ -255,6 +249,7 @@ public void spillingOccursInResponseToMemoryPressure() throws Exception {
       assertEquals(i, Platform.getInt(iter.getBaseObject(), iter.getBaseOffset()));
       i++;
     }
+    assertEquals(numRecords + 1, i);
     sorter.cleanupResources();
     assertSpillFilesWereCleanedUp();
   }
@@ -323,7 +318,6 @@ public void testPeakMemoryUsed() throws Exception {
     final long numRecordsPerPage = pageSizeBytes / recordLengthBytes;
     final UnsafeExternalSorter sorter = UnsafeExternalSorter.create(
       taskMemoryManager,
-      shuffleMemoryManager,
       blockManager,
       taskContext,
       recordComparator,
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index 778e813df6b54..d5de56a0512f9 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -26,11 +26,11 @@
 import static org.mockito.Mockito.mock;
 
 import org.apache.spark.HashPartitioner;
+import org.apache.spark.SparkConf;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryAllocator;
+import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 public class UnsafeInMemorySorterSuite {
 
@@ -43,7 +43,8 @@ private static String getStringFromDataPage(Object baseObject, long baseOffset,
   @Test
   public void testSortingEmptyInput() {
     final UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)),
+      new TaskMemoryManager(
+        new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0),
       mock(RecordComparator.class),
       mock(PrefixComparator.class),
       100);
@@ -64,8 +65,8 @@ public void testSortingOnlyByIntegerPrefix() throws Exception {
       "Lychee",
       "Mango"
     };
-    final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+    final TaskMemoryManager memoryManager = new TaskMemoryManager(
+      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
     final MemoryBlock dataPage = memoryManager.allocatePage(2048);
     final Object baseObject = dataPage.getBaseObject();
     // Write the records into the data page:
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index f58756e6f6179..0242cbc9244a8 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -149,7 +149,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     // cause is preserved
     val thrownDueToTaskFailure = intercept[SparkException] {
       sc.parallelize(Seq(0)).mapPartitions { iter =>
-        TaskContext.get().taskMemoryManager().allocate(128)
+        TaskContext.get().taskMemoryManager().allocatePage(128)
         throw new Exception("intentional task failure")
         iter
       }.count()
@@ -159,7 +159,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     // If the task succeeded but memory was leaked, then the task should fail due to that leak
     val thrownDueToMemoryLeak = intercept[SparkException] {
       sc.parallelize(Seq(0)).mapPartitions { iter =>
-        TaskContext.get().taskMemoryManager().allocate(128)
+        TaskContext.get().taskMemoryManager().allocatePage(128)
         iter
       }.count()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala
similarity index 56%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
rename to core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala
index c4358f409b6ef..fe102d8aeb2a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala
@@ -15,51 +15,25 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution
+package org.apache.spark.memory
 
 import scala.collection.mutable
 
-import org.apache.spark.memory.MemoryManager
-import org.apache.spark.shuffle.ShuffleMemoryManager
-import org.apache.spark.storage.{BlockId, BlockStatus}
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.{BlockStatus, BlockId}
 
-
-/**
- * A [[ShuffleMemoryManager]] that can be controlled to run out of memory.
- */
-class TestShuffleMemoryManager
-  extends ShuffleMemoryManager(new GrantEverythingMemoryManager, 4 * 1024 * 1024) {
-  private var oom = false
-
-  override def tryToAcquire(numBytes: Long): Long = {
+class GrantEverythingMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1) {
+  private[memory] override def doAcquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
     if (oom) {
       oom = false
       0
     } else {
-      // Uncomment the following to trace memory allocations.
-      // println(s"tryToAcquire $numBytes in " +
-      //   Thread.currentThread().getStackTrace.mkString("", "\n  -", ""))
-      val acquired = super.tryToAcquire(numBytes)
-      acquired
+      _executionMemoryUsed += numBytes // To suppress warnings when freeing unallocated memory
+      numBytes
     }
   }
-
-  override def release(numBytes: Long): Unit = {
-    // Uncomment the following to trace memory releases.
-    // println(s"release $numBytes in " +
-    //   Thread.currentThread().getStackTrace.mkString("", "\n  -", ""))
-    super.release(numBytes)
-  }
-
-  def markAsOutOfMemory(): Unit = {
-    oom = true
-  }
-}
-
-private class GrantEverythingMemoryManager extends MemoryManager {
-  override def acquireExecutionMemory(
-      numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = numBytes
   override def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
@@ -68,8 +42,13 @@ private class GrantEverythingMemoryManager extends MemoryManager {
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
-  override def releaseExecutionMemory(numBytes: Long): Unit = { }
   override def releaseStorageMemory(numBytes: Long): Unit = { }
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
+
+  private var oom = false
+
+  def markExecutionAsOutOfMemory(): Unit = {
+    oom = true
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
index 36e4566310715..1265087743a98 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -19,10 +19,14 @@ package org.apache.spark.memory
 
 import java.util.concurrent.atomic.AtomicLong
 
+import scala.concurrent.duration.Duration
+import scala.concurrent.{Await, ExecutionContext, Future}
+
 import org.mockito.Matchers.{any, anyLong}
 import org.mockito.Mockito.{mock, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
+import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.storage.MemoryStore
@@ -126,6 +130,136 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     assert(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
       "ensure free space should not have been called!")
   }
+
+  /**
+   * Create a MemoryManager with the specified execution memory limit and no storage memory.
+   */
+  protected def createMemoryManager(maxExecutionMemory: Long): MemoryManager
+
+  // -- Tests of sharing of execution memory between tasks ----------------------------------------
+  // Prior to Spark 1.6, these tests were part of ShuffleMemoryManagerSuite.
+
+  implicit val ec = ExecutionContext.global
+
+  test("single task requesting execution memory") {
+    val manager = createMemoryManager(1000L)
+    val taskMemoryManager = new TaskMemoryManager(manager, 0)
+
+    assert(taskMemoryManager.acquireExecutionMemory(100L) === 100L)
+    assert(taskMemoryManager.acquireExecutionMemory(400L) === 400L)
+    assert(taskMemoryManager.acquireExecutionMemory(400L) === 400L)
+    assert(taskMemoryManager.acquireExecutionMemory(200L) === 100L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
+
+    taskMemoryManager.releaseExecutionMemory(500L)
+    assert(taskMemoryManager.acquireExecutionMemory(300L) === 300L)
+    assert(taskMemoryManager.acquireExecutionMemory(300L) === 200L)
+
+    taskMemoryManager.cleanUpAllAllocatedMemory()
+    assert(taskMemoryManager.acquireExecutionMemory(1000L) === 1000L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
+  }
+
+  test("two tasks requesting full execution memory") {
+    val memoryManager = createMemoryManager(1000L)
+    val t1MemManager = new TaskMemoryManager(memoryManager, 1)
+    val t2MemManager = new TaskMemoryManager(memoryManager, 2)
+    val futureTimeout: Duration = 20.seconds
+
+    // Have both tasks request 500 bytes, then wait until both requests have been granted:
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(500L) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    assert(Await.result(t1Result1, futureTimeout) === 500L)
+    assert(Await.result(t2Result1, futureTimeout) === 500L)
+
+    // Have both tasks each request 500 bytes more; both should immediately return 0 as they are
+    // both now at 1 / N
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    assert(Await.result(t1Result2, 200.millis) === 0L)
+    assert(Await.result(t2Result2, 200.millis) === 0L)
+  }
+
+  test("two tasks cannot grow past 1 / N of execution memory") {
+    val memoryManager = createMemoryManager(1000L)
+    val t1MemManager = new TaskMemoryManager(memoryManager, 1)
+    val t2MemManager = new TaskMemoryManager(memoryManager, 2)
+    val futureTimeout: Duration = 20.seconds
+
+    // Have both tasks request 250 bytes, then wait until both requests have been granted:
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(250L) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L) }
+    assert(Await.result(t1Result1, futureTimeout) === 250L)
+    assert(Await.result(t2Result1, futureTimeout) === 250L)
+
+    // Have both tasks each request 500 bytes more.
+    // We should only grant 250 bytes to each of them on this second request
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    assert(Await.result(t1Result2, futureTimeout) === 250L)
+    assert(Await.result(t2Result2, futureTimeout) === 250L)
+  }
+
+  test("tasks can block to get at least 1 / 2N of execution memory") {
+    val memoryManager = createMemoryManager(1000L)
+    val t1MemManager = new TaskMemoryManager(memoryManager, 1)
+    val t2MemManager = new TaskMemoryManager(memoryManager, 2)
+    val futureTimeout: Duration = 20.seconds
+
+    // t1 grabs 1000 bytes and then waits until t2 is ready to make a request.
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L) }
+    assert(Await.result(t1Result1, futureTimeout) === 1000L)
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L) }
+    // Make sure that t2 didn't grab the memory right away. This is hacky but it would be difficult
+    // to make sure the other thread blocks for some time otherwise.
+    Thread.sleep(300)
+    t1MemManager.releaseExecutionMemory(250L)
+    // The memory freed from t1 should now be granted to t2.
+    assert(Await.result(t2Result1, futureTimeout) === 250L)
+    // Further requests by t2 should be denied immediately because it now has 1 / 2N of the memory.
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(100L) }
+    assert(Await.result(t2Result2, 200.millis) === 0L)
+  }
+
+  test("TaskMemoryManager.cleanUpAllAllocatedMemory") {
+    val memoryManager = createMemoryManager(1000L)
+    val t1MemManager = new TaskMemoryManager(memoryManager, 1)
+    val t2MemManager = new TaskMemoryManager(memoryManager, 2)
+    val futureTimeout: Duration = 20.seconds
+
+    // t1 grabs 1000 bytes and then waits until t2 is ready to make a request.
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L) }
+    assert(Await.result(t1Result1, futureTimeout) === 1000L)
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    // Make sure that t2 didn't grab the memory right away. This is hacky but it would be difficult
+    // to make sure the other thread blocks for some time otherwise.
+    Thread.sleep(300)
+    // t1 releases all of its memory, so t2 should be able to grab all of the memory
+    t1MemManager.cleanUpAllAllocatedMemory()
+    assert(Await.result(t2Result1, futureTimeout) === 500L)
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    assert(Await.result(t2Result2, futureTimeout) === 500L)
+    val t2Result3 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    assert(Await.result(t2Result3, 200.millis) === 0L)
+  }
+
+  test("tasks should not be granted a negative amount of execution memory") {
+    // This is a regression test for SPARK-4715.
+    val memoryManager = createMemoryManager(1000L)
+    val t1MemManager = new TaskMemoryManager(memoryManager, 1)
+    val t2MemManager = new TaskMemoryManager(memoryManager, 2)
+    val futureTimeout: Duration = 20.seconds
+
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(700L) }
+    assert(Await.result(t1Result1, futureTimeout) === 700L)
+
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(300L) }
+    assert(Await.result(t2Result1, futureTimeout) === 300L)
+
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(300L) }
+    assert(Await.result(t1Result2, 200.millis) === 0L)
+  }
 }
 
 private object MemoryManagerSuite {
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
new file mode 100644
index 0000000000000..4b4c3b0311328
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import org.apache.spark.{SparkEnv, TaskContextImpl, TaskContext}
+
+/**
+ * Helper methods for mocking out memory-management-related classes in tests.
+ */
+object MemoryTestingUtils {
+  def fakeTaskContext(env: SparkEnv): TaskContext = {
+    val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0)
+    new TaskContextImpl(
+      stageId = 0,
+      partitionId = 0,
+      taskAttemptId = 0,
+      attemptNumber = 0,
+      taskMemoryManager = taskMemoryManager,
+      metricsSystem = env.metricsSystem,
+      internalAccumulators = Seq.empty)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
index 6cae1f871e24b..885c450d6d4f5 100644
--- a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -36,27 +36,35 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
       maxExecutionMem: Long,
       maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
     val mm = new StaticMemoryManager(
-      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
+      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem, numCores = 1)
     val ms = makeMemoryStore(mm)
     (mm, ms)
   }
 
+  override protected def createMemoryManager(maxMemory: Long): MemoryManager = {
+    new StaticMemoryManager(
+      conf,
+      maxExecutionMemory = maxMemory,
+      maxStorageMemory = 0,
+      numCores = 1)
+  }
+
   test("basic execution memory") {
     val maxExecutionMem = 1000L
     val (mm, _) = makeThings(maxExecutionMem, Long.MaxValue)
     assert(mm.executionMemoryUsed === 0L)
-    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
+    assert(mm.doAcquireExecutionMemory(10L, evictedBlocks) === 10L)
     assert(mm.executionMemoryUsed === 10L)
-    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.doAcquireExecutionMemory(100L, evictedBlocks) === 100L)
     // Acquire up to the max
-    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
+    assert(mm.doAcquireExecutionMemory(1000L, evictedBlocks) === 890L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
-    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
+    assert(mm.doAcquireExecutionMemory(1L, evictedBlocks) === 0L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
     mm.releaseExecutionMemory(800L)
     assert(mm.executionMemoryUsed === 200L)
     // Acquire after release
-    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
+    assert(mm.doAcquireExecutionMemory(1L, evictedBlocks) === 1L)
     assert(mm.executionMemoryUsed === 201L)
     // Release beyond what was acquired
     mm.releaseExecutionMemory(maxExecutionMem)
@@ -108,10 +116,10 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
     val dummyBlock = TestBlockId("ain't nobody love like you do")
     val (mm, ms) = makeThings(maxExecutionMem, maxStorageMem)
     // Only execution memory should increase
-    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.doAcquireExecutionMemory(100L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 100L)
-    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 100L)
+    assert(mm.doAcquireExecutionMemory(1000L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 200L)
     // Only storage memory should increase
diff --git a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
index e7baa50dc2cd0..0c97f2bd89651 100644
--- a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
@@ -34,11 +34,15 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
    * Make a [[UnifiedMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
    */
   private def makeThings(maxMemory: Long): (UnifiedMemoryManager, MemoryStore) = {
-    val mm = new UnifiedMemoryManager(conf, maxMemory)
+    val mm = new UnifiedMemoryManager(conf, maxMemory, numCores = 1)
     val ms = makeMemoryStore(mm)
     (mm, ms)
   }
 
+  override protected def createMemoryManager(maxMemory: Long): MemoryManager = {
+    new UnifiedMemoryManager(conf, maxMemory, numCores = 1)
+  }
+
   private def getStorageRegionSize(mm: UnifiedMemoryManager): Long = {
     mm invokePrivate PrivateMethod[Long]('storageRegionSize)()
   }
@@ -56,18 +60,18 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     val maxMemory = 1000L
     val (mm, _) = makeThings(maxMemory)
     assert(mm.executionMemoryUsed === 0L)
-    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
+    assert(mm.doAcquireExecutionMemory(10L, evictedBlocks) === 10L)
     assert(mm.executionMemoryUsed === 10L)
-    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.doAcquireExecutionMemory(100L, evictedBlocks) === 100L)
     // Acquire up to the max
-    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
+    assert(mm.doAcquireExecutionMemory(1000L, evictedBlocks) === 890L)
     assert(mm.executionMemoryUsed === maxMemory)
-    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
+    assert(mm.doAcquireExecutionMemory(1L, evictedBlocks) === 0L)
     assert(mm.executionMemoryUsed === maxMemory)
     mm.releaseExecutionMemory(800L)
     assert(mm.executionMemoryUsed === 200L)
     // Acquire after release
-    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
+    assert(mm.doAcquireExecutionMemory(1L, evictedBlocks) === 1L)
     assert(mm.executionMemoryUsed === 201L)
     // Release beyond what was acquired
     mm.releaseExecutionMemory(maxMemory)
@@ -132,12 +136,12 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     require(mm.storageMemoryUsed > storageRegionSize,
       s"bad test: storage memory used should exceed the storage region")
     // Execution needs to request 250 bytes to evict storage memory
-    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.doAcquireExecutionMemory(100L, evictedBlocks) === 100L)
     assert(mm.executionMemoryUsed === 100L)
     assert(mm.storageMemoryUsed === 750L)
     assertEnsureFreeSpaceNotCalled(ms)
     // Execution wants 200 bytes but only 150 are free, so storage is evicted
-    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assert(mm.doAcquireExecutionMemory(200L, evictedBlocks) === 200L)
     assertEnsureFreeSpaceCalled(ms, 200L)
     assert(mm.executionMemoryUsed === 300L)
     mm.releaseAllStorageMemory()
@@ -151,7 +155,7 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
       s"bad test: storage memory used should be within the storage region")
     // Execution cannot evict storage because the latter is within the storage fraction,
     // so grant only what's remaining without evicting anything, i.e. 1000 - 300 - 400 = 300
-    assert(mm.acquireExecutionMemory(400L, evictedBlocks) === 300L)
+    assert(mm.doAcquireExecutionMemory(400L, evictedBlocks) === 300L)
     assert(mm.executionMemoryUsed === 600L)
     assert(mm.storageMemoryUsed === 400L)
     assertEnsureFreeSpaceNotCalled(ms)
@@ -170,7 +174,7 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     require(executionRegionSize === expectedExecutionRegionSize,
       "bad test: storage region size is unexpected")
     // Acquire enough execution memory to exceed the execution region
-    assert(mm.acquireExecutionMemory(800L, evictedBlocks) === 800L)
+    assert(mm.doAcquireExecutionMemory(800L, evictedBlocks) === 800L)
     assert(mm.executionMemoryUsed === 800L)
     assert(mm.storageMemoryUsed === 0L)
     assertEnsureFreeSpaceNotCalled(ms)
@@ -188,7 +192,7 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     mm.releaseExecutionMemory(maxMemory)
     mm.releaseStorageMemory(maxMemory)
     // Acquire some execution memory again, but this time keep it within the execution region
-    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assert(mm.doAcquireExecutionMemory(200L, evictedBlocks) === 200L)
     assert(mm.executionMemoryUsed === 200L)
     assert(mm.storageMemoryUsed === 0L)
     assertEnsureFreeSpaceNotCalled(ms)
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
deleted file mode 100644
index 5877aa042d4af..0000000000000
--- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle
-
-import java.util.concurrent.CountDownLatch
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.mockito.Mockito._
-import org.scalatest.concurrent.Timeouts
-import org.scalatest.time.SpanSugar._
-
-import org.apache.spark.{SparkFunSuite, TaskContext}
-import org.apache.spark.executor.TaskMetrics
-
-class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
-
-  val nextTaskAttemptId = new AtomicInteger()
-
-  /** Launch a thread with the given body block and return it. */
-  private def startThread(name: String)(body: => Unit): Thread = {
-    val thread = new Thread("ShuffleMemorySuite " + name) {
-      override def run() {
-        try {
-          val taskAttemptId = nextTaskAttemptId.getAndIncrement
-          val mockTaskContext = mock(classOf[TaskContext], RETURNS_SMART_NULLS)
-          val taskMetrics = new TaskMetrics
-          when(mockTaskContext.taskAttemptId()).thenReturn(taskAttemptId)
-          when(mockTaskContext.taskMetrics()).thenReturn(taskMetrics)
-          TaskContext.setTaskContext(mockTaskContext)
-          body
-        } finally {
-          TaskContext.unset()
-        }
-      }
-    }
-    thread.start()
-    thread
-  }
-
-  test("single task requesting memory") {
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-
-    assert(manager.tryToAcquire(100L) === 100L)
-    assert(manager.tryToAcquire(400L) === 400L)
-    assert(manager.tryToAcquire(400L) === 400L)
-    assert(manager.tryToAcquire(200L) === 100L)
-    assert(manager.tryToAcquire(100L) === 0L)
-    assert(manager.tryToAcquire(100L) === 0L)
-
-    manager.release(500L)
-    assert(manager.tryToAcquire(300L) === 300L)
-    assert(manager.tryToAcquire(300L) === 200L)
-
-    manager.releaseMemoryForThisTask()
-    assert(manager.tryToAcquire(1000L) === 1000L)
-    assert(manager.tryToAcquire(100L) === 0L)
-  }
-
-  test("two threads requesting full memory") {
-    // Two threads request 500 bytes first, wait for each other to get it, and then request
-    // 500 more; we should immediately return 0 as both are now at 1 / N
-
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-
-    class State {
-      var t1Result1 = -1L
-      var t2Result1 = -1L
-      var t1Result2 = -1L
-      var t2Result2 = -1L
-    }
-    val state = new State
-
-    val t1 = startThread("t1") {
-      val r1 = manager.tryToAcquire(500L)
-      state.synchronized {
-        state.t1Result1 = r1
-        state.notifyAll()
-        while (state.t2Result1 === -1L) {
-          state.wait()
-        }
-      }
-      val r2 = manager.tryToAcquire(500L)
-      state.synchronized { state.t1Result2 = r2 }
-    }
-
-    val t2 = startThread("t2") {
-      val r1 = manager.tryToAcquire(500L)
-      state.synchronized {
-        state.t2Result1 = r1
-        state.notifyAll()
-        while (state.t1Result1 === -1L) {
-          state.wait()
-        }
-      }
-      val r2 = manager.tryToAcquire(500L)
-      state.synchronized { state.t2Result2 = r2 }
-    }
-
-    failAfter(20 seconds) {
-      t1.join()
-      t2.join()
-    }
-
-    assert(state.t1Result1 === 500L)
-    assert(state.t2Result1 === 500L)
-    assert(state.t1Result2 === 0L)
-    assert(state.t2Result2 === 0L)
-  }
-
-
-  test("tasks cannot grow past 1 / N") {
-    // Two tasks request 250 bytes first, wait for each other to get it, and then request
-    // 500 more; we should only grant 250 bytes to each of them on this second request
-
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-
-    class State {
-      var t1Result1 = -1L
-      var t2Result1 = -1L
-      var t1Result2 = -1L
-      var t2Result2 = -1L
-    }
-    val state = new State
-
-    val t1 = startThread("t1") {
-      val r1 = manager.tryToAcquire(250L)
-      state.synchronized {
-        state.t1Result1 = r1
-        state.notifyAll()
-        while (state.t2Result1 === -1L) {
-          state.wait()
-        }
-      }
-      val r2 = manager.tryToAcquire(500L)
-      state.synchronized { state.t1Result2 = r2 }
-    }
-
-    val t2 = startThread("t2") {
-      val r1 = manager.tryToAcquire(250L)
-      state.synchronized {
-        state.t2Result1 = r1
-        state.notifyAll()
-        while (state.t1Result1 === -1L) {
-          state.wait()
-        }
-      }
-      val r2 = manager.tryToAcquire(500L)
-      state.synchronized { state.t2Result2 = r2 }
-    }
-
-    failAfter(20 seconds) {
-      t1.join()
-      t2.join()
-    }
-
-    assert(state.t1Result1 === 250L)
-    assert(state.t2Result1 === 250L)
-    assert(state.t1Result2 === 250L)
-    assert(state.t2Result2 === 250L)
-  }
-
-  test("tasks can block to get at least 1 / 2N memory") {
-    // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps
-    // for a bit and releases 250 bytes, which should then be granted to t2. Further requests
-    // by t2 will return false right away because it now has 1 / 2N of the memory.
-
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-
-    class State {
-      var t1Requested = false
-      var t2Requested = false
-      var t1Result = -1L
-      var t2Result = -1L
-      var t2Result2 = -1L
-      var t2WaitTime = 0L
-    }
-    val state = new State
-
-    val t1 = startThread("t1") {
-      state.synchronized {
-        state.t1Result = manager.tryToAcquire(1000L)
-        state.t1Requested = true
-        state.notifyAll()
-        while (!state.t2Requested) {
-          state.wait()
-        }
-      }
-      // Sleep a bit before releasing our memory; this is hacky but it would be difficult to make
-      // sure the other thread blocks for some time otherwise
-      Thread.sleep(300)
-      manager.release(250L)
-    }
-
-    val t2 = startThread("t2") {
-      state.synchronized {
-        while (!state.t1Requested) {
-          state.wait()
-        }
-        state.t2Requested = true
-        state.notifyAll()
-      }
-      val startTime = System.currentTimeMillis()
-      val result = manager.tryToAcquire(250L)
-      val endTime = System.currentTimeMillis()
-      state.synchronized {
-        state.t2Result = result
-        // A second call should return 0 because we're now already at 1 / 2N
-        state.t2Result2 = manager.tryToAcquire(100L)
-        state.t2WaitTime = endTime - startTime
-      }
-    }
-
-    failAfter(20 seconds) {
-      t1.join()
-      t2.join()
-    }
-
-    // Both threads should've been able to acquire their memory; the second one will have waited
-    // until the first one acquired 1000 bytes and then released 250
-    state.synchronized {
-      assert(state.t1Result === 1000L, "t1 could not allocate memory")
-      assert(state.t2Result === 250L, "t2 could not allocate memory")
-      assert(state.t2WaitTime > 200, s"t2 waited less than 200 ms (${state.t2WaitTime})")
-      assert(state.t2Result2 === 0L, "t1 got extra memory the second time")
-    }
-  }
-
-  test("releaseMemoryForThisTask") {
-    // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps
-    // for a bit and releases all its memory. t2 should now be able to grab all the memory.
-
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-
-    class State {
-      var t1Requested = false
-      var t2Requested = false
-      var t1Result = -1L
-      var t2Result1 = -1L
-      var t2Result2 = -1L
-      var t2Result3 = -1L
-      var t2WaitTime = 0L
-    }
-    val state = new State
-
-    val t1 = startThread("t1") {
-      state.synchronized {
-        state.t1Result = manager.tryToAcquire(1000L)
-        state.t1Requested = true
-        state.notifyAll()
-        while (!state.t2Requested) {
-          state.wait()
-        }
-      }
-      // Sleep a bit before releasing our memory; this is hacky but it would be difficult to make
-      // sure the other task blocks for some time otherwise
-      Thread.sleep(300)
-      manager.releaseMemoryForThisTask()
-    }
-
-    val t2 = startThread("t2") {
-      state.synchronized {
-        while (!state.t1Requested) {
-          state.wait()
-        }
-        state.t2Requested = true
-        state.notifyAll()
-      }
-      val startTime = System.currentTimeMillis()
-      val r1 = manager.tryToAcquire(500L)
-      val endTime = System.currentTimeMillis()
-      val r2 = manager.tryToAcquire(500L)
-      val r3 = manager.tryToAcquire(500L)
-      state.synchronized {
-        state.t2Result1 = r1
-        state.t2Result2 = r2
-        state.t2Result3 = r3
-        state.t2WaitTime = endTime - startTime
-      }
-    }
-
-    failAfter(20 seconds) {
-      t1.join()
-      t2.join()
-    }
-
-    // Both tasks should've been able to acquire their memory; the second one will have waited
-    // until the first one acquired 1000 bytes and then released all of it
-    state.synchronized {
-      assert(state.t1Result === 1000L, "t1 could not allocate memory")
-      assert(state.t2Result1 === 500L, "t2 didn't get 500 bytes the first time")
-      assert(state.t2Result2 === 500L, "t2 didn't get 500 bytes the second time")
-      assert(state.t2Result3 === 0L, s"t2 got more bytes a third time (${state.t2Result3})")
-      assert(state.t2WaitTime > 200, s"t2 waited less than 200 ms (${state.t2WaitTime})")
-    }
-  }
-
-  test("tasks should not be granted a negative size") {
-    val manager = ShuffleMemoryManager.createForTesting(maxMemory = 1000L)
-    manager.tryToAcquire(700L)
-
-    val latch = new CountDownLatch(1)
-    startThread("t1") {
-      manager.tryToAcquire(300L)
-      latch.countDown()
-    }
-    latch.await() // Wait until `t1` calls `tryToAcquire`
-
-    val granted = manager.tryToAcquire(300L)
-    assert(0 === granted, "granted is negative")
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index cc44c676b27ac..6e3f500e15dc0 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -61,7 +61,7 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem, numCores = 1)
     val store = new BlockManager(name, rpcEnv, master, serializer, conf,
       memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     memManager.setMemoryStore(store.memoryStore)
@@ -261,7 +261,7 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
     val failableTransfer = mock(classOf[BlockTransferService]) // this wont actually work
     when(failableTransfer.hostName).thenReturn("some-hostname")
     when(failableTransfer.port).thenReturn(1000)
-    val memManager = new StaticMemoryManager(conf, Long.MaxValue, 10000)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, 10000, numCores = 1)
     val failableStore = new BlockManager("failable-store", rpcEnv, master, serializer, conf,
       memManager, mapOutputTracker, shuffleManager, failableTransfer, securityMgr, 0)
     memManager.setMemoryStore(failableStore.memoryStore)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index f3fab33ca2e31..d49015afcd594 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -68,7 +68,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem, numCores = 1)
     val blockManager = new BlockManager(name, rpcEnv, master, serializer, conf,
       memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     memManager.setMemoryStore(blockManager.memoryStore)
@@ -823,7 +823,11 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   test("block store put failure") {
     // Use Java serializer so we can create an unserializable error.
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val memoryManager = new StaticMemoryManager(conf, Long.MaxValue, 1200)
+    val memoryManager = new StaticMemoryManager(
+      conf,
+      maxExecutionMemory = Long.MaxValue,
+      maxStorageMemory = 1200,
+      numCores = 1)
     store = new BlockManager(SparkContext.DRIVER_IDENTIFIER, rpcEnv, master,
       new JavaSerializer(conf), conf, memoryManager, mapOutputTracker,
       shuffleManager, transfer, securityMgr, 0)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 5cb506ea2164e..dc3185a6d505a 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
-
+import org.apache.spark.memory.MemoryTestingUtils
 
 class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   import TestUtils.{assertNotSpilled, assertSpilled}
@@ -32,8 +32,11 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   private def mergeCombiners[T](buf1: ArrayBuffer[T], buf2: ArrayBuffer[T]): ArrayBuffer[T] =
     buf1 ++= buf2
 
-  private def createExternalMap[T] = new ExternalAppendOnlyMap[T, T, ArrayBuffer[T]](
-    createCombiner[T], mergeValue[T], mergeCombiners[T])
+  private def createExternalMap[T] = {
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
+    new ExternalAppendOnlyMap[T, T, ArrayBuffer[T]](
+      createCombiner[T], mergeValue[T], mergeCombiners[T], context = context)
+  }
 
   private def createSparkConf(loadDefaults: Boolean, codec: Option[String] = None): SparkConf = {
     val conf = new SparkConf(loadDefaults)
@@ -49,23 +52,27 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
     conf
   }
 
-  test("simple insert") {
+  test("single insert insert") {
     val conf = createSparkConf(loadDefaults = false)
     sc = new SparkContext("local", "test", conf)
     val map = createExternalMap[Int]
-
-    // Single insert
     map.insert(1, 10)
-    var it = map.iterator
+    val it = map.iterator
     assert(it.hasNext)
     val kv = it.next()
     assert(kv._1 === 1 && kv._2 === ArrayBuffer[Int](10))
     assert(!it.hasNext)
+    sc.stop()
+  }
 
-    // Multiple insert
+  test("multiple insert") {
+    val conf = createSparkConf(loadDefaults = false)
+    sc = new SparkContext("local", "test", conf)
+    val map = createExternalMap[Int]
+    map.insert(1, 10)
     map.insert(2, 20)
     map.insert(3, 30)
-    it = map.iterator
+    val it = map.iterator
     assert(it.hasNext)
     assert(it.toSet === Set[(Int, ArrayBuffer[Int])](
       (1, ArrayBuffer[Int](10)),
@@ -144,39 +151,22 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
     sc = new SparkContext("local", "test", conf)
 
     val map = createExternalMap[Int]
+    val nullInt = null.asInstanceOf[Int]
     map.insert(1, 5)
     map.insert(2, 6)
     map.insert(3, 7)
-    assert(map.size === 3)
-    assert(map.iterator.toSet === Set[(Int, Seq[Int])](
-      (1, Seq[Int](5)),
-      (2, Seq[Int](6)),
-      (3, Seq[Int](7))
-    ))
-
-    // Null keys
-    val nullInt = null.asInstanceOf[Int]
+    map.insert(4, nullInt)
     map.insert(nullInt, 8)
-    assert(map.size === 4)
-    assert(map.iterator.toSet === Set[(Int, Seq[Int])](
+    map.insert(nullInt, nullInt)
+      val result = map.iterator.toSet[(Int, ArrayBuffer[Int])].map(kv => (kv._1, kv._2.sorted))
+    assert(result === Set[(Int, Seq[Int])](
       (1, Seq[Int](5)),
       (2, Seq[Int](6)),
       (3, Seq[Int](7)),
-      (nullInt, Seq[Int](8))
+      (4, Seq[Int](nullInt)),
+      (nullInt, Seq[Int](nullInt, 8))
     ))
 
-    // Null values
-    map.insert(4, nullInt)
-    map.insert(nullInt, nullInt)
-    assert(map.size === 5)
-    val result = map.iterator.toSet[(Int, ArrayBuffer[Int])].map(kv => (kv._1, kv._2.toSet))
-    assert(result === Set[(Int, Set[Int])](
-      (1, Set[Int](5)),
-      (2, Set[Int](6)),
-      (3, Set[Int](7)),
-      (4, Set[Int](nullInt)),
-      (nullInt, Set[Int](nullInt, 8))
-    ))
     sc.stop()
   }
 
@@ -344,7 +334,9 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
     val conf = createSparkConf(loadDefaults = true)
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
-    val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
+    val map =
+      new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _, context = context)
 
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index e2cb791771d99..d7b2d07a40052 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util.collection
 
+import org.apache.spark.memory.MemoryTestingUtils
+
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
@@ -98,6 +100,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
@@ -109,7 +112,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       createCombiner _, mergeValue _, mergeCombiners _)
 
     val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
-      Some(agg), None, None, None)
+      context, Some(agg), None, None, None)
 
     val collisionPairs = Seq(
       ("Aa", "BB"),                   // 2112
@@ -158,8 +161,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
-    val sorter = new ExternalSorter[FixedHashObject, Int, Int](Some(agg), None, None, None)
+    val sorter = new ExternalSorter[FixedHashObject, Int, Int](context, Some(agg), None, None, None)
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
     val toInsert = for (i <- 1 to 10; j <- 1 to size) yield (FixedHashObject(j, j % 2), 1)
@@ -180,6 +184,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
     def mergeValue(buffer: ArrayBuffer[Int], i: Int): ArrayBuffer[Int] = buffer += i
@@ -188,7 +193,8 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     }
 
     val agg = new Aggregator[Int, Int, ArrayBuffer[Int]](createCombiner, mergeValue, mergeCombiners)
-    val sorter = new ExternalSorter[Int, Int, ArrayBuffer[Int]](Some(agg), None, None, None)
+    val sorter =
+      new ExternalSorter[Int, Int, ArrayBuffer[Int]](context, Some(agg), None, None, None)
     sorter.insertAll(
       (1 to size).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
     assert(sorter.numSpills > 0, "sorter did not spill")
@@ -204,6 +210,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
@@ -214,7 +221,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       createCombiner, mergeValue, mergeCombiners)
 
     val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
-      Some(agg), None, None, None)
+      context, Some(agg), None, None, None)
 
     sorter.insertAll((1 to size).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
       (null.asInstanceOf[String], "1"),
@@ -271,31 +278,32 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   private def emptyDataStream(conf: SparkConf) {
     conf.set("spark.shuffle.manager", "sort")
     sc = new SparkContext("local", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
     val ord = implicitly[Ordering[Int]]
 
     // Both aggregator and ordering
     val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
+      context, Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
     assert(sorter.iterator.toSeq === Seq())
     sorter.stop()
 
     // Only aggregator
     val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), None, None)
+      context, Some(agg), Some(new HashPartitioner(3)), None, None)
     assert(sorter2.iterator.toSeq === Seq())
     sorter2.stop()
 
     // Only ordering
     val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
+      context, None, Some(new HashPartitioner(3)), Some(ord), None)
     assert(sorter3.iterator.toSeq === Seq())
     sorter3.stop()
 
     // Neither aggregator nor ordering
     val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), None, None)
+      context, None, Some(new HashPartitioner(3)), None, None)
     assert(sorter4.iterator.toSeq === Seq())
     sorter4.stop()
   }
@@ -303,6 +311,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   private def fewElementsPerPartition(conf: SparkConf) {
     conf.set("spark.shuffle.manager", "sort")
     sc = new SparkContext("local", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
     val ord = implicitly[Ordering[Int]]
@@ -313,28 +322,28 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
     // Both aggregator and ordering
     val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
+      context, Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
     sorter.insertAll(elements.iterator)
     assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
     sorter.stop()
 
     // Only aggregator
     val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), None, None)
+      context, Some(agg), Some(new HashPartitioner(7)), None, None)
     sorter2.insertAll(elements.iterator)
     assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
     sorter2.stop()
 
     // Only ordering
     val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
+      context, None, Some(new HashPartitioner(7)), Some(ord), None)
     sorter3.insertAll(elements.iterator)
     assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
     sorter3.stop()
 
     // Neither aggregator nor ordering
     val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), None, None)
+      context, None, Some(new HashPartitioner(7)), None, None)
     sorter4.insertAll(elements.iterator)
     assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
     sorter4.stop()
@@ -345,12 +354,13 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     conf.set("spark.shuffle.manager", "sort")
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local", "test", conf)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
 
     val ord = implicitly[Ordering[Int]]
     val elements = Iterator((1, 1), (5, 5)) ++ (0 until size).iterator.map(x => (2, 2))
 
     val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
+      context, None, Some(new HashPartitioner(7)), Some(ord), None)
     sorter.insertAll(elements)
     assert(sorter.numSpills > 0, "sorter did not spill")
     val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
@@ -432,8 +442,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val diskBlockManager = sc.env.blockManager.diskBlockManager
     val ord = implicitly[Ordering[Int]]
     val expectedSize = if (withFailures) size - 1 else size
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
     val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
+      context, None, Some(new HashPartitioner(3)), Some(ord), None)
     if (withFailures) {
       intercept[SparkException] {
         sorter.insertAll((0 until size).iterator.map { i =>
@@ -501,7 +512,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
         None
       }
     val ord = if (withOrdering) Some(implicitly[Ordering[Int]]) else None
-    val sorter = new ExternalSorter[Int, Int, Int](agg, Some(new HashPartitioner(3)), ord, None)
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
+    val sorter =
+      new ExternalSorter[Int, Int, Int](context, agg, Some(new HashPartitioner(3)), ord, None)
     sorter.insertAll((0 until size).iterator.map { i => (i / 4, i) })
     if (withSpilling) {
       assert(sorter.numSpills > 0, "sorter did not spill")
@@ -538,8 +551,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
     val testData = Array.tabulate(size) { _ => rand.nextInt().toString }
 
+    val context = MemoryTestingUtils.fakeTaskContext(sc.env)
     val sorter1 = new ExternalSorter[String, String, String](
-      None, None, Some(wrongOrdering), None)
+      context, None, None, Some(wrongOrdering), None)
     val thrown = intercept[IllegalArgumentException] {
       sorter1.insertAll(testData.iterator.map(i => (i, i)))
       assert(sorter1.numSpills > 0, "sorter did not spill")
@@ -561,7 +575,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       createCombiner, mergeValue, mergeCombiners)
 
     val sorter2 = new ExternalSorter[String, String, ArrayBuffer[String]](
-      Some(agg), None, None, None)
+      context, Some(agg), None, None, None)
     sorter2.insertAll(testData.iterator.map(i => (i, i)))
     assert(sorter2.numSpills > 0, "sorter did not spill")
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 7d94e0566faa9..810c74fd2fb96 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -67,7 +67,6 @@ public UnsafeExternalRowSorter(
     final TaskContext taskContext = TaskContext.get();
     sorter = UnsafeExternalSorter.create(
       taskContext.taskMemoryManager(),
-      sparkEnv.shuffleMemoryManager(),
       sparkEnv.blockManager(),
       taskContext,
       new RowComparator(ordering, schema.length()),
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 09511ff35f785..82c645df284de 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -22,7 +22,6 @@
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.spark.SparkEnv;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -32,7 +31,7 @@
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 
 /**
  * Unsafe-based HashMap for performing aggregations where the aggregated values are fixed-width.
@@ -88,8 +87,6 @@ public static boolean supportsAggregationBufferSchema(StructType schema) {
    * @param aggregationBufferSchema the schema of the aggregation buffer, used for row conversion.
    * @param groupingKeySchema the schema of the grouping key, used for row conversion.
    * @param taskMemoryManager the memory manager used to allocate our Unsafe memory structures.
-   * @param shuffleMemoryManager the shuffle memory manager, for coordinating our memory usage with
-   *                             other tasks.
    * @param initialCapacity the initial capacity of the map (a sizing hint to avoid re-hashing).
    * @param pageSizeBytes the data page size, in bytes; limits the maximum record size.
    * @param enablePerfMetrics if true, performance metrics will be recorded (has minor perf impact)
@@ -99,15 +96,14 @@ public UnsafeFixedWidthAggregationMap(
       StructType aggregationBufferSchema,
       StructType groupingKeySchema,
       TaskMemoryManager taskMemoryManager,
-      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
     this.aggregationBufferSchema = aggregationBufferSchema;
     this.groupingKeyProjection = UnsafeProjection.create(groupingKeySchema);
     this.groupingKeySchema = groupingKeySchema;
-    this.map = new BytesToBytesMap(
-      taskMemoryManager, shuffleMemoryManager, initialCapacity, pageSizeBytes, enablePerfMetrics);
+    this.map =
+      new BytesToBytesMap(taskMemoryManager, initialCapacity, pageSizeBytes, enablePerfMetrics);
     this.enablePerfMetrics = enablePerfMetrics;
 
     // Initialize the buffer for aggregation value
@@ -256,7 +252,7 @@ public void printPerfMetrics() {
   public UnsafeKVExternalSorter destructAndCreateExternalSorter() throws IOException {
     UnsafeKVExternalSorter sorter = new UnsafeKVExternalSorter(
       groupingKeySchema, aggregationBufferSchema,
-      SparkEnv.get().blockManager(), map.getShuffleMemoryManager(), map.getPageSizeBytes(), map);
+      SparkEnv.get().blockManager(), map.getPageSizeBytes(), map);
     return sorter;
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index 9df5780e4fd84..46301f0042954 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -24,7 +24,6 @@
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.spark.TaskContext;
-import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.expressions.codegen.BaseOrdering;
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering;
@@ -34,7 +33,7 @@
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.collection.unsafe.sort.*;
 
 /**
@@ -50,14 +49,19 @@ public final class UnsafeKVExternalSorter {
   private final UnsafeExternalRowSorter.PrefixComputer prefixComputer;
   private final UnsafeExternalSorter sorter;
 
-  public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
-      BlockManager blockManager, ShuffleMemoryManager shuffleMemoryManager, long pageSizeBytes)
-    throws IOException {
-    this(keySchema, valueSchema, blockManager, shuffleMemoryManager, pageSizeBytes, null);
+  public UnsafeKVExternalSorter(
+      StructType keySchema,
+      StructType valueSchema,
+      BlockManager blockManager,
+      long pageSizeBytes) throws IOException {
+    this(keySchema, valueSchema, blockManager, pageSizeBytes, null);
   }
 
-  public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
-      BlockManager blockManager, ShuffleMemoryManager shuffleMemoryManager, long pageSizeBytes,
+  public UnsafeKVExternalSorter(
+      StructType keySchema,
+      StructType valueSchema,
+      BlockManager blockManager,
+      long pageSizeBytes,
       @Nullable BytesToBytesMap map) throws IOException {
     this.keySchema = keySchema;
     this.valueSchema = valueSchema;
@@ -73,7 +77,6 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
     if (map == null) {
       sorter = UnsafeExternalSorter.create(
         taskMemoryManager,
-        shuffleMemoryManager,
         blockManager,
         taskContext,
         recordComparator,
@@ -115,7 +118,6 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
 
       sorter = UnsafeExternalSorter.createWithExistingInMemorySorter(
         taskContext.taskMemoryManager(),
-        shuffleMemoryManager,
         blockManager,
         taskContext,
         new KVComparator(ordering, keySchema.length()),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 7cd0f7b81e46c..fb2fc98e34fbe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.aggregate
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.unsafe.KVIterator
-import org.apache.spark.{InternalAccumulator, Logging, SparkEnv, TaskContext}
+import org.apache.spark.{InternalAccumulator, Logging, TaskContext}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
@@ -34,7 +34,7 @@ import org.apache.spark.sql.types.StructType
  *
  * This iterator first uses hash-based aggregation to process input rows. It uses
  * a hash map to store groups and their corresponding aggregation buffers. If we
- * this map cannot allocate memory from [[org.apache.spark.shuffle.ShuffleMemoryManager]],
+ * this map cannot allocate memory from memory manager,
  * it switches to sort-based aggregation. The process of the switch has the following step:
  *  - Step 1: Sort all entries of the hash map based on values of grouping expressions and
  *            spill them to disk.
@@ -480,10 +480,9 @@ class TungstenAggregationIterator(
     initialAggregationBuffer,
     StructType.fromAttributes(allAggregateFunctions.flatMap(_.aggBufferAttributes)),
     StructType.fromAttributes(groupingExpressions.map(_.toAttribute)),
-    TaskContext.get.taskMemoryManager(),
-    SparkEnv.get.shuffleMemoryManager,
+    TaskContext.get().taskMemoryManager(),
     1024 * 16, // initial capacity
-    SparkEnv.get.shuffleMemoryManager.pageSizeBytes,
+    TaskContext.get().taskMemoryManager().pageSizeBytes,
     false // disable tracking of performance metrics
   )
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index cfd64c1d9eb34..1b59b19d9420d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -344,8 +344,7 @@ private[sql] class DynamicPartitionWriterContainer(
               StructType.fromAttributes(partitionColumns),
               StructType.fromAttributes(dataColumns),
               SparkEnv.get.blockManager,
-              SparkEnv.get.shuffleMemoryManager,
-              SparkEnv.get.shuffleMemoryManager.pageSizeBytes)
+              TaskContext.get().taskMemoryManager().pageSizeBytes)
             sorter.insertKV(currentKey, getOutputRow(inputRow))
           }
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index bc255b27502b2..cc8abb1ba463c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -21,7 +21,7 @@ import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}
 import java.nio.ByteOrder
 import java.util.{HashMap => JavaHashMap}
 
-import org.apache.spark.shuffle.ShuffleMemoryManager
+import org.apache.spark.memory.{TaskMemoryManager, StaticMemoryManager}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkSqlSerializer
@@ -29,7 +29,7 @@ import org.apache.spark.sql.execution.local.LocalNode
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.map.BytesToBytesMap
-import org.apache.spark.unsafe.memory.{MemoryLocation, ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
+import org.apache.spark.unsafe.memory.MemoryLocation
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.CompactBuffer
 import org.apache.spark.{SparkConf, SparkEnv}
@@ -320,21 +320,20 @@ private[joins] final class UnsafeHashedRelation(
   override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
     val nKeys = in.readInt()
     // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory
-    val taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+    // TODO(josh): This needs to be revisited before we merge this patch; making this change now
+    // so that tests compile:
+    val taskMemoryManager = new TaskMemoryManager(
+      new StaticMemoryManager(
+        new SparkConf().set("spark.unsafe.offHeap", "false"), Long.MaxValue, Long.MaxValue, 1), 0)
 
-    val pageSizeBytes = Option(SparkEnv.get).map(_.shuffleMemoryManager.pageSizeBytes)
+    val pageSizeBytes = Option(SparkEnv.get).map(_.memoryManager.pageSizeBytes)
       .getOrElse(new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "16m"))
 
-    // Dummy shuffle memory manager which always grants all memory allocation requests.
-    // We use this because it doesn't make sense count shared broadcast variables' memory usage
-    // towards individual tasks' quotas. In the future, we should devise a better way of handling
-    // this.
-    val shuffleMemoryManager =
-      ShuffleMemoryManager.create(maxMemory = Long.MaxValue, pageSizeBytes = pageSizeBytes)
+    // TODO(josh): We won't need this dummy memory manager after future refactorings; revisit
+    // during code review
 
     binaryMap = new BytesToBytesMap(
       taskMemoryManager,
-      shuffleMemoryManager,
       (nKeys * 1.5 + 1).toInt, // reduce hash collision
       pageSizeBytes)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index 9385e5734db5c..dd92dda480601 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -49,7 +49,8 @@ case class Sort(
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
     child.execute().mapPartitions( { iterator =>
       val ordering = newOrdering(sortOrder, child.output)
-      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](ordering = Some(ordering))
+      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](
+        TaskContext.get(), ordering = Some(ordering))
       sorter.insertAll(iterator.map(r => (r.copy(), null)))
       val baseIterator = sorter.iterator.map(_._1)
       val context = TaskContext.get()
@@ -124,7 +125,7 @@ case class TungstenSort(
         }
       }
 
-      val pageSize = SparkEnv.get.shuffleMemoryManager.pageSizeBytes
+      val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
       val sorter = new UnsafeExternalRowSorter(
         schema, ordering, prefixComparator, prefixComputer, pageSize)
       if (testSpillFrequency > 0) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 1739798a24e0a..dbf4863b767bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -23,13 +23,12 @@ import scala.util.{Try, Random}
 
 import org.scalatest.Matchers
 
-import org.apache.spark.{TaskContextImpl, TaskContext, SparkFunSuite}
-import org.apache.spark.shuffle.ShuffleMemoryManager
+import org.apache.spark.{SparkConf, TaskContextImpl, TaskContext, SparkFunSuite}
+import org.apache.spark.memory.{TaskMemoryManager, GrantEverythingMemoryManager}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -49,23 +48,22 @@ class UnsafeFixedWidthAggregationMapSuite
   private def emptyAggregationBuffer: InternalRow = InternalRow(0)
   private val PAGE_SIZE_BYTES: Long = 1L << 26; // 64 megabytes
 
+  private var memoryManager: GrantEverythingMemoryManager = null
   private var taskMemoryManager: TaskMemoryManager = null
-  private var shuffleMemoryManager: TestShuffleMemoryManager = null
 
   def testWithMemoryLeakDetection(name: String)(f: => Unit) {
     def cleanup(): Unit = {
       if (taskMemoryManager != null) {
-        val leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask()
         assert(taskMemoryManager.cleanUpAllAllocatedMemory() === 0)
-        assert(leakedShuffleMemory === 0)
         taskMemoryManager = null
       }
       TaskContext.unset()
     }
 
     test(name) {
-      taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
-      shuffleMemoryManager = new TestShuffleMemoryManager
+      val conf = new SparkConf().set("spark.unsafe.offHeap", "false")
+      memoryManager = new GrantEverythingMemoryManager(conf)
+      taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
 
       TaskContext.setTaskContext(new TaskContextImpl(
         stageId = 0,
@@ -110,7 +108,6 @@ class UnsafeFixedWidthAggregationMapSuite
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      shuffleMemoryManager,
       1024, // initial capacity,
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -125,7 +122,6 @@ class UnsafeFixedWidthAggregationMapSuite
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      shuffleMemoryManager,
       1024, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -153,7 +149,6 @@ class UnsafeFixedWidthAggregationMapSuite
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      shuffleMemoryManager,
       128, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -176,14 +171,13 @@ class UnsafeFixedWidthAggregationMapSuite
 
   testWithMemoryLeakDetection("test external sorting") {
     // Memory consumption in the beginning of the task.
-    val initialMemoryConsumption = shuffleMemoryManager.getMemoryConsumptionForThisTask()
+    val initialMemoryConsumption = taskMemoryManager.getMemoryConsumptionForThisTask()
 
     val map = new UnsafeFixedWidthAggregationMap(
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      shuffleMemoryManager,
       128, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -200,7 +194,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
+      assert(taskMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.
@@ -214,7 +208,7 @@ class UnsafeFixedWidthAggregationMapSuite
       sorter.insertKV(keyConverter.apply(k), valueConverter.apply(v))
 
       if ((i % 100) == 0) {
-        shuffleMemoryManager.markAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemory()
         sorter.closeCurrentPage()
       }
     }
@@ -238,7 +232,6 @@ class UnsafeFixedWidthAggregationMapSuite
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      shuffleMemoryManager,
       128, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -258,7 +251,7 @@ class UnsafeFixedWidthAggregationMapSuite
       sorter.insertKV(keyConverter.apply(k), valueConverter.apply(v))
 
       if ((i % 100) == 0) {
-        shuffleMemoryManager.markAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemory()
         sorter.closeCurrentPage()
       }
     }
@@ -281,14 +274,13 @@ class UnsafeFixedWidthAggregationMapSuite
   testWithMemoryLeakDetection("test external sorting with empty records") {
 
     // Memory consumption in the beginning of the task.
-    val initialMemoryConsumption = shuffleMemoryManager.getMemoryConsumptionForThisTask()
+    val initialMemoryConsumption = taskMemoryManager.getMemoryConsumptionForThisTask()
 
     val map = new UnsafeFixedWidthAggregationMap(
       emptyAggregationBuffer,
       StructType(Nil),
       StructType(Nil),
       taskMemoryManager,
-      shuffleMemoryManager,
       128, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -303,7 +295,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
+      assert(taskMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.
@@ -311,7 +303,7 @@ class UnsafeFixedWidthAggregationMapSuite
       sorter.insertKV(UnsafeRow.createFromByteArray(0, 0), UnsafeRow.createFromByteArray(0, 0))
 
       if ((i % 100) == 0) {
-        shuffleMemoryManager.markAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemory()
         sorter.closeCurrentPage()
       }
     }
@@ -332,34 +324,28 @@ class UnsafeFixedWidthAggregationMapSuite
   }
 
   testWithMemoryLeakDetection("convert to external sorter under memory pressure (SPARK-10474)") {
-    val smm = ShuffleMemoryManager.createForTesting(65536)
     val pageSize = 4096
     val map = new UnsafeFixedWidthAggregationMap(
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
       taskMemoryManager,
-      smm,
       128, // initial capacity
       pageSize,
       false // disable perf metrics
     )
 
-    // Insert into the map until we've run out of space
     val rand = new Random(42)
-    var hasSpace = true
-    while (hasSpace) {
+    for (i <- 1 to 100) {
       val str = rand.nextString(1024)
       val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
-      if (buf == null) {
-        hasSpace = false
-      } else {
-        buf.setInt(0, str.length)
-      }
+      buf.setInt(0, str.length)
     }
-
-    // Ensure we're actually maxed out by asserting that we can't acquire even just 1 byte
-    assert(smm.tryToAcquire(1) === 0)
+    // Simulate running out of space
+    memoryManager.markExecutionAsOutOfMemory()
+    val str = rand.nextString(1024)
+    val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+    assert(buf == null)
 
     // Convert the map into a sorter. This used to fail before the fix for SPARK-10474
     // because we would try to acquire space for the in-memory sorter pointer array before
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index d3be568a8758c..13dc1754c9ff0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -20,12 +20,12 @@ package org.apache.spark.sql.execution
 import scala.util.Random
 
 import org.apache.spark._
+import org.apache.spark.memory.{TaskMemoryManager, GrantEverythingMemoryManager}
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{InterpretedOrdering, UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
 
 /**
  * Test suite for [[UnsafeKVExternalSorter]], with randomly generated test data.
@@ -108,9 +108,9 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       inputData: Seq[(InternalRow, InternalRow)],
       pageSize: Long,
       spill: Boolean): Unit = {
-
-    val taskMemMgr = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
-    val shuffleMemMgr = new TestShuffleMemoryManager
+    val memoryManager =
+      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"))
+    val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
       partitionId = 0,
@@ -121,14 +121,14 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       internalAccumulators = Seq.empty))
 
     val sorter = new UnsafeKVExternalSorter(
-      keySchema, valueSchema, SparkEnv.get.blockManager, shuffleMemMgr, pageSize)
+      keySchema, valueSchema, SparkEnv.get.blockManager, pageSize)
 
     // Insert the keys and values into the sorter
     inputData.foreach { case (k, v) =>
       sorter.insertKV(k.asInstanceOf[UnsafeRow], v.asInstanceOf[UnsafeRow])
       // 1% chance we will spill
       if (rand.nextDouble() < 0.01 && spill) {
-        shuffleMemMgr.markAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemory()
         sorter.closeCurrentPage()
       }
     }
@@ -170,12 +170,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
     assert(out.sorted(kvOrdering) === inputData.sorted(kvOrdering))
 
     // Make sure there is no memory leak
-    val leakedUnsafeMemory: Long = taskMemMgr.cleanUpAllAllocatedMemory
-    if (shuffleMemMgr != null) {
-      val leakedShuffleMemory: Long = shuffleMemMgr.getMemoryConsumptionForThisTask()
-      assert(0L === leakedShuffleMemory)
-    }
-    assert(0 === leakedUnsafeMemory)
+    assert(0 === taskMemMgr.cleanUpAllAllocatedMemory)
     TaskContext.unset()
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index 1680d7e0a85ce..d32572b54b8a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import java.io.{File, ByteArrayInputStream, ByteArrayOutputStream}
 
 import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.ShuffleBlockId
 import org.apache.spark.util.collection.ExternalSorter
@@ -112,7 +113,12 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
       val data = (1 to 10000).iterator.map { i =>
         (i, converter(Row(i)))
       }
+      val taskMemoryManager = new TaskMemoryManager(sc.env.memoryManager, 0)
+      val taskContext = new TaskContextImpl(
+        0, 0, 0, 0, taskMemoryManager, null, InternalAccumulator.create(sc))
+
       val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
+        taskContext,
         partitioner = Some(new HashPartitioner(10)),
         serializer = Some(new UnsafeRowSerializer(numFields = 1)))
 
@@ -122,10 +128,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
       assert(sorter.numSpills > 0)
 
       // Merging spilled files should not throw assertion error
-      val taskContext =
-        new TaskContextImpl(0, 0, 0, 0, null, null, InternalAccumulator.create(sc))
       taskContext.taskMetrics.shuffleWriteMetrics = Some(new ShuffleWriteMetrics)
-      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), taskContext, outputFile)
+      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), outputFile)
     } {
       // Clean up
       if (sc != null) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index cc0ac1b07c21a..475037bd45379 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -18,16 +18,16 @@
 package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark._
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 
 class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLContext {
 
   test("memory acquired on construction") {
-    val taskMemoryManager = new TaskMemoryManager(SparkEnv.get.executorMemoryManager)
+    val taskMemoryManager = new TaskMemoryManager(SparkEnv.get.memoryManager, 0)
     val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, null, Seq.empty)
     TaskContext.setTaskContext(taskContext)
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index b2b6848719639..c17fb7238151b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -254,7 +254,7 @@ class ReceivedBlockHandlerSuite
       maxMem: Long,
       conf: SparkConf,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem, numCores = 1)
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val blockManager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, conf,
       memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java
deleted file mode 100644
index cbbe8594627a5..0000000000000
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/ExecutorMemoryManager.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.unsafe.memory;
-
-import java.lang.ref.WeakReference;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Map;
-import javax.annotation.concurrent.GuardedBy;
-
-/**
- * Manages memory for an executor. Individual operators / tasks allocate memory through
- * {@link TaskMemoryManager} objects, which obtain their memory from ExecutorMemoryManager.
- */
-public class ExecutorMemoryManager {
-
-  /**
-   * Allocator, exposed for enabling untracked allocations of temporary data structures.
-   */
-  public final MemoryAllocator allocator;
-
-  /**
-   * Tracks whether memory will be allocated on the JVM heap or off-heap using sun.misc.Unsafe.
-   */
-  final boolean inHeap;
-
-  @GuardedBy("this")
-  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
-    new HashMap<Long, LinkedList<WeakReference<MemoryBlock>>>();
-
-  private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
-
-  /**
-   * Construct a new ExecutorMemoryManager.
-   *
-   * @param allocator the allocator that will be used
-   */
-  public ExecutorMemoryManager(MemoryAllocator allocator) {
-    this.inHeap = allocator instanceof HeapMemoryAllocator;
-    this.allocator = allocator;
-  }
-
-  /**
-   * Returns true if allocations of the given size should go through the pooling mechanism and
-   * false otherwise.
-   */
-  private boolean shouldPool(long size) {
-    // Very small allocations are less likely to benefit from pooling.
-    // At some point, we should explore supporting pooling for off-heap memory, but for now we'll
-    // ignore that case in the interest of simplicity.
-    return size >= POOLING_THRESHOLD_BYTES && allocator instanceof HeapMemoryAllocator;
-  }
-
-  /**
-   * Allocates a contiguous block of memory. Note that the allocated memory is not guaranteed
-   * to be zeroed out (call `zero()` on the result if this is necessary).
-   */
-  MemoryBlock allocate(long size) throws OutOfMemoryError {
-    if (shouldPool(size)) {
-      synchronized (this) {
-        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
-        if (pool != null) {
-          while (!pool.isEmpty()) {
-            final WeakReference<MemoryBlock> blockReference = pool.pop();
-            final MemoryBlock memory = blockReference.get();
-            if (memory != null) {
-              assert (memory.size() == size);
-              return memory;
-            }
-          }
-          bufferPoolsBySize.remove(size);
-        }
-      }
-      return allocator.allocate(size);
-    } else {
-      return allocator.allocate(size);
-    }
-  }
-
-  void free(MemoryBlock memory) {
-    final long size = memory.size();
-    if (shouldPool(size)) {
-      synchronized (this) {
-        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
-        if (pool == null) {
-          pool = new LinkedList<WeakReference<MemoryBlock>>();
-          bufferPoolsBySize.put(size, pool);
-        }
-        pool.add(new WeakReference<MemoryBlock>(memory));
-      }
-    } else {
-      allocator.free(memory);
-    }
-  }
-
-}
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index 6722301df19d1..ebe90d9e63d83 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -17,22 +17,71 @@
 
 package org.apache.spark.unsafe.memory;
 
+import javax.annotation.concurrent.GuardedBy;
+import java.lang.ref.WeakReference;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
 /**
  * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array.
  */
 public class HeapMemoryAllocator implements MemoryAllocator {
 
+  @GuardedBy("this")
+  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
+    new HashMap<>();
+
+  private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
+
+  /**
+   * Returns true if allocations of the given size should go through the pooling mechanism and
+   * false otherwise.
+   */
+  private boolean shouldPool(long size) {
+    // Very small allocations are less likely to benefit from pooling.
+    return size >= POOLING_THRESHOLD_BYTES;
+  }
+
   @Override
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
     if (size % 8 != 0) {
       throw new IllegalArgumentException("Size " + size + " was not a multiple of 8");
     }
+    if (shouldPool(size)) {
+      synchronized (this) {
+        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        if (pool != null) {
+          while (!pool.isEmpty()) {
+            final WeakReference<MemoryBlock> blockReference = pool.pop();
+            final MemoryBlock memory = blockReference.get();
+            if (memory != null) {
+              assert (memory.size() == size);
+              return memory;
+            }
+          }
+          bufferPoolsBySize.remove(size);
+        }
+      }
+    }
     long[] array = new long[(int) (size / 8)];
     return MemoryBlock.fromLongArray(array);
   }
 
   @Override
   public void free(MemoryBlock memory) {
-    // Do nothing
+    final long size = memory.size();
+    if (shouldPool(size)) {
+      synchronized (this) {
+        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        if (pool == null) {
+          pool = new LinkedList<>();
+          bufferPoolsBySize.put(size, pool);
+        }
+        pool.add(new WeakReference<>(memory));
+      }
+    } else {
+      // Do nothing
+    }
   }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
index dd75820834370..e3e79471154df 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
@@ -30,9 +30,10 @@ public class MemoryBlock extends MemoryLocation {
 
   /**
    * Optional page number; used when this MemoryBlock represents a page allocated by a
-   * MemoryManager. This is package-private and is modified by MemoryManager.
+   * TaskMemoryManager. This field is public so that it can be modified by the TaskMemoryManager,
+   * which lives in a different package.
    */
-  int pageNumber = -1;
+  public int pageNumber = -1;
 
   public MemoryBlock(@Nullable Object obj, long offset, long length) {
     super(obj, offset);

From 87f82a5fb9c4350a97c761411069245f07aad46f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 25 Oct 2015 21:57:34 -0700
Subject: [PATCH 673/802] [SPARK-11127][STREAMING] upgrade AWS SDK and Kinesis
 Client Library (KCL)

AWS SDK 1.9.40 is the latest 1.9.x release. KCL 1.5.1 is the latest release that using AWS SDK 1.9.x. The main goal is to have Kinesis consumer be able to read messages generated from Kinesis Producer Library (KPL). The API should be compatible with old versions.

tdas brkyvz

Author: Xiangrui Meng <meng@databricks.com>

Closes #9153 from mengxr/SPARK-11127.
---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 445e65c0459bf..3dfc434fb553b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -152,8 +152,8 @@
     <avro.version>1.7.7</avro.version>
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <jets3t.version>0.7.1</jets3t.version>
-    <aws.java.sdk.version>1.9.16</aws.java.sdk.version>
-    <aws.kinesis.client.version>1.3.0</aws.kinesis.client.version>
+    <aws.java.sdk.version>1.9.40</aws.java.sdk.version>
+    <aws.kinesis.client.version>1.4.0</aws.kinesis.client.version>
     <!--  org.apache.httpcomponents/httpclient-->
     <commons.httpclient.version>4.3.2</commons.httpclient.version>
     <!--  commons-httpclient/commons-httpclient-->

From 07ced43424447699e47106de9ca2fa714756bdeb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 25 Oct 2015 22:47:39 -0700
Subject: [PATCH 674/802] [SPARK-11253] [SQL] reset all accumulators in
 physical operators before execute an action

With this change, our query execution listener can get the metrics correctly.

The UI still looks good after this change.
<img width="257" alt="screen shot 2015-10-23 at 11 25 14 am" src="https://cloud.githubusercontent.com/assets/3182036/10683834/d516f37e-7978-11e5-8118-343ed40eb824.png">
<img width="494" alt="screen shot 2015-10-23 at 11 25 01 am" src="https://cloud.githubusercontent.com/assets/3182036/10683837/e1fa60da-7978-11e5-8ec8-178b88f27764.png">

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9215 from cloud-fan/metric.
---
 .../org/apache/spark/sql/DataFrame.scala      |  3 +
 .../sql/execution/metric/SQLMetrics.scala     |  7 +-
 .../sql/util/DataFrameCallbackSuite.scala     | 81 ++++++++++++++++++-
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index bf25bcde208e2..25ad3bb993f4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1974,6 +1974,9 @@ class DataFrame private[sql](
    */
   private def withCallback[T](name: String, df: DataFrame)(action: DataFrame => T) = {
     try {
+      df.queryExecution.executedPlan.foreach { plan =>
+        plan.metrics.valuesIterator.foreach(_.reset())
+      }
       val start = System.nanoTime()
       val result = action(df)
       val end = System.nanoTime()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 075b7ad881112..1c253e3942e95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -28,7 +28,12 @@ import org.apache.spark.{Accumulable, AccumulableParam, SparkContext}
  */
 private[sql] abstract class SQLMetric[R <: SQLMetricValue[T], T](
     name: String, val param: SQLMetricParam[R, T])
-  extends Accumulable[R, T](param.zero, param, Some(name), true)
+  extends Accumulable[R, T](param.zero, param, Some(name), true) {
+
+  def reset(): Unit = {
+    this.value = param.zero
+  }
+}
 
 /**
  * Create a layer for specialized metric. We cannot add `@specialized` to
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index eb056cd519717..b46b0d2f6040a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.sql.util
 
-import org.apache.spark.SparkException
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark._
 import org.apache.spark.sql.{functions, QueryTest}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.test.SharedSQLContext
 
-import scala.collection.mutable.ArrayBuffer
-
 class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
   import functions._
@@ -54,6 +54,8 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
     assert(metrics(1)._1 == "count")
     assert(metrics(1)._2.analyzed.isInstanceOf[Aggregate])
     assert(metrics(1)._3 > 0)
+
+    sqlContext.listenerManager.unregister(listener)
   }
 
   test("execute callback functions when a DataFrame action failed") {
@@ -79,5 +81,78 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
     assert(metrics(0)._1 == "collect")
     assert(metrics(0)._2.analyzed.isInstanceOf[Project])
     assert(metrics(0)._3.getMessage == e.getMessage)
+
+    sqlContext.listenerManager.unregister(listener)
+  }
+
+  test("get numRows metrics by callback") {
+    val metrics = ArrayBuffer.empty[Long]
+    val listener = new QueryExecutionListener {
+      // Only test successful case here, so no need to implement `onFailure`
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
+
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+        metrics += qe.executedPlan.longMetric("numInputRows").value.value
+      }
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val df = Seq(1 -> "a").toDF("i", "j").groupBy("i").count()
+    df.collect()
+    df.collect()
+    Seq(1 -> "a", 2 -> "a").toDF("i", "j").groupBy("i").count().collect()
+
+    assert(metrics.length == 3)
+    assert(metrics(0) == 1)
+    assert(metrics(1) == 1)
+    assert(metrics(2) == 2)
+
+    sqlContext.listenerManager.unregister(listener)
+  }
+
+  // TODO: Currently some LongSQLMetric use -1 as initial value, so if the accumulator is never
+  // updated, we can filter it out later.  However, when we aggregate(sum) accumulator values at
+  // driver side for SQL physical operators, these -1 values will make our result smaller.
+  // A easy fix is to create a new SQLMetric(including new MetricValue, MetricParam, etc.), but we
+  // can do it later because the impact is just too small (1048576 tasks for 1 MB).
+  ignore("get size metrics by callback") {
+    val metrics = ArrayBuffer.empty[Long]
+    val listener = new QueryExecutionListener {
+      // Only test successful case here, so no need to implement `onFailure`
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
+
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+        metrics += qe.executedPlan.longMetric("dataSize").value.value
+        val bottomAgg = qe.executedPlan.children(0).children(0)
+        metrics += bottomAgg.longMetric("dataSize").value.value
+      }
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val sparkListener = new SaveInfoListener
+    sqlContext.sparkContext.addSparkListener(sparkListener)
+
+    val df = (1 to 100).map(i => i -> i.toString).toDF("i", "j")
+    df.groupBy("i").count().collect()
+
+    def getPeakExecutionMemory(stageId: Int): Long = {
+      val peakMemoryAccumulator = sparkListener.getCompletedStageInfos(stageId).accumulables
+        .filter(_._2.name == InternalAccumulator.PEAK_EXECUTION_MEMORY)
+
+      assert(peakMemoryAccumulator.size == 1)
+      peakMemoryAccumulator.head._2.value.toLong
+    }
+
+    assert(sparkListener.getCompletedStageInfos.length == 2)
+    val bottomAggDataSize = getPeakExecutionMemory(0)
+    val topAggDataSize = getPeakExecutionMemory(1)
+
+    // For this simple case, the peakExecutionMemory of a stage should be the data size of the
+    // aggregate operator, as we only have one memory consuming operator per stage.
+    assert(metrics.length == 2)
+    assert(metrics(0) == topAggDataSize)
+    assert(metrics(1) == bottomAggDataSize)
+
+    sqlContext.listenerManager.unregister(listener)
   }
 }

From 05c4bdb57947f44924b4fbdd8e4e2101f2f816f5 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Mon, 26 Oct 2015 09:25:19 +0100
Subject: [PATCH 675/802] [SPARK-11279][PYSPARK] Add DataFrame#toDF in PySpark

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9248 from zjffdu/SPARK-11279.
---
 python/pyspark/sql/dataframe.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 36fc6e0611dc1..3baff8147753d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1266,6 +1266,18 @@ def drop(self, col):
             raise TypeError("col should be a string or a Column")
         return DataFrame(jdf, self.sql_ctx)
 
+    @ignore_unicode_prefix
+    def toDF(self, *cols):
+        """Returns a new class:`DataFrame` that with new specified column names
+
+        :param cols: list of new column names (string)
+
+        >>> df.toDF('f1', 'f2').collect()
+        [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')]
+        """
+        jdf = self._jdf.toDF(self._jseq(cols))
+        return DataFrame(jdf, self.sql_ctx)
+
     @since(1.3)
     def toPandas(self):
         """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.

From 616be29c7f2ebc184bd5ec97210da36a2174d80c Mon Sep 17 00:00:00 2001
From: Kevin Yu <qyu@us.ibm.com>
Date: Mon, 26 Oct 2015 09:34:15 +0000
Subject: [PATCH 676/802] [SPARK-5966][WIP] Spark-submit deploy-mode cluster is
 not compatible with master local>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… master local>

Author: Kevin Yu <qyu@us.ibm.com>

Closes #9220 from kevinyu98/working_on_spark-5966.
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 640cc325281a9..84ae122f44370 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -328,6 +328,8 @@ object SparkSubmit {
       case (STANDALONE, CLUSTER) if args.isR =>
         printErrorAndExit("Cluster deploy mode is currently not supported for R " +
           "applications on standalone clusters.")
+      case (LOCAL, CLUSTER) =>
+        printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
       case (_, CLUSTER) if isShell(args.primaryResource) =>
         printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.")
       case (_, CLUSTER) if isSqlShell(args.mainClass) =>

From 3689beb98b6a6db61e35049fdb57b0cd6aad8019 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>
Date: Mon, 26 Oct 2015 15:12:25 -0700
Subject: [PATCH 677/802] [SPARK-10979][SPARKR] Sparkrmerge: Add merge to
 DataFrame with R signature

Add merge function to DataFrame, which supports R signature.
https://stat.ethz.ch/R-manual/R-devel/library/base/html/merge.html

Author: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>

Closes #9012 from NarineK/sparkrmerge.
---
 R/pkg/R/DataFrame.R              | 140 ++++++++++++++++++++++++++++++-
 R/pkg/inst/tests/test_sparkSQL.R |  37 +++++++-
 2 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 2acbd081cd504..c8944459542af 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1457,15 +1457,147 @@ setMethod("join",
             dataFrame(sdf)
           })
 
-#' @rdname merge
+#'
 #' @name merge
 #' @aliases join
+#' @title Merges two data frames
+#' @param x the first data frame to be joined
+#' @param y the second data frame to be joined
+#' @param by a character vector specifying the join columns. If by is not
+#'   specified, the common column names in \code{x} and \code{y} will be used.
+#' @param by.x a character vector specifying the joining columns for x.
+#' @param by.y a character vector specifying the joining columns for y.
+#' @param all.x a boolean value indicating whether all the rows in x should
+#'              be including in the join
+#' @param all.y a boolean value indicating whether all the rows in y should
+#'              be including in the join
+#' @param sort a logical argument indicating whether the resulting columns should be sorted
+#' @details  If all.x and all.y are set to FALSE, a natural join will be returned. If
+#'   all.x is set to TRUE and all.y is set to FALSE, a left outer join will
+#'   be returned. If all.x is set to FALSE and all.y is set to TRUE, a right
+#'   outer join will be returned. If all.x and all.y are set to TRUE, a full
+#'   outer join will be returned.
+#' @rdname merge
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' df1 <- jsonFile(sqlContext, path)
+#' df2 <- jsonFile(sqlContext, path2)
+#' merge(df1, df2) # Performs a Cartesian
+#' merge(df1, df2, by = "col1") # Performs an inner join based on expression
+#' merge(df1, df2, by.x = "col1", by.y = "col2", all.y = TRUE)
+#' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE)
+#' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE, all.y = TRUE)
+#' merge(df1, df2, by.x = "col1", by.y = "col2", all = TRUE, sort = FALSE)
+#' merge(df1, df2, by = "col1", all = TRUE, suffixes = c("-X", "-Y"))
+#' }
 setMethod("merge",
           signature(x = "DataFrame", y = "DataFrame"),
-          function(x, y, joinExpr = NULL, joinType = NULL, ...) {
-            join(x, y, joinExpr, joinType)
-          })
+          function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by,
+                   all = FALSE, all.x = all, all.y = all,
+                   sort = TRUE, suffixes = c("_x","_y"), ... ) {
+
+            if (length(suffixes) != 2) {
+              stop("suffixes must have length 2")
+            }
+
+            # join type is identified based on the values of all, all.x and all.y
+            # default join type is inner, according to R it should be natural but since it
+            # is not supported in spark inner join is used
+            joinType <- "inner"
+            if (all || (all.x && all.y)) {
+              joinType <- "outer"
+            } else if (all.x) {
+              joinType <- "left_outer"
+            } else if (all.y) {
+              joinType <- "right_outer"
+            }
 
+            # join expression is based on by.x, by.y if both by.x and by.y are not missing
+            # or on by, if by.x or by.y are missing or have different lengths
+            if (length(by.x) > 0 && length(by.x) == length(by.y)) {
+              joinX <- by.x
+              joinY <- by.y
+            } else if (length(by) > 0) {
+              # if join columns have the same name for both dataframes,
+              # they are used in join expression
+              joinX <- by
+              joinY <- by
+            } else {
+              # if by or both by.x and by.y have length 0, use Cartesian Product
+              joinRes <- join(x, y)
+              return (joinRes)
+            }
+
+            # sets alias for making colnames unique in dataframes 'x' and 'y'
+            colsX <- generateAliasesForIntersectedCols(x, by, suffixes[1])
+            colsY <- generateAliasesForIntersectedCols(y, by, suffixes[2])
+
+            # selects columns with their aliases from dataframes
+            # in case same column names are present in both data frames
+            xsel <- select(x, colsX)
+            ysel <- select(y, colsY)
+
+            # generates join conditions and adds them into a list
+            # it also considers alias names of the columns while generating join conditions
+            joinColumns <- lapply(seq_len(length(joinX)), function(i) {
+              colX <- joinX[[i]]
+              colY <- joinY[[i]]
+
+              if (colX %in% by) {
+                colX <- paste(colX, suffixes[1], sep = "")
+              }
+              if (colY %in% by) {
+                colY <- paste(colY, suffixes[2], sep = "")
+              }
+
+              colX <- getColumn(xsel, colX)
+              colY <- getColumn(ysel, colY)
+
+              colX == colY
+            })
+
+            # concatenates join columns with '&' and executes join
+            joinExpr <- Reduce("&", joinColumns)
+            joinRes <- join(xsel, ysel, joinExpr, joinType)
+
+            # sorts the result by 'by' columns if sort = TRUE
+            if (sort && length(by) > 0) {
+              colNameWithSuffix <- paste(by, suffixes[2], sep = "")
+              joinRes <- do.call("arrange", c(joinRes, colNameWithSuffix, decreasing = FALSE))
+            }
+
+            joinRes
+          })
+
+#' 
+#' Creates a list of columns by replacing the intersected ones with aliases.
+#' The name of the alias column is formed by concatanating the original column name and a suffix.
+#'
+#' @param x a DataFrame on which the
+#' @param intersectedColNames a list of intersected column names
+#' @param suffix a suffix for the column name
+#' @return list of columns
+#'
+generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
+  allColNames <- names(x)
+  # sets alias for making colnames unique in dataframe 'x'
+  cols <- lapply(allColNames, function(colName) {
+    col <- getColumn(x, colName)
+    if (colName %in% intersectedColNames) {
+      newJoin <- paste(colName, suffix, sep = "")
+      if (newJoin %in% allColNames){
+        stop ("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.",
+          "Please use different suffixes for the intersected columns.")
+      }
+      col <- alias(col, newJoin)
+    }
+    col
+  })
+  cols
+}
 
 #' UnionAll
 #'
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 67d8b23cd7b8d..540854d114b23 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1105,11 +1105,40 @@ test_that("join() and merge() on a DataFrame", {
   expect_equal(count(joined9), 4)
   expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
 
-  merged <- select(merge(df, df2, df$name == df2$name, "outer"),
-                   alias(df$age + 5, "newAge"), df$name, df2$test)
-  expect_equal(names(merged), c("newAge", "name", "test"))
+  merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
   expect_equal(count(merged), 4)
-  expect_equal(collect(orderBy(merged, merged$name))$newAge[3], 24)
+  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+  expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19)
+
+  merged <- merge(df, df2, suffixes = c("-X","-Y"))
+  expect_equal(count(merged), 3)
+  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30)
+
+  merged <- merge(df, df2, by = "name", suffixes = c("-X","-Y"), sort = FALSE)
+  expect_equal(count(merged), 3)
+  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael")
+
+  merged <- merge(df, df2, by = "name", all = T, sort = T)
+  expect_equal(count(merged), 4)
+  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name_y"))$"name_x"[1], "Andy")
+
+  merged <- merge(df, df2, by = NULL)
+  expect_equal(count(merged), 12)
+  expect_equal(names(merged), c("age", "name", "name", "test"))
+
+  mockLines3 <- c("{\"name\":\"Michael\", \"name_y\":\"Michael\", \"test\": \"yes\"}",
+                  "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}",
+                  "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}",
+                  "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
+  jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(mockLines3, jsonPath3)
+  df3 <- jsonFile(sqlContext, jsonPath3)
+  expect_error(merge(df, df3),
+               paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
+                     "Please use different suffixes for the intersected columns.", sep = ""))
 })
 
 test_that("toJSON() returns an RDD of the correct values", {

From b60aab8a95e2a35a1d4023a9d0a0d9724e4164f9 Mon Sep 17 00:00:00 2001
From: Frank Rosner <frank@fam-rosner.de>
Date: Mon, 26 Oct 2015 15:46:59 -0700
Subject: [PATCH 678/802] [SPARK-11258] Converting a Spark DataFrame into an R
 data.frame is slow / requires a lot of memory

https://issues.apache.org/jira/browse/SPARK-11258

I was not able to locate an existing unit test for this function so I wrote one.

Author: Frank Rosner <frank@fam-rosner.de>

Closes #9222 from FRosner/master.
---
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 16 ++++----
 .../spark/sql/api/r/SQLUtilsSuite.scala       | 38 +++++++++++++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index b0120a8d0dc4f..b3f134614c6bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -130,16 +130,18 @@ private[r] object SQLUtils {
   }
 
   def dfToCols(df: DataFrame): Array[Array[Any]] = {
-    // localDF is Array[Row]
-    val localDF = df.collect()
+    val localDF: Array[Row] = df.collect()
     val numCols = df.columns.length
+    val numRows = localDF.length
 
-    // result is Array[Array[Any]]
-    (0 until numCols).map { colIdx =>
-      localDF.map { row =>
-        row(colIdx)
+    val colArray = new Array[Array[Any]](numCols)
+    for (colNo <- 0 until numCols) {
+      colArray(colNo) = new Array[Any](numRows)
+      for (rowNo <- 0 until numRows) {
+        colArray(colNo)(rowNo) = localDF(rowNo)(colNo)
       }
-    }.toArray
+    }
+    colArray
   }
 
   def saveMode(mode: String): SaveMode = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
new file mode 100644
index 0000000000000..f54e23e3aa6cb
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
@@ -0,0 +1,38 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.api.r
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class SQLUtilsSuite extends SharedSQLContext {
+
+  import testImplicits._
+
+  test("dfToCols should collect and transpose a data frame") {
+    val df = Seq(
+      (1, 2, 3),
+      (4, 5, 6)
+    ).toDF
+    assert(SQLUtils.dfToCols(df) === Array(
+      Array(1, 4),
+      Array(2, 5),
+      Array(3, 6)
+    ))
+  }
+
+}

From 4bb2b3698ffed58cc5159db36f8b11573ad26b23 Mon Sep 17 00:00:00 2001
From: Alexander Slesarenko <avslesarenko@gmail.com>
Date: Mon, 26 Oct 2015 23:49:14 +0100
Subject: [PATCH 679/802] [SQL][DOC] Minor document fixes in interfaces.scala

rxin just noticed this while reading the code.

Author: Alexander Slesarenko <avslesarenko@gmail.com>

Closes #9284 from aslesarenko/doc-typos.
---
 .../org/apache/spark/sql/sources/interfaces.scala  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 84eef0f8a672c..a9a013e936fd3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -43,7 +43,7 @@ import org.apache.spark.util.SerializableConfiguration
  * This allows users to give the data source alias as the format type over the fully qualified
  * class name.
  *
- * A new instance of this class with be instantiated each time a DDL call is made.
+ * A new instance of this class will be instantiated each time a DDL call is made.
  *
  * @since 1.5.0
  */
@@ -74,7 +74,7 @@ trait DataSourceRegister {
  * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
  * data source 'org.apache.spark.sql.json.DefaultSource'
  *
- * A new instance of this class with be instantiated each time a DDL call is made.
+ * A new instance of this class will be instantiated each time a DDL call is made.
  *
  * @since 1.3.0
  */
@@ -100,7 +100,7 @@ trait RelationProvider {
  * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
  * data source 'org.apache.spark.sql.json.DefaultSource'
  *
- * A new instance of this class with be instantiated each time a DDL call is made.
+ * A new instance of this class will be instantiated each time a DDL call is made.
  *
  * The difference between a [[RelationProvider]] and a [[SchemaRelationProvider]] is that
  * users need to provide a schema when using a [[SchemaRelationProvider]].
@@ -135,7 +135,7 @@ trait SchemaRelationProvider {
  * less verbose invocation.  For example, 'org.apache.spark.sql.json' would resolve to the
  * data source 'org.apache.spark.sql.json.DefaultSource'
  *
- * A new instance of this class with be instantiated each time a DDL call is made.
+ * A new instance of this class will be instantiated each time a DDL call is made.
  *
  * The difference between a [[RelationProvider]] and a [[HadoopFsRelationProvider]] is
  * that users need to provide a schema and a (possibly empty) list of partition columns when
@@ -195,7 +195,7 @@ trait CreatableRelationProvider {
  * implementation should inherit from one of the descendant `Scan` classes, which define various
  * abstract methods for execution.
  *
- * BaseRelations must also define a equality function that only returns true when the two
+ * BaseRelations must also define an equality function that only returns true when the two
  * instances will return the same data. This equality function is used when determining when
  * it is safe to substitute cached results for a given relation.
  *
@@ -208,7 +208,7 @@ abstract class BaseRelation {
 
   /**
    * Returns an estimated size of this relation in bytes. This information is used by the planner
-   * to decided when it is safe to broadcast a relation and can be overridden by sources that
+   * to decide when it is safe to broadcast a relation and can be overridden by sources that
    * know the size ahead of time. By default, the system will assume that tables are too
    * large to broadcast. This method will be called multiple times during query planning
    * and thus should not perform expensive operations for each invocation.
@@ -383,7 +383,7 @@ abstract class OutputWriter {
 
 /**
  * ::Experimental::
- * A [[BaseRelation]] that provides much of the common code required for formats that store their
+ * A [[BaseRelation]] that provides much of the common code required for relations that store their
  * data to an HDFS compatible filesystem.
  *
  * For the read path, similar to [[PrunedFilteredScan]], it can eliminate unneeded columns and

From d4c397a64af4cec899fdaa3e617ed20333cc567d Mon Sep 17 00:00:00 2001
From: Nong Li <nongli@gmail.com>
Date: Mon, 26 Oct 2015 18:27:02 -0700
Subject: [PATCH 680/802] [SPARK-11325] [SQL] Alias 'alias' in Scala's
 DataFrame API

Author: Nong Li <nongli@gmail.com>

Closes #9286 from nongli/spark-11325.
---
 .../scala/org/apache/spark/sql/DataFrame.scala     | 14 ++++++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala      |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 25ad3bb993f4e..32d9b0b1d9888 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -698,6 +698,20 @@ class DataFrame private[sql](
    */
   def as(alias: Symbol): DataFrame = as(alias.name)
 
+  /**
+   * Returns a new [[DataFrame]] with an alias set. Same as `as`.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def alias(alias: String): DataFrame = as(alias)
+
+  /**
+   * (Scala-specific) Returns a new [[DataFrame]] with an alias set. Same as `as`.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def alias(alias: Symbol): DataFrame = as(alias)
+
   /**
    * Selects a set of column based expressions.
    * {{{
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f4c7aa34e560c..59565a6b13d40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -105,6 +105,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(testData.head(2).head.schema === testData.schema)
   }
 
+  test("dataframe alias") {
+    val df = Seq(Tuple1(1)).toDF("c").as("t")
+    val dfAlias = df.alias("t2")
+    df.col("t.c")
+    dfAlias.col("t2.c")
+  }
+
   test("simple explode") {
     val df = Seq(Tuple1("a b c"), Tuple1("d e")).toDF("words")
 

From 82464fb2e02ca4e4d425017815090497b79dc93f Mon Sep 17 00:00:00 2001
From: Stephen De Gennaro <stepheng@realitymine.com>
Date: Mon, 26 Oct 2015 19:55:10 -0700
Subject: [PATCH 681/802] [SPARK-10947] [SQL] With schema inference from JSON
 into a Dataframe, add option to infer all primitive object types as strings

Currently, when a schema is inferred from a JSON file using sqlContext.read.json, the primitive object types are inferred as string, long, boolean, etc.

However, if the inferred type is too specific (JSON obviously does not enforce types itself), this can cause issues with merging dataframe schemas.

This pull request adds the option "primitivesAsString" to the JSON DataFrameReader which when true (defaults to false if not set) will infer all primitives as strings.

Below is an example usage of this new functionality.
```
val jsonDf = sqlContext.read.option("primitivesAsString", "true").json(sampleJsonFile)

scala> jsonDf.printSchema()
root
|-- bigInteger: string (nullable = true)
|-- boolean: string (nullable = true)
|-- double: string (nullable = true)
|-- integer: string (nullable = true)
|-- long: string (nullable = true)
|-- null: string (nullable = true)
|-- string: string (nullable = true)
```

Author: Stephen De Gennaro <stepheng@realitymine.com>

Closes #9249 from stephend-realitymine/stephend-primitives.
---
 .../apache/spark/sql/DataFrameReader.scala    |  10 +-
 .../datasources/json/InferSchema.scala        |  20 ++-
 .../datasources/json/JSONRelation.scala       |  14 +-
 .../datasources/json/JsonSuite.scala          | 138 +++++++++++++++++-
 4 files changed, 171 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 824220d85e04d..6a194a443ab17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -256,8 +256,16 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    */
   def json(jsonRDD: RDD[String]): DataFrame = {
     val samplingRatio = extraOptions.getOrElse("samplingRatio", "1.0").toDouble
+    val primitivesAsString = extraOptions.getOrElse("primitivesAsString", "false").toBoolean
     sqlContext.baseRelationToDataFrame(
-      new JSONRelation(Some(jsonRDD), samplingRatio, userSpecifiedSchema, None, None)(sqlContext))
+      new JSONRelation(
+        Some(jsonRDD),
+        samplingRatio,
+        primitivesAsString,
+        userSpecifiedSchema,
+        None,
+        None)(sqlContext)
+    )
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index d0780028dacb1..b9914c581a657 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -35,7 +35,8 @@ private[sql] object InferSchema {
   def apply(
       json: RDD[String],
       samplingRatio: Double = 1.0,
-      columnNameOfCorruptRecords: String): StructType = {
+      columnNameOfCorruptRecords: String,
+      primitivesAsString: Boolean = false): StructType = {
     require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
     val schemaData = if (samplingRatio > 0.99) {
       json
@@ -50,7 +51,7 @@ private[sql] object InferSchema {
         try {
           Utils.tryWithResource(factory.createParser(row)) { parser =>
             parser.nextToken()
-            inferField(parser)
+            inferField(parser, primitivesAsString)
           }
         } catch {
           case _: JsonParseException =>
@@ -70,14 +71,14 @@ private[sql] object InferSchema {
   /**
    * Infer the type of a json document from the parser's token stream
    */
-  private def inferField(parser: JsonParser): DataType = {
+  private def inferField(parser: JsonParser, primitivesAsString: Boolean): DataType = {
     import com.fasterxml.jackson.core.JsonToken._
     parser.getCurrentToken match {
       case null | VALUE_NULL => NullType
 
       case FIELD_NAME =>
         parser.nextToken()
-        inferField(parser)
+        inferField(parser, primitivesAsString)
 
       case VALUE_STRING if parser.getTextLength < 1 =>
         // Zero length strings and nulls have special handling to deal
@@ -92,7 +93,10 @@ private[sql] object InferSchema {
       case START_OBJECT =>
         val builder = Seq.newBuilder[StructField]
         while (nextUntil(parser, END_OBJECT)) {
-          builder += StructField(parser.getCurrentName, inferField(parser), nullable = true)
+          builder += StructField(
+            parser.getCurrentName,
+            inferField(parser, primitivesAsString),
+            nullable = true)
         }
 
         StructType(builder.result().sortBy(_.name))
@@ -103,11 +107,15 @@ private[sql] object InferSchema {
         // the type as we pass through all JSON objects.
         var elementType: DataType = NullType
         while (nextUntil(parser, END_ARRAY)) {
-          elementType = compatibleType(elementType, inferField(parser))
+          elementType = compatibleType(elementType, inferField(parser, primitivesAsString))
         }
 
         ArrayType(elementType)
 
+      case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT) if primitivesAsString => StringType
+
+      case (VALUE_TRUE | VALUE_FALSE) if primitivesAsString => StringType
+
       case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT =>
         import JsonParser.NumberType._
         parser.getNumberType match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 794b889a93627..5f104fca7d629 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -52,14 +52,23 @@ class DefaultSource extends HadoopFsRelationProvider with DataSourceRegister {
       partitionColumns: Option[StructType],
       parameters: Map[String, String]): HadoopFsRelation = {
     val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
+    val primitivesAsString = parameters.get("primitivesAsString").map(_.toBoolean).getOrElse(false)
 
-    new JSONRelation(None, samplingRatio, dataSchema, None, partitionColumns, paths)(sqlContext)
+    new JSONRelation(
+      None,
+      samplingRatio,
+      primitivesAsString,
+      dataSchema,
+      None,
+      partitionColumns,
+      paths)(sqlContext)
   }
 }
 
 private[sql] class JSONRelation(
     val inputRDD: Option[RDD[String]],
     val samplingRatio: Double,
+    val primitivesAsString: Boolean,
     val maybeDataSchema: Option[StructType],
     val maybePartitionSpec: Option[PartitionSpec],
     override val userDefinedPartitionColumns: Option[StructType],
@@ -105,7 +114,8 @@ private[sql] class JSONRelation(
       InferSchema(
         inputRDD.getOrElse(createBaseRdd(files)),
         samplingRatio,
-        sqlContext.conf.columnNameOfCorruptRecord)
+        sqlContext.conf.columnNameOfCorruptRecord,
+        primitivesAsString)
     }
     checkConstraints(jsonSchema)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 7540223bf2771..d3fd409291f29 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -632,6 +632,136 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     )
   }
 
+  test("Loading a JSON dataset primitivesAsString returns schema with primitive types as strings") {
+    val dir = Utils.createTempDir()
+    dir.delete()
+    val path = dir.getCanonicalPath
+    primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
+    val jsonDF = sqlContext.read.option("primitivesAsString", "true").json(path)
+
+    val expectedSchema = StructType(
+      StructField("bigInteger", StringType, true) ::
+      StructField("boolean", StringType, true) ::
+      StructField("double", StringType, true) ::
+      StructField("integer", StringType, true) ::
+      StructField("long", StringType, true) ::
+      StructField("null", StringType, true) ::
+      StructField("string", StringType, true) :: Nil)
+
+    assert(expectedSchema === jsonDF.schema)
+
+    jsonDF.registerTempTable("jsonTable")
+
+    checkAnswer(
+      sql("select * from jsonTable"),
+      Row("92233720368547758070",
+      "true",
+      "1.7976931348623157E308",
+      "10",
+      "21474836470",
+      null,
+      "this is a simple string.")
+    )
+  }
+
+  test("Loading a JSON dataset primitivesAsString returns complex fields as strings") {
+    val jsonDF = sqlContext.read.option("primitivesAsString", "true").json(complexFieldAndType1)
+
+    val expectedSchema = StructType(
+      StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
+      StructField("arrayOfArray2", ArrayType(ArrayType(StringType, true), true), true) ::
+      StructField("arrayOfBigInteger", ArrayType(StringType, true), true) ::
+      StructField("arrayOfBoolean", ArrayType(StringType, true), true) ::
+      StructField("arrayOfDouble", ArrayType(StringType, true), true) ::
+      StructField("arrayOfInteger", ArrayType(StringType, true), true) ::
+      StructField("arrayOfLong", ArrayType(StringType, true), true) ::
+      StructField("arrayOfNull", ArrayType(StringType, true), true) ::
+      StructField("arrayOfString", ArrayType(StringType, true), true) ::
+      StructField("arrayOfStruct", ArrayType(
+        StructType(
+          StructField("field1", StringType, true) ::
+          StructField("field2", StringType, true) ::
+          StructField("field3", StringType, true) :: Nil), true), true) ::
+      StructField("struct", StructType(
+        StructField("field1", StringType, true) ::
+        StructField("field2", StringType, true) :: Nil), true) ::
+      StructField("structWithArrayFields", StructType(
+        StructField("field1", ArrayType(StringType, true), true) ::
+        StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil)
+
+    assert(expectedSchema === jsonDF.schema)
+
+    jsonDF.registerTempTable("jsonTable")
+
+    // Access elements of a primitive array.
+    checkAnswer(
+      sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"),
+      Row("str1", "str2", null)
+    )
+
+    // Access an array of null values.
+    checkAnswer(
+      sql("select arrayOfNull from jsonTable"),
+      Row(Seq(null, null, null, null))
+    )
+
+    // Access elements of a BigInteger array (we use DecimalType internally).
+    checkAnswer(
+      sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from jsonTable"),
+      Row("922337203685477580700", "-922337203685477580800", null)
+    )
+
+    // Access elements of an array of arrays.
+    checkAnswer(
+      sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"),
+      Row(Seq("1", "2", "3"), Seq("str1", "str2"))
+    )
+
+    // Access elements of an array of arrays.
+    checkAnswer(
+      sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"),
+      Row(Seq("1", "2", "3"), Seq("1.1", "2.1", "3.1"))
+    )
+
+    // Access elements of an array inside a filed with the type of ArrayType(ArrayType).
+    checkAnswer(
+      sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"),
+      Row("str2", "2.1")
+    )
+
+    // Access elements of an array of structs.
+    checkAnswer(
+      sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " +
+        "from jsonTable"),
+      Row(
+        Row("true", "str1", null),
+        Row("false", null, null),
+        Row(null, null, null),
+        null)
+    )
+
+    // Access a struct and fields inside of it.
+    checkAnswer(
+      sql("select struct, struct.field1, struct.field2 from jsonTable"),
+      Row(
+        Row("true", "92233720368547758070"),
+        "true",
+        "92233720368547758070") :: Nil
+    )
+
+    // Access an array field of a struct.
+    checkAnswer(
+      sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"),
+      Row(Seq("4", "5", "6"), Seq("str1", "str2"))
+    )
+
+    // Access elements of an array field of a struct.
+    checkAnswer(
+      sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from jsonTable"),
+      Row("5", null)
+    )
+  }
+
   test("Loading a JSON dataset from a text file with SQL") {
     val dir = Utils.createTempDir()
     dir.delete()
@@ -960,9 +1090,9 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
     val jsonDF = sqlContext.read.json(primitiveFieldAndType)
     val primTable = sqlContext.read.json(jsonDF.toJSON)
-    primTable.registerTempTable("primativeTable")
+    primTable.registerTempTable("primitiveTable")
     checkAnswer(
-        sql("select * from primativeTable"),
+        sql("select * from primitiveTable"),
       Row(new java.math.BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
@@ -1039,24 +1169,28 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val relation0 = new JSONRelation(
       Some(empty),
       1.0,
+      false,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
       None, None)(sqlContext)
     val logicalRelation0 = LogicalRelation(relation0)
     val relation1 = new JSONRelation(
       Some(singleRow),
       1.0,
+      false,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
       None, None)(sqlContext)
     val logicalRelation1 = LogicalRelation(relation1)
     val relation2 = new JSONRelation(
       Some(singleRow),
       0.5,
+      false,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
       None, None)(sqlContext)
     val logicalRelation2 = LogicalRelation(relation2)
     val relation3 = new JSONRelation(
       Some(singleRow),
       1.0,
+      false,
       Some(StructType(StructField("b", IntegerType, true) :: Nil)),
       None, None)(sqlContext)
     val logicalRelation3 = LogicalRelation(relation3)

From dc3220ce11c7513b1452c82ee82cb86e908bcc2d Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Mon, 26 Oct 2015 20:58:18 -0700
Subject: [PATCH 682/802] [SPARK-11209][SPARKR] Add window functions into
 SparkR [step 1].

Author: Sun Rui <rui.sun@intel.com>

Closes #9193 from sun-rui/SPARK-11209.
---
 R/pkg/NAMESPACE                               |  4 +
 R/pkg/R/functions.R                           | 98 +++++++++++++++++++
 R/pkg/R/generics.R                            | 16 +++
 R/pkg/inst/tests/test_sparkSQL.R              |  2 +
 .../apache/spark/api/r/RBackendHandler.scala  |  3 +-
 5 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 52f7a0106aae6..b73bed3128242 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -119,6 +119,7 @@ exportMethods("%in%",
               "count",
               "countDistinct",
               "crc32",
+              "cumeDist",
               "date_add",
               "date_format",
               "date_sub",
@@ -150,8 +151,10 @@ exportMethods("%in%",
               "isNaN",
               "isNotNull",
               "isNull",
+              "lag",
               "last",
               "last_day",
+              "lead",
               "least",
               "length",
               "levenshtein",
@@ -177,6 +180,7 @@ exportMethods("%in%",
               "nanvl",
               "negate",
               "next_day",
+              "ntile",
               "otherwise",
               "pmod",
               "quarter",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a72fb7bb42fef..366290fe66276 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2013,3 +2013,101 @@ setMethod("ifelse",
                                 "otherwise", no)
               column(jc)
           })
+
+###################### Window functions######################
+
+#' cumeDist
+#'
+#' Window function: returns the cumulative distribution of values within a window partition,
+#' i.e. the fraction of rows that are below the current row.
+#' 
+#'   N = total number of rows in the partition
+#'   cumeDist(x) = number of values before (and including) x / N
+#'   
+#' This is equivalent to the CUME_DIST function in SQL.
+#'
+#' @rdname cumeDist
+#' @name cumeDist
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{cumeDist()}
+setMethod("cumeDist",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "cumeDist")
+            column(jc)
+          })
+
+#' lag
+#'
+#' Window function: returns the value that is `offset` rows before the current row, and
+#' `defaultValue` if there is less than `offset` rows before the current row. For example,
+#' an `offset` of one will return the previous row at any given point in the window partition.
+#' 
+#' This is equivalent to the LAG function in SQL.
+#'
+#' @rdname lag
+#' @name lag
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{lag(df$c)}
+setMethod("lag",
+          signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"),
+          function(x, offset, defaultValue = NULL) {
+            col <- if (class(x) == "Column") {
+              x@jc
+            } else {
+              x
+            }
+
+            jc <- callJStatic("org.apache.spark.sql.functions",
+                              "lag", col, as.integer(offset), defaultValue)
+            column(jc)
+          })
+
+#' lead
+#'
+#' Window function: returns the value that is `offset` rows after the current row, and
+#' `null` if there is less than `offset` rows after the current row. For example,
+#' an `offset` of one will return the next row at any given point in the window partition.
+#' 
+#' This is equivalent to the LEAD function in SQL.
+#'
+#' @rdname lead
+#' @name lead
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{lead(df$c)}
+setMethod("lead",
+          signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"),
+          function(x, offset, defaultValue = NULL) {
+            col <- if (class(x) == "Column") {
+              x@jc
+            } else {
+              x
+            }
+
+            jc <- callJStatic("org.apache.spark.sql.functions",
+                              "lead", col, as.integer(offset), defaultValue)
+            column(jc)
+          })
+
+#' ntile
+#'
+#' Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window
+#' partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second
+#' quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
+#' 
+#' This is equivalent to the NTILE function in SQL.
+#'
+#' @rdname ntile
+#' @name ntile
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{ntile(1)}
+setMethod("ntile",
+          signature(x = "numeric"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 4a419f785e92c..c11c3c8d3e150 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -714,6 +714,10 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct")
 #' @export
 setGeneric("crc32", function(x) { standardGeneric("crc32") })
 
+#' @rdname cumeDist
+#' @export
+setGeneric("cumeDist", function(x) { standardGeneric("cumeDist") })
+
 #' @rdname datediff
 #' @export
 setGeneric("datediff", function(y, x) { standardGeneric("datediff") })
@@ -790,6 +794,10 @@ setGeneric("instr", function(y, x) { standardGeneric("instr") })
 #' @export
 setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
 
+#' @rdname lag
+#' @export
+setGeneric("lag", function(x, offset, defaultValue = NULL) { standardGeneric("lag") })
+
 #' @rdname last
 #' @export
 setGeneric("last", function(x) { standardGeneric("last") })
@@ -798,6 +806,10 @@ setGeneric("last", function(x) { standardGeneric("last") })
 #' @export
 setGeneric("last_day", function(x) { standardGeneric("last_day") })
 
+#' @rdname lead
+#' @export
+setGeneric("lead", function(x, offset, defaultValue = NULL) { standardGeneric("lead") })
+
 #' @rdname least
 #' @export
 setGeneric("least", function(x, ...) { standardGeneric("least") })
@@ -858,6 +870,10 @@ setGeneric("negate", function(x) { standardGeneric("negate") })
 #' @export
 setGeneric("next_day", function(y, x) { standardGeneric("next_day") })
 
+#' @rdname ntile
+#' @export
+setGeneric("ntile", function(x) { standardGeneric("ntile") })
+
 #' @rdname countDistinct
 #' @export
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 540854d114b23..e1d4499925fe7 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -829,6 +829,8 @@ test_that("column functions", {
   c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c)
   c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
+  c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
+  c13 <- cumeDist() + ntile(1)
 
   df <- jsonFile(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 2a792d81994fd..0095548c463cc 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -224,7 +224,8 @@ private[r] class RBackendHandler(server: RBackend)
                 case _ => parameterType
               }
             }
-            if (!parameterWrapperType.isInstance(args(i))) {
+            if ((parameterType.isPrimitive || args(i) != null) &&
+                !parameterWrapperType.isInstance(args(i))) {
               argMatched = false
             }
           }

From a150e6c1b03b64a35855b8074b2fe077a6081a34 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 26 Oct 2015 21:14:26 -0700
Subject: [PATCH 683/802] [SPARK-10562] [SQL] support mixed case partitionBy
 column names for tables stored in metastore

https://issues.apache.org/jira/browse/SPARK-10562

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9226 from cloud-fan/par.
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 61 +++++++++++--------
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  9 ++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 11 ++++
 3 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index fdb576bedbbaf..f4d45714fae4e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -143,6 +143,21 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
           }
         }
 
+        def partColsFromParts: Option[Seq[String]] = {
+          table.properties.get("spark.sql.sources.schema.numPartCols").map { numPartCols =>
+            (0 until numPartCols.toInt).map { index =>
+              val partCol = table.properties.get(s"spark.sql.sources.schema.partCol.$index").orNull
+              if (partCol == null) {
+                throw new AnalysisException(
+                  "Could not read partitioned columns from the metastore because it is corrupted " +
+                    s"(missing part $index of the it, $numPartCols parts are expected).")
+              }
+
+              partCol
+            }
+          }
+        }
+
         // Originally, we used spark.sql.sources.schema to store the schema of a data source table.
         // After SPARK-6024, we removed this flag.
         // Although we are not using spark.sql.sources.schema any more, we need to still support.
@@ -155,7 +170,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         // We only need names at here since userSpecifiedSchema we loaded from the metastore
         // contains partition columns. We can always get datatypes of partitioning columns
         // from userSpecifiedSchema.
-        val partitionColumns = table.partitionColumns.map(_.name)
+        val partitionColumns = partColsFromParts.getOrElse(Nil)
 
         // It does not appear that the ql client for the metastore has a way to enumerate all the
         // SerDe properties directly...
@@ -218,25 +233,21 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       }
     }
 
-    val metastorePartitionColumns = userSpecifiedSchema.map { schema =>
-      val fields = partitionColumns.map(col => schema(col))
-      fields.map { field =>
-        HiveColumn(
-          name = field.name,
-          hiveType = HiveMetastoreTypes.toMetastoreType(field.dataType),
-          comment = "")
-      }.toSeq
-    }.getOrElse {
-      if (partitionColumns.length > 0) {
-        // The table does not have a specified schema, which means that the schema will be inferred
-        // when we load the table. So, we are not expecting partition columns and we will discover
-        // partitions when we load the table. However, if there are specified partition columns,
-        // we simply ignore them and provide a warning message.
-        logWarning(
-          s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
-            s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
+    if (userSpecifiedSchema.isDefined && partitionColumns.length > 0) {
+      tableProperties.put("spark.sql.sources.schema.numPartCols", partitionColumns.length.toString)
+      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
+        tableProperties.put(s"spark.sql.sources.schema.partCol.$index", partCol)
       }
-      Seq.empty[HiveColumn]
+    }
+
+    if (userSpecifiedSchema.isEmpty && partitionColumns.length > 0) {
+      // The table does not have a specified schema, which means that the schema will be inferred
+      // when we load the table. So, we are not expecting partition columns and we will discover
+      // partitions when we load the table. However, if there are specified partition columns,
+      // we simply ignore them and provide a warning message.
+      logWarning(
+        s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
+          s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
     }
 
     val tableType = if (isExternal) {
@@ -255,8 +266,8 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       HiveTable(
         specifiedDatabase = Option(dbName),
         name = tblName,
-        schema = Seq.empty,
-        partitionColumns = metastorePartitionColumns,
+        schema = Nil,
+        partitionColumns = Nil,
         tableType = tableType,
         properties = tableProperties.toMap,
         serdeProperties = options)
@@ -272,14 +283,14 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         }
       }
 
-      val partitionColumns = schemaToHiveColumn(relation.partitionColumns)
-      val dataColumns = schemaToHiveColumn(relation.schema).filterNot(partitionColumns.contains)
+      assert(partitionColumns.isEmpty)
+      assert(relation.partitionColumns.isEmpty)
 
       HiveTable(
         specifiedDatabase = Option(dbName),
         name = tblName,
-        schema = dataColumns,
-        partitionColumns = partitionColumns,
+        schema = schemaToHiveColumn(relation.schema),
+        partitionColumns = Nil,
         tableType = tableType,
         properties = tableProperties.toMap,
         serdeProperties = options,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index d2928876887bd..f74eb1500b989 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -753,10 +753,15 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       invalidateTable(tableName)
       val metastoreTable = catalog.client.getTable("default", tableName)
       val expectedPartitionColumns = StructType(df.schema("d") :: df.schema("b") :: Nil)
+
+      val numPartCols = metastoreTable.properties("spark.sql.sources.schema.numPartCols").toInt
+      assert(numPartCols == 2)
+
       val actualPartitionColumns =
         StructType(
-          metastoreTable.partitionColumns.map(c =>
-            StructField(c.name, HiveMetastoreTypes.toDataType(c.hiveType))))
+          (0 until numPartCols).map { index =>
+            df.schema(metastoreTable.properties(s"spark.sql.sources.schema.partCol.$index"))
+          })
       // Make sure partition columns are correctly stored in metastore.
       assert(
         expectedPartitionColumns.sameType(actualPartitionColumns),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 396150be76e83..fd380641dcc71 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1410,4 +1410,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-10562: partition by column with mixed case name") {
+    withTable("tbl10562") {
+      val df = Seq(2012 -> "a").toDF("Year", "val")
+      df.write.partitionBy("Year").saveAsTable("tbl10562")
+      checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
+      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
+    }
+  }
 }

From 943d4fa204a827ca8ecc39d9cf04e86890ee9840 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Mon, 26 Oct 2015 21:17:53 -0700
Subject: [PATCH 684/802] [SPARK-11289][DOC] Substitute code examples in ML
 features extractors with include_example

mengxr https://issues.apache.org/jira/browse/SPARK-11289

I make some changes in ML feature extractors. I.e. TF-IDF, Word2Vec, and CountVectorizer. I add new example code in spark/examples, hope it is the right place to add those examples.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #9266 from yinxusen/SPARK-11289.
---
 docs/ml-features.md                           | 217 +-----------------
 .../ml/JavaCountVectorizerExample.java        |  69 ++++++
 .../spark/examples/ml/JavaTfIdfExample.java   |  79 +++++++
 .../examples/ml/JavaWord2VecExample.java      |  67 ++++++
 examples/src/main/python/ml/tf_idf_example.py |  47 ++++
 .../src/main/python/ml/word2vec_example.py    |  45 ++++
 .../examples/ml/CountVectorizerExample.scala  |  59 +++++
 .../spark/examples/ml/TfIdfExample.scala      |  53 +++++
 .../spark/examples/ml/Word2VecExample.scala   |  53 +++++
 9 files changed, 480 insertions(+), 209 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
 create mode 100644 examples/src/main/python/ml/tf_idf_example.py
 create mode 100644 examples/src/main/python/ml/word2vec_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 44a98829393e7..142afac2f3f95 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -37,23 +37,7 @@ In the following code segment, we start with a set of sentences.  We split each
 Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
 the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
-
-val sentenceData = sqlContext.createDataFrame(Seq(
-  (0, "Hi I heard about Spark"),
-  (0, "I wish Java could use case classes"),
-  (1, "Logistic regression models are neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val wordsData = tokenizer.transform(sentenceData)
-val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
-val featurizedData = hashingTF.transform(wordsData)
-val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
-val idfModel = idf.fit(featurizedData)
-val rescaledData = idfModel.transform(featurizedData)
-rescaledData.select("features", "label").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -61,49 +45,7 @@ rescaledData.select("features", "label").take(3).foreach(println)
 Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
 [IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.IDF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, "Hi I heard about Spark"),
-  RowFactory.create(0, "I wish Java could use case classes"),
-  RowFactory.create(1, "Logistic regression models are neat")
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
-  new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
-Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsData = tokenizer.transform(sentenceData);
-int numFeatures = 20;
-HashingTF hashingTF = new HashingTF()
-  .setInputCol("words")
-  .setOutputCol("rawFeatures")
-  .setNumFeatures(numFeatures);
-DataFrame featurizedData = hashingTF.transform(wordsData);
-IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
-IDFModel idfModel = idf.fit(featurizedData);
-DataFrame rescaledData = idfModel.transform(featurizedData);
-for (Row r : rescaledData.select("features", "label").take(3)) {
-  Vector features = r.getAs(0);
-  Double label = r.getDouble(1);
-  System.out.println(features);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -111,24 +53,7 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
 Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
 the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import HashingTF, IDF, Tokenizer
-
-sentenceData = sqlContext.createDataFrame([
-  (0, "Hi I heard about Spark"),
-  (0, "I wish Java could use case classes"),
-  (1, "Logistic regression models are neat")
-], ["label", "sentence"])
-tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsData = tokenizer.transform(sentenceData)
-hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
-featurizedData = hashingTF.transform(wordsData)
-idf = IDF(inputCol="rawFeatures", outputCol="features")
-idfModel = idf.fit(featurizedData)
-rescaledData = idfModel.transform(featurizedData)
-for features_label in rescaledData.select("features", "label").take(3):
-  print(features_label)
-{% endhighlight %}
+{% include_example python/ml/tf_idf_example.py %}
 </div>
 </div>
 
@@ -149,26 +74,7 @@ In the following code segment, we start with a set of documents, each of which i
 Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.Word2Vec
-
-// Input data: Each row is a bag of words from a sentence or document.
-val documentDF = sqlContext.createDataFrame(Seq(
-  "Hi I heard about Spark".split(" "),
-  "I wish Java could use case classes".split(" "),
-  "Logistic regression models are neat".split(" ")
-).map(Tuple1.apply)).toDF("text")
-
-// Learn a mapping from words to Vectors.
-val word2Vec = new Word2Vec()
-  .setInputCol("text")
-  .setOutputCol("result")
-  .setVectorSize(3)
-  .setMinCount(0)
-val model = word2Vec.fit(documentDF)
-val result = model.transform(documentDF)
-result.select("result").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -176,43 +82,7 @@ result.select("result").take(3).foreach(println)
 Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.*;
-
-JavaSparkContext jsc = ...
-SQLContext sqlContext = ...
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
-  RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
-  RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
-
-// Learn a mapping from words to Vectors.
-Word2Vec word2Vec = new Word2Vec()
-  .setInputCol("text")
-  .setOutputCol("result")
-  .setVectorSize(3)
-  .setMinCount(0);
-Word2VecModel model = word2Vec.fit(documentDF);
-DataFrame result = model.transform(documentDF);
-for (Row r: result.select("result").take(3)) {
-  System.out.println(r);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -220,22 +90,7 @@ for (Row r: result.select("result").take(3)) {
 Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Word2Vec
-
-# Input data: Each row is a bag of words from a sentence or document.
-documentDF = sqlContext.createDataFrame([
-  ("Hi I heard about Spark".split(" "), ),
-  ("I wish Java could use case classes".split(" "), ),
-  ("Logistic regression models are neat".split(" "), )
-], ["text"])
-# Learn a mapping from words to Vectors.
-word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
-model = word2Vec.fit(documentDF)
-result = model.transform(documentDF)
-for feature in result.select("result").take(3):
-  print(feature)
-{% endhighlight %}
+{% include_example python/ml/word2vec_example.py %}
 </div>
 </div>
 
@@ -283,30 +138,7 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
 and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.CountVectorizer
-import org.apache.spark.mllib.util.CountVectorizerModel
-
-val df = sqlContext.createDataFrame(Seq(
-  (0, Array("a", "b", "c")),
-  (1, Array("a", "b", "b", "c", "a"))
-)).toDF("id", "words")
-
-// fit a CountVectorizerModel from the corpus
-val cvModel: CountVectorizerModel = new CountVectorizer()
-  .setInputCol("words")
-  .setOutputCol("features")
-  .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
-  .fit(df)
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-val cvm = new CountVectorizerModel(Array("a", "b", "c"))
-  .setInputCol("words")
-  .setOutputCol("features")
-
-cvModel.transform(df).select("features").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -315,40 +147,7 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
 and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
 for more details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.CountVectorizer;
-import org.apache.spark.ml.feature.CountVectorizerModel;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("a", "b", "c")),
-  RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
-));
-StructType schema = new StructType(new StructField [] {
-  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-
-// fit a CountVectorizerModel from the corpus
-CountVectorizerModel cvModel = new CountVectorizer()
-  .setInputCol("text")
-  .setOutputCol("feature")
-  .setVocabSize(3)
-  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
-  .fit(df);
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
-  .setInputCol("text")
-  .setOutputCol("feature");
-
-cvModel.transform(df).show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
 </div>
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
new file mode 100644
index 0000000000000..ac33adb65292f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaCountVectorizerExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("a", "b", "c")),
+      RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+    ));
+    StructType schema = new StructType(new StructField [] {
+      new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+    });
+    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+    // fit a CountVectorizerModel from the corpus
+    CountVectorizerModel cvModel = new CountVectorizer()
+      .setInputCol("text")
+      .setOutputCol("feature")
+      .setVocabSize(3)
+      .setMinDF(2)
+      .fit(df);
+
+    // alternatively, define CountVectorizerModel with a-priori vocabulary
+    CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+      .setInputCol("text")
+      .setOutputCol("feature");
+
+    cvModel.transform(df).show();
+    // $example off$
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
new file mode 100644
index 0000000000000..a41a5ec9bff05
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.IDF;
+import org.apache.spark.ml.feature.IDFModel;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTfIdfExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, "Hi I heard about Spark"),
+      RowFactory.create(0, "I wish Java could use case classes"),
+      RowFactory.create(1, "Logistic regression models are neat")
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+    });
+    DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
+    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+    DataFrame wordsData = tokenizer.transform(sentenceData);
+    int numFeatures = 20;
+    HashingTF hashingTF = new HashingTF()
+      .setInputCol("words")
+      .setOutputCol("rawFeatures")
+      .setNumFeatures(numFeatures);
+    DataFrame featurizedData = hashingTF.transform(wordsData);
+    IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
+    IDFModel idfModel = idf.fit(featurizedData);
+    DataFrame rescaledData = idfModel.transform(featurizedData);
+    for (Row r : rescaledData.select("features", "label").take(3)) {
+      Vector features = r.getAs(0);
+      Double label = r.getDouble(1);
+      System.out.println(features);
+      System.out.println(label);
+    }
+    // $example off$
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
new file mode 100644
index 0000000000000..d472375ca9825
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Word2Vec;
+import org.apache.spark.ml.feature.Word2VecModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaWord2VecExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+      RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+      RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+    });
+    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+    // Learn a mapping from words to Vectors.
+    Word2Vec word2Vec = new Word2Vec()
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setVectorSize(3)
+      .setMinCount(0);
+    Word2VecModel model = word2Vec.fit(documentDF);
+    DataFrame result = model.transform(documentDF);
+    for (Row r : result.select("result").take(3)) {
+      System.out.println(r);
+    }
+    // $example off$
+  }
+}
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
new file mode 100644
index 0000000000000..c92313378eec7
--- /dev/null
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
+# $example off$
+from pyspark.sql import SQLContext
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="TfIdfExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    sentenceData = sqlContext.createDataFrame([
+        (0, "Hi I heard about Spark"),
+        (0, "I wish Java could use case classes"),
+        (1, "Logistic regression models are neat")
+    ], ["label", "sentence"])
+    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+    wordsData = tokenizer.transform(sentenceData)
+    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+    featurizedData = hashingTF.transform(wordsData)
+    idf = IDF(inputCol="rawFeatures", outputCol="features")
+    idfModel = idf.fit(featurizedData)
+    rescaledData = idfModel.transform(featurizedData)
+    for features_label in rescaledData.select("features", "label").take(3):
+        print(features_label)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
new file mode 100644
index 0000000000000..53c77feb10145
--- /dev/null
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Word2Vec
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="Word2VecExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    # Input data: Each row is a bag of words from a sentence or document.
+    documentDF = sqlContext.createDataFrame([
+        ("Hi I heard about Spark".split(" "), ),
+        ("I wish Java could use case classes".split(" "), ),
+        ("Logistic regression models are neat".split(" "), )
+    ], ["text"])
+    # Learn a mapping from words to Vectors.
+    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+    model = word2Vec.fit(documentDF)
+    result = model.transform(documentDF)
+    for feature in result.select("result").take(3):
+        print(feature)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
new file mode 100644
index 0000000000000..ba916f66c4c07
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+object CountVectorizerExample {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("CounterVectorizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val df = sqlContext.createDataFrame(Seq(
+      (0, Array("a", "b", "c")),
+      (1, Array("a", "b", "b", "c", "a"))
+    )).toDF("id", "words")
+
+    // fit a CountVectorizerModel from the corpus
+    val cvModel: CountVectorizerModel = new CountVectorizer()
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setVocabSize(3)
+      .setMinDF(2)
+      .fit(df)
+
+    // alternatively, define CountVectorizerModel with a-priori vocabulary
+    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+      .setInputCol("words")
+      .setOutputCol("features")
+
+    cvModel.transform(df).select("features").show()
+    // $example off$
+  }
+}
+// scalastyle:on println
+
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
new file mode 100644
index 0000000000000..40c33e4e7d44e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TfIdfExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("TfIdfExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val sentenceData = sqlContext.createDataFrame(Seq(
+      (0, "Hi I heard about Spark"),
+      (0, "I wish Java could use case classes"),
+      (1, "Logistic regression models are neat")
+    )).toDF("label", "sentence")
+
+    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+    val wordsData = tokenizer.transform(sentenceData)
+    val hashingTF = new HashingTF()
+      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+    val featurizedData = hashingTF.transform(wordsData)
+    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
+    val idfModel = idf.fit(featurizedData)
+    val rescaledData = idfModel.transform(featurizedData)
+    rescaledData.select("features", "label").take(3).foreach(println)
+    // $example off$
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
new file mode 100644
index 0000000000000..631ab4c8efa0d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Word2Vec
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object Word2VecExample {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("Word2Vec example")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // Input data: Each row is a bag of words from a sentence or document.
+    val documentDF = sqlContext.createDataFrame(Seq(
+      "Hi I heard about Spark".split(" "),
+      "I wish Java could use case classes".split(" "),
+      "Logistic regression models are neat".split(" ")
+    ).map(Tuple1.apply)).toDF("text")
+
+    // Learn a mapping from words to Vectors.
+    val word2Vec = new Word2Vec()
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setVectorSize(3)
+      .setMinCount(0)
+    val model = word2Vec.fit(documentDF)
+    val result = model.transform(documentDF)
+    result.select("result").take(3).foreach(println)
+    // $example off$
+  }
+}
+// scalastyle:on println

From 5d4f6abec4e371093e01c084656173e9cfabf29b Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Mon, 26 Oct 2015 21:28:18 -0700
Subject: [PATCH 685/802] [SPARK-10271][PYSPARK][MLLIB] Added @since tags to
 pyspark.mllib.clustering

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to methods + "versionadded::" to classes (derived from the git file history in pyspark).

Author: noelsmith <mail@noelsmith.com>

Closes #8627 from noel-smith/SPARK-10271-since-mllib-clustering.
---
 python/pyspark/mllib/clustering.py | 69 +++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 6964a45db2493..c451df17cf264 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -28,7 +28,7 @@
 
 from collections import namedtuple
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
@@ -96,21 +96,26 @@ class KMeansModel(Saveable, Loader):
     ...     initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)]))
     >>> model.clusterCenters
     [array([-1000., -1000.]), array([ 5.,  5.]), array([ 1000.,  1000.])]
+
+    .. versionadded:: 0.9.0
     """
 
     def __init__(self, centers):
         self.centers = centers
 
     @property
+    @since('1.0.0')
     def clusterCenters(self):
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return self.centers
 
     @property
+    @since('1.4.0')
     def k(self):
         """Total number of clusters."""
         return len(self.centers)
 
+    @since('0.9.0')
     def predict(self, x):
         """Find the cluster to which x belongs in this model."""
         best = 0
@@ -126,6 +131,7 @@ def predict(self, x):
                 best_distance = distance
         return best
 
+    @since('1.4.0')
     def computeCost(self, rdd):
         """
         Return the K-means cost (sum of squared distances of points to
@@ -135,20 +141,32 @@ def computeCost(self, rdd):
                              [_convert_to_vector(c) for c in self.centers])
         return cost
 
+    @since('1.4.0')
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
         java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(sc._jsc.sc(), path)
         return KMeansModel(_java2py(sc, java_model.clusterCenters()))
 
 
 class KMeans(object):
+    """
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since('0.9.0')
     def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
               seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
         """Train a k-means clustering model."""
@@ -222,9 +240,12 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     True
     >>> labels[3]==labels[4]
     True
+
+    .. versionadded:: 1.3.0
     """
 
     @property
+    @since('1.4.0')
     def weights(self):
         """
         Weights for each Gaussian distribution in the mixture, where weights[i] is
@@ -233,6 +254,7 @@ def weights(self):
         return array(self.call("weights"))
 
     @property
+    @since('1.4.0')
     def gaussians(self):
         """
         Array of MultivariateGaussian where gaussians[i] represents
@@ -243,10 +265,12 @@ def gaussians(self):
             for gaussian in zip(*self.call("gaussians"))]
 
     @property
+    @since('1.4.0')
     def k(self):
         """Number of gaussians in mixture."""
         return len(self.weights)
 
+    @since('1.3.0')
     def predict(self, x):
         """
         Find the cluster to which the points in 'x' has maximum membership
@@ -262,6 +286,7 @@ def predict(self, x):
             raise TypeError("x should be represented by an RDD, "
                             "but got %s." % type(x))
 
+    @since('1.3.0')
     def predictSoft(self, x):
         """
         Find the membership of each point in 'x' to all mixture components.
@@ -279,6 +304,7 @@ def predictSoft(self, x):
                             "but got %s." % type(x))
 
     @classmethod
+    @since('1.5.0')
     def load(cls, sc, path):
         """Load the GaussianMixtureModel from disk.
 
@@ -302,8 +328,11 @@ class GaussianMixture(object):
     :param maxIterations:   Number of iterations. Default to 100
     :param seed:            Random Seed
     :param initialModel:    GaussianMixtureModel for initializing learning
+
+    .. versionadded:: 1.3.0
     """
     @classmethod
+    @since('1.3.0')
     def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None):
         """Train a Gaussian Mixture clustering model."""
         initialModelWeights = None
@@ -358,15 +387,19 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 1.5.0
     """
 
     @property
+    @since('1.5.0')
     def k(self):
         """
         Returns the number of clusters.
         """
         return self.call("k")
 
+    @since('1.5.0')
     def assignments(self):
         """
         Returns the cluster assignments of this model.
@@ -375,7 +408,11 @@ def assignments(self):
             lambda x: (PowerIterationClustering.Assignment(*x)))
 
     @classmethod
+    @since('1.5.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         model = cls._load_java(sc, path)
         wrapper = sc._jvm.PowerIterationClusteringModelWrapper(model)
         return PowerIterationClusteringModel(wrapper)
@@ -390,9 +427,12 @@ class PowerIterationClustering(object):
     From the abstract: PIC finds a very low-dimensional embedding of a
     dataset using truncated power iteration on a normalized pair-wise
     similarity matrix of the data.
+
+    .. versionadded:: 1.5.0
     """
 
     @classmethod
+    @since('1.5.0')
     def train(cls, rdd, k, maxIterations=100, initMode="random"):
         """
         :param rdd: an RDD of (i, j, s,,ij,,) tuples representing the
@@ -415,6 +455,8 @@ def train(cls, rdd, k, maxIterations=100, initMode="random"):
     class Assignment(namedtuple("Assignment", ["id", "cluster"])):
         """
         Represents an (id, cluster) tuple.
+
+        .. versionadded:: 1.5.0
         """
 
 
@@ -474,17 +516,21 @@ class StreamingKMeansModel(KMeansModel):
     0
     >>> stkm.predict([1.5, 1.5])
     1
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, clusterCenters, clusterWeights):
         super(StreamingKMeansModel, self).__init__(centers=clusterCenters)
         self._clusterWeights = list(clusterWeights)
 
     @property
+    @since('1.5.0')
     def clusterWeights(self):
         """Return the cluster weights."""
         return self._clusterWeights
 
     @ignore_unicode_prefix
+    @since('1.5.0')
     def update(self, data, decayFactor, timeUnit):
         """Update the centroids, according to data
 
@@ -523,6 +569,8 @@ class StreamingKMeans(object):
     :param decayFactor: float, forgetfulness of the previous centroids.
     :param timeUnit: can be "batches" or "points". If points, then the
                      decayfactor is raised to the power of no. of new points.
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"):
         self._k = k
@@ -533,6 +581,7 @@ def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"):
         self._timeUnit = timeUnit
         self._model = None
 
+    @since('1.5.0')
     def latestModel(self):
         """Return the latest model"""
         return self._model
@@ -547,16 +596,19 @@ def _validate(self, dstream):
                 "Expected dstream to be of type DStream, "
                 "got type %s" % type(dstream))
 
+    @since('1.5.0')
     def setK(self, k):
         """Set number of clusters."""
         self._k = k
         return self
 
+    @since('1.5.0')
     def setDecayFactor(self, decayFactor):
         """Set decay factor."""
         self._decayFactor = decayFactor
         return self
 
+    @since('1.5.0')
     def setHalfLife(self, halfLife, timeUnit):
         """
         Set number of batches after which the centroids of that
@@ -566,6 +618,7 @@ def setHalfLife(self, halfLife, timeUnit):
         self._decayFactor = exp(log(0.5) / halfLife)
         return self
 
+    @since('1.5.0')
     def setInitialCenters(self, centers, weights):
         """
         Set initial centers. Should be set before calling trainOn.
@@ -573,6 +626,7 @@ def setInitialCenters(self, centers, weights):
         self._model = StreamingKMeansModel(centers, weights)
         return self
 
+    @since('1.5.0')
     def setRandomCenters(self, dim, weight, seed):
         """
         Set the initial centres to be random samples from
@@ -584,6 +638,7 @@ def setRandomCenters(self, dim, weight, seed):
         self._model = StreamingKMeansModel(clusterCenters, clusterWeights)
         return self
 
+    @since('1.5.0')
     def trainOn(self, dstream):
         """Train the model on the incoming dstream."""
         self._validate(dstream)
@@ -593,6 +648,7 @@ def update(rdd):
 
         dstream.foreachRDD(update)
 
+    @since('1.5.0')
     def predictOn(self, dstream):
         """
         Make predictions on a dstream.
@@ -601,6 +657,7 @@ def predictOn(self, dstream):
         self._validate(dstream)
         return dstream.map(lambda x: self._model.predict(x))
 
+    @since('1.5.0')
     def predictOnValues(self, dstream):
         """
         Make predictions on a keyed dstream.
@@ -649,16 +706,21 @@ class LDAModel(JavaModelWrapper):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 1.5.0
     """
 
+    @since('1.5.0')
     def topicsMatrix(self):
         """Inferred topics, where each topic is represented by a distribution over terms."""
         return self.call("topicsMatrix").toArray()
 
+    @since('1.5.0')
     def vocabSize(self):
         """Vocabulary size (number of terms or terms in the vocabulary)"""
         return self.call("vocabSize")
 
+    @since('1.5.0')
     def save(self, sc, path):
         """Save the LDAModel on to disk.
 
@@ -672,6 +734,7 @@ def save(self, sc, path):
         self._java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.5.0')
     def load(cls, sc, path):
         """Load the LDAModel from disk.
 
@@ -688,8 +751,12 @@ def load(cls, sc, path):
 
 
 class LDA(object):
+    """
+    .. versionadded:: 1.5.0
+    """
 
     @classmethod
+    @since('1.5.0')
     def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
               topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"):
         """Train a LDA model.

From 3cac6614a4fe60b1446bf704d0a35787d385fb86 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 26 Oct 2015 21:47:42 -0700
Subject: [PATCH 686/802] [SPARK-11184][MLLIB] Declare most of .mllib code
 not-Experimental

Remove "Experimental" from .mllib code that has been around since 1.4.0 or earlier

Author: Sean Owen <sowen@cloudera.com>

Closes #9169 from srowen/SPARK-11184.
---
 .../mllib/classification/ClassificationModel.scala |  4 +---
 .../mllib/classification/LogisticRegression.scala  | 10 +---------
 .../apache/spark/mllib/classification/SVM.scala    |  8 +-------
 .../StreamingLogisticRegressionWithSGD.scala       |  4 +---
 .../spark/mllib/clustering/GaussianMixture.scala   |  5 +----
 .../mllib/clustering/GaussianMixtureModel.scala    |  6 +-----
 .../org/apache/spark/mllib/clustering/LDA.scala    |  5 +----
 .../apache/spark/mllib/clustering/LDAModel.scala   |  9 ---------
 .../clustering/PowerIterationClustering.scala      | 11 +----------
 .../spark/mllib/clustering/StreamingKMeans.scala   |  8 +-------
 .../evaluation/BinaryClassificationMetrics.scala   |  5 +----
 .../spark/mllib/evaluation/MulticlassMetrics.scala |  4 +---
 .../spark/mllib/evaluation/RankingMetrics.scala    |  4 +---
 .../spark/mllib/evaluation/RegressionMetrics.scala |  4 +---
 .../apache/spark/mllib/feature/ChiSqSelector.scala |  6 +-----
 .../spark/mllib/feature/ElementwiseProduct.scala   |  4 +---
 .../org/apache/spark/mllib/feature/HashingTF.scala |  4 +---
 .../scala/org/apache/spark/mllib/feature/IDF.scala |  6 +-----
 .../apache/spark/mllib/feature/Normalizer.scala    |  4 +---
 .../spark/mllib/feature/StandardScaler.scala       |  6 +-----
 .../org/apache/spark/mllib/feature/Word2Vec.scala  | 12 +++---------
 .../org/apache/spark/mllib/fpm/FPGrowth.scala      | 14 +-------------
 .../mllib/linalg/SingularValueDecomposition.scala  |  2 --
 .../mllib/linalg/distributed/BlockMatrix.scala     |  5 +----
 .../linalg/distributed/CoordinateMatrix.scala      |  6 +-----
 .../linalg/distributed/IndexedRowMatrix.scala      |  6 +-----
 .../spark/mllib/linalg/distributed/RowMatrix.scala |  6 +-----
 .../org/apache/spark/mllib/random/RandomRDDs.scala |  4 +---
 .../mllib/regression/IsotonicRegression.scala      |  8 +-------
 .../spark/mllib/regression/RegressionModel.scala   |  3 +--
 .../StreamingLinearRegressionWithSGD.scala         |  4 +---
 .../apache/spark/mllib/stat/KernelDensity.scala    |  4 +---
 .../org/apache/spark/mllib/stat/Statistics.scala   |  4 +---
 .../apache/spark/mllib/stat/test/TestResult.scala  |  4 ----
 .../org/apache/spark/mllib/tree/DecisionTree.scala |  4 +---
 .../spark/mllib/tree/GradientBoostedTrees.scala    |  4 +---
 .../org/apache/spark/mllib/tree/RandomForest.scala |  4 +---
 .../tree/configuration/BoostingStrategy.scala      |  5 +----
 .../mllib/tree/configuration/FeatureType.scala     |  4 +---
 .../tree/configuration/QuantileStrategy.scala      |  4 +---
 .../spark/mllib/tree/configuration/Strategy.scala  |  5 +----
 .../apache/spark/mllib/tree/impl/TimeTracker.scala |  3 ---
 .../spark/mllib/tree/model/DecisionTreeModel.scala |  4 +---
 .../mllib/tree/model/treeEnsembleModels.scala      |  7 +------
 .../org/apache/spark/mllib/util/MLUtils.scala      |  8 +-------
 45 files changed, 43 insertions(+), 208 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index 85a413243b049..5161bc72659c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -19,17 +19,15 @@ package org.apache.spark.mllib.classification
 
 import org.json4s.{DefaultFormats, JValue}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
  * Represents a classification model that predicts to which of a set of categories an example
  * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc.
  */
-@Experimental
 @Since("0.8.0")
 trait ClassificationModel extends Serializable {
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 5ceff5b2259ea..2d52abc122bf2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.classification
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.linalg.{DenseVector, Vector}
@@ -82,35 +82,29 @@ class LogisticRegressionModel @Since("1.3.0") (
   private var threshold: Option[Double] = Some(0.5)
 
   /**
-   * :: Experimental ::
    * Sets the threshold that separates positive predictions from negative predictions
    * in Binary Logistic Regression. An example with prediction score greater than or equal to
    * this threshold is identified as an positive, and negative otherwise. The default value is 0.5.
    * It is only used for binary classification.
    */
   @Since("1.0.0")
-  @Experimental
   def setThreshold(threshold: Double): this.type = {
     this.threshold = Some(threshold)
     this
   }
 
   /**
-   * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
    * It is only used for binary classification.
    */
   @Since("1.3.0")
-  @Experimental
   def getThreshold: Option[Double] = threshold
 
   /**
-   * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
    * It is only used for binary classification.
    */
   @Since("1.0.0")
-  @Experimental
   def clearThreshold(): this.type = {
     threshold = None
     this
@@ -359,13 +353,11 @@ class LogisticRegressionWithLBFGS
   }
 
   /**
-   * :: Experimental ::
    * Set the number of possible outcomes for k classes classification problem in
    * Multinomial Logistic Regression.
    * By default, it is binary logistic regression so k will be set to 2.
    */
   @Since("1.3.0")
-  @Experimental
   def setNumClasses(numClasses: Int): this.type = {
     require(numClasses > 1)
     numOfLinearPredictor = numClasses - 1
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 896565cd90e89..a8d3fd4177a23 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.classification
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
@@ -43,32 +43,26 @@ class SVMModel @Since("1.1.0") (
   private var threshold: Option[Double] = Some(0.0)
 
   /**
-   * :: Experimental ::
    * Sets the threshold that separates positive predictions from negative predictions. An example
    * with prediction score greater than or equal to this threshold is identified as an positive,
    * and negative otherwise. The default value is 0.0.
    */
   @Since("1.0.0")
-  @Experimental
   def setThreshold(threshold: Double): this.type = {
     this.threshold = Some(threshold)
     this
   }
 
   /**
-   * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
    */
   @Since("1.3.0")
-  @Experimental
   def getThreshold: Option[Double] = threshold
 
   /**
-   * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
    */
   @Since("1.0.0")
-  @Experimental
   def clearThreshold(): this.type = {
     threshold = None
     this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
index 75630054d1368..47bff5ebdde47 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.mllib.classification
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
 
 /**
- * :: Experimental ::
  * Train or predict a logistic regression model on streaming data. Training uses
  * Stochastic Gradient Descent to update the model based on each new batch of
  * incoming data from a DStream (see `LogisticRegressionWithSGD` for model equation)
@@ -43,7 +42,6 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
  *    .trainOn(DStream)
  * }}}
  */
-@Experimental
 @Since("1.3.0")
 class StreamingLogisticRegressionWithSGD private[mllib] (
     private var stepSize: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index f82bd82c20371..7b203e2f40815 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.IndexedSeq
 
 import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
@@ -30,8 +30,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 /**
- * :: Experimental ::
- *
  * This class performs expectation maximization for multivariate Gaussian
  * Mixture Models (GMMs).  A GMM represents a composite distribution of
  * independent Gaussian distributions with associated "mixing" weights
@@ -52,7 +50,6 @@ import org.apache.spark.util.Utils
  * is considered to have occurred.
  * @param maxIterations The maximum number of iterations to perform
  */
-@Experimental
 @Since("1.3.0")
 class GaussianMixture private (
     private var k: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index a5902190d4637..2115f7d99c182 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -24,7 +24,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Matrices, Matrix}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
@@ -33,8 +33,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{SQLContext, Row}
 
 /**
- * :: Experimental ::
- *
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
  * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
  * the respective mean and covariance for each Gaussian distribution i=1..k.
@@ -45,7 +43,6 @@ import org.apache.spark.sql.{SQLContext, Row}
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
  */
 @Since("1.3.0")
-@Experimental
 class GaussianMixtureModel @Since("1.3.0") (
   @Since("1.3.0") val weights: Array[Double],
   @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
@@ -132,7 +129,6 @@ class GaussianMixtureModel @Since("1.3.0") (
 }
 
 @Since("1.4.0")
-@Experimental
 object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
 
   private object SaveLoadV1_0 {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 92a321afb0ca3..eb802a365ed6e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseVector => BDV}
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.graphx._
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -28,8 +28,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 /**
- * :: Experimental ::
- *
  * Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
  *
  * Terminology:
@@ -45,7 +43,6 @@ import org.apache.spark.util.Utils
  *       (Wikipedia)]]
  */
 @Since("1.3.0")
-@Experimental
 class LDA private (
     private var k: Int,
     private var maxIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 15129e0dd5a91..31d8a9fdea1c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -35,14 +35,11 @@ import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.util.BoundedPriorityQueue
 
 /**
- * :: Experimental ::
- *
  * Latent Dirichlet Allocation (LDA) model.
  *
  * This abstraction permits for different underlying representations,
  * including local and distributed data structures.
  */
-@Experimental
 @Since("1.3.0")
 abstract class LDAModel private[clustering] extends Saveable {
 
@@ -184,15 +181,12 @@ abstract class LDAModel private[clustering] extends Saveable {
 }
 
 /**
- * :: Experimental ::
- *
  * Local LDA model.
  * This model stores only the inferred topics.
  * It may be used for computing topics for new documents, but it may give less accurate answers
  * than the [[DistributedLDAModel]].
  * @param topics Inferred topics (vocabSize x k matrix).
  */
-@Experimental
 @Since("1.3.0")
 class LocalLDAModel private[clustering] (
     @Since("1.3.0") val topics: Matrix,
@@ -481,14 +475,11 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
 }
 
 /**
- * :: Experimental ::
- *
  * Distributed LDA model.
  * This model stores the inferred topics, the full training dataset, and the topic distributions.
  * When computing topics for new documents, it may give more accurate answers
  * than the [[LocalLDAModel]].
  */
-@Experimental
 @Since("1.3.0")
 class DistributedLDAModel private[clustering] (
     private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 6c76e26fd1626..7cd9b08fa8e0e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -21,7 +21,7 @@ import org.json4s.JsonDSL._
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
@@ -33,15 +33,12 @@ import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.{Logging, SparkContext, SparkException}
 
 /**
- * :: Experimental ::
- *
  * Model produced by [[PowerIterationClustering]].
  *
  * @param k number of clusters
  * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
  */
 @Since("1.3.0")
-@Experimental
 class PowerIterationClusteringModel @Since("1.3.0") (
     @Since("1.3.0") val k: Int,
     @Since("1.3.0") val assignments: RDD[PowerIterationClustering.Assignment])
@@ -107,8 +104,6 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
 }
 
 /**
- * :: Experimental ::
- *
  * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
  * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
  * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
@@ -120,7 +115,6 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
  *
  * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
  */
-@Experimental
 @Since("1.3.0")
 class PowerIterationClustering private[clustering] (
     private var k: Int,
@@ -239,17 +233,14 @@ class PowerIterationClustering private[clustering] (
 }
 
 @Since("1.3.0")
-@Experimental
 object PowerIterationClustering extends Logging {
 
   /**
-   * :: Experimental ::
    * Cluster assignment.
    * @param id node id
    * @param cluster assigned cluster id
    */
   @Since("1.3.0")
-  @Experimental
   case class Assignment(id: Long, cluster: Int)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 1d50ffec96faf..80843719f50b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaSparkContext._
 import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.rdd.RDD
@@ -30,8 +30,6 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
- * :: Experimental ::
- *
  * StreamingKMeansModel extends MLlib's KMeansModel for streaming
  * algorithms, so it can keep track of a continuously updated weight
  * associated with each cluster, and also update the model by
@@ -65,7 +63,6 @@ import org.apache.spark.util.random.XORShiftRandom
  * as batches or points.
  */
 @Since("1.2.0")
-@Experimental
 class StreamingKMeansModel @Since("1.2.0") (
     @Since("1.2.0") override val clusterCenters: Array[Vector],
     @Since("1.2.0") val clusterWeights: Array[Double])
@@ -149,8 +146,6 @@ class StreamingKMeansModel @Since("1.2.0") (
 }
 
 /**
- * :: Experimental ::
- *
  * StreamingKMeans provides methods for configuring a
  * streaming k-means analysis, training the model on streaming,
  * and using the model to make predictions on streaming data.
@@ -168,7 +163,6 @@ class StreamingKMeansModel @Since("1.2.0") (
  * }}}
  */
 @Since("1.2.0")
-@Experimental
 class StreamingKMeans @Since("1.2.0") (
     @Since("1.2.0") var k: Int,
     @Since("1.2.0") var decayFactor: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 508fe532b1306..12cf22095720a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.Logging
-import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.evaluation.binary._
 import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: Experimental ::
  * Evaluator for binary classification.
  *
  * @param scoreAndLabels an RDD of (score, label) pairs.
@@ -43,7 +41,6 @@ import org.apache.spark.sql.DataFrame
  *                partition boundaries.
  */
 @Since("1.0.0")
-@Experimental
 class BinaryClassificationMetrics @Since("1.3.0") (
     @Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)],
     @Since("1.3.0") val numBins: Int) extends Logging {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 00e837661dfc2..c5104960cfcb6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.evaluation
 
 import scala.collection.Map
 
-import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
@@ -32,7 +31,6 @@ import org.apache.spark.sql.DataFrame
  * @param predictionAndLabels an RDD of (prediction, label) pairs.
  */
 @Since("1.1.0")
-@Experimental
 class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index a7f43f0b110f5..cc01936dd34b2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -23,7 +23,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
 import org.apache.spark.rdd.RDD
 
@@ -36,7 +36,6 @@ import org.apache.spark.rdd.RDD
  * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
  */
 @Since("1.2.0")
-@Experimental
 class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])])
   extends Logging with Serializable {
 
@@ -159,7 +158,6 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
 
 }
 
-@Experimental
 object RankingMetrics {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 799ebb980ef01..1d8f4fe340fb4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.evaluation
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.rdd.RDD
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg.Vectors
@@ -25,13 +25,11 @@ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Multivariate
 import org.apache.spark.sql.DataFrame
 
 /**
- * :: Experimental ::
  * Evaluator for regression.
  *
  * @param predictionAndObservations an RDD of (prediction, observation) pairs.
  */
 @Since("1.2.0")
-@Experimental
 class RegressionMetrics @Since("1.2.0") (
     predictionAndObservations: RDD[(Double, Double)]) extends Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 5246faf221914..d4d022afde051 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -23,7 +23,7 @@ import org.json4s._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
@@ -33,13 +33,11 @@ import org.apache.spark.SparkContext
 import org.apache.spark.sql.{SQLContext, Row}
 
 /**
- * :: Experimental ::
  * Chi Squared selector model.
  *
  * @param selectedFeatures list of indices to select (filter). Must be ordered asc
  */
 @Since("1.3.0")
-@Experimental
 class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
@@ -173,7 +171,6 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 }
 
 /**
- * :: Experimental ::
  * Creates a ChiSquared feature selector.
  * @param numTopFeatures number of features that selector will select
  *                       (ordered by statistic value descending)
@@ -181,7 +178,6 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  *                       select all features.
  */
 @Since("1.3.0")
-@Experimental
 class ChiSqSelector @Since("1.3.0") (
   @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index d0a6cf61687a8..c757fc7f06c58 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -17,18 +17,16 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg._
 
 /**
- * :: Experimental ::
  * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
  * provided "weight" vector. In other words, it scales each column of the dataset by a scalar
  * multiplier.
  * @param scalingVec The values used to scale the reference vector's individual components.
  */
 @Since("1.4.0")
-@Experimental
 class ElementwiseProduct @Since("1.4.0") (
     @Since("1.4.0") val scalingVec: Vector) extends VectorTransformer {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index e47d524b61623..c93ed64183ad6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -22,20 +22,18 @@ import java.lang.{Iterable => JavaIterable}
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 /**
- * :: Experimental ::
  * Maps a sequence of terms to their term frequencies using the hashing trick.
  *
  * @param numFeatures number of features (default: 2^20^)
  */
 @Since("1.1.0")
-@Experimental
 class HashingTF(val numFeatures: Int) extends Serializable {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 68078ccfa3d60..cffa9fba05c8a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -19,13 +19,12 @@ package org.apache.spark.mllib.feature
 
 import breeze.linalg.{DenseVector => BDV}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
  * Inverse document frequency (IDF).
  * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total
  * number of documents and `d(t)` is the number of documents that contain term `t`.
@@ -38,7 +37,6 @@ import org.apache.spark.rdd.RDD
  *                   should appear for filtering
  */
 @Since("1.1.0")
-@Experimental
 class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) {
 
   @Since("1.1.0")
@@ -159,10 +157,8 @@ private object IDF {
 }
 
 /**
- * :: Experimental ::
  * Represents an IDF model that can transform term frequency vectors.
  */
-@Experimental
 @Since("1.1.0")
 class IDFModel private[spark] (@Since("1.1.0") val idf: Vector) extends Serializable {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index 8d5a22520d6b8..af0c8e1d8a9d2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 
 /**
- * :: Experimental ::
  * Normalizes samples individually to unit L^p^ norm
  *
  * For any 1 &lt;= p &lt; Double.PositiveInfinity, normalizes samples using
@@ -32,7 +31,6 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors
  * @param p Normalization in L^p^ space, p = 2 by default.
  */
 @Since("1.1.0")
-@Experimental
 class Normalizer @Since("1.1.0") (p: Double) extends VectorTransformer {
 
   @Since("1.1.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index f018b453bae7e..6fe573c528943 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -18,13 +18,12 @@
 package org.apache.spark.mllib.feature
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
  * Standardizes features by removing the mean and scaling to unit std using column summary
  * statistics on the samples in the training set.
  *
@@ -33,7 +32,6 @@ import org.apache.spark.rdd.RDD
  * @param withStd True by default. Scales the data to unit standard deviation.
  */
 @Since("1.1.0")
-@Experimental
 class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean) extends Logging {
 
   @Since("1.1.0")
@@ -64,7 +62,6 @@ class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean) exten
 }
 
 /**
- * :: Experimental ::
  * Represents a StandardScaler model that can transform vectors.
  *
  * @param std column standard deviation values
@@ -73,7 +70,6 @@ class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean) exten
  * @param withMean whether to center the data before scaling
  */
 @Since("1.1.0")
-@Experimental
 class StandardScalerModel @Since("1.3.0") (
     @Since("1.3.0") val std: Vector,
     @Since("1.1.0") val mean: Vector,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 58857c338f546..f3e4d346e358a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -31,15 +31,14 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseMatrix, BLAS, DenseVector}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
-import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.SQLContext
 
 /**
  *  Entry in vocabulary
@@ -53,7 +52,6 @@ private case class VocabWord(
 )
 
 /**
- * :: Experimental ::
  * Word2Vec creates vector representation of words in a text corpus.
  * The algorithm first constructs a vocabulary from the corpus
  * and then learns vector representation of words in the vocabulary.
@@ -71,7 +69,6 @@ private case class VocabWord(
  * Distributed Representations of Words and Phrases and their Compositionality.
  */
 @Since("1.1.0")
-@Experimental
 class Word2Vec extends Serializable with Logging {
 
   private var vectorSize = 100
@@ -427,7 +424,6 @@ class Word2Vec extends Serializable with Logging {
 }
 
 /**
- * :: Experimental ::
  * Word2Vec model
  * @param wordIndex maps each word to an index, which can retrieve the corresponding
  *                  vector from wordVectors
@@ -435,7 +431,6 @@ class Word2Vec extends Serializable with Logging {
  *                    to the word mapped with index i can be retrieved by the slice
  *                    (i * vectorSize, i * vectorSize + vectorSize)
  */
-@Experimental
 @Since("1.1.0")
 class Word2VecModel private[mllib] (
     private val wordIndex: Map[String, Int],
@@ -558,7 +553,6 @@ class Word2VecModel private[mllib] (
 }
 
 @Since("1.4.0")
-@Experimental
 object Word2VecModel extends Loader[Word2VecModel] {
 
   private def buildWordIndex(model: Map[String, Array[Float]]): Map[String, Int] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index aea5c4f8a8a7d..70ef1ed30c71a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.fpm.FPGrowth._
@@ -33,15 +33,11 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
 /**
- * :: Experimental ::
- *
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
  * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
  * @tparam Item item type
- *
  */
 @Since("1.3.0")
-@Experimental
 class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
     @Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
   /**
@@ -56,8 +52,6 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
 }
 
 /**
- * :: Experimental ::
- *
  * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
  * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
  *  Recommendation]]. PFP distributes computation in such a way that each worker executes an
@@ -74,7 +68,6 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
  *
  */
 @Since("1.3.0")
-@Experimental
 class FPGrowth private (
     private var minSupport: Double,
     private var numPartitions: Int) extends Logging with Serializable {
@@ -213,12 +206,7 @@ class FPGrowth private (
   }
 }
 
-/**
- * :: Experimental ::
- *
- */
 @Since("1.3.0")
-@Experimental
 object FPGrowth {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index 4dcf8f28f2023..4591cb88ef152 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -20,11 +20,9 @@ package org.apache.spark.mllib.linalg
 import org.apache.spark.annotation.{Experimental, Since}
 
 /**
- * :: Experimental ::
  * Represents singular value decomposition (SVD) factors.
  */
 @Since("1.0.0")
-@Experimental
 case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType)
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 81a6c0550bda7..09527dcf5d9e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import breeze.linalg.{DenseMatrix => BDM}
 
 import org.apache.spark.{Logging, Partitioner, SparkException}
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -115,8 +115,6 @@ private[mllib] object GridPartitioner {
 }
 
 /**
- * :: Experimental ::
- *
  * Represents a distributed matrix in blocks of local matrices.
  *
  * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that
@@ -132,7 +130,6 @@ private[mllib] object GridPartitioner {
  *              zero, the number of columns will be calculated when `numCols` is invoked.
  */
 @Since("1.3.0")
-@Experimental
 class BlockMatrix @Since("1.3.0") (
     @Since("1.3.0") val blocks: RDD[((Int, Int), Matrix)],
     @Since("1.3.0") val rowsPerBlock: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 644f293d88a75..8a70f34e70f6a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -19,23 +19,20 @@ package org.apache.spark.mllib.linalg.distributed
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors}
 
 /**
- * :: Experimental ::
  * Represents an entry in an distributed matrix.
  * @param i row index
  * @param j column index
  * @param value value of the entry
  */
 @Since("1.0.0")
-@Experimental
 case class MatrixEntry(i: Long, j: Long, value: Double)
 
 /**
- * :: Experimental ::
  * Represents a matrix in coordinate format.
  *
  * @param entries matrix entries
@@ -45,7 +42,6 @@ case class MatrixEntry(i: Long, j: Long, value: Double)
  *              columns will be determined by the max column index plus one.
  */
 @Since("1.0.0")
-@Experimental
 class CoordinateMatrix @Since("1.0.0") (
     @Since("1.0.0") val entries: RDD[MatrixEntry],
     private var nRows: Long,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index b20ea0dc50da5..e6af0c0ec7ec2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -19,21 +19,18 @@ package org.apache.spark.mllib.linalg.distributed
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.SingularValueDecomposition
 
 /**
- * :: Experimental ::
  * Represents a row of [[org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix]].
  */
 @Since("1.0.0")
-@Experimental
 case class IndexedRow(index: Long, vector: Vector)
 
 /**
- * :: Experimental ::
  * Represents a row-oriented [[org.apache.spark.mllib.linalg.distributed.DistributedMatrix]] with
  * indexed rows.
  *
@@ -44,7 +41,6 @@ case class IndexedRow(index: Long, vector: Vector)
  *              columns will be determined by the size of the first row.
  */
 @Since("1.0.0")
-@Experimental
 class IndexedRowMatrix @Since("1.0.0") (
     @Since("1.0.0") val rows: RDD[IndexedRow],
     private var nRows: Long,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index b8a7adceb15b6..52c0f19c645d9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -26,8 +26,7 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BS
 import breeze.numerics.{sqrt => brzSqrt}
 
 import org.apache.spark.Logging
-import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary}
 import org.apache.spark.rdd.RDD
@@ -35,7 +34,6 @@ import org.apache.spark.util.random.XORShiftRandom
 import org.apache.spark.storage.StorageLevel
 
 /**
- * :: Experimental ::
  * Represents a row-oriented distributed Matrix with no meaningful row indices.
  *
  * @param rows rows stored as an RDD[Vector]
@@ -45,7 +43,6 @@ import org.apache.spark.storage.StorageLevel
  *              columns will be determined by the size of the first row.
  */
 @Since("1.0.0")
-@Experimental
 class RowMatrix @Since("1.0.0") (
     @Since("1.0.0") val rows: RDD[Vector],
     private var nRows: Long,
@@ -676,7 +673,6 @@ class RowMatrix @Since("1.0.0") (
 }
 
 @Since("1.0.0")
-@Experimental
 object RowMatrix {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 41d7c4d355f61..b0a716936ae6f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.random
 import scala.reflect.ClassTag
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.linalg.Vector
@@ -29,10 +29,8 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 /**
- * :: Experimental ::
  * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution.
  */
-@Experimental
 @Since("1.1.0")
 object RandomRDDs {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 877d31ba41303..ec78ea24539b5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -29,7 +29,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -37,8 +37,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
 
 /**
- * :: Experimental ::
- *
  * Regression model for isotonic regression.
  *
  * @param boundaries Array of boundaries for which predictions are known.
@@ -49,7 +47,6 @@ import org.apache.spark.sql.SQLContext
  *
  */
 @Since("1.3.0")
-@Experimental
 class IsotonicRegressionModel @Since("1.3.0") (
     @Since("1.3.0") val boundaries: Array[Double],
     @Since("1.3.0") val predictions: Array[Double],
@@ -233,8 +230,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
 }
 
 /**
- * :: Experimental ::
- *
  * Isotonic regression.
  * Currently implemented using parallelized pool adjacent violators algorithm.
  * Only univariate (single feature) algorithm supported.
@@ -252,7 +247,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
  *
  * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
  */
-@Experimental
 @Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
index 0e72d6591ce83..a95a54225a085 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
@@ -19,13 +19,12 @@ package org.apache.spark.mllib.regression
 
 import org.json4s.{DefaultFormats, JValue}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 @Since("0.8.0")
-@Experimental
 trait RegressionModel extends Serializable {
   /**
    * Predict values for the given data set using the model trained.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index fe1d487cdd078..fe2a46b9eecc7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.Vector
 
 /**
- * :: Experimental ::
  * Train or predict a linear regression model on streaming data. Training uses
  * Stochastic Gradient Descent to update the model based on each new batch of
  * incoming data from a DStream (see `LinearRegressionWithSGD` for model equation)
@@ -40,7 +39,6 @@ import org.apache.spark.mllib.linalg.Vector
  *    .setInitialWeights(Vectors.dense(...))
  *    .trainOn(DStream)
  */
-@Experimental
 @Since("1.1.0")
 class StreamingLinearRegressionWithSGD private[mllib] (
     private var stepSize: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index 4a856f7f3434a..f253963270bc4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -19,12 +19,11 @@ package org.apache.spark.mllib.stat
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
  * Kernel density estimation. Given a sample from a population, estimate its probability density
  * function at each of the given evaluation points using kernels. Only Gaussian kernel is supported.
  *
@@ -39,7 +38,6 @@ import org.apache.spark.rdd.RDD
  * }}}
  */
 @Since("1.4.0")
-@Experimental
 class KernelDensity extends Serializable {
 
   import KernelDensity._
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 84d64a5bfb38e..bcb33a7a04677 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.stat
 
 import scala.annotation.varargs
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.{JavaRDD, JavaDoubleRDD}
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
@@ -30,11 +30,9 @@ import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult, KolmogorovS
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
  * API for statistical functions in MLlib.
  */
 @Since("1.1.0")
-@Experimental
 object Statistics {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
index b0916d3e84651..8a29fd39a9106 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -20,11 +20,9 @@ package org.apache.spark.mllib.stat.test
 import org.apache.spark.annotation.{Experimental, Since}
 
 /**
- * :: Experimental ::
  * Trait for hypothesis test results.
  * @tparam DF Return type of `degreesOfFreedom`.
  */
-@Experimental
 @Since("1.1.0")
 trait TestResult[DF] {
 
@@ -79,10 +77,8 @@ trait TestResult[DF] {
 }
 
 /**
- * :: Experimental ::
  * Object containing the test results for the chi-squared hypothesis test.
  */
-@Experimental
 @Since("1.1.0")
 class ChiSqTestResult private[stat] (override val pValue: Double,
     @Since("1.1.0") override val degreesOfFreedom: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 53d6482f8057c..af1f7e74c004d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.RandomForest.NodeIndexInfo
@@ -36,7 +36,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
- * :: Experimental ::
  * A class which implements a decision tree learning algorithm for classification and regression.
  * It supports both continuous and categorical features.
  * @param strategy The configuration parameters for the tree algorithm which specify the type
@@ -44,7 +43,6 @@ import org.apache.spark.util.random.XORShiftRandom
  *                 categorical), depth of the tree, quantile calculation strategy, etc.
  */
 @Since("1.0.0")
-@Experimental
 class DecisionTree @Since("1.0.0") (private val strategy: Strategy)
   extends Serializable with Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 66a07e31360d8..729a211574822 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.tree
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -31,7 +31,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
 /**
- * :: Experimental ::
  * A class that implements
  * [[http://en.wikipedia.org/wiki/Gradient_boosting  Stochastic Gradient Boosting]]
  * for regression and binary classification.
@@ -50,7 +49,6 @@ import org.apache.spark.storage.StorageLevel
  * @param boostingStrategy Parameters for the gradient boosting algorithm.
  */
 @Since("1.2.0")
-@Experimental
 class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: BoostingStrategy)
   extends Serializable with Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 63a902f3eb51e..a684cdd18c2fc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import scala.collection.JavaConverters._
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Strategy
@@ -39,7 +39,6 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.random.SamplingUtils
 
 /**
- * :: Experimental ::
  * A class that implements a [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]]
  * learning algorithm for classification and regression.
  * It supports both continuous and categorical features.
@@ -66,7 +65,6 @@ import org.apache.spark.util.random.SamplingUtils
  *                                  to "onethird" for regression.
  * @param seed Random seed for bootstrapping and choosing feature subsets.
  */
-@Experimental
 private class RandomForest (
     private val strategy: Strategy,
     private val numTrees: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index fc13bcfd8e998..d2513a9d5c5bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -19,12 +19,11 @@ package org.apache.spark.mllib.tree.configuration
 
 import scala.beans.BeanProperty
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
 
 /**
- * :: Experimental ::
  * Configuration options for [[org.apache.spark.mllib.tree.GradientBoostedTrees]].
  *
  * @param treeStrategy Parameters for the tree algorithm. We support regression and binary
@@ -47,7 +46,6 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
  */
 @Since("1.2.0")
-@Experimental
 case class BoostingStrategy @Since("1.4.0") (
     // Required boosting parameters
     @Since("1.2.0") @BeanProperty var treeStrategy: Strategy,
@@ -79,7 +77,6 @@ case class BoostingStrategy @Since("1.4.0") (
 }
 
 @Since("1.2.0")
-@Experimental
 object BoostingStrategy {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
index 4e0cd473def06..1470295d8a932 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.mllib.tree.configuration
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 
 /**
- * :: Experimental ::
  * Enum to describe whether a feature is "continuous" or "categorical"
  */
 @Since("1.0.0")
-@Experimental
 object FeatureType extends Enumeration {
   @Since("1.0.0")
   type FeatureType = Value
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
index 8262db8a4f111..1c16f136eb3eb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
@@ -17,14 +17,12 @@
 
 package org.apache.spark.mllib.tree.configuration
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 
 /**
- * :: Experimental ::
  * Enum for selecting the quantile calculation strategy
  */
 @Since("1.0.0")
-@Experimental
 object QuantileStrategy extends Enumeration {
   @Since("1.0.0")
   type QuantileStrategy = Value
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index 89cc13b7c06cf..372d6617a4014 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -20,13 +20,12 @@ package org.apache.spark.mllib.tree.configuration
 import scala.beans.BeanProperty
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.tree.impurity.{Variance, Entropy, Gini, Impurity}
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
 
 /**
- * :: Experimental ::
  * Stores all the configuration options for tree construction
  * @param algo  Learning goal.  Supported:
  *              [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
@@ -68,7 +67,6 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  *                           [[org.apache.spark.SparkContext]], this setting is ignored.
  */
 @Since("1.0.0")
-@Experimental
 class Strategy @Since("1.3.0") (
     @Since("1.0.0") @BeanProperty var algo: Algo,
     @Since("1.0.0") @BeanProperty var impurity: Impurity,
@@ -179,7 +177,6 @@ class Strategy @Since("1.3.0") (
 }
 
 @Since("1.2.0")
-@Experimental
 object Strategy {
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
index aac84243d5ce1..70afaa162b2e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
@@ -19,12 +19,9 @@ package org.apache.spark.mllib.tree.impl
 
 import scala.collection.mutable.{HashMap => MutableHashMap}
 
-import org.apache.spark.annotation.Experimental
-
 /**
  * Time tracker implementation which holds labeled timers.
  */
-@Experimental
 private[spark] class TimeTracker extends Serializable {
 
   private val starts: MutableHashMap[String, Long] = new MutableHashMap[String, Long]()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index e1bf23f4c34bb..54c136aecf660 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -24,7 +24,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType}
@@ -35,14 +35,12 @@ import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spark.util.Utils
 
 /**
- * :: Experimental ::
  * Decision tree model for classification or regression.
  * This model stores the decision tree structure and parameters.
  * @param topNode root node
  * @param algo algorithm type -- classification or regression
  */
 @Since("1.0.0")
-@Experimental
 class DecisionTreeModel @Since("1.0.0") (
     @Since("1.0.0") val topNode: Node,
     @Since("1.0.0") val algo: Algo) extends Serializable with Saveable {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index df5b8feab5d5d..90e032e3d9842 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -25,7 +25,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -38,16 +38,13 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.util.Utils
 
-
 /**
- * :: Experimental ::
  * Represents a random forest model.
  *
  * @param algo algorithm for the ensemble model, either Classification or Regression
  * @param trees tree ensembles
  */
 @Since("1.2.0")
-@Experimental
 class RandomForestModel @Since("1.2.0") (
     @Since("1.2.0") override val algo: Algo,
     @Since("1.2.0") override val trees: Array[DecisionTreeModel])
@@ -108,7 +105,6 @@ object RandomForestModel extends Loader[RandomForestModel] {
 }
 
 /**
- * :: Experimental ::
  * Represents a gradient boosted trees model.
  *
  * @param algo algorithm for the ensemble model, either Classification or Regression
@@ -116,7 +112,6 @@ object RandomForestModel extends Loader[RandomForestModel] {
  * @param treeWeights tree ensemble weights
  */
 @Since("1.2.0")
-@Experimental
 class GradientBoostedTreesModel @Since("1.2.0") (
     @Since("1.2.0") override val algo: Algo,
     @Since("1.2.0") override val trees: Array[DecisionTreeModel],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 81c2f0ce6e12c..414ea99cfd8c8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -19,9 +19,7 @@ package org.apache.spark.mllib.util
 
 import scala.reflect.ClassTag
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
-
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PartitionwiseSampledRDD
@@ -30,8 +28,6 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.dstream.DStream
 
 /**
  * Helper methods to load, save and pre-process data used in ML Lib.
@@ -263,13 +259,11 @@ object MLUtils {
   }
 
   /**
-   * :: Experimental ::
    * Return a k element array of pairs of RDDs with the first element of each pair
    * containing the training data, a complement of the validation data and the second
    * element, the validation data, containing a unique 1/kth of the data. Where k=numFolds.
    */
   @Since("1.0.0")
-  @Experimental
   def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Int): Array[(RDD[T], RDD[T])] = {
     val numFoldsF = numFolds.toFloat
     (1 to numFolds).map { fold =>

From 8b292b19c9b3aaaa51b919a12132e099e5be832d Mon Sep 17 00:00:00 2001
From: Reza Zadeh <reza@databricks.com>
Date: Mon, 26 Oct 2015 22:00:24 -0700
Subject: [PATCH 687/802] [SPARK-10654][MLLIB] Add columnSimilarities to
 IndexedRowMatrix

Add columnSimilarities to IndexedRowMatrix by delegating to functionality already in RowMatrix.

With a test.

Author: Reza Zadeh <reza@databricks.com>

Closes #8792 from rezazadeh/colsims.
---
 .../mllib/linalg/distributed/IndexedRowMatrix.scala | 13 +++++++++++++
 .../linalg/distributed/IndexedRowMatrixSuite.scala  | 12 ++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index e6af0c0ec7ec2..976299124cedd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -68,6 +68,19 @@ class IndexedRowMatrix @Since("1.0.0") (
     nRows
   }
 
+
+  /**
+   * Compute all cosine similarities between columns of this matrix using the brute-force
+   * approach of computing normalized dot products.
+   *
+   * @return An n x n sparse upper-triangular matrix of cosine similarities between
+   *         columns of this matrix.
+   */
+  @Since("1.6.0")
+  def columnSimilarities(): CoordinateMatrix = {
+    toRowMatrix().columnSimilarities()
+  }
+
   /**
    * Drops row indices and converts this matrix to a
    * [[org.apache.spark.mllib.linalg.distributed.RowMatrix]].
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
index 0ecb7a221a503..6de6cf2fa8634 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -153,6 +153,18 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("similar columns") {
+    val A = new IndexedRowMatrix(indexedRows)
+    val gram = A.computeGramianMatrix().toBreeze.toDenseMatrix
+
+    val G = A.columnSimilarities().toBreeze()
+
+    for (i <- 0 until n; j <- i + 1 until n) {
+      val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j))
+      assert(math.abs(G(i, j) - trueResult) < 1e-6)
+    }
+  }
+
   def closeToZero(G: BDM[Double]): Boolean = {
     G.valuesIterator.map(math.abs).sum < 1e-6
   }

From d77d198fcc7c532a699f062a3e3877a7679809da Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Mon, 26 Oct 2015 23:53:41 -0700
Subject: [PATCH 688/802] [SPARK-11297] Add new code tags

mengxr https://issues.apache.org/jira/browse/SPARK-11297

Add new code tags to hold the same look and feel with previous documents.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #9265 from yinxusen/SPARK-11297.
---
 docs/css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/css/main.css b/docs/css/main.css
index 89305a7d3a358..d770173be1014 100755
--- a/docs/css/main.css
+++ b/docs/css/main.css
@@ -74,6 +74,10 @@ code {
   color: #444444;
 }
 
+div .highlight pre {
+  font-size: 12px;
+}
+
 a code {
   color: #0088cc;
 }

From feb8d6a44fbfc31a880aaaac0cfcaadc91786073 Mon Sep 17 00:00:00 2001
From: Sem Mulder <sem.mulder@site2mobile.com>
Date: Tue, 27 Oct 2015 07:55:10 +0000
Subject: [PATCH 689/802] [SPARK-11276][CORE] SizeEstimator prevents class
 unloading

The SizeEstimator keeps a cache of ClassInfos but this cache uses Class objects as keys.
Which results in strong references to the Class objects. If these classes are dynamically created
this prevents the corresponding ClassLoader from being GCed. Leading to PermGen exhaustion.

We use a Map with WeakKeys to prevent this issue.

Author: Sem Mulder <sem.mulder@site2mobile.com>

Closes #9244 from SemMulder/fix-sizeestimator-classunloading.
---
 .../main/scala/org/apache/spark/util/SizeEstimator.scala    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 14b1f2a17e707..23ee4eff0881b 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util
 
+import com.google.common.collect.MapMaker
+
 import java.lang.management.ManagementFactory
 import java.lang.reflect.{Field, Modifier}
 import java.util.{IdentityHashMap, Random}
@@ -29,7 +31,6 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.collection.OpenHashSet
 
-
 /**
  * :: DeveloperApi ::
  * Estimates the sizes of Java objects (number of bytes of memory they occupy), for use in
@@ -73,7 +74,8 @@ object SizeEstimator extends Logging {
   private val ALIGN_SIZE = 8
 
   // A cache of ClassInfo objects for each class
-  private val classInfos = new ConcurrentHashMap[Class[_], ClassInfo]
+  // We use weakKeys to allow GC of dynamically created classes
+  private val classInfos = new MapMaker().weakKeys().makeMap[Class[_], ClassInfo]()
 
   // Object and pointer sizes are arch dependent
   private var is64bit = false

From 8f888eea1aef5a28916ec406a99fc19648681ecf Mon Sep 17 00:00:00 2001
From: Nick Evans <me@nicolasevans.org>
Date: Tue, 27 Oct 2015 01:29:06 -0700
Subject: [PATCH 690/802] [SPARK-11270][STREAMING] Add improved equality
 testing for TopicAndPartition from the Kafka Streaming API

jerryshao tdas

I know this is kind of minor, and I know you all are busy, but this brings this class in line with the `OffsetRange` class, and makes tests a little more concise.

Instead of doing something like:
```
assert topic_and_partition_instance._topic == "foo"
assert topic_and_partition_instance._partition == 0
```

You can do something like:
```
assert topic_and_partition_instance == TopicAndPartition("foo", 0)
```

Before:
```
>>> from pyspark.streaming.kafka import TopicAndPartition
>>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0)
False
```

After:
```
>>> from pyspark.streaming.kafka import TopicAndPartition
>>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0)
True
```

I couldn't find any tests - am I missing something?

Author: Nick Evans <me@nicolasevans.org>

Closes #9236 from manygrams/topic_and_partition_equality.
---
 python/pyspark/streaming/kafka.py | 10 ++++++++++
 python/pyspark/streaming/tests.py | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index b35bbaf404cc5..06e159172ab51 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -254,6 +254,16 @@ def __init__(self, topic, partition):
     def _jTopicAndPartition(self, helper):
         return helper.createTopicAndPartition(self._topic, self._partition)
 
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return (self._topic == other._topic
+                    and self._partition == other._partition)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
 
 class Broker(object):
     """
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 2c908daa8b214..f7fa481d50235 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -898,6 +898,16 @@ def transformWithOffsetRanges(rdd):
 
         self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
 
+    def test_topic_and_partition_equality(self):
+        topic_and_partition_a = TopicAndPartition("foo", 0)
+        topic_and_partition_b = TopicAndPartition("foo", 0)
+        topic_and_partition_c = TopicAndPartition("bar", 0)
+        topic_and_partition_d = TopicAndPartition("foo", 1)
+
+        self.assertEqual(topic_and_partition_a, topic_and_partition_b)
+        self.assertNotEqual(topic_and_partition_a, topic_and_partition_c)
+        self.assertNotEqual(topic_and_partition_a, topic_and_partition_d)
+
 
 class FlumeStreamTests(PySparkStreamingTestCase):
     timeout = 20  # seconds

From 17f499920776e0e995434cfa300ff2ff38658fa8 Mon Sep 17 00:00:00 2001
From: maxwell <maxwellzdm@gmail.com>
Date: Tue, 27 Oct 2015 01:31:28 -0700
Subject: [PATCH 691/802] [SPARK-5569][STREAMING] fix
 ObjectInputStreamWithLoader for supporting load array classes.

When use Kafka DirectStream API to create checkpoint and restore saved checkpoint when restart,
ClassNotFound exception would occur.

The reason for this error is that ObjectInputStreamWithLoader extends the ObjectInputStream class and override its resolveClass method. But Instead of Using Class.forName(desc,false,loader), Spark uses loader.loadClass(desc) to instance the class, which do not works with array class.

For example:
Class.forName("[Lorg.apache.spark.streaming.kafka.OffsetRange.",false,loader) works well while loader.loadClass("[Lorg.apache.spark.streaming.kafka.OffsetRange") would throw an class not found exception.

details of the difference between Class.forName and loader.loadClass can be found here.
http://bugs.java.com/view_bug.do?bug_id=6446627

Author: maxwell <maxwellzdm@gmail.com>
Author: DEMING ZHU <deming.zhu@linecorp.com>

Closes #8955 from maxwellzdm/master.
---
 .../apache/spark/streaming/Checkpoint.scala   |  4 ++-
 .../spark/streaming/CheckpointSuite.scala     | 35 +++++++++++++++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 8a6050f5227bf..b7de6dde61c63 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -352,7 +352,9 @@ class ObjectInputStreamWithLoader(inputStream_ : InputStream, loader: ClassLoade
 
   override def resolveClass(desc: ObjectStreamClass): Class[_] = {
     try {
-      return loader.loadClass(desc.getName())
+      // scalastyle:off classforname
+      return Class.forName(desc.getName(), false, loader)
+      // scalastyle:on classforname
     } catch {
       case e: Exception =>
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index a6956533c07a5..84f5294aa39cc 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.streaming
 
-import java.io.File
+import java.io.{ObjectOutputStream, ByteArrayOutputStream, ByteArrayInputStream, File}
+import org.apache.spark.TestUtils
 
 import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
 import scala.reflect.ClassTag
@@ -34,7 +35,7 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.streaming.dstream.{DStream, FileInputDStream}
 import org.apache.spark.streaming.scheduler.{ConstantEstimator, RateTestInputDStream, RateTestReceiver}
-import org.apache.spark.util.{Clock, ManualClock, Utils}
+import org.apache.spark.util.{MutableURLClassLoader, Clock, ManualClock, Utils}
 
 /**
  * This test suites tests the checkpointing functionality of DStreams -
@@ -579,6 +580,36 @@ class CheckpointSuite extends TestSuiteBase {
     }
   }
 
+  // This tests whether spark can deserialize array object
+  // refer to SPARK-5569
+  test("recovery from checkpoint contains array object") {
+    // create a class which is invisible to app class loader
+    val jar = TestUtils.createJarWithClasses(
+      classNames = Seq("testClz"),
+      toStringValue = "testStringValue"
+      )
+
+    // invisible to current class loader
+    val appClassLoader = getClass.getClassLoader
+    intercept[ClassNotFoundException](appClassLoader.loadClass("testClz"))
+
+    // visible to mutableURLClassLoader
+    val loader = new MutableURLClassLoader(
+      Array(jar), appClassLoader)
+    assert(loader.loadClass("testClz").newInstance().toString == "testStringValue")
+
+    // create and serialize Array[testClz]
+    // scalastyle:off classforname
+    val arrayObj = Class.forName("[LtestClz;", false, loader)
+    // scalastyle:on classforname
+    val bos = new ByteArrayOutputStream()
+    new ObjectOutputStream(bos).writeObject(arrayObj)
+
+    // deserialize the Array[testClz]
+    val ois = new ObjectInputStreamWithLoader(
+      new ByteArrayInputStream(bos.toByteArray), loader)
+    assert(ois.readObject().asInstanceOf[Class[_]].getName == "[LtestClz;")
+  }
 
   /**
    * Tests a streaming operation under checkpointing, by restarting the operation

From 958a0ec8fa58ff091f595db2b574a7aa3ff41253 Mon Sep 17 00:00:00 2001
From: Jia Li <jiali@us.ibm.com>
Date: Tue, 27 Oct 2015 10:57:08 +0100
Subject: [PATCH 692/802] [SPARK-11277][SQL] sort_array throws exception
 scala.MatchError

I'm new to spark. I was trying out the sort_array function then hit this exception. I looked into the spark source code. I found the root cause is that sort_array does not check for an array of NULLs. It's not meaningful to sort an array of entirely NULLs anyway.

I'm adding a check on the input array type to SortArray. If the array consists of NULLs entirely, there is no need to sort such array. I have also added a test case for this.

Please help to review my fix. Thanks!

Author: Jia Li <jiali@us.ibm.com>

Closes #9247 from jliwork/SPARK-11277.
---
 .../sql/catalyst/expressions/collectionOperations.scala    | 6 +++++-
 .../catalyst/expressions/CollectionFunctionsSuite.scala    | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 89d87726ac649..2cf19b939f734 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -68,6 +68,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
   private lazy val lt: Comparator[Any] = {
     val ordering = base.dataType match {
       case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
+      case _ @ ArrayType(s: StructType, _) => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
     }
 
     new Comparator[Any]() {
@@ -89,6 +90,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
   private lazy val gt: Comparator[Any] = {
     val ordering = base.dataType match {
       case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
+      case _ @ ArrayType(s: StructType, _) => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
     }
 
     new Comparator[Any]() {
@@ -109,7 +111,9 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
   override def nullSafeEval(array: Any, ascending: Any): Any = {
     val elementType = base.dataType.asInstanceOf[ArrayType].elementType
     val data = array.asInstanceOf[ArrayData].toArray[AnyRef](elementType)
-    java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt)
+    if (elementType != NullType) {
+      java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt)
+    }
     new GenericArrayData(data.asInstanceOf[Array[Any]])
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
index a3e81888dfd0d..1aae4678d6278 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
@@ -49,6 +49,7 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
     val a2 = Literal.create(Seq("b", "a"), ArrayType(StringType))
     val a3 = Literal.create(Seq("b", null, "a"), ArrayType(StringType))
+    val a4 = Literal.create(Seq(null, null), ArrayType(NullType))
 
     checkEvaluation(new SortArray(a0), Seq(1, 2, 3))
     checkEvaluation(new SortArray(a1), Seq[Integer]())
@@ -64,6 +65,12 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(new SortArray(a3, Literal(false)), Seq("b", "a", null))
 
     checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
+    checkEvaluation(new SortArray(a4), Seq(null, null))
+
+    val typeAS = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
+    val arrayStruct = Literal.create(Seq(create_row(2), create_row(1)), typeAS)
+
+    checkEvaluation(new SortArray(arrayStruct), Seq(create_row(1), create_row(2)))
   }
 
   test("Array contains") {

From 360ed832f5213b805ac28cf1d2828be09480f2d6 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 27 Oct 2015 11:28:59 +0100
Subject: [PATCH 693/802] [SPARK-11303][SQL] filter should not be pushed down
 into sample

When sampling and then filtering DataFrame, the SQL Optimizer will push down filter into sample and produce wrong result. This is due to the sampler is calculated based on the original scope rather than the scope after filtering.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9294 from yanboliang/spark-11303.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala       |  4 ----
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala     | 10 ++++++++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0139b9e87ce84..d37f43888fd4f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -74,10 +74,6 @@ object DefaultOptimizer extends Optimizer {
 object SamplePushDown extends Rule[LogicalPlan] {
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    // Push down filter into sample
-    case Filter(condition, s @ Sample(lb, up, replace, seed, child)) =>
-      Sample(lb, up, replace, seed,
-        Filter(condition, child))
     // Push down projection into sample
     case Project(projectList, s @ Sample(lb, up, replace, seed, child)) =>
       Sample(lb, up, replace, seed,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 298c32290697a..f5ae3ae49b460 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1860,4 +1860,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         Row(1))
     }
   }
+
+  test("SPARK-11303: filter should not be pushed down into sample") {
+    val df = sqlContext.range(100)
+    List(true, false).foreach { withReplacement =>
+      val sampled = df.sample(withReplacement, 0.1, 1)
+      val sampledOdd = sampled.filter("id % 2 != 0")
+      val sampledEven = sampled.filter("id % 2 = 0")
+      assert(sampled.count() == sampledOdd.count() + sampledEven.count())
+    }
+  }
 }

From 9fc16a82adb5f3db2a250765c11393794404a51b Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Tue, 27 Oct 2015 10:46:43 -0700
Subject: [PATCH 694/802] [SPARK-11306] Fix hang when JVM exits.

This commit fixes a bug where, in Standalone mode, if a task fails and crashes the JVM, the
failure is considered a "normal failure" (meaning it's considered unrelated to the task), so
the failure isn't counted against the task's maximum number of failures:
https://github.com/apache/spark/commit/af3bc59d1f5d9d952c2d7ad1af599c49f1dbdaf0#diff-a755f3d892ff2506a7aa7db52022d77cL138.
As a result, if a task fails in a way that results in it crashing the JVM, it will continuously be
re-launched, resulting in a hang. This commit fixes that problem.

This bug was introduced by #8007; andrewor14 mccheah vanzin can you take a look at this?

This error is hard to trigger because we handle executor losses through 2 code paths (the second is via Akka, where Akka notices that the executor endpoint is disconnected).  In my setup, the Akka code path completes first, and doesn't have this bug, so things work fine (see my recent email to the dev list about this).  If I manually disable the Akka code path, I can see the hang (and this commit fixes the issue).

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #9273 from kayousterhout/SPARK-11306.
---
 .../spark/scheduler/cluster/SparkDeploySchedulerBackend.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 2625c3e7ac718..a4214c496166d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -137,7 +137,7 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]) {
     val reason: ExecutorLossReason = exitStatus match {
-      case Some(code) => ExecutorExited(code, isNormalExit = true, message)
+      case Some(code) => ExecutorExited(code, isNormalExit = false, message)
       case None => SlaveLost(message)
     }
     logInfo("Executor %s removed: %s".format(fullId, message))

From 3bdbbc6c972567861044dd6a6dc82f35cd12442d Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Tue, 27 Oct 2015 11:05:14 -0700
Subject: [PATCH 695/802] [SPARK-6488][MLLIB][PYTHON] Support
 addition/multiplication in PySpark's BlockMatrix

This PR adds addition and multiplication to PySpark's `BlockMatrix` class via `add` and `multiply` functions.

Author: Mike Dusenberry <mwdusenb@us.ibm.com>

Closes #9139 from dusenberrymw/SPARK-6488_Add_Addition_and_Multiplication_to_PySpark_BlockMatrix.
---
 python/pyspark/mllib/linalg/distributed.py | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index aec407de90aa3..0e76050788630 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -775,6 +775,74 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    def add(self, other):
+        """
+        Adds two block matrices together. The matrices must have the
+        same size and matching `rowsPerBlock` and `colsPerBlock` values.
+        If one of the sub matrix blocks that are being added is a
+        SparseMatrix, the resulting sub matrix block will also be a
+        SparseMatrix, even if it is being added to a DenseMatrix. If
+        two dense sub matrix blocks are added, the output block will
+        also be a DenseMatrix.
+
+        >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+        >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
+        >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])
+        >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
+        >>> blocks2 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
+        >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])
+        >>> mat1 = BlockMatrix(blocks1, 3, 2)
+        >>> mat2 = BlockMatrix(blocks2, 3, 2)
+        >>> mat3 = BlockMatrix(blocks3, 3, 2)
+
+        >>> mat1.add(mat2).toLocalMatrix()
+        DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)
+
+        >>> mat1.add(mat3).toLocalMatrix()
+        DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0)
+        """
+        if not isinstance(other, BlockMatrix):
+            raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
+
+        other_java_block_matrix = other._java_matrix_wrapper._java_model
+        java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix)
+        return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
+
+    def multiply(self, other):
+        """
+        Left multiplies this BlockMatrix by `other`, another
+        BlockMatrix. The `colsPerBlock` of this matrix must equal the
+        `rowsPerBlock` of `other`. If `other` contains any SparseMatrix
+        blocks, they will have to be converted to DenseMatrix blocks.
+        The output BlockMatrix will only consist of DenseMatrix blocks.
+        This may cause some performance issues until support for
+        multiplying two sparse matrices is added.
+
+        >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
+        >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12])
+        >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+        >>> dm4 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
+        >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])
+        >>> blocks1 = sc.parallelize([((0, 0), dm1), ((0, 1), dm2)])
+        >>> blocks2 = sc.parallelize([((0, 0), dm3), ((1, 0), dm4)])
+        >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm4)])
+        >>> mat1 = BlockMatrix(blocks1, 2, 3)
+        >>> mat2 = BlockMatrix(blocks2, 3, 2)
+        >>> mat3 = BlockMatrix(blocks3, 3, 2)
+
+        >>> mat1.multiply(mat2).toLocalMatrix()
+        DenseMatrix(2, 2, [242.0, 272.0, 350.0, 398.0], 0)
+
+        >>> mat1.multiply(mat3).toLocalMatrix()
+        DenseMatrix(2, 2, [227.0, 258.0, 394.0, 450.0], 0)
+        """
+        if not isinstance(other, BlockMatrix):
+            raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
+
+        other_java_block_matrix = other._java_matrix_wrapper._java_model
+        java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix)
+        return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
+
     def toLocalMatrix(self):
         """
         Collect the distributed matrix on the driver as a DenseMatrix.

From 5a5f65905a202e59bc85170b01c57a883718ddf6 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 27 Oct 2015 13:28:52 -0700
Subject: [PATCH 696/802] [SPARK-11347] [SQL] Support for joinWith in Datasets

This PR adds a new operation `joinWith` to a `Dataset`, which returns a `Tuple` for each pair where a given `condition` evaluates to true.

```scala
case class ClassData(a: String, b: Int)

val ds1 = Seq(ClassData("a", 1), ClassData("b", 2)).toDS()
val ds2 = Seq(("a", 1), ("b", 2)).toDS()

> ds1.joinWith(ds2, $"_1" === $"a").collect()
res0: Array((ClassData("a", 1), ("a", 1)), (ClassData("b", 2), ("b", 2)))
```

This operation is similar to the relation `join` function with one important difference in the result schema. Since `joinWith` preserves objects present on either side of the join, the result schema is similarly nested into a tuple under the column names `_1` and `_2`.

This type of join can be useful both for preserving type-safety with the original object types as well as working with relational data where either side of the join has column names in common.

## Required Changes to Encoders
In the process of working on this patch, several deficiencies to the way that we were handling encoders were discovered.  Specifically, it turned out to be very difficult to `rebind` the non-expression based encoders to extract the nested objects from the results of joins (and also typed selects that return tuples).

As a result the following changes were made.
 - `ClassEncoder` has been renamed to `ExpressionEncoder` and has been improved to also handle primitive types.  Additionally, it is now possible to take arbitrary expression encoders and rewrite them into a single encoder that returns a tuple.
 - All internal operations on `Dataset`s now require an `ExpressionEncoder`.  If the users tries to pass a non-`ExpressionEncoder` in, an error will be thrown.  We can relax this requirement in the future by constructing a wrapper class that uses expressions to project the row to the expected schema, shielding the users code from the required remapping.  This will give us a nice balance where we don't force user encoders to understand attribute references and binding, but still allow our native encoder to leverage runtime code generation to construct specific encoders for a given schema that avoid an extra remapping step.
 - Additionally, the semantics for different types of objects are now better defined.  As stated in the `ExpressionEncoder` scaladoc:
  - Classes will have their sub fields extracted by name using `UnresolvedAttribute` expressions
  and `UnresolvedExtractValue` expressions.
  - Tuples will have their subfields extracted by position using `BoundReference` expressions.
  - Primitives will have their values extracted from the first ordinal with a schema that defaults
  to the name `value`.
 - Finally, the binding lifecycle for `Encoders` has now been unified across the codebase.  Encoders are now `resolved` to the appropriate schema in the constructor of `Dataset`.  This process replaces an unresolved expressions with concrete `AttributeReference` expressions.  Binding then happens on demand, when an encoder is going to be used to construct an object.  This closely mirrors the lifecycle for standard expressions when executing normal SQL or `DataFrame` queries.

Author: Michael Armbrust <michael@databricks.com>

Closes #9300 from marmbrus/datasets-tuples.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |  43 +++-
 .../sql/catalyst/encoders/ClassEncoder.scala  | 101 --------
 .../spark/sql/catalyst/encoders/Encoder.scala |  38 +--
 .../catalyst/encoders/ExpressionEncoder.scala | 217 ++++++++++++++++++
 .../catalyst/encoders/ProductEncoder.scala    |  47 ----
 .../sql/catalyst/encoders/RowEncoder.scala    |   5 +-
 .../sql/catalyst/encoders/package.scala}      |  29 +--
 .../catalyst/encoders/primitiveTypes.scala    | 100 --------
 .../spark/sql/catalyst/encoders/tuples.scala  | 173 --------------
 .../plans/logical/basicOperators.scala        |  28 +--
 ...ite.scala => ExpressionEncoderSuite.scala} |  39 ++--
 .../org/apache/spark/sql/DataFrame.scala      |   2 +-
 .../scala/org/apache/spark/sql/Dataset.scala  | 190 ++++++++-------
 .../org/apache/spark/sql/SQLContext.scala     |   4 +-
 .../org/apache/spark/sql/SQLImplicits.scala   |  13 +-
 .../spark/sql/execution/basicOperators.scala  |  16 +-
 .../org/apache/spark/sql/DatasetSuite.scala   |  89 ++++++-
 .../org/apache/spark/sql/QueryTest.scala      |  44 +++-
 18 files changed, 563 insertions(+), 615 deletions(-)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
 rename sql/catalyst/src/{test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala => main/scala/org/apache/spark/sql/catalyst/encoders/package.scala} (56%)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/{ProductEncoderSuite.scala => ExpressionEncoderSuite.scala} (91%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index c25161ee81b66..9cbb7c2ffdc76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -146,6 +146,10 @@ trait ScalaReflection {
    * row with a compatible schema.  Fields of the row will be extracted using UnresolvedAttributes
    * of the same name as the constructor arguments.  Nested classes will have their fields accessed
    * using UnresolvedExtractValue.
+   *
+   * When used on a primitive type, the constructor will instead default to extracting the value
+   * from ordinal 0 (since there are no names to map to).  The actual location can be moved by
+   * calling unbind/bind with a new schema.
    */
   def constructorFor[T : TypeTag]: Expression = constructorFor(typeOf[T], None)
 
@@ -159,8 +163,14 @@ trait ScalaReflection {
         .map(p => UnresolvedExtractValue(p, expressions.Literal(part)))
         .getOrElse(UnresolvedAttribute(part))
 
+    /** Returns the current path with a field at ordinal extracted. */
+    def addToPathOrdinal(ordinal: Int, dataType: DataType) =
+      path
+        .map(p => GetStructField(p, StructField(s"_$ordinal", dataType), ordinal))
+        .getOrElse(BoundReference(ordinal, dataType, false))
+
     /** Returns the current path or throws an error. */
-    def getPath = path.getOrElse(sys.error("Constructors must start at a class type"))
+    def getPath = path.getOrElse(BoundReference(0, dataTypeFor(tpe), true))
 
     tpe match {
       case t if !dataTypeFor(t).isInstanceOf[ObjectType] =>
@@ -387,12 +397,17 @@ trait ScalaReflection {
         val className: String = t.erasure.typeSymbol.asClass.fullName
         val cls = Utils.classForName(className)
 
-        val arguments = params.head.map { p =>
+        val arguments = params.head.zipWithIndex.map { case (p, i) =>
           val fieldName = p.name.toString
           val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
-          val dataType = dataTypeFor(fieldType)
+          val dataType = schemaFor(fieldType).dataType
 
-          constructorFor(fieldType, Some(addToPath(fieldName)))
+          // For tuples, we based grab the inner fields by ordinal instead of name.
+          if (className startsWith "scala.Tuple") {
+            constructorFor(fieldType, Some(addToPathOrdinal(i, dataType)))
+          } else {
+            constructorFor(fieldType, Some(addToPath(fieldName)))
+          }
         }
 
         val newInstance = NewInstance(cls, arguments, propagateNull = false, ObjectType(cls))
@@ -413,7 +428,10 @@ trait ScalaReflection {
   /** Returns expressions for extracting all the fields from the given type. */
   def extractorsFor[T : TypeTag](inputObject: Expression): CreateNamedStruct = {
     ScalaReflectionLock.synchronized {
-      extractorFor(inputObject, typeTag[T].tpe).asInstanceOf[CreateNamedStruct]
+      extractorFor(inputObject, typeTag[T].tpe) match {
+        case s: CreateNamedStruct => s
+        case o => CreateNamedStruct(expressions.Literal("value") :: o :: Nil)
+      }
     }
   }
 
@@ -602,6 +620,21 @@ trait ScalaReflection {
         case t if t <:< localTypeOf[java.lang.Boolean] =>
           Invoke(inputObject, "booleanValue", BooleanType)
 
+        case t if t <:< definitions.IntTpe =>
+          BoundReference(0, IntegerType, false)
+        case t if t <:< definitions.LongTpe =>
+          BoundReference(0, LongType, false)
+        case t if t <:< definitions.DoubleTpe =>
+          BoundReference(0, DoubleType, false)
+        case t if t <:< definitions.FloatTpe =>
+          BoundReference(0, FloatType, false)
+        case t if t <:< definitions.ShortTpe =>
+          BoundReference(0, ShortType, false)
+        case t if t <:< definitions.ByteTpe =>
+          BoundReference(0, ByteType, false)
+        case t if t <:< definitions.BooleanTpe =>
+          BoundReference(0, BooleanType, false)
+
         case other =>
           throw new UnsupportedOperationException(s"Extractor for type $other is not supported")
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
deleted file mode 100644
index b484b8fde6369..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.encoders
-
-import scala.reflect.ClassTag
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, SimpleAnalyzer}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
-import org.apache.spark.sql.types.{ObjectType, StructType}
-
-/**
- * A generic encoder for JVM objects.
- *
- * @param schema The schema after converting `T` to a Spark SQL row.
- * @param extractExpressions A set of expressions, one for each top-level field that can be used to
- *                           extract the values from a raw object.
- * @param clsTag A classtag for `T`.
- */
-case class ClassEncoder[T](
-    schema: StructType,
-    extractExpressions: Seq[Expression],
-    constructExpression: Expression,
-    clsTag: ClassTag[T])
-  extends Encoder[T] {
-
-  @transient
-  private lazy val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
-  private val inputRow = new GenericMutableRow(1)
-
-  @transient
-  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
-  private val dataType = ObjectType(clsTag.runtimeClass)
-
-  override def toRow(t: T): InternalRow = {
-    inputRow(0) = t
-    extractProjection(inputRow)
-  }
-
-  override def fromRow(row: InternalRow): T = {
-    constructProjection(row).get(0, dataType).asInstanceOf[T]
-  }
-
-  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
-    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
-    val analyzedPlan = SimpleAnalyzer.execute(plan)
-    val resolvedExpression = analyzedPlan.expressions.head.children.head
-    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
-
-    copy(constructExpression = boundExpression)
-  }
-
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): ClassEncoder[T] = {
-    val positionToAttribute = AttributeMap.toIndex(oldSchema)
-    val attributeToNewPosition = AttributeMap.byIndex(newSchema)
-    copy(constructExpression = constructExpression transform {
-      case r: BoundReference =>
-        r.copy(ordinal = attributeToNewPosition(positionToAttribute(r.ordinal)))
-    })
-  }
-
-  override def bindOrdinals(schema: Seq[Attribute]): ClassEncoder[T] = {
-    var remaining = schema
-    copy(constructExpression = constructExpression transform {
-      case u: UnresolvedAttribute =>
-        val pos = remaining.head
-        remaining = remaining.drop(1)
-        pos
-    })
-  }
-
-  protected val attrs = extractExpressions.map(_.collect {
-    case a: Attribute => s"#${a.exprId}"
-    case b: BoundReference => s"[${b.ordinal}]"
-  }.headOption.getOrElse(""))
-
-
-  protected val schemaString =
-    schema
-      .zip(attrs)
-      .map { case(f, a) => s"${f.name}$a: ${f.dataType.simpleString}"}.mkString(", ")
-
-  override def toString: String = s"class[$schemaString]"
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index efb872ddb81e5..329a132d3d8b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -18,10 +18,9 @@
 package org.apache.spark.sql.catalyst.encoders
 
 
+
 import scala.reflect.ClassTag
 
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -30,44 +29,11 @@ import org.apache.spark.sql.types.StructType
  * Encoders are not intended to be thread-safe and thus they are allow to avoid internal locking
  * and reuse internal buffers to improve performance.
  */
-trait Encoder[T] {
+trait Encoder[T] extends Serializable {
 
   /** Returns the schema of encoding this type of object as a Row. */
   def schema: StructType
 
   /** A ClassTag that can be used to construct and Array to contain a collection of `T`. */
   def clsTag: ClassTag[T]
-
-  /**
-   * Returns an encoded version of `t` as a Spark SQL row.  Note that multiple calls to
-   * toRow are allowed to return the same actual [[InternalRow]] object.  Thus, the caller should
-   * copy the result before making another call if required.
-   */
-  def toRow(t: T): InternalRow
-
-  /**
-   * Returns an object of type `T`, extracting the required values from the provided row.  Note that
-   * you must `bind` an encoder to a specific schema before you can call this function.
-   */
-  def fromRow(row: InternalRow): T
-
-  /**
-   * Returns a new copy of this encoder, where the expressions used by `fromRow` are bound to the
-   * given schema.
-   */
-  def bind(schema: Seq[Attribute]): Encoder[T]
-
-  /**
-   * Binds this encoder to the given schema positionally.  In this binding, the first reference to
-   * any input is mapped to `schema(0)`, and so on for each input that is encountered.
-   */
-  def bindOrdinals(schema: Seq[Attribute]): Encoder[T]
-
-  /**
-   * Given an encoder that has already been bound to a given schema, returns a new encoder that
-   * where the positions are mapped from `oldSchema` to `newSchema`.  This can be used, for example,
-   * when you are trying to use an encoder on grouping keys that were orriginally part of a larger
-   * row, but now you have projected out only the key expressions.
-   */
-  def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
new file mode 100644
index 0000000000000..c287aebeeee05
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.sql.catalyst.analysis.{SimpleAnalyzer, UnresolvedExtractValue, UnresolvedAttribute}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
+import org.apache.spark.util.Utils
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.{typeTag, TypeTag}
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.types.{StructField, DataType, ObjectType, StructType}
+
+/**
+ * A factory for constructing encoders that convert objects and primitves to and from the
+ * internal row format using catalyst expressions and code generation.  By default, the
+ * expressions used to retrieve values from an input row when producing an object will be created as
+ * follows:
+ *  - Classes will have their sub fields extracted by name using [[UnresolvedAttribute]] expressions
+ *    and [[UnresolvedExtractValue]] expressions.
+ *  - Tuples will have their subfields extracted by position using [[BoundReference]] expressions.
+ *  - Primitives will have their values extracted from the first ordinal with a schema that defaults
+ *    to the name `value`.
+ */
+object ExpressionEncoder {
+  def apply[T : TypeTag](flat: Boolean = false): ExpressionEncoder[T] = {
+    // We convert the not-serializable TypeTag into StructType and ClassTag.
+    val mirror = typeTag[T].mirror
+    val cls = mirror.runtimeClass(typeTag[T].tpe)
+
+    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
+    val extractExpression = ScalaReflection.extractorsFor[T](inputObject)
+    val constructExpression = ScalaReflection.constructorFor[T]
+
+    new ExpressionEncoder[T](
+      extractExpression.dataType,
+      flat,
+      extractExpression.flatten,
+      constructExpression,
+      ClassTag[T](cls))
+  }
+
+  /**
+   * Given a set of N encoders, constructs a new encoder that produce objects as items in an
+   * N-tuple.  Note that these encoders should first be bound correctly to the combined input
+   * schema.
+   */
+  def tuple(encoders: Seq[ExpressionEncoder[_]]): ExpressionEncoder[_] = {
+    val schema =
+      StructType(
+        encoders.zipWithIndex.map { case (e, i) => StructField(s"_${i + 1}", e.schema)})
+    val cls = Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}")
+    val extractExpressions = encoders.map {
+      case e if e.flat => e.extractExpressions.head
+      case other => CreateStruct(other.extractExpressions)
+    }
+    val constructExpression =
+      NewInstance(cls, encoders.map(_.constructExpression), false, ObjectType(cls))
+
+    new ExpressionEncoder[Any](
+      schema,
+      false,
+      extractExpressions,
+      constructExpression,
+      ClassTag.apply(cls))
+  }
+
+  /** A helper for producing encoders of Tuple2 from other encoders. */
+  def tuple[T1, T2](
+      e1: ExpressionEncoder[T1],
+      e2: ExpressionEncoder[T2]): ExpressionEncoder[(T1, T2)] =
+    tuple(e1 :: e2 :: Nil).asInstanceOf[ExpressionEncoder[(T1, T2)]]
+}
+
+/**
+ * A generic encoder for JVM objects.
+ *
+ * @param schema The schema after converting `T` to a Spark SQL row.
+ * @param extractExpressions A set of expressions, one for each top-level field that can be used to
+ *                           extract the values from a raw object.
+ * @param clsTag A classtag for `T`.
+ */
+case class ExpressionEncoder[T](
+    schema: StructType,
+    flat: Boolean,
+    extractExpressions: Seq[Expression],
+    constructExpression: Expression,
+    clsTag: ClassTag[T])
+  extends Encoder[T] {
+
+  if (flat) require(extractExpressions.size == 1)
+
+  @transient
+  private lazy val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  private val inputRow = new GenericMutableRow(1)
+
+  @transient
+  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
+
+  /**
+   * Returns an encoded version of `t` as a Spark SQL row.  Note that multiple calls to
+   * toRow are allowed to return the same actual [[InternalRow]] object.  Thus, the caller should
+   * copy the result before making another call if required.
+   */
+  def toRow(t: T): InternalRow = {
+    inputRow(0) = t
+    extractProjection(inputRow)
+  }
+
+  /**
+   * Returns an object of type `T`, extracting the required values from the provided row.  Note that
+   * you must `resolve` and `bind` an encoder to a specific schema before you can call this
+   * function.
+   */
+  def fromRow(row: InternalRow): T = try {
+    constructProjection(row).get(0, ObjectType(clsTag.runtimeClass)).asInstanceOf[T]
+  } catch {
+    case e: Exception =>
+      throw new RuntimeException(s"Error while decoding: $e\n${constructExpression.treeString}", e)
+  }
+
+  /**
+   * Returns a new copy of this encoder, where the expressions used by `fromRow` are resolved to the
+   * given schema.
+   */
+  def resolve(schema: Seq[Attribute]): ExpressionEncoder[T] = {
+    val plan = Project(Alias(constructExpression, "")() :: Nil, LocalRelation(schema))
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    copy(constructExpression = analyzedPlan.expressions.head.children.head)
+  }
+
+  /**
+   * Returns a copy of this encoder where the expressions used to construct an object from an input
+   * row have been bound to the ordinals of the given schema.  Note that you need to first call
+   * resolve before bind.
+   */
+  def bind(schema: Seq[Attribute]): ExpressionEncoder[T] = {
+    copy(constructExpression = BindReferences.bindReference(constructExpression, schema))
+  }
+
+  /**
+   * Replaces any bound references in the schema with the attributes at the corresponding ordinal
+   * in the provided schema.  This can be used to "relocate" a given encoder to pull values from
+   * a different schema than it was initially bound to.  It can also be used to assign attributes
+   * to ordinal based extraction (i.e. because the input data was a tuple).
+   */
+  def unbind(schema: Seq[Attribute]): ExpressionEncoder[T] = {
+    val positionToAttribute = AttributeMap.toIndex(schema)
+    copy(constructExpression = constructExpression transform {
+      case b: BoundReference => positionToAttribute(b.ordinal)
+    })
+  }
+
+  /**
+   * Given an encoder that has already been bound to a given schema, returns a new encoder
+   * where the positions are mapped from `oldSchema` to `newSchema`.  This can be used, for example,
+   * when you are trying to use an encoder on grouping keys that were originally part of a larger
+   * row, but now you have projected out only the key expressions.
+   */
+  def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): ExpressionEncoder[T] = {
+    val positionToAttribute = AttributeMap.toIndex(oldSchema)
+    val attributeToNewPosition = AttributeMap.byIndex(newSchema)
+    copy(constructExpression = constructExpression transform {
+      case r: BoundReference =>
+        r.copy(ordinal = attributeToNewPosition(positionToAttribute(r.ordinal)))
+    })
+  }
+
+  /**
+   * Returns a copy of this encoder where the expressions used to create an object given an
+   * input row have been modified to pull the object out from a nested struct, instead of the
+   * top level fields.
+   */
+  def nested(input: Expression = BoundReference(0, schema, true)): ExpressionEncoder[T] = {
+    copy(constructExpression = constructExpression transform {
+      case u: Attribute if u != input =>
+        UnresolvedExtractValue(input, Literal(u.name))
+      case b: BoundReference if b != input =>
+        GetStructField(
+          input,
+          StructField(s"i[${b.ordinal}]", b.dataType),
+          b.ordinal)
+    })
+  }
+
+  protected val attrs = extractExpressions.flatMap(_.collect {
+    case _: UnresolvedAttribute => ""
+    case a: Attribute => s"#${a.exprId}"
+    case b: BoundReference => s"[${b.ordinal}]"
+  })
+
+  protected val schemaString =
+    schema
+      .zip(attrs)
+      .map { case(f, a) => s"${f.name}$a: ${f.dataType.simpleString}"}.mkString(", ")
+
+  override def toString: String = s"class[$schemaString]"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
deleted file mode 100644
index 34f5e6c030f58..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.encoders
-
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.{typeTag, TypeTag}
-
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.types.{ObjectType, StructType}
-
-/**
- * A factory for constructing encoders that convert Scala's product type to/from the Spark SQL
- * internal binary representation.
- */
-object ProductEncoder {
-  def apply[T <: Product : TypeTag]: ClassEncoder[T] = {
-    // We convert the not-serializable TypeTag into StructType and ClassTag.
-    val mirror = typeTag[T].mirror
-    val cls = mirror.runtimeClass(typeTag[T].tpe)
-
-    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
-    val extractExpression = ScalaReflection.extractorsFor[T](inputObject)
-    val constructExpression = ScalaReflection.constructorFor[T]
-
-    new ClassEncoder[T](
-      extractExpression.dataType,
-      extractExpression.flatten,
-      constructExpression,
-      ClassTag[T](cls))
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index e9cc00a2b64ce..0b42130a013b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -31,13 +31,14 @@ import org.apache.spark.unsafe.types.UTF8String
  * internal binary representation.
  */
 object RowEncoder {
-  def apply(schema: StructType): ClassEncoder[Row] = {
+  def apply(schema: StructType): ExpressionEncoder[Row] = {
     val cls = classOf[Row]
     val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
     val extractExpressions = extractorsFor(inputObject, schema)
     val constructExpression = constructorFor(schema)
-    new ClassEncoder[Row](
+    new ExpressionEncoder[Row](
       schema,
+      flat = false,
       extractExpressions.asInstanceOf[CreateStruct].children,
       constructExpression,
       ClassTag(cls))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/package.scala
similarity index 56%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/package.scala
index 52f8383faca92..d4642a500672e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/PrimitiveEncoderSuite.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/package.scala
@@ -15,29 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.encoders
+package org.apache.spark.sql.catalyst
 
-import org.apache.spark.SparkFunSuite
-
-class PrimitiveEncoderSuite extends SparkFunSuite {
-  test("long encoder") {
-    val enc = new LongEncoder()
-    val row = enc.toRow(10)
-    assert(row.getLong(0) == 10)
-    assert(enc.fromRow(row) == 10)
-  }
-
-  test("int encoder") {
-    val enc = new IntEncoder()
-    val row = enc.toRow(10)
-    assert(row.getInt(0) == 10)
-    assert(enc.fromRow(row) == 10)
-  }
-
-  test("string encoder") {
-    val enc = new StringEncoder()
-    val row = enc.toRow("test")
-    assert(row.getString(0) == "test")
-    assert(enc.fromRow(row) == "test")
+package object encoders {
+  private[sql] def encoderFor[A : Encoder]: ExpressionEncoder[A] = implicitly[Encoder[A]] match {
+    case e: ExpressionEncoder[A] => e
+    case _ => sys.error(s"Only expression encoders are supported today")
   }
 }
+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
deleted file mode 100644
index a93f2d7c6115d..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/primitiveTypes.scala
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.encoders
-
-import scala.reflect.ClassTag
-
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.sql.types._
-
-/** An encoder for primitive Long types. */
-case class LongEncoder(fieldName: String = "value", ordinal: Int = 0) extends Encoder[Long] {
-  private val row = UnsafeRow.createFromByteArray(64, 1)
-
-  override def clsTag: ClassTag[Long] = ClassTag.Long
-  override def schema: StructType =
-    StructType(StructField(fieldName, LongType) :: Nil)
-
-  override def fromRow(row: InternalRow): Long = row.getLong(ordinal)
-
-  override def toRow(t: Long): InternalRow = {
-    row.setLong(ordinal, t)
-    row
-  }
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[Long] = this
-  override def bind(schema: Seq[Attribute]): Encoder[Long] = this
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[Long] = this
-}
-
-/** An encoder for primitive Integer types. */
-case class IntEncoder(fieldName: String = "value", ordinal: Int = 0) extends Encoder[Int] {
-  private val row = UnsafeRow.createFromByteArray(64, 1)
-
-  override def clsTag: ClassTag[Int] = ClassTag.Int
-  override def schema: StructType =
-    StructType(StructField(fieldName, IntegerType) :: Nil)
-
-  override def fromRow(row: InternalRow): Int = row.getInt(ordinal)
-
-  override def toRow(t: Int): InternalRow = {
-    row.setInt(ordinal, t)
-    row
-  }
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[Int] = this
-  override def bind(schema: Seq[Attribute]): Encoder[Int] = this
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[Int] = this
-}
-
-/** An encoder for String types. */
-case class StringEncoder(
-    fieldName: String = "value",
-    ordinal: Int = 0) extends Encoder[String] {
-
-  val record = new SpecificMutableRow(StringType :: Nil)
-
-  @transient
-  lazy val projection =
-    GenerateUnsafeProjection.generate(BoundReference(0, StringType, true) :: Nil)
-
-  override def schema: StructType =
-    StructType(
-      StructField("value", StringType, nullable = false) :: Nil)
-
-  override def clsTag: ClassTag[String] = scala.reflect.classTag[String]
-
-
-  override final def fromRow(row: InternalRow): String = {
-    row.getString(ordinal)
-  }
-
-  override final def toRow(value: String): InternalRow = {
-    val utf8String = UTF8String.fromString(value)
-    record(0) = utf8String
-    // TODO: this is a bit of a hack to produce UnsafeRows
-    projection(record)
-  }
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[String] = this
-  override def bind(schema: Seq[Attribute]): Encoder[String] = this
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[String] = this
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
deleted file mode 100644
index a48eeda7d2e6f..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/tuples.scala
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.encoders
-
-
-import scala.reflect.ClassTag
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.types.{StructField, StructType}
-
-// Most of this file is codegen.
-// scalastyle:off
-
-/**
- * A set of composite encoders that take sub encoders and map each of their objects to a
- * Scala tuple.  Note that currently the implementation is fairly limited and only supports going
- * from an internal row to a tuple.
- */
-object TupleEncoder {
-
-  /** Code generator for composite tuple encoders. */
-  def main(args: Array[String]): Unit = {
-    (2 to 5).foreach { i =>
-      val types = (1 to i).map(t => s"T$t").mkString(", ")
-      val tupleType = s"($types)"
-      val args = (1 to i).map(t => s"e$t: Encoder[T$t]").mkString(", ")
-      val fields = (1 to i).map(t => s"""StructField("_$t", e$t.schema)""").mkString(", ")
-      val fromRow = (1 to i).map(t => s"e$t.fromRow(row)").mkString(", ")
-
-      println(
-        s"""
-          |class Tuple${i}Encoder[$types]($args) extends Encoder[$tupleType] {
-          |  val schema = StructType(Array($fields))
-          |
-          |  def clsTag: ClassTag[$tupleType] = scala.reflect.classTag[$tupleType]
-          |
-          |  def fromRow(row: InternalRow): $tupleType = {
-          |    ($fromRow)
-          |  }
-          |
-          |  override def toRow(t: $tupleType): InternalRow =
-          |    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
-          |
-          |  override def bind(schema: Seq[Attribute]): Encoder[$tupleType] = {
-          |    this
-          |  }
-          |
-          |  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[$tupleType] =
-          |    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-          |
-          |
-          |  override def bindOrdinals(schema: Seq[Attribute]): Encoder[$tupleType] =
-          |    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-          |}
-        """.stripMargin)
-    }
-  }
-}
-
-class Tuple2Encoder[T1, T2](e1: Encoder[T1], e2: Encoder[T2]) extends Encoder[(T1, T2)] {
-  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema)))
-
-  def clsTag: ClassTag[(T1, T2)] = scala.reflect.classTag[(T1, T2)]
-
-  def fromRow(row: InternalRow): (T1, T2) = {
-    (e1.fromRow(row), e2.fromRow(row))
-  }
-
-  override def toRow(t: (T1, T2)): InternalRow =
-    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
-
-  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2)] = {
-    this
-  }
-
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-}
-
-
-class Tuple3Encoder[T1, T2, T3](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3]) extends Encoder[(T1, T2, T3)] {
-  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema)))
-
-  def clsTag: ClassTag[(T1, T2, T3)] = scala.reflect.classTag[(T1, T2, T3)]
-
-  def fromRow(row: InternalRow): (T1, T2, T3) = {
-    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row))
-  }
-
-  override def toRow(t: (T1, T2, T3)): InternalRow =
-    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
-
-  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3)] = {
-    this
-  }
-
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-}
-
-
-class Tuple4Encoder[T1, T2, T3, T4](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4]) extends Encoder[(T1, T2, T3, T4)] {
-  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema), StructField("_4", e4.schema)))
-
-  def clsTag: ClassTag[(T1, T2, T3, T4)] = scala.reflect.classTag[(T1, T2, T3, T4)]
-
-  def fromRow(row: InternalRow): (T1, T2, T3, T4) = {
-    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row), e4.fromRow(row))
-  }
-
-  override def toRow(t: (T1, T2, T3, T4)): InternalRow =
-    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
-
-  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] = {
-    this
-  }
-
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-}
-
-
-class Tuple5Encoder[T1, T2, T3, T4, T5](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4], e5: Encoder[T5]) extends Encoder[(T1, T2, T3, T4, T5)] {
-  val schema = StructType(Array(StructField("_1", e1.schema), StructField("_2", e2.schema), StructField("_3", e3.schema), StructField("_4", e4.schema), StructField("_5", e5.schema)))
-
-  def clsTag: ClassTag[(T1, T2, T3, T4, T5)] = scala.reflect.classTag[(T1, T2, T3, T4, T5)]
-
-  def fromRow(row: InternalRow): (T1, T2, T3, T4, T5) = {
-    (e1.fromRow(row), e2.fromRow(row), e3.fromRow(row), e4.fromRow(row), e5.fromRow(row))
-  }
-
-  override def toRow(t: (T1, T2, T3, T4, T5)): InternalRow =
-    throw new UnsupportedOperationException("Tuple Encoders only support fromRow.")
-
-  override def bind(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] = {
-    this
-  }
-
-  override def rebind(oldSchema: Seq[Attribute], newSchema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-
-
-  override def bindOrdinals(schema: Seq[Attribute]): Encoder[(T1, T2, T3, T4, T5)] =
-    throw new UnsupportedOperationException("Tuple Encoders only support bind.")
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 21a55a5371841..d2d3db0a44484 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.encoders.Encoder
+import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.Utils
 import org.apache.spark.sql.catalyst.plans._
@@ -450,8 +450,8 @@ case object OneRowRelation extends LeafNode {
  */
 case class MapPartitions[T, U](
     func: Iterator[T] => Iterator[U],
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     output: Seq[Attribute],
     child: LogicalPlan) extends UnaryNode {
   override def missingInput: AttributeSet = AttributeSet.empty
@@ -460,8 +460,8 @@ case class MapPartitions[T, U](
 /** Factory for constructing new `AppendColumn` nodes. */
 object AppendColumn {
   def apply[T : Encoder, U : Encoder](func: T => U, child: LogicalPlan): AppendColumn[T, U] = {
-    val attrs = implicitly[Encoder[U]].schema.toAttributes
-    new AppendColumn[T, U](func, implicitly[Encoder[T]], implicitly[Encoder[U]], attrs, child)
+    val attrs = encoderFor[U].schema.toAttributes
+    new AppendColumn[T, U](func, encoderFor[T], encoderFor[U], attrs, child)
   }
 }
 
@@ -472,8 +472,8 @@ object AppendColumn {
  */
 case class AppendColumn[T, U](
     func: T => U,
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     newColumns: Seq[Attribute],
     child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output ++ newColumns
@@ -488,11 +488,11 @@ object MapGroups {
       child: LogicalPlan): MapGroups[K, T, U] = {
     new MapGroups(
       func,
-      implicitly[Encoder[K]],
-      implicitly[Encoder[T]],
-      implicitly[Encoder[U]],
+      encoderFor[K],
+      encoderFor[T],
+      encoderFor[U],
       groupingAttributes,
-      implicitly[Encoder[U]].schema.toAttributes,
+      encoderFor[U].schema.toAttributes,
       child)
   }
 }
@@ -504,9 +504,9 @@ object MapGroups {
  */
 case class MapGroups[K, T, U](
     func: (K, Iterator[T]) => Iterator[U],
-    kEncoder: Encoder[K],
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    kEncoder: ExpressionEncoder[K],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     groupingAttributes: Seq[Attribute],
     output: Seq[Attribute],
     child: LogicalPlan) extends UnaryNode {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
similarity index 91%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 008d0bea8a941..a374da4da1f08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -47,7 +47,16 @@ case class RepeatedData(
 
 case class SpecificCollection(l: List[Int])
 
-class ProductEncoderSuite extends SparkFunSuite {
+class ExpressionEncoderSuite extends SparkFunSuite {
+
+  encodeDecodeTest(1)
+  encodeDecodeTest(1L)
+  encodeDecodeTest(1.toDouble)
+  encodeDecodeTest(1.toFloat)
+  encodeDecodeTest(true)
+  encodeDecodeTest(false)
+  encodeDecodeTest(1.toShort)
+  encodeDecodeTest(1.toByte)
 
   encodeDecodeTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 
@@ -210,24 +219,24 @@ class ProductEncoderSuite extends SparkFunSuite {
     { (l, r) => l._2.toString == r._2.toString }
 
   /** Simplified encodeDecodeTestCustom, where the comparison function can be `Object.equals`. */
-  protected def encodeDecodeTest[T <: Product : TypeTag](inputData: T) =
+  protected def encodeDecodeTest[T : TypeTag](inputData: T) =
     encodeDecodeTestCustom[T](inputData)((l, r) => l == r)
 
   /**
    * Constructs a test that round-trips `t` through an encoder, checking the results to ensure it
    * matches the original.
    */
-  protected def encodeDecodeTestCustom[T <: Product : TypeTag](
+  protected def encodeDecodeTestCustom[T : TypeTag](
       inputData: T)(
       c: (T, T) => Boolean) = {
-    test(s"encode/decode: $inputData") {
-      val encoder = try ProductEncoder[T] catch {
+    test(s"encode/decode: $inputData - ${inputData.getClass.getName}") {
+      val encoder = try ExpressionEncoder[T]() catch {
         case e: Exception =>
           fail(s"Exception thrown generating encoder", e)
       }
       val convertedData = encoder.toRow(inputData)
       val schema = encoder.schema.toAttributes
-      val boundEncoder = encoder.bind(schema)
+      val boundEncoder = encoder.resolve(schema).bind(schema)
       val convertedBack = try boundEncoder.fromRow(convertedData) catch {
         case e: Exception =>
           fail(
@@ -236,15 +245,19 @@ class ProductEncoderSuite extends SparkFunSuite {
               |Schema: ${schema.mkString(",")}
               |${encoder.schema.treeString}
               |
-              |Construct Expressions:
-              |${boundEncoder.constructExpression.treeString}
+              |Encoder:
+              |$boundEncoder
               |
             """.stripMargin, e)
       }
 
       if (!c(inputData, convertedBack)) {
-        val types =
-          convertedBack.productIterator.filter(_ != null).map(_.getClass.getName).mkString(",")
+        val types = convertedBack match {
+          case c: Product =>
+            c.productIterator.filter(_ != null).map(_.getClass.getName).mkString(",")
+          case other => other.getClass.getName
+        }
+
 
         val encodedData = try {
           convertedData.toSeq(encoder.schema).zip(encoder.schema).map {
@@ -269,11 +282,7 @@ class ProductEncoderSuite extends SparkFunSuite {
              |${encoder.schema.treeString}
              |
              |Extract Expressions:
-             |${boundEncoder.extractExpressions.map(_.treeString).mkString("\n")}
-             |
-             |Construct Expressions:
-             |${boundEncoder.constructExpression.treeString}
-             |
+             |$boundEncoder
          """.stripMargin)
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 32d9b0b1d9888..aa817a037ef5e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -267,7 +267,7 @@ class DataFrame private[sql](
    * @since 1.6.0
    */
   @Experimental
-  def as[U : Encoder]: Dataset[U] = new Dataset[U](sqlContext, queryExecution)
+  def as[U : Encoder]: Dataset[U] = new Dataset[U](sqlContext, logicalPlan)
 
   /**
    * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 96213c7630400..e0ab5f593e933 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -21,6 +21,7 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.Inner
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.types.StructType
@@ -53,15 +54,21 @@ import org.apache.spark.sql.types.StructType
  * @since 1.6.0
  */
 @Experimental
-class Dataset[T] private[sql](
+class Dataset[T] private(
     @transient val sqlContext: SQLContext,
-    @transient val queryExecution: QueryExecution)(
-    implicit val encoder: Encoder[T]) extends Serializable {
+    @transient val queryExecution: QueryExecution,
+    unresolvedEncoder: Encoder[T]) extends Serializable {
+
+  /** The encoder for this [[Dataset]] that has been resolved to its output schema. */
+  private[sql] implicit val encoder: ExpressionEncoder[T] = unresolvedEncoder match {
+    case e: ExpressionEncoder[T] => e.resolve(queryExecution.analyzed.output)
+    case _ => throw new IllegalArgumentException("Only expression encoders are currently supported")
+  }
 
   private implicit def classTag = encoder.clsTag
 
   private[sql] def this(sqlContext: SQLContext, plan: LogicalPlan)(implicit encoder: Encoder[T]) =
-    this(sqlContext, new QueryExecution(sqlContext, plan))
+    this(sqlContext, new QueryExecution(sqlContext, plan), encoder)
 
   /** Returns the schema of the encoded form of the objects in this [[Dataset]]. */
   def schema: StructType = encoder.schema
@@ -76,7 +83,9 @@ class Dataset[T] private[sql](
    * TODO: document binding rules
    * @since 1.6.0
    */
-  def as[U : Encoder]: Dataset[U] = new Dataset(sqlContext, queryExecution)(implicitly[Encoder[U]])
+  def as[U : Encoder]: Dataset[U] = {
+    new Dataset(sqlContext, queryExecution, encoderFor[U])
+  }
 
   /**
    * Applies a logical alias to this [[Dataset]] that can be used to disambiguate columns that have
@@ -103,7 +112,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def rdd: RDD[T] = {
-    val tEnc = implicitly[Encoder[T]]
+    val tEnc = encoderFor[T]
     val input = queryExecution.analyzed.output
     queryExecution.toRdd.mapPartitions { iter =>
       val bound = tEnc.bind(input)
@@ -150,9 +159,9 @@ class Dataset[T] private[sql](
       sqlContext,
       MapPartitions[T, U](
         func,
-        implicitly[Encoder[T]],
-        implicitly[Encoder[U]],
-        implicitly[Encoder[U]].schema.toAttributes,
+        encoderFor[T],
+        encoderFor[U],
+        encoderFor[U].schema.toAttributes,
         logicalPlan))
   }
 
@@ -209,8 +218,8 @@ class Dataset[T] private[sql](
     val executed = sqlContext.executePlan(withGroupingKey)
 
     new GroupedDataset(
-      implicitly[Encoder[K]].bindOrdinals(withGroupingKey.newColumns),
-      implicitly[Encoder[T]].bind(inputPlan.output),
+      encoderFor[K].resolve(withGroupingKey.newColumns),
+      encoderFor[T].bind(inputPlan.output),
       executed,
       inputPlan.output,
       withGroupingKey.newColumns)
@@ -220,6 +229,18 @@ class Dataset[T] private[sql](
    *  Typed Relational  *
    * ****************** */
 
+  /**
+   * Selects a set of column based expressions.
+   * {{{
+   *   df.select($"colA", $"colB" + 1)
+   * }}}
+   * @group dfops
+   * @since 1.3.0
+   */
+  // Copied from Dataframe to make sure we don't have invalid overloads.
+  @scala.annotation.varargs
+  def select(cols: Column*): DataFrame = toDF().select(cols: _*)
+
   /**
    * Returns a new [[Dataset]] by computing the given [[Column]] expression for each element.
    *
@@ -233,88 +254,64 @@ class Dataset[T] private[sql](
     new Dataset[U1](sqlContext, Project(Alias(c1.expr, "_1")() :: Nil, logicalPlan))
   }
 
-  // Codegen
-  // scalastyle:off
-
-  /** sbt scalaShell; println(Seq(1).toDS().genSelect) */
-  private def genSelect: String = {
-    (2 to 5).map { n =>
-      val types = (1 to n).map(i =>s"U$i").mkString(", ")
-      val args = (1 to n).map(i => s"c$i: TypedColumn[U$i]").mkString(", ")
-      val encoders = (1 to n).map(i => s"c$i.encoder").mkString(", ")
-      val schema = (1 to n).map(i => s"""Alias(c$i.expr, "_$i")()""").mkString(" :: ")
-      s"""
-         |/**
-         | * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
-         | * @since 1.6.0
-         | */
-         |def select[$types]($args): Dataset[($types)] = {
-         |  implicit val te = new Tuple${n}Encoder($encoders)
-         |  new Dataset[($types)](sqlContext,
-         |    Project(
-         |      $schema :: Nil,
-         |      logicalPlan))
-         |}
-         |
-       """.stripMargin
-    }.mkString("\n")
+  /**
+   * Internal helper function for building typed selects that return tuples.  For simplicity and
+   * code reuse, we do this without the help of the type system and then use helper functions
+   * that cast appropriately for the user facing interface.
+   */
+  protected def selectUntyped(columns: TypedColumn[_]*): Dataset[_] = {
+    val aliases = columns.zipWithIndex.map { case (c, i) => Alias(c.expr, s"_${i + 1}")() }
+    val unresolvedPlan = Project(aliases, logicalPlan)
+    val execution = new QueryExecution(sqlContext, unresolvedPlan)
+    // Rebind the encoders to the nested schema that will be produced by the select.
+    val encoders = columns.map(_.encoder.asInstanceOf[ExpressionEncoder[_]]).zip(aliases).map {
+      case (e: ExpressionEncoder[_], a) if !e.flat =>
+        e.nested(a.toAttribute).resolve(execution.analyzed.output)
+      case (e, a) =>
+        e.unbind(a.toAttribute :: Nil).resolve(execution.analyzed.output)
+    }
+    new Dataset(sqlContext, execution, ExpressionEncoder.tuple(encoders))
   }
 
   /**
    * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
    * @since 1.6.0
    */
-  def select[U1, U2](c1: TypedColumn[U1], c2: TypedColumn[U2]): Dataset[(U1, U2)] = {
-    implicit val te = new Tuple2Encoder(c1.encoder, c2.encoder)
-    new Dataset[(U1, U2)](sqlContext,
-      Project(
-        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Nil,
-        logicalPlan))
-  }
-
-
+  def select[U1, U2](c1: TypedColumn[U1], c2: TypedColumn[U2]): Dataset[(U1, U2)] =
+    selectUntyped(c1, c2).asInstanceOf[Dataset[(U1, U2)]]
 
   /**
    * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
    * @since 1.6.0
    */
-  def select[U1, U2, U3](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3]): Dataset[(U1, U2, U3)] = {
-    implicit val te = new Tuple3Encoder(c1.encoder, c2.encoder, c3.encoder)
-    new Dataset[(U1, U2, U3)](sqlContext,
-      Project(
-        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Nil,
-        logicalPlan))
-  }
-
-
+  def select[U1, U2, U3](
+      c1: TypedColumn[U1],
+      c2: TypedColumn[U2],
+      c3: TypedColumn[U3]): Dataset[(U1, U2, U3)] =
+    selectUntyped(c1, c2, c3).asInstanceOf[Dataset[(U1, U2, U3)]]
 
   /**
    * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
    * @since 1.6.0
    */
-  def select[U1, U2, U3, U4](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3], c4: TypedColumn[U4]): Dataset[(U1, U2, U3, U4)] = {
-    implicit val te = new Tuple4Encoder(c1.encoder, c2.encoder, c3.encoder, c4.encoder)
-    new Dataset[(U1, U2, U3, U4)](sqlContext,
-      Project(
-        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Alias(c4.expr, "_4")() :: Nil,
-        logicalPlan))
-  }
-
-
+  def select[U1, U2, U3, U4](
+      c1: TypedColumn[U1],
+      c2: TypedColumn[U2],
+      c3: TypedColumn[U3],
+      c4: TypedColumn[U4]): Dataset[(U1, U2, U3, U4)] =
+    selectUntyped(c1, c2, c3, c4).asInstanceOf[Dataset[(U1, U2, U3, U4)]]
 
   /**
    * Returns a new [[Dataset]] by computing the given [[Column]] expressions for each element.
    * @since 1.6.0
    */
-  def select[U1, U2, U3, U4, U5](c1: TypedColumn[U1], c2: TypedColumn[U2], c3: TypedColumn[U3], c4: TypedColumn[U4], c5: TypedColumn[U5]): Dataset[(U1, U2, U3, U4, U5)] = {
-    implicit val te = new Tuple5Encoder(c1.encoder, c2.encoder, c3.encoder, c4.encoder, c5.encoder)
-    new Dataset[(U1, U2, U3, U4, U5)](sqlContext,
-      Project(
-        Alias(c1.expr, "_1")() :: Alias(c2.expr, "_2")() :: Alias(c3.expr, "_3")() :: Alias(c4.expr, "_4")() :: Alias(c5.expr, "_5")() :: Nil,
-        logicalPlan))
-  }
-
-  // scalastyle:on
+  def select[U1, U2, U3, U4, U5](
+      c1: TypedColumn[U1],
+      c2: TypedColumn[U2],
+      c3: TypedColumn[U3],
+      c4: TypedColumn[U4],
+      c5: TypedColumn[U5]): Dataset[(U1, U2, U3, U4, U5)] =
+    selectUntyped(c1, c2, c3, c4, c5).asInstanceOf[Dataset[(U1, U2, U3, U4, U5)]]
 
   /* **************** *
    *  Set operations  *
@@ -360,6 +357,48 @@ class Dataset[T] private[sql](
    */
   def subtract(other: Dataset[T]): Dataset[T] = withPlan[T](other)(Except)
 
+  /* ****** *
+   *  Joins *
+   * ****** */
+
+  /**
+   * Joins this [[Dataset]] returning a [[Tuple2]] for each pair where `condition` evaluates to
+   * true.
+   *
+   * This is similar to the relation `join` function with one important difference in the
+   * result schema. Since `joinWith` preserves objects present on either side of the join, the
+   * result schema is similarly nested into a tuple under the column names `_1` and `_2`.
+   *
+   * This type of join can be useful both for preserving type-safety with the original object
+   * types as well as working with relational data where either side of the join has column
+   * names in common.
+   */
+  def joinWith[U](other: Dataset[U], condition: Column): Dataset[(T, U)] = {
+    val left = this.logicalPlan
+    val right = other.logicalPlan
+
+    val leftData = this.encoder match {
+      case e if e.flat => Alias(left.output.head, "_1")()
+      case _ => Alias(CreateStruct(left.output), "_1")()
+    }
+    val rightData = other.encoder match {
+      case e if e.flat => Alias(right.output.head, "_2")()
+      case _ => Alias(CreateStruct(right.output), "_2")()
+    }
+    val leftEncoder =
+      if (encoder.flat) encoder else encoder.nested(leftData.toAttribute)
+    val rightEncoder =
+      if (other.encoder.flat) other.encoder else other.encoder.nested(rightData.toAttribute)
+    implicit val tuple2Encoder: Encoder[(T, U)] =
+      ExpressionEncoder.tuple(leftEncoder, rightEncoder)
+
+    withPlan[(T, U)](other) { (left, right) =>
+      Project(
+        leftData :: rightData :: Nil,
+        Join(left, right, Inner, Some(condition.expr)))
+    }
+  }
+
   /* ************************** *
    *  Gather to Driver Actions  *
    * ************************** */
@@ -380,13 +419,10 @@ class Dataset[T] private[sql](
   private[sql] def logicalPlan = queryExecution.analyzed
 
   private[sql] def withPlan(f: LogicalPlan => LogicalPlan): Dataset[T] =
-    new Dataset[T](sqlContext, sqlContext.executePlan(f(logicalPlan)))
+    new Dataset[T](sqlContext, sqlContext.executePlan(f(logicalPlan)), encoder)
 
   private[sql] def withPlan[R : Encoder](
       other: Dataset[_])(
       f: (LogicalPlan, LogicalPlan) => LogicalPlan): Dataset[R] =
-    new Dataset[R](
-      sqlContext,
-      sqlContext.executePlan(
-        f(logicalPlan, other.logicalPlan)))
+    new Dataset[R](sqlContext, f(logicalPlan, other.logicalPlan))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 5e7198f974389..2cb94430e6178 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -34,7 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.encoders.Encoder
+import org.apache.spark.sql.catalyst.encoders.{encoderFor, Encoder}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
@@ -491,7 +491,7 @@ class SQLContext private[sql](
 
 
   def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = {
-    val enc = implicitly[Encoder[T]]
+    val enc = encoderFor[T]
     val attributes = enc.schema.toAttributes
     val encoded = data.map(d => enc.toRow(d).copy())
     val plan = new LocalRelation(attributes, encoded)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index af8474df0de80..f460a86414c41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -37,11 +37,16 @@ import org.apache.spark.unsafe.types.UTF8String
 abstract class SQLImplicits {
   protected def _sqlContext: SQLContext
 
-  implicit def newProductEncoder[T <: Product : TypeTag]: Encoder[T] = ProductEncoder[T]
+  implicit def newProductEncoder[T <: Product : TypeTag]: Encoder[T] = ExpressionEncoder[T]()
 
-  implicit def newIntEncoder: Encoder[Int] = new IntEncoder()
-  implicit def newLongEncoder: Encoder[Long] = new LongEncoder()
-  implicit def newStringEncoder: Encoder[String] = new StringEncoder()
+  implicit def newIntEncoder: Encoder[Int] = ExpressionEncoder[Int](flat = true)
+  implicit def newLongEncoder: Encoder[Long] = ExpressionEncoder[Long](flat = true)
+  implicit def newDoubleEncoder: Encoder[Double] = ExpressionEncoder[Double](flat = true)
+  implicit def newFloatEncoder: Encoder[Float] = ExpressionEncoder[Float](flat = true)
+  implicit def newByteEncoder: Encoder[Byte] = ExpressionEncoder[Byte](flat = true)
+  implicit def newShortEncoder: Encoder[Short] = ExpressionEncoder[Short](flat = true)
+  implicit def newBooleanEncoder: Encoder[Boolean] = ExpressionEncoder[Boolean](flat = true)
+  implicit def newStringEncoder: Encoder[String] = ExpressionEncoder[String](flat = true)
 
   implicit def localSeqToDatasetHolder[T : Encoder](s: Seq[T]): DatasetHolder[T] = {
     DatasetHolder(_sqlContext.createDataset(s))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 2bb3dba5bd2ba..89938471ee381 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.encoders.Encoder
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
 import org.apache.spark.sql.catalyst.plans.physical._
@@ -319,8 +319,8 @@ case class OutputFaker(output: Seq[Attribute], child: SparkPlan) extends SparkPl
  */
 case class MapPartitions[T, U](
     func: Iterator[T] => Iterator[U],
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     output: Seq[Attribute],
     child: SparkPlan) extends UnaryNode {
 
@@ -337,8 +337,8 @@ case class MapPartitions[T, U](
  */
 case class AppendColumns[T, U](
     func: T => U,
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     newColumns: Seq[Attribute],
     child: SparkPlan) extends UnaryNode {
 
@@ -363,9 +363,9 @@ case class AppendColumns[T, U](
  */
 case class MapGroups[K, T, U](
     func: (K, Iterator[T]) => Iterator[U],
-    kEncoder: Encoder[K],
-    tEncoder: Encoder[T],
-    uEncoder: Encoder[U],
+    kEncoder: ExpressionEncoder[K],
+    tEncoder: ExpressionEncoder[T],
+    uEncoder: ExpressionEncoder[U],
     groupingAttributes: Seq[Attribute],
     output: Seq[Attribute],
     child: SparkPlan) extends UnaryNode {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 08496249c60cc..aebb390a1d15d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -34,6 +34,13 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       data: _*)
   }
 
+  test("as tuple") {
+    val data = Seq(("a", 1), ("b", 2)).toDF("a", "b")
+    checkAnswer(
+      data.as[(String, Int)],
+      ("a", 1), ("b", 2))
+  }
+
   test("as case class / collect") {
     val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDF("a", "b").as[ClassData]
     checkAnswer(
@@ -61,14 +68,40 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       2, 3, 4)
   }
 
-  test("select 3") {
+  test("select 2") {
     val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
     checkAnswer(
       ds.select(
         expr("_1").as[String],
-        expr("_2").as[Int],
-        expr("_2 + 1").as[Int]),
-      ("a", 1, 2), ("b", 2, 3), ("c", 3, 4))
+        expr("_2").as[Int]) : Dataset[(String, Int)],
+      ("a", 1), ("b", 2), ("c", 3))
+  }
+
+  test("select 2, primitive and tuple") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.select(
+        expr("_1").as[String],
+        expr("struct(_2, _2)").as[(Int, Int)]),
+      ("a", (1, 1)), ("b", (2, 2)), ("c", (3, 3)))
+  }
+
+  test("select 2, primitive and class") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkAnswer(
+      ds.select(
+        expr("_1").as[String],
+        expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+      ("a", ClassData("a", 1)), ("b", ClassData("b", 2)), ("c", ClassData("c", 3)))
+  }
+
+  test("select 2, primitive and class, fields reordered") {
+    val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS()
+    checkDecoding(
+      ds.select(
+        expr("_1").as[String],
+        expr("named_struct('b', _2, 'a', _1)").as[ClassData]),
+      ("a", ClassData("a", 1)), ("b", ClassData("b", 2)), ("c", ClassData("c", 3)))
   }
 
   test("filter") {
@@ -102,6 +135,54 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     assert(ds.fold(("", 0))((a, b) => ("sum", a._2 + b._2)) == ("sum", 6))
   }
 
+  test("joinWith, flat schema") {
+    val ds1 = Seq(1, 2, 3).toDS().as("a")
+    val ds2 = Seq(1, 2).toDS().as("b")
+
+    checkAnswer(
+      ds1.joinWith(ds2, $"a.value" === $"b.value"),
+      (1, 1), (2, 2))
+  }
+
+  test("joinWith, expression condition") {
+    val ds1 = Seq(ClassData("a", 1), ClassData("b", 2)).toDS()
+    val ds2 = Seq(("a", 1), ("b", 2)).toDS()
+
+    checkAnswer(
+      ds1.joinWith(ds2, $"_1" === $"a"),
+      (ClassData("a", 1), ("a", 1)), (ClassData("b", 2), ("b", 2)))
+  }
+
+  test("joinWith tuple with primitive, expression") {
+    val ds1 = Seq(1, 1, 2).toDS()
+    val ds2 = Seq(("a", 1), ("b", 2)).toDS()
+
+    checkAnswer(
+      ds1.joinWith(ds2, $"value" === $"_2"),
+      (1, ("a", 1)), (1, ("a", 1)), (2, ("b", 2)))
+  }
+
+  test("joinWith class with primitive, toDF") {
+    val ds1 = Seq(1, 1, 2).toDS()
+    val ds2 = Seq(ClassData("a", 1), ClassData("b", 2)).toDS()
+
+    checkAnswer(
+      ds1.joinWith(ds2, $"value" === $"b").toDF().select($"_1", $"_2.a", $"_2.b"),
+      Row(1, "a", 1) :: Row(1, "a", 1) :: Row(2, "b", 2) :: Nil)
+  }
+
+  test("multi-level joinWith") {
+    val ds1 = Seq(("a", 1), ("b", 2)).toDS().as("a")
+    val ds2 = Seq(("a", 1), ("b", 2)).toDS().as("b")
+    val ds3 = Seq(("a", 1), ("b", 2)).toDS().as("c")
+
+    checkAnswer(
+      ds1.joinWith(ds2, $"a._2" === $"b._2").as("ab").joinWith(ds3, $"ab._1._2" === $"c._2"),
+      ((("a", 1), ("a", 1)), ("a", 1)),
+      ((("b", 2), ("b", 2)), ("b", 2)))
+
+  }
+
   test("groupBy function, keys") {
     val ds = Seq(("a", 1), ("b", 1)).toDS()
     val grouped = ds.groupBy(v => (1, v._2))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index aba567512fe32..73e02eb0d9574 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -20,12 +20,11 @@ package org.apache.spark.sql
 import java.util.{Locale, TimeZone}
 
 import scala.collection.JavaConverters._
-import scala.reflect.runtime.universe._
 
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.columnar.InMemoryRelation
-import org.apache.spark.sql.catalyst.encoders.{ProductEncoder, Encoder}
+import org.apache.spark.sql.catalyst.encoders.Encoder
 
 abstract class QueryTest extends PlanTest {
 
@@ -55,10 +54,49 @@ abstract class QueryTest extends PlanTest {
     }
   }
 
-  protected def checkAnswer[T : Encoder](ds: => Dataset[T], expectedAnswer: T*): Unit = {
+  /**
+   * Evaluates a dataset to make sure that the result of calling collect matches the given
+   * expected answer.
+   *  - Special handling is done based on whether the query plan should be expected to return
+   *    the results in sorted order.
+   *  - This function also checks to make sure that the schema for serializing the expected answer
+   *    matches that produced by the dataset (i.e. does manual construction of object match
+   *    the constructed encoder for cases like joins, etc).  Note that this means that it will fail
+   *    for cases where reordering is done on fields.  For such tests, user `checkDecoding` instead
+   *    which performs a subset of the checks done by this function.
+   */
+  protected def checkAnswer[T : Encoder](
+      ds: => Dataset[T],
+      expectedAnswer: T*): Unit = {
     checkAnswer(
       ds.toDF(),
       sqlContext.createDataset(expectedAnswer).toDF().collect().toSeq)
+
+    checkDecoding(ds, expectedAnswer: _*)
+  }
+
+  protected def checkDecoding[T](
+      ds: => Dataset[T],
+      expectedAnswer: T*): Unit = {
+    val decoded = try ds.collect().toSet catch {
+      case e: Exception =>
+        fail(
+          s"""
+             |Exception collecting dataset as objects
+             |${ds.encoder}
+             |${ds.encoder.constructExpression.treeString}
+             |${ds.queryExecution}
+           """.stripMargin, e)
+    }
+
+    if (decoded != expectedAnswer.toSet) {
+      fail(
+        s"""Decoded objects do not match expected objects:
+           |Expected: ${expectedAnswer.toSet.toSeq.map((a: Any) => a.toString).sorted}
+            |Actual ${decoded.toSet.toSeq.map((a: Any) => a.toString).sorted}
+            |${ds.encoder.constructExpression.treeString}
+         """.stripMargin)
+    }
   }
 
   /**

From 9dba5fb2b59174cefde5b62a5c892fe5925bea38 Mon Sep 17 00:00:00 2001
From: vectorijk <jiangkai@gmail.com>
Date: Tue, 27 Oct 2015 13:55:03 -0700
Subject: [PATCH 697/802] [SPARK-10024][PYSPARK] Python API RF and GBT related
 params clear up

implement {RandomForest, GBT, TreeEnsemble, TreeClassifier, TreeRegressor}Params for Python API
in pyspark/ml/{classification, regression}.py

Author: vectorijk <jiangkai@gmail.com>

Closes #9233 from vectorijk/spark-10024.
---
 python/pyspark/ml/classification.py | 182 +++-------------
 python/pyspark/ml/regression.py     | 324 ++++++++++++----------------
 2 files changed, 168 insertions(+), 338 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 88815e561f572..4cbe7fbd482da 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -19,7 +19,7 @@
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
 from pyspark.ml.regression import (
-    RandomForestParams, DecisionTreeModel, TreeEnsembleModels)
+    RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels)
 from pyspark.mllib.common import inherit_doc
 
 
@@ -205,8 +205,34 @@ class TreeClassifierParams(object):
     """
     supportedImpurities = ["entropy", "gini"]
 
+    # a placeholder to make it appear in the generated doc
+    impurity = Param(Params._dummy(), "impurity",
+                     "Criterion used for information gain calculation (case-insensitive). " +
+                     "Supported options: " +
+                     ", ".join(supportedImpurities))
+
+    def __init__(self):
+        super(TreeClassifierParams, self).__init__()
+        #: param for Criterion used for information gain calculation (case-insensitive).
+        self.impurity = Param(self, "impurity", "Criterion used for information " +
+                              "gain calculation (case-insensitive). Supported options: " +
+                              ", ".join(self.supportedImpurities))
+
+    def setImpurity(self, value):
+        """
+        Sets the value of :py:attr:`impurity`.
+        """
+        self._paramMap[self.impurity] = value
+        return self
 
-class GBTParams(object):
+    def getImpurity(self):
+        """
+        Gets the value of impurity or its default value.
+        """
+        return self.getOrDefault(self.impurity)
+
+
+class GBTParams(TreeEnsembleParams):
     """
     Private class to track supported GBT params.
     """
@@ -216,7 +242,7 @@ class GBTParams(object):
 @inherit_doc
 class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                              HasProbabilityCol, HasRawPredictionCol, DecisionTreeParams,
-                             HasCheckpointInterval):
+                             TreeClassifierParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
     learning algorithm for classification.
@@ -250,11 +276,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     1.0
     """
 
-    # a placeholder to make it appear in the generated doc
-    impurity = Param(Params._dummy(), "impurity",
-                     "Criterion used for information gain calculation (case-insensitive). " +
-                     "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
-
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  probabilityCol="probability", rawPredictionCol="rawPrediction",
@@ -269,11 +290,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         super(DecisionTreeClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid)
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = \
-            Param(self, "impurity",
-                  "Criterion used for information gain calculation (case-insensitive). " +
-                  "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini")
@@ -299,19 +315,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return DecisionTreeClassificationModel(java_model)
 
-    def setImpurity(self, value):
-        """
-        Sets the value of :py:attr:`impurity`.
-        """
-        self._paramMap[self.impurity] = value
-        return self
-
-    def getImpurity(self):
-        """
-        Gets the value of impurity or its default value.
-        """
-        return self.getOrDefault(self.impurity)
-
 
 @inherit_doc
 class DecisionTreeClassificationModel(DecisionTreeModel):
@@ -323,7 +326,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
 @inherit_doc
 class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
                              HasRawPredictionCol, HasProbabilityCol,
-                             DecisionTreeParams, HasCheckpointInterval):
+                             RandomForestParams, TreeClassifierParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
     learning algorithm for classification.
@@ -357,19 +360,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     1.0
     """
 
-    # a placeholder to make it appear in the generated doc
-    impurity = Param(Params._dummy(), "impurity",
-                     "Criterion used for information gain calculation (case-insensitive). " +
-                     "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
-    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
-                            "Fraction of the training data used for learning each decision tree, " +
-                            "in range (0, 1].")
-    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1)")
-    featureSubsetStrategy = \
-        Param(Params._dummy(), "featureSubsetStrategy",
-              "The number of features to consider for splits at each tree node. Supported " +
-              "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
-
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  probabilityCol="probability", rawPredictionCol="rawPrediction",
@@ -386,23 +376,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         super(RandomForestClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = \
-            Param(self, "impurity",
-                  "Criterion used for information gain calculation (case-insensitive). " +
-                  "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
-        #: param for Fraction of the training data used for learning each decision tree,
-        #  in range (0, 1]
-        self.subsamplingRate = Param(self, "subsamplingRate",
-                                     "Fraction of the training data used for learning each " +
-                                     "decision tree, in range (0, 1].")
-        #: param for Number of trees to train (>= 1)
-        self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1)")
-        #: param for The number of features to consider for splits at each tree node
-        self.featureSubsetStrategy = \
-            Param(self, "featureSubsetStrategy",
-                  "The number of features to consider for splits at each tree node. Supported " +
-                  "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
                          impurity="gini", numTrees=20, featureSubsetStrategy="auto")
@@ -429,58 +402,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return RandomForestClassificationModel(java_model)
 
-    def setImpurity(self, value):
-        """
-        Sets the value of :py:attr:`impurity`.
-        """
-        self._paramMap[self.impurity] = value
-        return self
-
-    def getImpurity(self):
-        """
-        Gets the value of impurity or its default value.
-        """
-        return self.getOrDefault(self.impurity)
-
-    def setSubsamplingRate(self, value):
-        """
-        Sets the value of :py:attr:`subsamplingRate`.
-        """
-        self._paramMap[self.subsamplingRate] = value
-        return self
-
-    def getSubsamplingRate(self):
-        """
-        Gets the value of subsamplingRate or its default value.
-        """
-        return self.getOrDefault(self.subsamplingRate)
-
-    def setNumTrees(self, value):
-        """
-        Sets the value of :py:attr:`numTrees`.
-        """
-        self._paramMap[self.numTrees] = value
-        return self
-
-    def getNumTrees(self):
-        """
-        Gets the value of numTrees or its default value.
-        """
-        return self.getOrDefault(self.numTrees)
-
-    def setFeatureSubsetStrategy(self, value):
-        """
-        Sets the value of :py:attr:`featureSubsetStrategy`.
-        """
-        self._paramMap[self.featureSubsetStrategy] = value
-        return self
-
-    def getFeatureSubsetStrategy(self):
-        """
-        Gets the value of featureSubsetStrategy or its default value.
-        """
-        return self.getOrDefault(self.featureSubsetStrategy)
-
 
 class RandomForestClassificationModel(TreeEnsembleModels):
     """
@@ -490,7 +411,7 @@ class RandomForestClassificationModel(TreeEnsembleModels):
 
 @inherit_doc
 class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                    DecisionTreeParams, HasCheckpointInterval):
+                    GBTParams, HasCheckpointInterval, HasStepSize, HasSeed):
     """
     `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
     learning algorithm for classification.
@@ -522,12 +443,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
-    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
-                            "Fraction of the training data used for learning each decision tree, " +
-                            "in range (0, 1].")
-    stepSize = Param(Params._dummy(), "stepSize",
-                     "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the " +
-                     "contribution of each estimator")
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -547,15 +462,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.lossType = Param(self, "lossType",
                               "Loss function which GBT tries to minimize (case-insensitive). " +
                               "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
-        #: Fraction of the training data used for learning each decision tree, in range (0, 1].
-        self.subsamplingRate = Param(self, "subsamplingRate",
-                                     "Fraction of the training data used for learning each " +
-                                     "decision tree, in range (0, 1].")
-        #: Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of
-        #  each estimator
-        self.stepSize = Param(self, "stepSize",
-                              "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
-                              "the contribution of each estimator")
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          lossType="logistic", maxIter=20, stepSize=0.1)
@@ -593,32 +499,6 @@ def getLossType(self):
         """
         return self.getOrDefault(self.lossType)
 
-    def setSubsamplingRate(self, value):
-        """
-        Sets the value of :py:attr:`subsamplingRate`.
-        """
-        self._paramMap[self.subsamplingRate] = value
-        return self
-
-    def getSubsamplingRate(self):
-        """
-        Gets the value of subsamplingRate or its default value.
-        """
-        return self.getOrDefault(self.subsamplingRate)
-
-    def setStepSize(self, value):
-        """
-        Sets the value of :py:attr:`stepSize`.
-        """
-        self._paramMap[self.stepSize] = value
-        return self
-
-    def getStepSize(self):
-        """
-        Gets the value of stepSize or its default value.
-        """
-        return self.getOrDefault(self.stepSize)
-
 
 class GBTClassificationModel(TreeEnsembleModels):
     """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index eb5f4bd6d70b4..eeb18b3e9d290 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -260,21 +260,127 @@ def predictions(self):
         return self._call_java("predictions")
 
 
-class TreeRegressorParams(object):
+class TreeEnsembleParams(DecisionTreeParams):
+    """
+    Mixin for Decision Tree-based ensemble algorithms parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
+                            "used for learning each decision tree, in range (0, 1].")
+
+    def __init__(self):
+        super(TreeEnsembleParams, self).__init__()
+        #: param for Fraction of the training data, in range (0, 1].
+        self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " +
+                                     "used for learning each decision tree, in range (0, 1].")
+
+    @since("1.4.0")
+    def setSubsamplingRate(self, value):
+        """
+        Sets the value of :py:attr:`subsamplingRate`.
+        """
+        self._paramMap[self.subsamplingRate] = value
+        return self
+
+    @since("1.4.0")
+    def getSubsamplingRate(self):
+        """
+        Gets the value of subsamplingRate or its default value.
+        """
+        return self.getOrDefault(self.subsamplingRate)
+
+
+class TreeRegressorParams(Params):
     """
     Private class to track supported impurity measures.
     """
+
     supportedImpurities = ["variance"]
+    # a placeholder to make it appear in the generated doc
+    impurity = Param(Params._dummy(), "impurity",
+                     "Criterion used for information gain calculation (case-insensitive). " +
+                     "Supported options: " +
+                     ", ".join(supportedImpurities))
 
+    def __init__(self):
+        super(TreeRegressorParams, self).__init__()
+        #: param for Criterion used for information gain calculation (case-insensitive).
+        self.impurity = Param(self, "impurity", "Criterion used for information " +
+                              "gain calculation (case-insensitive). Supported options: " +
+                              ", ".join(self.supportedImpurities))
 
-class RandomForestParams(object):
+    @since("1.4.0")
+    def setImpurity(self, value):
+        """
+        Sets the value of :py:attr:`impurity`.
+        """
+        self._paramMap[self.impurity] = value
+        return self
+
+    @since("1.4.0")
+    def getImpurity(self):
+        """
+        Gets the value of impurity or its default value.
+        """
+        return self.getOrDefault(self.impurity)
+
+
+class RandomForestParams(TreeEnsembleParams):
     """
     Private class to track supported random forest parameters.
     """
+
     supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
+    # a placeholder to make it appear in the generated doc
+    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
+    featureSubsetStrategy = \
+        Param(Params._dummy(), "featureSubsetStrategy",
+              "The number of features to consider for splits at each tree node. Supported " +
+              "options: " + ", ".join(supportedFeatureSubsetStrategies))
+
+    def __init__(self):
+        super(RandomForestParams, self).__init__()
+        #: param for Number of trees to train (>= 1).
+        self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).")
+        #: param for The number of features to consider for splits at each tree node.
+        self.featureSubsetStrategy = \
+            Param(self, "featureSubsetStrategy",
+                  "The number of features to consider for splits at each tree node. Supported " +
+                  "options: " + ", ".join(self.supportedFeatureSubsetStrategies))
+
+    @since("1.4.0")
+    def setNumTrees(self, value):
+        """
+        Sets the value of :py:attr:`numTrees`.
+        """
+        self._paramMap[self.numTrees] = value
+        return self
+
+    @since("1.4.0")
+    def getNumTrees(self):
+        """
+        Gets the value of numTrees or its default value.
+        """
+        return self.getOrDefault(self.numTrees)
 
+    @since("1.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        self._paramMap[self.featureSubsetStrategy] = value
+        return self
 
-class GBTParams(object):
+    @since("1.4.0")
+    def getFeatureSubsetStrategy(self):
+        """
+        Gets the value of featureSubsetStrategy or its default value.
+        """
+        return self.getOrDefault(self.featureSubsetStrategy)
+
+
+class GBTParams(TreeEnsembleParams):
     """
     Private class to track supported GBT params.
     """
@@ -283,7 +389,7 @@ class GBTParams(object):
 
 @inherit_doc
 class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
-                            DecisionTreeParams, HasCheckpointInterval):
+                            DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
     learning algorithm for regression.
@@ -309,11 +415,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
-    impurity = Param(Params._dummy(), "impurity",
-                     "Criterion used for information gain calculation (case-insensitive). " +
-                     "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
-
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
@@ -326,11 +427,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         super(DecisionTreeRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid)
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = \
-            Param(self, "impurity",
-                  "Criterion used for information gain calculation (case-insensitive). " +
-                  "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="variance")
@@ -355,21 +451,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return DecisionTreeRegressionModel(java_model)
 
-    @since("1.4.0")
-    def setImpurity(self, value):
-        """
-        Sets the value of :py:attr:`impurity`.
-        """
-        self._paramMap[self.impurity] = value
-        return self
-
-    @since("1.4.0")
-    def getImpurity(self):
-        """
-        Gets the value of impurity or its default value.
-        """
-        return self.getOrDefault(self.impurity)
-
 
 @inherit_doc
 class DecisionTreeModel(JavaModel):
@@ -422,7 +503,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel):
 
 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
-                            DecisionTreeParams, HasCheckpointInterval):
+                            RandomForestParams, TreeRegressorParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
     learning algorithm for regression.
@@ -447,54 +528,26 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
-    impurity = Param(Params._dummy(), "impurity",
-                     "Criterion used for information gain calculation (case-insensitive). " +
-                     "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
-    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
-                            "Fraction of the training data used for learning each decision tree, " +
-                            "in range (0, 1].")
-    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1)")
-    featureSubsetStrategy = \
-        Param(Params._dummy(), "featureSubsetStrategy",
-              "The number of features to consider for splits at each tree node. Supported " +
-              "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
-
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
-                 numTrees=20, featureSubsetStrategy="auto", seed=None):
+                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                 impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+                 featureSubsetStrategy="auto"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                 impurity="variance", numTrees=20, \
-                 featureSubsetStrategy="auto", seed=None)
+                 impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
+                 featureSubsetStrategy="auto")
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = \
-            Param(self, "impurity",
-                  "Criterion used for information gain calculation (case-insensitive). " +
-                  "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
-        #: param for Fraction of the training data used for learning each decision tree,
-        #  in range (0, 1]
-        self.subsamplingRate = Param(self, "subsamplingRate",
-                                     "Fraction of the training data used for learning each " +
-                                     "decision tree, in range (0, 1].")
-        #: param for Number of trees to train (>= 1)
-        self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1)")
-        #: param for The number of features to consider for splits at each tree node
-        self.featureSubsetStrategy = \
-            Param(self, "featureSubsetStrategy",
-                  "The number of features to consider for splits at each tree node. Supported " +
-                  "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
-                         impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                         impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+                         featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -502,13 +555,15 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
-                  impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+                  featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
-                  impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
+                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
+                  featureSubsetStrategy="auto")
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
@@ -517,66 +572,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return RandomForestRegressionModel(java_model)
 
-    @since("1.4.0")
-    def setImpurity(self, value):
-        """
-        Sets the value of :py:attr:`impurity`.
-        """
-        self._paramMap[self.impurity] = value
-        return self
-
-    @since("1.4.0")
-    def getImpurity(self):
-        """
-        Gets the value of impurity or its default value.
-        """
-        return self.getOrDefault(self.impurity)
-
-    @since("1.4.0")
-    def setSubsamplingRate(self, value):
-        """
-        Sets the value of :py:attr:`subsamplingRate`.
-        """
-        self._paramMap[self.subsamplingRate] = value
-        return self
-
-    @since("1.4.0")
-    def getSubsamplingRate(self):
-        """
-        Gets the value of subsamplingRate or its default value.
-        """
-        return self.getOrDefault(self.subsamplingRate)
-
-    @since("1.4.0")
-    def setNumTrees(self, value):
-        """
-        Sets the value of :py:attr:`numTrees`.
-        """
-        self._paramMap[self.numTrees] = value
-        return self
-
-    @since("1.4.0")
-    def getNumTrees(self):
-        """
-        Gets the value of numTrees or its default value.
-        """
-        return self.getOrDefault(self.numTrees)
-
-    @since("1.4.0")
-    def setFeatureSubsetStrategy(self, value):
-        """
-        Sets the value of :py:attr:`featureSubsetStrategy`.
-        """
-        self._paramMap[self.featureSubsetStrategy] = value
-        return self
-
-    @since("1.4.0")
-    def getFeatureSubsetStrategy(self):
-        """
-        Gets the value of featureSubsetStrategy or its default value.
-        """
-        return self.getOrDefault(self.featureSubsetStrategy)
-
 
 class RandomForestRegressionModel(TreeEnsembleModels):
     """
@@ -588,7 +583,7 @@ class RandomForestRegressionModel(TreeEnsembleModels):
 
 @inherit_doc
 class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                   DecisionTreeParams, HasCheckpointInterval):
+                   GBTParams, HasCheckpointInterval, HasStepSize, HasSeed):
     """
     `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
     learning algorithm for regression.
@@ -617,23 +612,17 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
-    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
-                            "Fraction of the training data used for learning each decision tree, " +
-                            "in range (0, 1].")
-    stepSize = Param(Params._dummy(), "stepSize",
-                     "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the " +
-                     "contribution of each estimator")
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="squared",
-                 maxIter=20, stepSize=0.1):
+                 maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+                 checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
-                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                 lossType="squared", maxIter=20, stepSize=0.1)
+                 maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
+                 checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         """
         super(GBTRegressor, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
@@ -641,18 +630,9 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.lossType = Param(self, "lossType",
                               "Loss function which GBT tries to minimize (case-insensitive). " +
                               "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
-        #: Fraction of the training data used for learning each decision tree, in range (0, 1].
-        self.subsamplingRate = Param(self, "subsamplingRate",
-                                     "Fraction of the training data used for learning each " +
-                                     "decision tree, in range (0, 1].")
-        #: Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of
-        #  each estimator
-        self.stepSize = Param(self, "stepSize",
-                              "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
-                              "the contribution of each estimator")
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         lossType="squared", maxIter=20, stepSize=0.1)
+                         maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+                         checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -660,13 +640,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  lossType="squared", maxIter=20, stepSize=0.1):
+                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+                  checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
-                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                  lossType="squared", maxIter=20, stepSize=0.1)
+                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
+                  checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         Sets params for Gradient Boosted Tree Regression.
         """
         kwargs = self.setParams._input_kwargs
@@ -690,36 +670,6 @@ def getLossType(self):
         """
         return self.getOrDefault(self.lossType)
 
-    @since("1.4.0")
-    def setSubsamplingRate(self, value):
-        """
-        Sets the value of :py:attr:`subsamplingRate`.
-        """
-        self._paramMap[self.subsamplingRate] = value
-        return self
-
-    @since("1.4.0")
-    def getSubsamplingRate(self):
-        """
-        Gets the value of subsamplingRate or its default value.
-        """
-        return self.getOrDefault(self.subsamplingRate)
-
-    @since("1.4.0")
-    def setStepSize(self, value):
-        """
-        Sets the value of :py:attr:`stepSize`.
-        """
-        self._paramMap[self.stepSize] = value
-        return self
-
-    @since("1.4.0")
-    def getStepSize(self):
-        """
-        Gets the value of stepSize or its default value.
-        """
-        return self.getOrDefault(self.stepSize)
-
 
 class GBTRegressionModel(TreeEnsembleModels):
     """
@@ -783,7 +733,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
                  quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
-                 quantilesCol=None):
+                 quantilesCol=None)
         """
         super(AFTSurvivalRegression, self).__init__()
         self._java_obj = self._new_java_obj(

From 4f030b9e82172659d250281782ac573cbd1438fc Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 27 Oct 2015 16:01:26 -0700
Subject: [PATCH 698/802] [SPARK-11324][STREAMING] Flag for closing Write Ahead
 Logs after a write

Currently the Write Ahead Log in Spark Streaming flushes data as writes need to be made. S3 does not support flushing of data, data is written once the stream is actually closed.
In case of failure, the data for the last minute (default rolling interval) will not be properly written. Therefore we need a flag to close the stream after the write, so that we achieve read after write consistency.

cc tdas zsxwing

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #9285 from brkyvz/caw-wal.
---
 .../util/FileBasedWriteAheadLog.scala         |  6 +++-
 .../streaming/util/WriteAheadLogUtils.scala   | 15 ++++++++-
 .../streaming/util/WriteAheadLogSuite.scala   | 32 +++++++++++++++----
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index 9f4a4d6806ab5..bc3f2486c21fd 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -47,7 +47,8 @@ private[streaming] class FileBasedWriteAheadLog(
     logDirectory: String,
     hadoopConf: Configuration,
     rollingIntervalSecs: Int,
-    maxFailures: Int
+    maxFailures: Int,
+    closeFileAfterWrite: Boolean
   ) extends WriteAheadLog with Logging {
 
   import FileBasedWriteAheadLog._
@@ -80,6 +81,9 @@ private[streaming] class FileBasedWriteAheadLog(
     while (!succeeded && failures < maxFailures) {
       try {
         fileSegment = getLogWriter(time).write(byteBuffer)
+        if (closeFileAfterWrite) {
+          resetWriter()
+        }
         succeeded = true
       } catch {
         case ex: Exception =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala
index 7f6ff12c58d47..0ea970e61b694 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala
@@ -31,11 +31,15 @@ private[streaming] object WriteAheadLogUtils extends Logging {
   val RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY =
     "spark.streaming.receiver.writeAheadLog.rollingIntervalSecs"
   val RECEIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.receiver.writeAheadLog.maxFailures"
+  val RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY =
+    "spark.streaming.receiver.writeAheadLog.closeFileAfterWrite"
 
   val DRIVER_WAL_CLASS_CONF_KEY = "spark.streaming.driver.writeAheadLog.class"
   val DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY =
     "spark.streaming.driver.writeAheadLog.rollingIntervalSecs"
   val DRIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.driver.writeAheadLog.maxFailures"
+  val DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY =
+    "spark.streaming.driver.writeAheadLog.closeFileAfterWrite"
 
   val DEFAULT_ROLLING_INTERVAL_SECS = 60
   val DEFAULT_MAX_FAILURES = 3
@@ -60,6 +64,14 @@ private[streaming] object WriteAheadLogUtils extends Logging {
     }
   }
 
+  def shouldCloseFileAfterWrite(conf: SparkConf, isDriver: Boolean): Boolean = {
+    if (isDriver) {
+      conf.getBoolean(DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false)
+    } else {
+      conf.getBoolean(RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false)
+    }
+  }
+
   /**
    * Create a WriteAheadLog for the driver. If configured with custom WAL class, it will try
    * to create instance of that class, otherwise it will create the default FileBasedWriteAheadLog.
@@ -113,7 +125,8 @@ private[streaming] object WriteAheadLogUtils extends Logging {
       }
     }.getOrElse {
       new FileBasedWriteAheadLog(sparkConf, fileWalLogDirectory, fileWalHadoopConf,
-        getRollingIntervalSecs(sparkConf, isDriver), getMaxFailures(sparkConf, isDriver))
+        getRollingIntervalSecs(sparkConf, isDriver), getMaxFailures(sparkConf, isDriver),
+        shouldCloseFileAfterWrite(sparkConf, isDriver))
     }
   }
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 5e49fd00769ad..93ae41a3d2ecd 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -203,6 +203,21 @@ class WriteAheadLogSuite extends SparkFunSuite with BeforeAndAfter {
     assert(writtenData === dataToWrite)
   }
 
+  test("FileBasedWriteAheadLog - close after write flag") {
+    // Write data with rotation using WriteAheadLog class
+    val numFiles = 3
+    val dataToWrite = Seq.tabulate(numFiles)(_.toString)
+    // total advance time is less than 1000, therefore log shouldn't be rolled, but manually closed
+    writeDataUsingWriteAheadLog(testDir, dataToWrite, closeLog = false, clockAdvanceTime = 100,
+      closeFileAfterWrite = true)
+
+    // Read data manually to verify the written data
+    val logFiles = getLogFilesInDirectory(testDir)
+    assert(logFiles.size === numFiles)
+    val writtenData = logFiles.flatMap { file => readDataManually(file)}
+    assert(writtenData === dataToWrite)
+  }
+
   test("FileBasedWriteAheadLog - read rotating logs") {
     // Write data manually for testing reading through WriteAheadLog
     val writtenData = (1 to 10).map { i =>
@@ -296,8 +311,8 @@ class WriteAheadLogSuite extends SparkFunSuite with BeforeAndAfter {
     assert(!nonexistentTempPath.exists())
 
     val writtenSegment = writeDataManually(generateRandomData(), testFile)
-    val wal = new FileBasedWriteAheadLog(
-      new SparkConf(), tempDir.getAbsolutePath, new Configuration(), 1, 1)
+    val wal = new FileBasedWriteAheadLog(new SparkConf(), tempDir.getAbsolutePath,
+      new Configuration(), 1, 1, closeFileAfterWrite = false)
     assert(!nonexistentTempPath.exists(), "Directory created just by creating log object")
     wal.read(writtenSegment.head)
     assert(!nonexistentTempPath.exists(), "Directory created just by attempting to read segment")
@@ -356,14 +371,16 @@ object WriteAheadLogSuite {
       logDirectory: String,
       data: Seq[String],
       manualClock: ManualClock = new ManualClock,
-      closeLog: Boolean = true
-    ): FileBasedWriteAheadLog = {
+      closeLog: Boolean = true,
+      clockAdvanceTime: Int = 500,
+      closeFileAfterWrite: Boolean = false): FileBasedWriteAheadLog = {
     if (manualClock.getTimeMillis() < 100000) manualClock.setTime(10000)
-    val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1)
+    val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1,
+      closeFileAfterWrite)
 
     // Ensure that 500 does not get sorted after 2000, so put a high base value.
     data.foreach { item =>
-      manualClock.advance(500)
+      manualClock.advance(clockAdvanceTime)
       wal.write(item, manualClock.getTimeMillis())
     }
     if (closeLog) wal.close()
@@ -418,7 +435,8 @@ object WriteAheadLogSuite {
 
   /** Read all the data in the log file in a directory using the WriteAheadLog class. */
   def readDataUsingWriteAheadLog(logDirectory: String): Seq[String] = {
-    val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1)
+    val wal = new FileBasedWriteAheadLog(new SparkConf(), logDirectory, hadoopConf, 1, 1,
+      closeFileAfterWrite = false)
     val data = wal.readAll().asScala.map(byteBufferToString).toSeq
     wal.close()
     data

From 9fbd75ab5d46612e52116ec5b9ced70715cf26b5 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 27 Oct 2015 16:14:33 -0700
Subject: [PATCH 699/802] =?UTF-8?q?[SPARK-11212][CORE][STREAMING]=20Make?=
 =?UTF-8?q?=20preferred=20locations=20support=20ExecutorCacheTaskLocation?=
 =?UTF-8?q?=20and=20update=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… ReceiverTracker and ReceiverSchedulingPolicy to use it

This PR includes the following changes:

1. Add a new preferred location format, `executor_<host>_<executorID>` (e.g., "executor_localhost_2"), to support specifying the executor locations for RDD.
2. Use the new preferred location format in `ReceiverTracker` to optimize the starting time of Receivers when there are multiple executors in a host.

The goal of this PR is to enable the streaming scheduler to place receivers (which run as tasks) in specific executors. Basically, I want to have more control on the placement of the receivers such that they are evenly distributed among the executors. We tried to do this without changing the core scheduling logic. But it does not allow specifying particular executor as preferred location, only at the host level. So if there are two executors in the same host, and I want two receivers to run on them (one on each executor), I cannot specify that. Current code only specifies the host as preference, which may end up launching both receivers on the same executor. We try to work around it but restarting a receiver when it does not launch in the desired executor and hope that next time it will be started in the right one. But that cause lots of restarts, and delays in correctly launching the receiver.

So this change, would allow the streaming scheduler to specify the exact executor as the preferred location. Also this is not exposed to the user, only the streaming scheduler uses this.

Author: zsxwing <zsxwing@gmail.com>

Closes #9181 from zsxwing/executor-location.
---
 .../apache/spark/scheduler/TaskLocation.scala |  17 ++-
 .../spark/scheduler/TaskSetManagerSuite.scala |   1 +
 .../receiver/ReceiverSupervisorImpl.scala     |   5 +-
 .../scheduler/ReceiverSchedulingPolicy.scala  | 110 +++++++++++-------
 .../streaming/scheduler/ReceiverTracker.scala |  93 ++++++++-------
 .../scheduler/ReceiverTrackingInfo.scala      |   9 +-
 .../ReceiverSchedulingPolicySuite.scala       | 110 +++++++++++-------
 .../scheduler/ReceiverTrackerSuite.scala      |   4 +-
 8 files changed, 217 insertions(+), 132 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
index 1b65926f5c749..1eb6c1614fc0b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala
@@ -31,7 +31,9 @@ private[spark] sealed trait TaskLocation {
  */
 private [spark]
 case class ExecutorCacheTaskLocation(override val host: String, executorId: String)
-  extends TaskLocation
+  extends TaskLocation {
+  override def toString: String = s"${TaskLocation.executorLocationTag}${host}_$executorId"
+}
 
 /**
  * A location on a host.
@@ -53,6 +55,9 @@ private[spark] object TaskLocation {
   // confusion.  See  RFC 952 and RFC 1123 for information about the format of hostnames.
   val inMemoryLocationTag = "hdfs_cache_"
 
+  // Identify locations of executors with this prefix.
+  val executorLocationTag = "executor_"
+
   def apply(host: String, executorId: String): TaskLocation = {
     new ExecutorCacheTaskLocation(host, executorId)
   }
@@ -65,7 +70,15 @@ private[spark] object TaskLocation {
   def apply(str: String): TaskLocation = {
     val hstr = str.stripPrefix(inMemoryLocationTag)
     if (hstr.equals(str)) {
-      new HostTaskLocation(str)
+      if (str.startsWith(executorLocationTag)) {
+        val splits = str.split("_")
+        if (splits.length != 3) {
+          throw new IllegalArgumentException("Illegal executor location format: " + str)
+        }
+        new ExecutorCacheTaskLocation(splits(1), splits(2))
+      } else {
+        new HostTaskLocation(str)
+      }
     } else {
       new HDFSCacheTaskLocation(hstr)
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 695523cc8aa3a..cd6bf723e70cb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -779,6 +779,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
   test("Test TaskLocation for different host type.") {
     assert(TaskLocation("host1") === HostTaskLocation("host1"))
     assert(TaskLocation("hdfs_cache_host1") === HDFSCacheTaskLocation("host1"))
+    assert(TaskLocation("executor_host1_3") === ExecutorCacheTaskLocation("host1", "3"))
   }
 
   def createTaskResult(id: Int): DirectTaskResult[Int] = {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index 59ef58d232ee7..167f56aa42281 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -47,7 +47,8 @@ private[streaming] class ReceiverSupervisorImpl(
     checkpointDirOption: Option[String]
   ) extends ReceiverSupervisor(receiver, env.conf) with Logging {
 
-  private val hostPort = SparkEnv.get.blockManager.blockManagerId.hostPort
+  private val host = SparkEnv.get.blockManager.blockManagerId.host
+  private val executorId = SparkEnv.get.blockManager.blockManagerId.executorId
 
   private val receivedBlockHandler: ReceivedBlockHandler = {
     if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
@@ -179,7 +180,7 @@ private[streaming] class ReceiverSupervisorImpl(
 
   override protected def onReceiverStart(): Boolean = {
     val msg = RegisterReceiver(
-      streamId, receiver.getClass.getSimpleName, hostPort, endpoint)
+      streamId, receiver.getClass.getSimpleName, host, executorId, endpoint)
     trackerEndpoint.askWithRetry[Boolean](msg)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
index d2b0be7f4a9c5..234bc8660da8a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
@@ -20,8 +20,8 @@ package org.apache.spark.streaming.scheduler
 import scala.collection.Map
 import scala.collection.mutable
 
+import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, TaskLocation}
 import org.apache.spark.streaming.receiver.Receiver
-import org.apache.spark.util.Utils
 
 /**
  * A class that tries to schedule receivers with evenly distributed. There are two phases for
@@ -29,23 +29,23 @@ import org.apache.spark.util.Utils
  *
  * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule
  *   all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase.
- *   It will try to schedule receivers with evenly distributed. ReceiverTracker should update its
- *   receiverTrackingInfoMap according to the results of `scheduleReceivers`.
- *   `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that
- *   contains the scheduled locations. Then when a receiver is starting, it will send a register
- *   request and `ReceiverTracker.registerReceiver` will be called. In
- *   `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check
- *   if the location of this receiver is one of the scheduled executors, if not, the register will
+ *   It will try to schedule receivers such that they are evenly distributed. ReceiverTracker should
+ *   update its `receiverTrackingInfoMap` according to the results of `scheduleReceivers`.
+ *   `ReceiverTrackingInfo.scheduledLocations` for each receiver should be set to an location list
+ *   that contains the scheduled locations. Then when a receiver is starting, it will send a
+ *   register request and `ReceiverTracker.registerReceiver` will be called. In
+ *   `ReceiverTracker.registerReceiver`, if a receiver's scheduled locations is set, it should check
+ *   if the location of this receiver is one of the scheduled locations, if not, the register will
  *   be rejected.
  * - The second phase is local scheduling when a receiver is restarting. There are two cases of
  *   receiver restarting:
  *   - If a receiver is restarting because it's rejected due to the real location and the scheduled
- *     executors mismatching, in other words, it fails to start in one of the locations that
+ *     locations mismatching, in other words, it fails to start in one of the locations that
  *     `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are
- *     still alive in the list of scheduled executors, then use them to launch the receiver job.
- *   - If a receiver is restarting without a scheduled executors list, or the executors in the list
+ *     still alive in the list of scheduled locations, then use them to launch the receiver job.
+ *   - If a receiver is restarting without a scheduled locations list, or the executors in the list
  *     are dead, `ReceiverTracker` should call `rescheduleReceiver`. If so, `ReceiverTracker` should
- *     not set `ReceiverTrackingInfo.scheduledExecutors` for this executor, instead, it should clear
+ *     not set `ReceiverTrackingInfo.scheduledLocations` for this receiver, instead, it should clear
  *     it. Then when this receiver is registering, we can know this is a local scheduling, and
  *     `ReceiverTrackingInfo` should call `rescheduleReceiver` again to check if the launching
  *     location is matching.
@@ -69,9 +69,12 @@ private[streaming] class ReceiverSchedulingPolicy {
    * </ol>
    *
    * This method is called when we start to launch receivers at the first time.
+   *
+   * @return a map for receivers and their scheduled locations
    */
   def scheduleReceivers(
-      receivers: Seq[Receiver[_]], executors: Seq[String]): Map[Int, Seq[String]] = {
+      receivers: Seq[Receiver[_]],
+      executors: Seq[ExecutorCacheTaskLocation]): Map[Int, Seq[TaskLocation]] = {
     if (receivers.isEmpty) {
       return Map.empty
     }
@@ -80,16 +83,16 @@ private[streaming] class ReceiverSchedulingPolicy {
       return receivers.map(_.streamId -> Seq.empty).toMap
     }
 
-    val hostToExecutors = executors.groupBy(executor => Utils.parseHostPort(executor)._1)
-    val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String])
-    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    val hostToExecutors = executors.groupBy(_.host)
+    val scheduledLocations = Array.fill(receivers.length)(new mutable.ArrayBuffer[TaskLocation])
+    val numReceiversOnExecutor = mutable.HashMap[ExecutorCacheTaskLocation, Int]()
     // Set the initial value to 0
     executors.foreach(e => numReceiversOnExecutor(e) = 0)
 
     // Firstly, we need to respect "preferredLocation". So if a receiver has "preferredLocation",
     // we need to make sure the "preferredLocation" is in the candidate scheduled executor list.
     for (i <- 0 until receivers.length) {
-      // Note: preferredLocation is host but executors are host:port
+      // Note: preferredLocation is host but executors are host_executorId
       receivers(i).preferredLocation.foreach { host =>
         hostToExecutors.get(host) match {
           case Some(executorsOnHost) =>
@@ -97,7 +100,7 @@ private[streaming] class ReceiverSchedulingPolicy {
             // this host
             val leastScheduledExecutor =
               executorsOnHost.minBy(executor => numReceiversOnExecutor(executor))
-            scheduledExecutors(i) += leastScheduledExecutor
+            scheduledLocations(i) += leastScheduledExecutor
             numReceiversOnExecutor(leastScheduledExecutor) =
               numReceiversOnExecutor(leastScheduledExecutor) + 1
           case None =>
@@ -106,17 +109,20 @@ private[streaming] class ReceiverSchedulingPolicy {
             // 1. This executor is not up. But it may be up later.
             // 2. This executor is dead, or it's not a host in the cluster.
             // Currently, simply add host to the scheduled executors.
-            scheduledExecutors(i) += host
+
+            // Note: host could be `HDFSCacheTaskLocation`, so use `TaskLocation.apply` to handle
+            // this case
+            scheduledLocations(i) += TaskLocation(host)
         }
       }
     }
 
     // For those receivers that don't have preferredLocation, make sure we assign at least one
     // executor to them.
-    for (scheduledExecutorsForOneReceiver <- scheduledExecutors.filter(_.isEmpty)) {
+    for (scheduledLocationsForOneReceiver <- scheduledLocations.filter(_.isEmpty)) {
       // Select the executor that has the least receivers
       val (leastScheduledExecutor, numReceivers) = numReceiversOnExecutor.minBy(_._2)
-      scheduledExecutorsForOneReceiver += leastScheduledExecutor
+      scheduledLocationsForOneReceiver += leastScheduledExecutor
       numReceiversOnExecutor(leastScheduledExecutor) = numReceivers + 1
     }
 
@@ -124,22 +130,22 @@ private[streaming] class ReceiverSchedulingPolicy {
     val idleExecutors = numReceiversOnExecutor.filter(_._2 == 0).map(_._1)
     for (executor <- idleExecutors) {
       // Assign an idle executor to the receiver that has least candidate executors.
-      val leastScheduledExecutors = scheduledExecutors.minBy(_.size)
+      val leastScheduledExecutors = scheduledLocations.minBy(_.size)
       leastScheduledExecutors += executor
     }
 
-    receivers.map(_.streamId).zip(scheduledExecutors).toMap
+    receivers.map(_.streamId).zip(scheduledLocations).toMap
   }
 
   /**
-   * Return a list of candidate executors to run the receiver. If the list is empty, the caller can
+   * Return a list of candidate locations to run the receiver. If the list is empty, the caller can
    * run this receiver in arbitrary executor.
    *
    * This method tries to balance executors' load. Here is the approach to schedule executors
    * for a receiver.
    * <ol>
    *   <li>
-   *     If preferredLocation is set, preferredLocation should be one of the candidate executors.
+   *     If preferredLocation is set, preferredLocation should be one of the candidate locations.
    *   </li>
    *   <li>
    *     Every executor will be assigned to a weight according to the receivers running or
@@ -163,40 +169,58 @@ private[streaming] class ReceiverSchedulingPolicy {
       receiverId: Int,
       preferredLocation: Option[String],
       receiverTrackingInfoMap: Map[Int, ReceiverTrackingInfo],
-      executors: Seq[String]): Seq[String] = {
+      executors: Seq[ExecutorCacheTaskLocation]): Seq[TaskLocation] = {
     if (executors.isEmpty) {
       return Seq.empty
     }
 
     // Always try to schedule to the preferred locations
-    val scheduledExecutors = mutable.Set[String]()
-    scheduledExecutors ++= preferredLocation
-
-    val executorWeights = receiverTrackingInfoMap.values.flatMap { receiverTrackingInfo =>
-      receiverTrackingInfo.state match {
-        case ReceiverState.INACTIVE => Nil
-        case ReceiverState.SCHEDULED =>
-          val scheduledExecutors = receiverTrackingInfo.scheduledExecutors.get
-          // The probability that a scheduled receiver will run in an executor is
-          // 1.0 / scheduledLocations.size
-          scheduledExecutors.map(location => location -> (1.0 / scheduledExecutors.size))
-        case ReceiverState.ACTIVE => Seq(receiverTrackingInfo.runningExecutor.get -> 1.0)
-      }
-    }.groupBy(_._1).mapValues(_.map(_._2).sum) // Sum weights for each executor
+    val scheduledLocations = mutable.Set[TaskLocation]()
+    // Note: preferredLocation could be `HDFSCacheTaskLocation`, so use `TaskLocation.apply` to
+    // handle this case
+    scheduledLocations ++= preferredLocation.map(TaskLocation(_))
+
+    val executorWeights: Map[ExecutorCacheTaskLocation, Double] = {
+      receiverTrackingInfoMap.values.flatMap(convertReceiverTrackingInfoToExecutorWeights)
+        .groupBy(_._1).mapValues(_.map(_._2).sum) // Sum weights for each executor
+    }
 
     val idleExecutors = executors.toSet -- executorWeights.keys
     if (idleExecutors.nonEmpty) {
-      scheduledExecutors ++= idleExecutors
+      scheduledLocations ++= idleExecutors
     } else {
       // There is no idle executor. So select all executors that have the minimum weight.
       val sortedExecutors = executorWeights.toSeq.sortBy(_._2)
       if (sortedExecutors.nonEmpty) {
         val minWeight = sortedExecutors(0)._2
-        scheduledExecutors ++= sortedExecutors.takeWhile(_._2 == minWeight).map(_._1)
+        scheduledLocations ++= sortedExecutors.takeWhile(_._2 == minWeight).map(_._1)
       } else {
         // This should not happen since "executors" is not empty
       }
     }
-    scheduledExecutors.toSeq
+    scheduledLocations.toSeq
+  }
+
+  /**
+   * This method tries to convert a receiver tracking info to executor weights. Every executor will
+   * be assigned to a weight according to the receivers running or scheduling on it:
+   *
+   * - If a receiver is running on an executor, it contributes 1.0 to the executor's weight.
+   * - If a receiver is scheduled to an executor but has not yet run, it contributes
+   * `1.0 / #candidate_executors_of_this_receiver` to the executor's weight.
+   */
+  private def convertReceiverTrackingInfoToExecutorWeights(
+      receiverTrackingInfo: ReceiverTrackingInfo): Seq[(ExecutorCacheTaskLocation, Double)] = {
+    receiverTrackingInfo.state match {
+      case ReceiverState.INACTIVE => Nil
+      case ReceiverState.SCHEDULED =>
+        val scheduledLocations = receiverTrackingInfo.scheduledLocations.get
+        // The probability that a scheduled receiver will run in an executor is
+        // 1.0 / scheduledLocations.size
+        scheduledLocations.filter(_.isInstanceOf[ExecutorCacheTaskLocation]).map { location =>
+          location.asInstanceOf[ExecutorCacheTaskLocation] -> (1.0 / scheduledLocations.size)
+        }
+      case ReceiverState.ACTIVE => Seq(receiverTrackingInfo.runningExecutor.get -> 1.0)
+    }
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 2ce80d618b0a3..b183d856f50c3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -17,20 +17,21 @@
 
 package org.apache.spark.streaming.scheduler
 
-import java.util.concurrent.{TimeUnit, CountDownLatch}
+import java.util.concurrent.{CountDownLatch, TimeUnit}
 
 import scala.collection.mutable.HashMap
 import scala.concurrent.ExecutionContext
 import scala.language.existentials
 import scala.util.{Failure, Success}
 
-import org.apache.spark.streaming.util.WriteAheadLogUtils
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc._
+import org.apache.spark.scheduler.{TaskLocation, ExecutorCacheTaskLocation}
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.receiver._
-import org.apache.spark.util.{Utils, ThreadUtils, SerializableConfiguration}
+import org.apache.spark.streaming.util.WriteAheadLogUtils
+import org.apache.spark.util.{SerializableConfiguration, ThreadUtils, Utils}
 
 
 /** Enumeration to identify current state of a Receiver */
@@ -47,7 +48,8 @@ private[streaming] sealed trait ReceiverTrackerMessage
 private[streaming] case class RegisterReceiver(
     streamId: Int,
     typ: String,
-    hostPort: String,
+    host: String,
+    executorId: String,
     receiverEndpoint: RpcEndpointRef
   ) extends ReceiverTrackerMessage
 private[streaming] case class AddBlock(receivedBlockInfo: ReceivedBlockInfo)
@@ -235,7 +237,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   private def registerReceiver(
       streamId: Int,
       typ: String,
-      hostPort: String,
+      host: String,
+      executorId: String,
       receiverEndpoint: RpcEndpointRef,
       senderAddress: RpcAddress
     ): Boolean = {
@@ -247,18 +250,23 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       return false
     }
 
-    val scheduledExecutors = receiverTrackingInfos(streamId).scheduledExecutors
-    val accetableExecutors = if (scheduledExecutors.nonEmpty) {
+    val scheduledLocations = receiverTrackingInfos(streamId).scheduledLocations
+    val acceptableExecutors = if (scheduledLocations.nonEmpty) {
         // This receiver is registering and it's scheduled by
-        // ReceiverSchedulingPolicy.scheduleReceivers. So use "scheduledExecutors" to check it.
-        scheduledExecutors.get
+        // ReceiverSchedulingPolicy.scheduleReceivers. So use "scheduledLocations" to check it.
+        scheduledLocations.get
       } else {
         // This receiver is scheduled by "ReceiverSchedulingPolicy.rescheduleReceiver", so calling
         // "ReceiverSchedulingPolicy.rescheduleReceiver" again to check it.
         scheduleReceiver(streamId)
       }
 
-    if (!accetableExecutors.contains(hostPort)) {
+    def isAcceptable: Boolean = acceptableExecutors.exists {
+      case loc: ExecutorCacheTaskLocation => loc.executorId == executorId
+      case loc: TaskLocation => loc.host == host
+    }
+
+    if (!isAcceptable) {
       // Refuse it since it's scheduled to a wrong executor
       false
     } else {
@@ -266,8 +274,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       val receiverTrackingInfo = ReceiverTrackingInfo(
         streamId,
         ReceiverState.ACTIVE,
-        scheduledExecutors = None,
-        runningExecutor = Some(hostPort),
+        scheduledLocations = None,
+        runningExecutor = Some(ExecutorCacheTaskLocation(host, executorId)),
         name = Some(name),
         endpoint = Some(receiverEndpoint))
       receiverTrackingInfos.put(streamId, receiverTrackingInfo)
@@ -338,25 +346,25 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     logWarning(s"Error reported by receiver for stream $streamId: $messageWithError")
   }
 
-  private def scheduleReceiver(receiverId: Int): Seq[String] = {
+  private def scheduleReceiver(receiverId: Int): Seq[TaskLocation] = {
     val preferredLocation = receiverPreferredLocations.getOrElse(receiverId, None)
-    val scheduledExecutors = schedulingPolicy.rescheduleReceiver(
+    val scheduledLocations = schedulingPolicy.rescheduleReceiver(
       receiverId, preferredLocation, receiverTrackingInfos, getExecutors)
-    updateReceiverScheduledExecutors(receiverId, scheduledExecutors)
-    scheduledExecutors
+    updateReceiverScheduledExecutors(receiverId, scheduledLocations)
+    scheduledLocations
   }
 
   private def updateReceiverScheduledExecutors(
-      receiverId: Int, scheduledExecutors: Seq[String]): Unit = {
+      receiverId: Int, scheduledLocations: Seq[TaskLocation]): Unit = {
     val newReceiverTrackingInfo = receiverTrackingInfos.get(receiverId) match {
       case Some(oldInfo) =>
         oldInfo.copy(state = ReceiverState.SCHEDULED,
-          scheduledExecutors = Some(scheduledExecutors))
+          scheduledLocations = Some(scheduledLocations))
       case None =>
         ReceiverTrackingInfo(
           receiverId,
           ReceiverState.SCHEDULED,
-          Some(scheduledExecutors),
+          Some(scheduledLocations),
           runningExecutor = None)
     }
     receiverTrackingInfos.put(receiverId, newReceiverTrackingInfo)
@@ -370,13 +378,16 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   /**
    * Get the list of executors excluding driver
    */
-  private def getExecutors: Seq[String] = {
+  private def getExecutors: Seq[ExecutorCacheTaskLocation] = {
     if (ssc.sc.isLocal) {
-      Seq(ssc.sparkContext.env.blockManager.blockManagerId.hostPort)
+      val blockManagerId = ssc.sparkContext.env.blockManager.blockManagerId
+      Seq(ExecutorCacheTaskLocation(blockManagerId.host, blockManagerId.executorId))
     } else {
       ssc.sparkContext.env.blockManager.master.getMemoryStatus.filter { case (blockManagerId, _) =>
         blockManagerId.executorId != SparkContext.DRIVER_IDENTIFIER // Ignore the driver location
-      }.map { case (blockManagerId, _) => blockManagerId.hostPort }.toSeq
+      }.map { case (blockManagerId, _) =>
+        ExecutorCacheTaskLocation(blockManagerId.host, blockManagerId.executorId)
+      }.toSeq
     }
   }
 
@@ -431,9 +442,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     override def receive: PartialFunction[Any, Unit] = {
       // Local messages
       case StartAllReceivers(receivers) =>
-        val scheduledExecutors = schedulingPolicy.scheduleReceivers(receivers, getExecutors)
+        val scheduledLocations = schedulingPolicy.scheduleReceivers(receivers, getExecutors)
         for (receiver <- receivers) {
-          val executors = scheduledExecutors(receiver.streamId)
+          val executors = scheduledLocations(receiver.streamId)
           updateReceiverScheduledExecutors(receiver.streamId, executors)
           receiverPreferredLocations(receiver.streamId) = receiver.preferredLocation
           startReceiver(receiver, executors)
@@ -441,14 +452,14 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       case RestartReceiver(receiver) =>
         // Old scheduled executors minus the ones that are not active any more
         val oldScheduledExecutors = getStoredScheduledExecutors(receiver.streamId)
-        val scheduledExecutors = if (oldScheduledExecutors.nonEmpty) {
+        val scheduledLocations = if (oldScheduledExecutors.nonEmpty) {
             // Try global scheduling again
             oldScheduledExecutors
           } else {
             val oldReceiverInfo = receiverTrackingInfos(receiver.streamId)
-            // Clear "scheduledExecutors" to indicate we are going to do local scheduling
+            // Clear "scheduledLocations" to indicate we are going to do local scheduling
             val newReceiverInfo = oldReceiverInfo.copy(
-              state = ReceiverState.INACTIVE, scheduledExecutors = None)
+              state = ReceiverState.INACTIVE, scheduledLocations = None)
             receiverTrackingInfos(receiver.streamId) = newReceiverInfo
             schedulingPolicy.rescheduleReceiver(
               receiver.streamId,
@@ -458,7 +469,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
           }
         // Assume there is one receiver restarting at one time, so we don't need to update
         // receiverTrackingInfos
-        startReceiver(receiver, scheduledExecutors)
+        startReceiver(receiver, scheduledLocations)
       case c: CleanupOldBlocks =>
         receiverTrackingInfos.values.flatMap(_.endpoint).foreach(_.send(c))
       case UpdateReceiverRateLimit(streamUID, newRate) =>
@@ -472,9 +483,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       // Remote messages
-      case RegisterReceiver(streamId, typ, hostPort, receiverEndpoint) =>
+      case RegisterReceiver(streamId, typ, host, executorId, receiverEndpoint) =>
         val successful =
-          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.senderAddress)
+          registerReceiver(streamId, typ, host, executorId, receiverEndpoint, context.senderAddress)
         context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
@@ -493,13 +504,16 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     /**
      * Return the stored scheduled executors that are still alive.
      */
-    private def getStoredScheduledExecutors(receiverId: Int): Seq[String] = {
+    private def getStoredScheduledExecutors(receiverId: Int): Seq[TaskLocation] = {
       if (receiverTrackingInfos.contains(receiverId)) {
-        val scheduledExecutors = receiverTrackingInfos(receiverId).scheduledExecutors
-        if (scheduledExecutors.nonEmpty) {
+        val scheduledLocations = receiverTrackingInfos(receiverId).scheduledLocations
+        if (scheduledLocations.nonEmpty) {
           val executors = getExecutors.toSet
           // Only return the alive executors
-          scheduledExecutors.get.filter(executors)
+          scheduledLocations.get.filter {
+            case loc: ExecutorCacheTaskLocation => executors(loc)
+            case loc: TaskLocation => true
+          }
         } else {
           Nil
         }
@@ -511,7 +525,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     /**
      * Start a receiver along with its scheduled executors
      */
-    private def startReceiver(receiver: Receiver[_], scheduledExecutors: Seq[String]): Unit = {
+    private def startReceiver(
+        receiver: Receiver[_],
+        scheduledLocations: Seq[TaskLocation]): Unit = {
       def shouldStartReceiver: Boolean = {
         // It's okay to start when trackerState is Initialized or Started
         !(isTrackerStopping || isTrackerStopped)
@@ -546,13 +562,12 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
           }
         }
 
-      // Create the RDD using the scheduledExecutors to run the receiver in a Spark job
+      // Create the RDD using the scheduledLocations to run the receiver in a Spark job
       val receiverRDD: RDD[Receiver[_]] =
-        if (scheduledExecutors.isEmpty) {
+        if (scheduledLocations.isEmpty) {
           ssc.sc.makeRDD(Seq(receiver), 1)
         } else {
-          val preferredLocations =
-            scheduledExecutors.map(hostPort => Utils.parseHostPort(hostPort)._1).distinct
+          val preferredLocations = scheduledLocations.map(_.toString).distinct
           ssc.sc.makeRDD(Seq(receiver -> preferredLocations))
         }
       receiverRDD.setName(s"Receiver $receiverId")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
index 043ff4d0ff054..ab0a84f05214d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming.scheduler
 
 import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, TaskLocation}
 import org.apache.spark.streaming.scheduler.ReceiverState._
 
 private[streaming] case class ReceiverErrorInfo(
@@ -28,7 +29,7 @@ private[streaming] case class ReceiverErrorInfo(
  *
  * @param receiverId the unique receiver id
  * @param state the current Receiver state
- * @param scheduledExecutors the scheduled executors provided by ReceiverSchedulingPolicy
+ * @param scheduledLocations the scheduled locations provided by ReceiverSchedulingPolicy
  * @param runningExecutor the running executor if the receiver is active
  * @param name the receiver name
  * @param endpoint the receiver endpoint. It can be used to send messages to the receiver
@@ -37,8 +38,8 @@ private[streaming] case class ReceiverErrorInfo(
 private[streaming] case class ReceiverTrackingInfo(
     receiverId: Int,
     state: ReceiverState,
-    scheduledExecutors: Option[Seq[String]],
-    runningExecutor: Option[String],
+    scheduledLocations: Option[Seq[TaskLocation]],
+    runningExecutor: Option[ExecutorCacheTaskLocation],
     name: Option[String] = None,
     endpoint: Option[RpcEndpointRef] = None,
     errorInfo: Option[ReceiverErrorInfo] = None) {
@@ -47,7 +48,7 @@ private[streaming] case class ReceiverTrackingInfo(
     receiverId,
     name.getOrElse(""),
     state == ReceiverState.ACTIVE,
-    location = runningExecutor.getOrElse(""),
+    location = runningExecutor.map(_.host).getOrElse(""),
     lastErrorMessage = errorInfo.map(_.lastErrorMessage).getOrElse(""),
     lastError = errorInfo.map(_.lastError).getOrElse(""),
     lastErrorTime = errorInfo.map(_.lastErrorTime).getOrElse(-1L)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
index b2a51d72bac2b..05b4e66c63ac6 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
@@ -20,73 +20,96 @@ package org.apache.spark.streaming.scheduler
 import scala.collection.mutable
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, HostTaskLocation, TaskLocation}
 
 class ReceiverSchedulingPolicySuite extends SparkFunSuite {
 
   val receiverSchedulingPolicy = new ReceiverSchedulingPolicy
 
   test("rescheduleReceiver: empty executors") {
-    val scheduledExecutors =
+    val scheduledLocations =
       receiverSchedulingPolicy.rescheduleReceiver(0, None, Map.empty, executors = Seq.empty)
-    assert(scheduledExecutors === Seq.empty)
+    assert(scheduledLocations === Seq.empty)
   }
 
   test("rescheduleReceiver: receiver preferredLocation") {
+    val executors = Seq(ExecutorCacheTaskLocation("host2", "2"))
     val receiverTrackingInfoMap = Map(
       0 -> ReceiverTrackingInfo(0, ReceiverState.INACTIVE, None, None))
-    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
-      0, Some("host1"), receiverTrackingInfoMap, executors = Seq("host2"))
-    assert(scheduledExecutors.toSet === Set("host1", "host2"))
+    val scheduledLocations = receiverSchedulingPolicy.rescheduleReceiver(
+      0, Some("host1"), receiverTrackingInfoMap, executors)
+    assert(scheduledLocations.toSet === Set(HostTaskLocation("host1"), executors(0)))
   }
 
   test("rescheduleReceiver: return all idle executors if there are any idle executors") {
-    val executors = Seq("host1", "host2", "host3", "host4", "host5")
-    // host3 is idle
+    val executors = (1 to 5).map(i => ExecutorCacheTaskLocation(s"host$i", s"$i"))
+    // executor 1 is busy, others are idle.
     val receiverTrackingInfoMap = Map(
-      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")))
-    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
+      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some(executors(0))))
+    val scheduledLocations = receiverSchedulingPolicy.rescheduleReceiver(
       1, None, receiverTrackingInfoMap, executors)
-    assert(scheduledExecutors.toSet === Set("host2", "host3", "host4", "host5"))
+    assert(scheduledLocations.toSet === executors.tail.toSet)
   }
 
   test("rescheduleReceiver: return all executors that have minimum weight if no idle executors") {
-    val executors = Seq("host1", "host2", "host3", "host4", "host5")
+    val executors = Seq(
+      ExecutorCacheTaskLocation("host1", "1"),
+      ExecutorCacheTaskLocation("host2", "2"),
+      ExecutorCacheTaskLocation("host3", "3"),
+      ExecutorCacheTaskLocation("host4", "4"),
+      ExecutorCacheTaskLocation("host5", "5")
+    )
     // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0, host4 = 0.5, host5 = 0.5
     val receiverTrackingInfoMap = Map(
-      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")),
-      1 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host2", "host3")), None),
-      2 -> ReceiverTrackingInfo(2, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None),
-      3 -> ReceiverTrackingInfo(4, ReceiverState.SCHEDULED, Some(Seq("host4", "host5")), None))
-    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
+      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None,
+        Some(ExecutorCacheTaskLocation("host1", "1"))),
+      1 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED,
+        Some(Seq(ExecutorCacheTaskLocation("host2", "2"), ExecutorCacheTaskLocation("host3", "3"))),
+        None),
+      2 -> ReceiverTrackingInfo(2, ReceiverState.SCHEDULED,
+        Some(Seq(ExecutorCacheTaskLocation("host1", "1"), ExecutorCacheTaskLocation("host3", "3"))),
+        None),
+      3 -> ReceiverTrackingInfo(4, ReceiverState.SCHEDULED,
+        Some(Seq(ExecutorCacheTaskLocation("host4", "4"),
+          ExecutorCacheTaskLocation("host5", "5"))), None))
+    val scheduledLocations = receiverSchedulingPolicy.rescheduleReceiver(
       4, None, receiverTrackingInfoMap, executors)
-    assert(scheduledExecutors.toSet === Set("host2", "host4", "host5"))
+    val expectedScheduledLocations = Set(
+      ExecutorCacheTaskLocation("host2", "2"),
+      ExecutorCacheTaskLocation("host4", "4"),
+      ExecutorCacheTaskLocation("host5", "5")
+    )
+    assert(scheduledLocations.toSet === expectedScheduledLocations)
   }
 
   test("scheduleReceivers: " +
     "schedule receivers evenly when there are more receivers than executors") {
     val receivers = (0 until 6).map(new RateTestReceiver(_))
-    val executors = (10000 until 10003).map(port => s"localhost:${port}")
-    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
-    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    val executors = (0 until 3).map(executorId =>
+      ExecutorCacheTaskLocation("localhost", executorId.toString))
+    val scheduledLocations = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[TaskLocation, Int]()
     // There should be 2 receivers running on each executor and each receiver has one executor
-    scheduledExecutors.foreach { case (receiverId, executors) =>
-      assert(executors.size == 1)
-      numReceiversOnExecutor(executors(0)) = numReceiversOnExecutor.getOrElse(executors(0), 0) + 1
+    scheduledLocations.foreach { case (receiverId, locations) =>
+      assert(locations.size == 1)
+      assert(locations(0).isInstanceOf[ExecutorCacheTaskLocation])
+      numReceiversOnExecutor(locations(0)) = numReceiversOnExecutor.getOrElse(locations(0), 0) + 1
     }
     assert(numReceiversOnExecutor === executors.map(_ -> 2).toMap)
   }
 
-
   test("scheduleReceivers: " +
     "schedule receivers evenly when there are more executors than receivers") {
     val receivers = (0 until 3).map(new RateTestReceiver(_))
-    val executors = (10000 until 10006).map(port => s"localhost:${port}")
-    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
-    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    val executors = (0 until 6).map(executorId =>
+      ExecutorCacheTaskLocation("localhost", executorId.toString))
+    val scheduledLocations = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[TaskLocation, Int]()
     // There should be 1 receiver running on each executor and each receiver has two executors
-    scheduledExecutors.foreach { case (receiverId, executors) =>
-      assert(executors.size == 2)
-      executors.foreach { l =>
+    scheduledLocations.foreach { case (receiverId, locations) =>
+      assert(locations.size == 2)
+      locations.foreach { l =>
+        assert(l.isInstanceOf[ExecutorCacheTaskLocation])
         numReceiversOnExecutor(l) = numReceiversOnExecutor.getOrElse(l, 0) + 1
       }
     }
@@ -96,34 +119,41 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
   test("scheduleReceivers: schedule receivers evenly when the preferredLocations are even") {
     val receivers = (0 until 3).map(new RateTestReceiver(_)) ++
       (3 until 6).map(new RateTestReceiver(_, Some("localhost")))
-    val executors = (10000 until 10003).map(port => s"localhost:${port}") ++
-      (10003 until 10006).map(port => s"localhost2:${port}")
-    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
-    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    val executors = (0 until 3).map(executorId =>
+      ExecutorCacheTaskLocation("localhost", executorId.toString)) ++
+      (3 until 6).map(executorId =>
+        ExecutorCacheTaskLocation("localhost2", executorId.toString))
+    val scheduledLocations = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[TaskLocation, Int]()
     // There should be 1 receiver running on each executor and each receiver has 1 executor
-    scheduledExecutors.foreach { case (receiverId, executors) =>
+    scheduledLocations.foreach { case (receiverId, executors) =>
       assert(executors.size == 1)
       executors.foreach { l =>
+        assert(l.isInstanceOf[ExecutorCacheTaskLocation])
         numReceiversOnExecutor(l) = numReceiversOnExecutor.getOrElse(l, 0) + 1
       }
     }
     assert(numReceiversOnExecutor === executors.map(_ -> 1).toMap)
     // Make sure we schedule the receivers to their preferredLocations
     val executorsForReceiversWithPreferredLocation =
-      scheduledExecutors.filter { case (receiverId, executors) => receiverId >= 3 }.flatMap(_._2)
+      scheduledLocations.filter { case (receiverId, executors) => receiverId >= 3 }.flatMap(_._2)
     // We can simply check the executor set because we only know each receiver only has 1 executor
     assert(executorsForReceiversWithPreferredLocation.toSet ===
-      (10000 until 10003).map(port => s"localhost:${port}").toSet)
+      (0 until 3).map(executorId =>
+        ExecutorCacheTaskLocation("localhost", executorId.toString)
+      ).toSet)
   }
 
   test("scheduleReceivers: return empty if no receiver") {
-    assert(receiverSchedulingPolicy.scheduleReceivers(Seq.empty, Seq("localhost:10000")).isEmpty)
+    val scheduledLocations = receiverSchedulingPolicy.
+      scheduleReceivers(Seq.empty, Seq(ExecutorCacheTaskLocation("localhost", "1")))
+    assert(scheduledLocations.isEmpty)
   }
 
   test("scheduleReceivers: return empty scheduled executors if no executors") {
     val receivers = (0 until 3).map(new RateTestReceiver(_))
-    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, Seq.empty)
-    scheduledExecutors.foreach { case (receiverId, executors) =>
+    val scheduledLocations = receiverSchedulingPolicy.scheduleReceivers(receivers, Seq.empty)
+    scheduledLocations.foreach { case (receiverId, executors) =>
       assert(executors.isEmpty)
     }
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index fda86aef457d4..3bd8d086abf7f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -99,8 +99,8 @@ class ReceiverTrackerSuite extends TestSuiteBase {
       output.register()
       ssc.start()
       eventually(timeout(10 seconds), interval(10 millis)) {
-        // If preferredLocations is set correctly, receiverTaskLocality should be NODE_LOCAL
-        assert(receiverTaskLocality === TaskLocality.NODE_LOCAL)
+        // If preferredLocations is set correctly, receiverTaskLocality should be PROCESS_LOCAL
+        assert(receiverTaskLocality === TaskLocality.PROCESS_LOCAL)
       }
     }
   }

From b960a890561eaf3795b93c621bd95be81e56f5b7 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Tue, 27 Oct 2015 16:55:10 -0700
Subject: [PATCH 700/802] [SPARK-11178] Improving naming around task failures.

Commit af3bc59d1f5d9d952c2d7ad1af599c49f1dbdaf0 introduced new
functionality so that if an executor dies for a reason that's not
caused by one of the tasks running on the executor (e.g., due to
pre-emption), Spark doesn't count the failure towards the maximum
number of failures for the task.  That commit introduced some vague
naming that this commit attempts to fix; in particular:

(1) The variable "isNormalExit", which was used to refer to cases where
the executor died for a reason unrelated to the tasks running on the
machine, has been renamed (and reversed) to "exitCausedByApp". The problem
with the existing name is that it's not clear (at least to me!) what it
means for an exit to be "normal"; the new name is intended to make the
purpose of this variable more clear.

(2) The variable "shouldEventuallyFailJob" has been renamed to
"countTowardsTaskFailures". This variable is used to determine whether
a task's failure should be counted towards the maximum number of failures
allowed for a task before the associated Stage is aborted. The problem
with the existing name is that it can be confused with implying that
the task's failure should immediately cause the stage to fail because it
is somehow fatal (this is the case for a fetch failure, for example: if
a task fails because of a fetch failure, there's no point in retrying,
and the whole stage should be failed).

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #9164 from kayousterhout/SPARK-11178.
---
 .../org/apache/spark/TaskEndReason.scala      | 22 ++++++++++-----
 .../spark/scheduler/ExecutorLossReason.scala  |  9 ++++---
 .../spark/scheduler/TaskSetManager.scala      | 16 ++++++-----
 .../cluster/CoarseGrainedClusterMessage.scala |  2 +-
 .../cluster/SparkDeploySchedulerBackend.scala |  2 +-
 .../cluster/YarnSchedulerBackend.scala        |  8 +++---
 .../cluster/mesos/MesosSchedulerBackend.scala |  2 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  9 +++----
 .../spark/scheduler/TaskSetManagerSuite.scala | 10 ++++---
 .../apache/spark/util/JsonProtocolSuite.scala |  6 ++---
 .../spark/deploy/yarn/YarnAllocator.scala     | 27 ++++++++++---------
 11 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 9335c5f4160bf..18278b292ff5a 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -53,7 +53,13 @@ sealed trait TaskFailedReason extends TaskEndReason {
   /** Error message displayed in the web UI. */
   def toErrorString: String
 
-  def shouldEventuallyFailJob: Boolean = true
+  /**
+   * Whether this task failure should be counted towards the maximum number of times the task is
+   * allowed to fail before the stage is aborted.  Set to false in cases where the task's failure
+   * was unrelated to the task; for example, if the task failed because the executor it was running
+   * on was killed.
+   */
+  def countTowardsTaskFailures: Boolean = true
 }
 
 /**
@@ -208,7 +214,7 @@ case class TaskCommitDenied(
    * towards failing the stage. This is intended to prevent spurious stage failures in cases
    * where many speculative tasks are launched and denied to commit.
    */
-  override def shouldEventuallyFailJob: Boolean = false
+  override def countTowardsTaskFailures: Boolean = false
 }
 
 /**
@@ -217,14 +223,18 @@ case class TaskCommitDenied(
  * the task crashed the JVM.
  */
 @DeveloperApi
-case class ExecutorLostFailure(execId: String, isNormalExit: Boolean = false)
+case class ExecutorLostFailure(execId: String, exitCausedByApp: Boolean = true)
   extends TaskFailedReason {
   override def toErrorString: String = {
-    val exitBehavior = if (isNormalExit) "normally" else "abnormally"
-    s"ExecutorLostFailure (executor ${execId} exited ${exitBehavior})"
+    val exitBehavior = if (exitCausedByApp) {
+      "caused by one of the running tasks"
+    } else {
+      "unrelated to the running tasks"
+    }
+    s"ExecutorLostFailure (executor ${execId} exited due to an issue ${exitBehavior})"
   }
 
-  override def shouldEventuallyFailJob: Boolean = !isNormalExit
+  override def countTowardsTaskFailures: Boolean = exitCausedByApp
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
index 0a98c69b89ea5..33edf25043850 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -28,12 +28,15 @@ class ExecutorLossReason(val message: String) extends Serializable {
 }
 
 private[spark]
-case class ExecutorExited(exitCode: Int, isNormalExit: Boolean, reason: String)
+case class ExecutorExited(exitCode: Int, exitCausedByApp: Boolean, reason: String)
   extends ExecutorLossReason(reason)
 
 private[spark] object ExecutorExited {
-  def apply(exitCode: Int, isNormalExit: Boolean): ExecutorExited = {
-    ExecutorExited(exitCode, isNormalExit, ExecutorExitCode.explainExitCode(exitCode))
+  def apply(exitCode: Int, exitCausedByApp: Boolean): ExecutorExited = {
+    ExecutorExited(
+      exitCode,
+      exitCausedByApp,
+      ExecutorExitCode.explainExitCode(exitCode))
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 987800d3d1f1e..9b3fad9012abc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -704,9 +704,10 @@ private[spark] class TaskSetManager(
         }
         ef.exception
 
-      case e: ExecutorLostFailure if e.isNormalExit =>
+      case e: ExecutorLostFailure if !e.exitCausedByApp =>
         logInfo(s"Task $tid failed because while it was being computed, its executor" +
-          s" exited normally. Not marking the task as failed.")
+          "exited for a reason unrelated to the task. Not counting this failure towards the " +
+          "maximum number of failures for the task.")
         None
 
       case e: TaskFailedReason =>  // TaskResultLost, TaskKilled, and others
@@ -724,7 +725,7 @@ private[spark] class TaskSetManager(
     addPendingTask(index)
     if (!isZombie && state != TaskState.KILLED
         && reason.isInstanceOf[TaskFailedReason]
-        && reason.asInstanceOf[TaskFailedReason].shouldEventuallyFailJob) {
+        && reason.asInstanceOf[TaskFailedReason].countTowardsTaskFailures) {
       assert (null != failureReason)
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
@@ -797,11 +798,12 @@ private[spark] class TaskSetManager(
       }
     }
     for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
-      val isNormalExit: Boolean = reason match {
-        case exited: ExecutorExited => exited.isNormalExit
-        case _ => false
+      val exitCausedByApp: Boolean = reason match {
+        case exited: ExecutorExited => exited.exitCausedByApp
+        case _ => true
       }
-      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, isNormalExit))
+      handleFailedTask(
+        tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, exitCausedByApp))
     }
     // recalculate valid locality levels and waits when executor is lost
     recomputeLocality()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 4652df32efa74..8103efa7302e7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -98,7 +98,7 @@ private[spark] object CoarseGrainedClusterMessages {
       hostToLocalTaskCount: Map[String, Int])
     extends CoarseGrainedClusterMessage
 
-  // Check if an executor was force-killed but for a normal reason.
+  // Check if an executor was force-killed but for a reason unrelated to the running tasks.
   // This could be the case if the executor is preempted, for instance.
   case class GetExecutorLossReason(executorId: String) extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index a4214c496166d..05d9bc92f228b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -137,7 +137,7 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]) {
     val reason: ExecutorLossReason = exitStatus match {
-      case Some(code) => ExecutorExited(code, isNormalExit = false, message)
+      case Some(code) => ExecutorExited(code, exitCausedByApp = true, message)
       case None => SlaveLost(message)
     }
     logInfo("Executor %s removed: %s".format(fullId, message))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 38218b9c08fd8..e483688edef5f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -111,10 +111,10 @@ private[spark] abstract class YarnSchedulerBackend(
      * immediately.
      *
      * In YARN's case however it is crucial to talk to the application master and ask why the
-     * executor had exited. In particular, the executor may have exited due to the executor
-     * having been preempted. If the executor "exited normally" according to the application
-     * master then we pass that information down to the TaskSetManager to inform the
-     * TaskSetManager that tasks on that lost executor should not count towards a job failure.
+     * executor had exited. If the executor exited for some reason unrelated to the running tasks
+     * (e.g., preemption), according to the application master, then we pass that information down
+     * to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
+     * not count towards a job failure.
      *
      * TODO there's a race condition where while we are querying the ApplicationMaster for
      * the executor loss reason, there is the potential that tasks will be scheduled on
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 6196176c7cc33..aaffac604a885 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -394,7 +394,7 @@ private[spark] class MesosSchedulerBackend(
                             slaveId: SlaveID, status: Int) {
     logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
                                                                  slaveId.getValue))
-    recordSlaveLost(d, slaveId, ExecutorExited(status, isNormalExit = false))
+    recordSlaveLost(d, slaveId, ExecutorExited(status, exitCausedByApp = true))
   }
 
   override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index a06dc6f709d33..ad6615c1124d0 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -367,9 +367,9 @@ private[spark] object JsonProtocol {
         ("Job ID" -> taskCommitDenied.jobID) ~
         ("Partition ID" -> taskCommitDenied.partitionID) ~
         ("Attempt Number" -> taskCommitDenied.attemptNumber)
-      case ExecutorLostFailure(executorId, isNormalExit) =>
+      case ExecutorLostFailure(executorId, exitCausedByApp) =>
         ("Executor ID" -> executorId) ~
-        ("Normal Exit" -> isNormalExit)
+        ("Exit Caused By App" -> exitCausedByApp)
       case _ => Utils.emptyJson
     }
     ("Reason" -> reason) ~ json
@@ -810,10 +810,9 @@ private[spark] object JsonProtocol {
         val attemptNo = Utils.jsonOption(json \ "Attempt Number").map(_.extract[Int]).getOrElse(-1)
         TaskCommitDenied(jobId, partitionId, attemptNo)
       case `executorLostFailure` =>
-        val isNormalExit = Utils.jsonOption(json \ "Normal Exit").
-          map(_.extract[Boolean])
+        val exitCausedByApp = Utils.jsonOption(json \ "Exit Caused By App").map(_.extract[Boolean])
         val executorId = Utils.jsonOption(json \ "Executor ID").map(_.extract[String])
-        ExecutorLostFailure(executorId.getOrElse("Unknown"), isNormalExit.getOrElse(false))
+        ExecutorLostFailure(executorId.getOrElse("Unknown"), exitCausedByApp.getOrElse(true))
       case `unknownReason` => UnknownReason
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index cd6bf723e70cb..ecc18fc6e15b4 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -511,7 +511,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.myLocalityLevels.sameElements(Array(NO_PREF, ANY)))
   }
 
-  test("Executors are added but exit normally while running tasks") {
+  test("Executors exit for reason unrelated to currently running tasks") {
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc)
     val taskSet = FakeTask.createTaskSet(4,
@@ -526,11 +526,15 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     manager.executorAdded()
     assert(manager.resourceOffer("exec1", "host1", ANY).isDefined)
     sched.removeExecutor("execA")
-    manager.executorLost("execA", "host1", ExecutorExited(143, true, "Normal termination"))
+    manager.executorLost(
+      "execA",
+      "host1",
+      ExecutorExited(143, false, "Terminated for reason unrelated to running tasks"))
     assert(!sched.taskSetsFailed.contains(taskSet.id))
     assert(manager.resourceOffer("execC", "host2", ANY).isDefined)
     sched.removeExecutor("execC")
-    manager.executorLost("execC", "host2", ExecutorExited(1, false, "Abnormal termination"))
+    manager.executorLost(
+      "execC", "host2", ExecutorExited(1, true, "Terminated due to issue with running tasks"))
     assert(sched.taskSetsFailed.contains(taskSet.id))
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index f9572921f43cb..86137f259c13d 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -603,10 +603,10 @@ class JsonProtocolSuite extends SparkFunSuite {
         assert(jobId1 === jobId2)
         assert(partitionId1 === partitionId2)
         assert(attemptNumber1 === attemptNumber2)
-      case (ExecutorLostFailure(execId1, isNormalExit1),
-          ExecutorLostFailure(execId2, isNormalExit2)) =>
+      case (ExecutorLostFailure(execId1, exit1CausedByApp),
+          ExecutorLostFailure(execId2, exit2CausedByApp)) =>
         assert(execId1 === execId2)
-        assert(isNormalExit1 === isNormalExit2)
+        assert(exit1CausedByApp === exit2CausedByApp)
       case (UnknownReason, UnknownReason) =>
       case _ => fail("Task end reasons don't match in types!")
     }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 1deaa3743ddfa..875bbd4e4e3d5 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -445,40 +445,41 @@ private[yarn] class YarnAllocator(
         // there are some exit status' we shouldn't necessarily count against us, but for
         // now I think its ok as none of the containers are expected to exit.
         val exitStatus = completedContainer.getExitStatus
-        val (isNormalExit, containerExitReason) = exitStatus match {
+        val (exitCausedByApp, containerExitReason) = exitStatus match {
           case ContainerExitStatus.SUCCESS =>
-            (true, s"Executor for container $containerId exited normally.")
+            (false, s"Executor for container $containerId exited because of a YARN event (e.g., " +
+              "pre-emption) and not because of an error in the running job.")
           case ContainerExitStatus.PREEMPTED =>
-            // Preemption should count as a normal exit, since YARN preempts containers merely
-            // to do resource sharing, and tasks that fail due to preempted executors could
+            // Preemption is not the fault of the running tasks, since YARN preempts containers
+            // merely to do resource sharing, and tasks that fail due to preempted executors could
             // just as easily finish on any other executor. See SPARK-8167.
-            (true, s"Container ${containerId}${onHostStr} was preempted.")
+            (false, s"Container ${containerId}${onHostStr} was preempted.")
           // Should probably still count memory exceeded exit codes towards task failures
           case VMEM_EXCEEDED_EXIT_CODE =>
-            (false, memLimitExceededLogMessage(
+            (true, memLimitExceededLogMessage(
               completedContainer.getDiagnostics,
               VMEM_EXCEEDED_PATTERN))
           case PMEM_EXCEEDED_EXIT_CODE =>
-            (false, memLimitExceededLogMessage(
+            (true, memLimitExceededLogMessage(
               completedContainer.getDiagnostics,
               PMEM_EXCEEDED_PATTERN))
           case unknown =>
             numExecutorsFailed += 1
-            (false, "Container marked as failed: " + containerId + onHostStr +
+            (true, "Container marked as failed: " + containerId + onHostStr +
               ". Exit status: " + completedContainer.getExitStatus +
               ". Diagnostics: " + completedContainer.getDiagnostics)
 
         }
-        if (isNormalExit) {
-          logInfo(containerExitReason)
-        } else {
+        if (exitCausedByApp) {
           logWarning(containerExitReason)
+        } else {
+          logInfo(containerExitReason)
         }
-        ExecutorExited(0, isNormalExit, containerExitReason)
+        ExecutorExited(0, exitCausedByApp, containerExitReason)
       } else {
         // If we have already released this container, then it must mean
         // that the driver has explicitly requested it to be killed
-        ExecutorExited(completedContainer.getExitStatus, isNormalExit = true,
+        ExecutorExited(completedContainer.getExitStatus, exitCausedByApp = false,
           s"Container $containerId exited from explicit termination request.")
       }
 

From d9c6039897236c3f1e4503aa95c5c9b07b32eadd Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 27 Oct 2015 20:26:38 -0700
Subject: [PATCH 701/802] [SPARK-10484] [SQL] Optimize the cartesian join with
 broadcast join for some cases

In some cases, we can broadcast the smaller relation in cartesian join, which improve the performance significantly.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8652 from chenghao-intel/cartesian.
---
 .../spark/sql/execution/SparkPlanner.scala    |  3 +-
 .../spark/sql/execution/SparkStrategies.scala | 38 +++++---
 .../joins/BroadcastNestedLoopJoin.scala       |  7 +-
 .../org/apache/spark/sql/JoinSuite.scala      | 92 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   |  3 +-
 ... JOIN #1-0-abfc0b99ee357f71639f6162345fe8e | 20 ++++
 ...JOIN #2-0-8412a39ee57885ccb0aaf848db8ef1dd | 20 ++++
 ...JOIN #3-0-e8a0427dbde35eea6011144443e5ffb4 | 20 ++++
 ...JOIN #4-0-45f8602d257655322b7d18cad09f6a0f | 20 ++++
 .../sql/hive/execution/HiveQuerySuite.scala   | 54 +++++++++++
 10 files changed, 261 insertions(+), 16 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #1-0-abfc0b99ee357f71639f6162345fe8e
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #2-0-8412a39ee57885ccb0aaf848db8ef1dd
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #3-0-e8a0427dbde35eea6011144443e5ffb4
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #4-0-45f8602d257655322b7d18cad09f6a0f

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
index b346f43faebe2..0f98fe88b2101 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
@@ -44,8 +44,9 @@ class SparkPlanner(val sqlContext: SQLContext) extends SparkStrategies {
       EquiJoinSelection ::
       InMemoryScans ::
       BasicOperators ::
+      BroadcastNestedLoop ::
       CartesianProduct ::
-      BroadcastNestedLoopJoin :: Nil)
+      DefaultJoin :: Nil)
 
   /**
    * Used to build table scan operators where complex projection and filtering are done using
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 637deff4e2202..ee9716285316a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -294,25 +294,24 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
-
-  object BroadcastNestedLoopJoin extends Strategy {
+  object BroadcastNestedLoop extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.Join(left, right, joinType, condition) =>
-        val buildSide =
-          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
-            joins.BuildRight
-          } else {
-            joins.BuildLeft
-          }
-        joins.BroadcastNestedLoopJoin(
-          planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
+      case logical.Join(
+             CanBroadcast(left), right, joinType, condition) if joinType != LeftSemiJoin =>
+        execution.joins.BroadcastNestedLoopJoin(
+          planLater(left), planLater(right), joins.BuildLeft, joinType, condition) :: Nil
+      case logical.Join(
+             left, CanBroadcast(right), joinType, condition) if joinType != LeftSemiJoin =>
+        execution.joins.BroadcastNestedLoopJoin(
+          planLater(left), planLater(right), joins.BuildRight, joinType, condition) :: Nil
       case _ => Nil
     }
   }
 
   object CartesianProduct extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.Join(left, right, _, None) =>
+      // TODO CartesianProduct doesn't support the Left Semi Join
+      case logical.Join(left, right, joinType, None) if joinType != LeftSemiJoin =>
         execution.joins.CartesianProduct(planLater(left), planLater(right)) :: Nil
       case logical.Join(left, right, Inner, Some(condition)) =>
         execution.Filter(condition,
@@ -321,6 +320,21 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
+  object DefaultJoin extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case logical.Join(left, right, joinType, condition) =>
+        val buildSide =
+          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
+            joins.BuildRight
+          } else {
+            joins.BuildLeft
+          }
+        joins.BroadcastNestedLoopJoin(
+          planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
+      case _ => Nil
+    }
+  }
+
   protected lazy val singleRowRdd = sparkContext.parallelize(Seq(InternalRow()), 1)
 
   object TakeOrderedAndProject extends Strategy {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index efef8c8a8b96a..05d20f511aef8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
-import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.collection.CompactBuffer
@@ -67,7 +67,10 @@ case class BroadcastNestedLoopJoin(
         left.output.map(_.withNullability(true)) ++ right.output
       case FullOuter =>
         left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true))
-      case x =>
+      case Inner =>
+        // TODO we can avoid breaking the lineage, since we union an empty RDD for Inner Join case
+        left.output ++ right.output
+      case x => // TODO support the Left Semi Join
         throw new IllegalArgumentException(
           s"BroadcastNestedLoopJoin should not take $x as the JoinType")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index b1fb06815868c..a9ca46cab067d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -28,6 +28,10 @@ class JoinSuite extends QueryTest with SharedSQLContext {
 
   setupTestData()
 
+  def statisticSizeInByte(df: DataFrame): BigInt = {
+    df.queryExecution.optimizedPlan.statistics.sizeInBytes
+  }
+
   test("equi-join is hash-join") {
     val x = testData2.as("x")
     val y = testData2.as("y")
@@ -466,6 +470,94 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     sql("UNCACHE TABLE testData")
   }
 
+  test("cross join with broadcast") {
+    sql("CACHE TABLE testData")
+
+    val sizeInByteOfTestData = statisticSizeInByte(sqlContext.table("testData"))
+
+    // we set the threshold is greater than statistic of the cached table testData
+    withSQLConf(
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> (sizeInByteOfTestData + 1).toString()) {
+
+      assert(statisticSizeInByte(sqlContext.table("testData2")) >
+        sqlContext.conf.autoBroadcastJoinThreshold)
+
+      assert(statisticSizeInByte(sqlContext.table("testData")) <
+        sqlContext.conf.autoBroadcastJoinThreshold)
+
+      Seq(
+        ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a",
+          classOf[LeftSemiJoinHash]),
+        ("SELECT * FROM testData LEFT SEMI JOIN testData2",
+          classOf[LeftSemiJoinBNL]),
+        ("SELECT * FROM testData JOIN testData2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData JOIN testData2 WHERE key = 2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData LEFT JOIN testData2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData RIGHT JOIN testData2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData FULL OUTER JOIN testData2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData LEFT JOIN testData2 WHERE key = 2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData RIGHT JOIN testData2 WHERE key = 2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key = 2",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData JOIN testData2 WHERE key > a",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key > a",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData left JOIN testData2 WHERE (key * a != key + a)",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData right JOIN testData2 WHERE (key * a != key + a)",
+          classOf[BroadcastNestedLoopJoin]),
+        ("SELECT * FROM testData full JOIN testData2 WHERE (key * a != key + a)",
+          classOf[BroadcastNestedLoopJoin])
+      ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
+
+      checkAnswer(
+        sql(
+          """
+            SELECT x.value, y.a, y.b FROM testData x JOIN testData2 y WHERE x.key = 2
+          """.stripMargin),
+        Row("2", 1, 1) ::
+        Row("2", 1, 2) ::
+        Row("2", 2, 1) ::
+        Row("2", 2, 2) ::
+        Row("2", 3, 1) ::
+        Row("2", 3, 2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            SELECT x.value, y.a, y.b FROM testData x JOIN testData2 y WHERE x.key < y.a
+          """.stripMargin),
+        Row("1", 2, 1) ::
+        Row("1", 2, 2) ::
+        Row("1", 3, 1) ::
+        Row("1", 3, 2) ::
+        Row("2", 3, 1) ::
+        Row("2", 3, 2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            SELECT x.value, y.a, y.b FROM testData x JOIN testData2 y ON x.key < y.a
+          """.stripMargin),
+        Row("1", 2, 1) ::
+          Row("1", 2, 2) ::
+          Row("1", 3, 1) ::
+          Row("1", 3, 2) ::
+          Row("2", 3, 1) ::
+          Row("2", 3, 2) :: Nil)
+    }
+
+    sql("UNCACHE TABLE testData")
+  }
+
   test("left semi join") {
     val df = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
     checkAnswer(df,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c328734df316b..83a81cf5d1fcf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -588,8 +588,9 @@ class HiveContext private[hive](
       LeftSemiJoin,
       EquiJoinSelection,
       BasicOperators,
+      BroadcastNestedLoop,
       CartesianProduct,
-      BroadcastNestedLoopJoin
+      DefaultJoin
     )
   }
 
diff --git a/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #1-0-abfc0b99ee357f71639f6162345fe8e b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #1-0-abfc0b99ee357f71639f6162345fe8e
new file mode 100644
index 0000000000000..0bb9399af0c45
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #1-0-abfc0b99ee357f71639f6162345fe8e	
@@ -0,0 +1,20 @@
+302	0
+302	0
+302	0
+305	0
+305	0
+305	0
+306	0
+306	0
+306	0
+307	0
+307	0
+307	0
+307	0
+307	0
+307	0
+308	0
+308	0
+308	0
+309	0
+309	0
diff --git a/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #2-0-8412a39ee57885ccb0aaf848db8ef1dd b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #2-0-8412a39ee57885ccb0aaf848db8ef1dd
new file mode 100644
index 0000000000000..4e455ed255117
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #2-0-8412a39ee57885ccb0aaf848db8ef1dd	
@@ -0,0 +1,20 @@
+302	0
+302	0
+302	0
+305	0
+305	0
+305	0
+305	2
+305	4
+306	0
+306	0
+306	0
+306	2
+306	4
+306	5
+306	5
+306	5
+307	0
+307	0
+307	0
+307	0
diff --git a/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #3-0-e8a0427dbde35eea6011144443e5ffb4 b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #3-0-e8a0427dbde35eea6011144443e5ffb4
new file mode 100644
index 0000000000000..4e455ed255117
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #3-0-e8a0427dbde35eea6011144443e5ffb4	
@@ -0,0 +1,20 @@
+302	0
+302	0
+302	0
+305	0
+305	0
+305	0
+305	2
+305	4
+306	0
+306	0
+306	0
+306	2
+306	4
+306	5
+306	5
+306	5
+307	0
+307	0
+307	0
+307	0
diff --git a/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #4-0-45f8602d257655322b7d18cad09f6a0f b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #4-0-45f8602d257655322b7d18cad09f6a0f
new file mode 100644
index 0000000000000..4e455ed255117
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #4-0-45f8602d257655322b7d18cad09f6a0f	
@@ -0,0 +1,20 @@
+302	0
+302	0
+302	0
+305	0
+305	0
+305	0
+305	2
+305	4
+306	0
+306	0
+306	0
+306	2
+306	4
+306	5
+306	5
+306	5
+307	0
+307	0
+307	0
+307	0
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 2878500453141..b52f7d4b57899 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hive.execution
 import java.io.File
 import java.util.{Locale, TimeZone}
 
+import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoin
+
 import scala.util.Try
 
 import org.scalatest.BeforeAndAfter
@@ -69,6 +71,58 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     }
   }
 
+  // Testing the Broadcast based join for cartesian join (cross join)
+  // We assume that the Broadcast Join Threshold will works since the src is a small table
+  private val spark_10484_1 = """
+                                | SELECT a.key, b.key
+                                | FROM src a LEFT JOIN src b WHERE a.key > b.key + 300
+                                | ORDER BY b.key, a.key
+                                | LIMIT 20
+                              """.stripMargin
+  private val spark_10484_2 = """
+                                | SELECT a.key, b.key
+                                | FROM src a RIGHT JOIN src b WHERE a.key > b.key + 300
+                                | ORDER BY a.key, b.key
+                                | LIMIT 20
+                              """.stripMargin
+  private val spark_10484_3 = """
+                                | SELECT a.key, b.key
+                                | FROM src a FULL OUTER JOIN src b WHERE a.key > b.key + 300
+                                | ORDER BY a.key, b.key
+                                | LIMIT 20
+                              """.stripMargin
+  private val spark_10484_4 = """
+                                | SELECT a.key, b.key
+                                | FROM src a JOIN src b WHERE a.key > b.key + 300
+                                | ORDER BY a.key, b.key
+                                | LIMIT 20
+                              """.stripMargin
+
+  createQueryTest("SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #1",
+    spark_10484_1)
+
+  createQueryTest("SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #2",
+    spark_10484_2)
+
+  createQueryTest("SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #3",
+    spark_10484_3)
+
+  createQueryTest("SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN #4",
+    spark_10484_4)
+
+  test("SPARK-10484 Optimize the Cartesian (Cross) Join with broadcast based JOIN") {
+    def assertBroadcastNestedLoopJoin(sqlText: String): Unit = {
+      assert(sql(sqlText).queryExecution.sparkPlan.collect {
+        case _: BroadcastNestedLoopJoin => 1
+      }.nonEmpty)
+    }
+
+    assertBroadcastNestedLoopJoin(spark_10484_1)
+    assertBroadcastNestedLoopJoin(spark_10484_2)
+    assertBroadcastNestedLoopJoin(spark_10484_3)
+    assertBroadcastNestedLoopJoin(spark_10484_4)
+  }
+
   createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
     """
       SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP

From 826e1e304b57abbc56b8b7ffd663d53942ab3c7c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 27 Oct 2015 23:07:37 -0700
Subject: [PATCH 702/802] [SPARK-11302][MLLIB] 2) Multivariate Gaussian Model
 with Covariance matrix returns incorrect answer in some cases

Fix computation of root-sigma-inverse in multivariate Gaussian; add a test and fix related Python mixture model test.

Supersedes https://github.com/apache/spark/pull/9293

Author: Sean Owen <sowen@cloudera.com>

Closes #9309 from srowen/SPARK-11302.2.
---
 .../stat/distribution/MultivariateGaussian.scala  |  8 ++++----
 .../distribution/MultivariateGaussianSuite.scala  | 15 +++++++++++++++
 python/pyspark/mllib/clustering.py                |  4 ++--
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 92a5af708d04b..0724af93088c2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -56,7 +56,7 @@ class MultivariateGaussian @Since("1.3.0") (
 
   /**
    * Compute distribution dependent constants:
-   *    rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
+   *    rootSigmaInv = D^(-1/2)^ * U.t, where sigma = U * D * U.t
    *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
    */
   private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
@@ -104,11 +104,11 @@ class MultivariateGaussian @Since("1.3.0") (
    *
    *    sigma = U * D * U.t
    *    inv(Sigma) = U * inv(D) * U.t
-   *               = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
+   *               = (D^{-1/2}^ * U.t).t * (D^{-1/2}^ * U.t)
    *
    * and thus
    *
-   *    -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U  * (x-mu))^2^
+   *    -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U.t  * (x-mu))^2^
    *
    * To guard against singular covariance matrices, this method computes both the
    * pseudo-determinant and the pseudo-inverse (Moore-Penrose).  Singular values are considered
@@ -130,7 +130,7 @@ class MultivariateGaussian @Since("1.3.0") (
       // by inverting the square root of all non-zero values
       val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
 
-      (pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
+      (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
     } catch {
       case uex: UnsupportedOperationException =>
         throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
index aa60deb665aeb..6e7a003475458 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -65,4 +65,19 @@ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
     assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
   }
+
+  test("SPARK-11302") {
+    val x = Vectors.dense(629, 640, 1.7188, 618.19)
+    val mu = Vectors.dense(
+      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
+    val sigma = Matrices.dense(4, 4, Array(
+      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
+      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
+      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
+      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
+    val dist = new MultivariateGaussian(mu, sigma)
+    // Agrees with R's dmvnorm: 7.154782e-05
+    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
+  }
+
 }
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index c451df17cf264..d1c3755a785f2 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -236,9 +236,9 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
     ...                               maxIterations=150, seed=10)
     >>> labels = model.predict(clusterdata_2).collect()
-    >>> labels[0]==labels[1]==labels[2]
+    >>> labels[0]==labels[1]
     True
-    >>> labels[3]==labels[4]
+    >>> labels[2]==labels[3]==labels[4]
     True
 
     .. versionadded:: 1.3.0

From 82c1c5772817785709b0289f7d836beba812c791 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 27 Oct 2015 23:41:42 -0700
Subject: [PATCH 703/802] [MINOR][ML] fix compile warns

This fixes some compile time warnings.

Author: Xiangrui Meng <meng@databricks.com>

Closes #9319 from mengxr/mllib-compile-warn-20151027.
---
 .../org/apache/spark/ml/regression/LinearRegression.scala      | 2 +-
 .../spark/ml/classification/LogisticRegressionSuite.scala      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 573a61a6eabdf..c3ee8b3bc1ba0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -145,7 +145,7 @@ class LinearRegression(override val uid: String)
     // Extract the number of features before deciding optimization solver.
     val numFeatures = dataset.select(col($(featuresCol))).limit(1).map {
       case Row(features: Vector) => features.size
-    }.toArray()(0)
+    }.first()
     val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
 
     if (($(solver) == "auto" && $(elasticNetParam) == 0.0 && numFeatures <= 4096) ||
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 5186c4e2be64d..e0a795e5e0b00 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.classification
 
+import scala.language.existentials
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
@@ -24,7 +25,7 @@ import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._

From 5f1cee6f158adb1f9f485ed1d529c56bace68adc Mon Sep 17 00:00:00 2001
From: Nakul Jindal <njindal@us.ibm.com>
Date: Wed, 28 Oct 2015 01:02:03 -0700
Subject: [PATCH 704/802] [SPARK-11332] [ML] Refactored to use
 ml.feature.Instance instead of WeightedLeastSquare.Instance

WeightedLeastSquares now uses the common Instance class in ml.feature instead of a private one.

Author: Nakul Jindal <njindal@us.ibm.com>

Closes #9325 from nakul02/SPARK-11332_refactor_WeightedLeastSquares_dot_Instance.
---
 .../spark/ml/optim/WeightedLeastSquares.scala | 25 ++++++-------------
 .../ml/regression/LinearRegression.scala      |  4 +--
 .../ml/optim/WeightedLeastSquaresSuite.scala  | 10 ++++----
 3 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index d7eaa5a9268ff..3d64f7f296137 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.Logging
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.rdd.RDD
 
@@ -121,16 +122,6 @@ private[ml] class WeightedLeastSquares(
 
 private[ml] object WeightedLeastSquares {
 
-  /**
-   * Case class for weighted observations.
-   * @param w weight, must be positive
-   * @param a features
-   * @param b label
-   */
-  case class Instance(w: Double, a: Vector, b: Double) {
-    require(w >= 0.0, s"Weight cannot be negative: $w.")
-  }
-
   /**
    * Aggregator to provide necessary summary statistics for solving [[WeightedLeastSquares]].
    */
@@ -168,8 +159,8 @@ private[ml] object WeightedLeastSquares {
      * Adds an instance.
      */
     def add(instance: Instance): this.type = {
-      val Instance(w, a, b) = instance
-      val ak = a.size
+      val Instance(l, w, f) = instance
+      val ak = f.size
       if (!initialized) {
         init(ak)
       }
@@ -177,11 +168,11 @@ private[ml] object WeightedLeastSquares {
       count += 1L
       wSum += w
       wwSum += w * w
-      bSum += w * b
-      bbSum += w * b * b
-      BLAS.axpy(w, a, aSum)
-      BLAS.axpy(w * b, a, abSum)
-      BLAS.spr(w, a, aaSum)
+      bSum += w * l
+      bbSum += w * l * l
+      BLAS.axpy(w, f, aSum)
+      BLAS.axpy(w * l, f, abSum)
+      BLAS.spr(w, f, aaSum)
       this
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index c3ee8b3bc1ba0..f663b9bd9ac73 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -154,10 +154,10 @@ class LinearRegression(override val uid: String)
         "solver is used.'")
       // For low dimensional data, WeightedLeastSquares is more efficiently since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
-      val instances: RDD[WeightedLeastSquares.Instance] = dataset.select(
+      val instances: RDD[Instance] = dataset.select(
         col($(labelCol)), w, col($(featuresCol))).map {
           case Row(label: Double, weight: Double, features: Vector) =>
-            WeightedLeastSquares.Instance(weight, features, label)
+            Instance(label, weight, features)
       }
 
       val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
index 652f3adb984d3..b542ba3dc54d2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.optim.WeightedLeastSquares.Instance
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -38,10 +38,10 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
        w <- c(1, 2, 3, 4)
      */
     instances = sc.parallelize(Seq(
-      Instance(1.0, Vectors.dense(0.0, 5.0).toSparse, 17.0),
-      Instance(2.0, Vectors.dense(1.0, 7.0), 19.0),
-      Instance(3.0, Vectors.dense(2.0, 11.0), 23.0),
-      Instance(4.0, Vectors.dense(3.0, 13.0), 29.0)
+      Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
     ), 2)
   }
 

From 075ce4914fdcbbcc7286c3c30cb940ed28d474d2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 28 Oct 2015 13:58:52 +0100
Subject: [PATCH 705/802] [SPARK-11313][SQL] implement cogroup on DataSets
 (support 2 datasets)

A simpler version of https://github.com/apache/spark/pull/9279, only support 2 datasets.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9324 from cloud-fan/cogroup2.
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  1 +
 .../plans/logical/basicOperators.scala        | 39 ++++++++
 .../org/apache/spark/sql/GroupedDataset.scala | 20 +++++
 .../sql/execution/CoGroupedIterator.scala     | 89 +++++++++++++++++++
 .../spark/sql/execution/SparkStrategies.scala |  4 +
 .../spark/sql/execution/basicOperators.scala  | 41 +++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 12 +++
 .../execution/CoGroupedIteratorSuite.scala    | 51 +++++++++++
 8 files changed, 257 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 850838af9be35..5ba14ebdb62a4 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -591,6 +591,7 @@ public String toString() {
       build.append(java.lang.Long.toHexString(Platform.getLong(baseObject, baseOffset + i)));
       build.append(',');
     }
+    build.deleteCharAt(build.length() - 1);
     build.append(']');
     return build.toString();
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index d2d3db0a44484..4cb67aacf33ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -513,3 +513,42 @@ case class MapGroups[K, T, U](
   override def missingInput: AttributeSet = AttributeSet.empty
 }
 
+/** Factory for constructing new `CoGroup` nodes. */
+object CoGroup {
+  def apply[K : Encoder, Left : Encoder, Right : Encoder, R : Encoder](
+      func: (K, Iterator[Left], Iterator[Right]) => Iterator[R],
+      leftGroup: Seq[Attribute],
+      rightGroup: Seq[Attribute],
+      left: LogicalPlan,
+      right: LogicalPlan): CoGroup[K, Left, Right, R] = {
+    CoGroup(
+      func,
+      encoderFor[K],
+      encoderFor[Left],
+      encoderFor[Right],
+      encoderFor[R],
+      encoderFor[R].schema.toAttributes,
+      leftGroup,
+      rightGroup,
+      left,
+      right)
+  }
+}
+
+/**
+ * A relation produced by applying `func` to each grouping key and associated values from left and
+ * right children.
+ */
+case class CoGroup[K, Left, Right, R](
+    func: (K, Iterator[Left], Iterator[Right]) => Iterator[R],
+    kEncoder: ExpressionEncoder[K],
+    leftEnc: ExpressionEncoder[Left],
+    rightEnc: ExpressionEncoder[Right],
+    rEncoder: ExpressionEncoder[R],
+    output: Seq[Attribute],
+    leftGroup: Seq[Attribute],
+    rightGroup: Seq[Attribute],
+    left: LogicalPlan,
+    right: LogicalPlan) extends BinaryNode {
+  override def missingInput: AttributeSet = AttributeSet.empty
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
index 89a16dd8b0acc..612f2b60cd405 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
@@ -65,4 +65,24 @@ class GroupedDataset[K, T] private[sql](
       sqlContext,
       MapGroups(f, groupingAttributes, logicalPlan))
   }
+
+  /**
+   * Applies the given function to each cogrouped data.  For each unique group, the function will
+   * be passed the grouping key and 2 iterators containing all elements in the group from
+   * [[Dataset]] `this` and `other`.  The function can return an iterator containing elements of an
+   * arbitrary type which will be returned as a new [[Dataset]].
+   */
+  def cogroup[U, R : Encoder](
+      other: GroupedDataset[K, U])(
+      f: (K, Iterator[T], Iterator[U]) => Iterator[R]): Dataset[R] = {
+    implicit def uEnc: Encoder[U] = other.tEncoder
+    new Dataset[R](
+      sqlContext,
+      CoGroup(
+        f,
+        this.groupingAttributes,
+        other.groupingAttributes,
+        this.logicalPlan,
+        other.logicalPlan))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
new file mode 100644
index 0000000000000..ce5827855e4aa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder, Attribute}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
+
+/**
+ * Iterates over [[GroupedIterator]]s and returns the cogrouped data, i.e. each record is a
+ * grouping key with its associated values from all [[GroupedIterator]]s.
+ * Note: we assume the output of each [[GroupedIterator]] is ordered by the grouping key.
+ */
+class CoGroupedIterator(
+    left: Iterator[(InternalRow, Iterator[InternalRow])],
+    right: Iterator[(InternalRow, Iterator[InternalRow])],
+    groupingSchema: Seq[Attribute])
+  extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] {
+
+  private val keyOrdering =
+    GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema)
+
+  private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
+  private var currentRightData: (InternalRow, Iterator[InternalRow]) = _
+
+  override def hasNext: Boolean = left.hasNext || right.hasNext
+
+  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
+    if (currentLeftData.eq(null) && left.hasNext) {
+      currentLeftData = left.next()
+    }
+    if (currentRightData.eq(null) && right.hasNext) {
+      currentRightData = right.next()
+    }
+
+    assert(currentLeftData.ne(null) || currentRightData.ne(null))
+
+    if (currentLeftData.eq(null)) {
+      // left is null, right is not null, consume the right data.
+      rightOnly()
+    } else if (currentRightData.eq(null)) {
+      // left is not null, right is null, consume the left data.
+      leftOnly()
+    } else if (currentLeftData._1 == currentRightData._1) {
+      // left and right have the same grouping key, consume both of them.
+      val result = (currentLeftData._1, currentLeftData._2, currentRightData._2)
+      currentLeftData = null
+      currentRightData = null
+      result
+    } else {
+      val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1)
+      assert(compare != 0)
+      if (compare < 0) {
+        // the grouping key of left is smaller, consume the left data.
+        leftOnly()
+      } else {
+        // the grouping key of right is smaller, consume the right data.
+        rightOnly()
+      }
+    }
+  }
+
+  private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
+    val result = (currentLeftData._1, currentLeftData._2, Iterator.empty)
+    currentLeftData = null
+    result
+  }
+
+  private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
+    val result = (currentRightData._1, Iterator.empty, currentRightData._2)
+    currentRightData = null
+    result
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index ee9716285316a..32067266b516b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -393,6 +393,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.AppendColumns(f, tEnc, uEnc, newCol, planLater(child)) :: Nil
       case logical.MapGroups(f, kEnc, tEnc, uEnc, grouping, output, child) =>
         execution.MapGroups(f, kEnc, tEnc, uEnc, grouping, output, planLater(child)) :: Nil
+      case logical.CoGroup(f, kEnc, leftEnc, rightEnc, rEnc, output,
+        leftGroup, rightGroup, left, right) =>
+        execution.CoGroup(f, kEnc, leftEnc, rightEnc, rEnc, output, leftGroup, rightGroup,
+          planLater(left), planLater(right)) :: Nil
 
       case logical.Repartition(numPartitions, shuffle, child) =>
         if (shuffle) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 89938471ee381..d5a803f8c4b24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -390,3 +390,44 @@ case class MapGroups[K, T, U](
     }
   }
 }
+
+/**
+ * Co-groups the data from left and right children, and calls the function with each group and 2
+ * iterators containing all elements in the group from left and right side.
+ * The result of this function is encoded and flattened before being output.
+ */
+case class CoGroup[K, Left, Right, R](
+    func: (K, Iterator[Left], Iterator[Right]) => Iterator[R],
+    kEncoder: ExpressionEncoder[K],
+    leftEnc: ExpressionEncoder[Left],
+    rightEnc: ExpressionEncoder[Right],
+    rEncoder: ExpressionEncoder[R],
+    output: Seq[Attribute],
+    leftGroup: Seq[Attribute],
+    rightGroup: Seq[Attribute],
+    left: SparkPlan,
+    right: SparkPlan) extends BinaryNode {
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    ClusteredDistribution(leftGroup) :: ClusteredDistribution(rightGroup) :: Nil
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
+    leftGroup.map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    left.execute().zipPartitions(right.execute()) { (leftData, rightData) =>
+      val leftGrouped = GroupedIterator(leftData, leftGroup, left.output)
+      val rightGrouped = GroupedIterator(rightData, rightGroup, right.output)
+      val groupKeyEncoder = kEncoder.bind(leftGroup)
+
+      new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup).flatMap {
+        case (key, leftResult, rightResult) =>
+          val result = func(
+            groupKeyEncoder.fromRow(key),
+            leftResult.map(leftEnc.fromRow),
+            rightResult.map(rightEnc.fromRow))
+          result.map(rEncoder.toRow)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index aebb390a1d15d..993e6d269ee03 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -202,4 +202,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       agged,
       ("a", 30), ("b", 3), ("c", 1))
   }
+
+  test("cogroup") {
+    val ds1 = Seq(1 -> "a", 3 -> "abc", 5 -> "hello", 3 -> "foo").toDS()
+    val ds2 = Seq(2 -> "q", 3 -> "w", 5 -> "e", 5 -> "r").toDS()
+    val cogrouped = ds1.groupBy(_._1).cogroup(ds2.groupBy(_._1)) { case (key, data1, data2) =>
+      Iterator(key -> (data1.map(_._2).mkString + "#" + data2.map(_._2).mkString))
+    }
+
+    checkAnswer(
+      cogrouped,
+      1 -> "a#", 2 -> "#q", 3 -> "abcfoo#w", 5 -> "hello#er")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala
new file mode 100644
index 0000000000000..d1fe81947e9ea
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.ExpressionEvalHelper
+
+class CoGroupedIteratorSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("basic") {
+    val leftInput = Seq(create_row(1, "a"), create_row(1, "b"), create_row(2, "c")).iterator
+    val rightInput = Seq(create_row(1, 2L), create_row(2, 3L), create_row(3, 4L)).iterator
+    val leftGrouped = GroupedIterator(leftInput, Seq('i.int.at(0)), Seq('i.int, 's.string))
+    val rightGrouped = GroupedIterator(rightInput, Seq('i.int.at(0)), Seq('i.int, 'l.long))
+    val cogrouped = new CoGroupedIterator(leftGrouped, rightGrouped, Seq('i.int))
+
+    val result = cogrouped.map {
+      case (key, leftData, rightData) =>
+        assert(key.numFields == 1)
+        (key.getInt(0), leftData.toSeq, rightData.toSeq)
+    }.toSeq
+    assert(result ==
+      (1,
+        Seq(create_row(1, "a"), create_row(1, "b")),
+        Seq(create_row(1, 2L))) ::
+      (2,
+        Seq(create_row(2, "c")),
+        Seq(create_row(2, 3L))) ::
+      (3,
+        Seq.empty,
+        Seq(create_row(3, 4L))) ::
+      Nil
+    )
+  }
+}

From fd9e345ceeff385ba614a16d478097650caa98d0 Mon Sep 17 00:00:00 2001
From: "Mageswaran.D" <mageswaran1989@gmail.com>
Date: Wed, 28 Oct 2015 08:46:30 -0700
Subject: [PATCH 706/802] Typo in mllib-evaluation-metrics.md

Recall by threshold snippet was using "precisionByThreshold"

Author: Mageswaran.D <mageswaran1989@gmail.com>

Closes #9333 from Mageswaran1989/Typo_in_mllib-evaluation-metrics.md.
---
 docs/mllib-evaluation-metrics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 2270f7a34b069..f73eff637dc36 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -141,7 +141,7 @@ precision.foreach { case (t, p) =>
 }
 
 // Recall by threshold
-val recall = metrics.precisionByThreshold
+val recall = metrics.recallByThreshold
 recall.foreach { case (t, r) =>
     println(s"Threshold: $t, Recall: $r")
 }
@@ -1509,4 +1509,4 @@ print("Explained variance = %s" % metrics.explainedVariance)
 {% endhighlight %}
 
 </div>
-</div>
\ No newline at end of file
+</div>

From fba9e95452ca0a9b589bc14b27c750c69f482b8d Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 28 Oct 2015 08:50:21 -0700
Subject: [PATCH 707/802] [SPARK-11369][ML][R] SparkR glm should support
 setting standardize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SparkR glm currently support :
```formula, family = c(“gaussian”, “binomial”), data, lambda = 0, alpha = 0```
We should also support setting standardize which has been defined at [design documentation](https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit)

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9331 from yanboliang/spark-11369.
---
 R/pkg/R/mllib.R                                               | 4 ++--
 .../src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 25615e805e03c..aadd5b8da5e3b 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -46,11 +46,11 @@ setClass("PipelineModel", representation(model = "jobj"))
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
           function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
-            solver = "auto") {
+            standardize = TRUE, solver = "auto") {
             family <- match.arg(family)
             model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                  "fitRModelFormula", deparse(formula), data@sdf, family, lambda,
-                                 alpha, solver)
+                                 alpha, standardize, solver)
             return(new("PipelineModel", model = model))
           })
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index fec61fed3cb9c..21ebf6d916db7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -31,6 +31,7 @@ private[r] object SparkRWrappers {
       family: String,
       lambda: Double,
       alpha: Double,
+      standardize: Boolean,
       solver: String): PipelineModel = {
     val formula = new RFormula().setFormula(value)
     val estimator = family match {
@@ -38,11 +39,13 @@ private[r] object SparkRWrappers {
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
         .setFitIntercept(formula.hasIntercept)
+        .setStandardization(standardize)
         .setSolver(solver)
       case "binomial" => new LogisticRegression()
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
         .setFitIntercept(formula.hasIntercept)
+        .setStandardization(standardize)
     }
     val pipeline = new Pipeline().setStages(Array(formula, estimator))
     pipeline.fit(df)

From f92b7b98e9998a6069996cc66ca26cbfa695fce5 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 28 Oct 2015 08:54:20 -0700
Subject: [PATCH 708/802] [SPARK-11367][ML][PYSPARK] Python LinearRegression
 should support setting solver

[SPARK-10668](https://issues.apache.org/jira/browse/SPARK-10668) has provided ```WeightedLeastSquares``` solver("normal") in ```LinearRegression``` with L2 regularization in Scala and R, Python ML ```LinearRegression``` should also support setting solver("auto", "normal", "l-bfgs")

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9328 from yanboliang/spark-11367.
---
 .../ml/param/_shared_params_code_gen.py       |  4 ++-
 python/pyspark/ml/param/shared.py             | 28 +++++++++++++++++++
 python/pyspark/ml/regression.py               | 27 ++++--------------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 7143d56330bd6..070c5db01ae73 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -135,7 +135,9 @@ def get$Name(self):
          "values >= 0. The class with largest value p/t is predicted, where p is the original " +
          "probability of that class and t is the class' threshold.", None),
         ("weightCol", "weight column name. If this is not set or empty, we treat " +
-         "all instance weights as 1.0.", None)]
+         "all instance weights as 1.0.", None),
+        ("solver", "the solver algorithm for optimization. If this is not set or empty, " +
+         "default value is 'auto'.", "'auto'")]
 
     code = []
     for name, doc, defaultValueStr in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 3a58ac87d6b65..4bdf2a8cc563f 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -597,6 +597,34 @@ def getWeightCol(self):
         return self.getOrDefault(self.weightCol)
 
 
+class HasSolver(Params):
+    """
+    Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.")
+
+    def __init__(self):
+        super(HasSolver, self).__init__()
+        #: param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
+        self.solver = Param(self, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.")
+        self._setDefault(solver='auto')
+
+    def setSolver(self, value):
+        """
+        Sets the value of :py:attr:`solver`.
+        """
+        self._paramMap[self.solver] = value
+        return self
+
+    def getSolver(self):
+        """
+        Gets the value of solver or its default value.
+        """
+        return self.getOrDefault(self.solver)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index eeb18b3e9d290..dc68815556d4e 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -33,7 +33,7 @@
 @inherit_doc
 class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
                        HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
-                       HasStandardization):
+                       HasStandardization, HasSolver):
     """
     Linear regression.
 
@@ -50,7 +50,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
-    >>> lr = LinearRegression(maxIter=5, regParam=0.0)
+    >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal")
     >>> model = lr.fit(df)
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
@@ -73,11 +73,11 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                 standardization=True):
+                 standardization=True, solver="auto"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                 standardization=True)
+                 standardization=True, solver="auto")
         """
         super(LinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -90,11 +90,11 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                  standardization=True):
+                  standardization=True, solver="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                  standardization=True)
+                  standardization=True, solver="auto")
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
@@ -103,21 +103,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)
 
-    @since("1.4.0")
-    def setElasticNetParam(self, value):
-        """
-        Sets the value of :py:attr:`elasticNetParam`.
-        """
-        self._paramMap[self.elasticNetParam] = value
-        return self
-
-    @since("1.4.0")
-    def getElasticNetParam(self):
-        """
-        Gets the value of elasticNetParam or its default value.
-        """
-        return self.getOrDefault(self.elasticNetParam)
-
 
 class LinearRegressionModel(JavaModel):
     """

From 032748bb9add096e4691551ee73834f3e5363dd5 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 28 Oct 2015 09:40:05 -0700
Subject: [PATCH 709/802] [SPARK-11377] [SQL] withNewChildren should not
 convert StructType to Seq

This is minor, but I ran into while writing Datasets and while it wasn't needed for the final solution, it was super confusing so we should fix it.

Basically we recurse into `Seq` to see if they have children.  This breaks because we don't preserve the original subclass of `Seq` (and `StructType <:< Seq[StructField]`).  Since a struct can never contain children, lets just not recurse into it.

Author: Michael Armbrust <michael@databricks.com>

Closes #9334 from marmbrus/structMakeCopy.
---
 .../scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 7971e25188e8d..35f087baccdee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.trees
 
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{StructType, DataType}
 
 /** Used by [[TreeNode.getNodeNumbered]] when traversing the tree for a given number */
 private class MutableInt(var i: Int)
@@ -176,6 +176,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     val remainingNewChildren = newChildren.toBuffer
     val remainingOldChildren = children.toBuffer
     val newArgs = productIterator.map {
+      case s: StructType => s // Don't convert struct types to some other type of Seq[StructField]
       // Handle Seq[TreeNode] in TreeNode parameters.
       case s: Seq[_] => s.map {
         case arg: TreeNode[_] if containsChild(arg) =>
@@ -337,6 +338,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
              |Is otherCopyArgs specified correctly for $nodeName.
              |Exception message: ${e.getMessage}
              |ctor: $defaultCtor?
+             |types: ${newArgs.map(_.getClass).mkString(", ")}
              |args: ${newArgs.mkString(", ")}
            """.stripMargin)
     }

From 5aa05219118e3d3525fb703a4716ae8e04f3da72 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 28 Oct 2015 14:28:38 -0700
Subject: [PATCH 710/802] [SPARK-11292] [SQL] Python API for text data source

Adds DataFrameReader.text and DataFrameWriter.text.

Author: Reynold Xin <rxin@databricks.com>

Closes #9259 from rxin/SPARK-11292.
---
 python/pyspark/sql/readwriter.py      | 27 +++++++++++++++++++++++++--
 python/test_support/sql/text-test.txt |  2 ++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 python/test_support/sql/text-test.txt

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 93832d4c713e5..97bd90c4db829 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -23,6 +23,7 @@
 from py4j.java_gateway import JavaClass
 
 from pyspark import RDD, since
+from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 
@@ -193,10 +194,22 @@ def parquet(self, *paths):
         """
         return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
 
+    @ignore_unicode_prefix
+    @since(1.6)
+    def text(self, path):
+        """Loads a text file and returns a [[DataFrame]] with a single string column named "text".
+
+        Each line in the text file is a new row in the resulting DataFrame.
+
+        >>> df = sqlContext.read.text('python/test_support/sql/text-test.txt')
+        >>> df.collect()
+        [Row(text=u'hello'), Row(text=u'this')]
+        """
+        return self._df(self._jreader.text(path))
+
     @since(1.5)
     def orc(self, path):
-        """
-        Loads an ORC file, returning the result as a :class:`DataFrame`.
+        """Loads an ORC file, returning the result as a :class:`DataFrame`.
 
         ::Note: Currently ORC support is only available together with
         :class:`HiveContext`.
@@ -432,6 +445,16 @@ def parquet(self, path, mode=None, partitionBy=None):
             self.partitionBy(partitionBy)
         self._jwrite.parquet(path)
 
+    @since(1.6)
+    def text(self, path):
+        """Saves the content of the DataFrame in a text file at the specified path.
+
+        The DataFrame must have only one column that is of string type.
+        Each row becomes a new line in the output file.
+        """
+        self._jwrite.text(path)
+
+    @since(1.5)
     def orc(self, path, mode=None, partitionBy=None):
         """Saves the content of the :class:`DataFrame` in ORC format at the specified path.
 
diff --git a/python/test_support/sql/text-test.txt b/python/test_support/sql/text-test.txt
new file mode 100644
index 0000000000000..ae1e76c9e93a7
--- /dev/null
+++ b/python/test_support/sql/text-test.txt
@@ -0,0 +1,2 @@
+hello
+this
\ No newline at end of file

From 20dfd46743401a528b70dfb7862e50ce9a3f8e02 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 28 Oct 2015 15:57:01 -0700
Subject: [PATCH 711/802] [SPARK-11363] [SQL] LeftSemiJoin should be LeftSemi
 in SparkStrategies

JIRA: https://issues.apache.org/jira/browse/SPARK-11363

In SparkStrategies some places use LeftSemiJoin. It should be LeftSemi.

cc chenghao-intel liancheng

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9318 from viirya/no-left-semi-join.
---
 .../org/apache/spark/sql/execution/SparkStrategies.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 32067266b516b..86d1d390f1918 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -297,11 +297,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   object BroadcastNestedLoop extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(
-             CanBroadcast(left), right, joinType, condition) if joinType != LeftSemiJoin =>
+             CanBroadcast(left), right, joinType, condition) if joinType != LeftSemi =>
         execution.joins.BroadcastNestedLoopJoin(
           planLater(left), planLater(right), joins.BuildLeft, joinType, condition) :: Nil
       case logical.Join(
-             left, CanBroadcast(right), joinType, condition) if joinType != LeftSemiJoin =>
+             left, CanBroadcast(right), joinType, condition) if joinType != LeftSemi =>
         execution.joins.BroadcastNestedLoopJoin(
           planLater(left), planLater(right), joins.BuildRight, joinType, condition) :: Nil
       case _ => Nil
@@ -311,7 +311,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   object CartesianProduct extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       // TODO CartesianProduct doesn't support the Left Semi Join
-      case logical.Join(left, right, joinType, None) if joinType != LeftSemiJoin =>
+      case logical.Join(left, right, joinType, None) if joinType != LeftSemi =>
         execution.joins.CartesianProduct(planLater(left), planLater(right)) :: Nil
       case logical.Join(left, right, Inner, Some(condition)) =>
         execution.Filter(condition,

From e5b89978edf7fa52090116b9b5b53ddaeef08beb Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 29 Oct 2015 11:34:54 +0800
Subject: [PATCH 712/802] [SPARK-11376][SQL] Removes duplicated `mutableRow`
 field

This PR fixes a mistake in the code generated by `GenerateColumnAccessor`. Interestingly, although the code is illegal in Java (the class has two fields with the same name), Janino accepts it happily and accidentally works properly.

Author: Cheng Lian <lian@databricks.com>

Closes #9335 from liancheng/spark-11376.fix-generated-code.
---
 .../org/apache/spark/sql/columnar/GenerateColumnAccessor.scala  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
index d0f5bfa1cd7bc..7980a6f36d8ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
@@ -140,7 +140,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
         private int numRowsInBatch = 0;
 
         private scala.collection.Iterator input = null;
-        private MutableRow mutableRow = null;
         private DataType[] columnTypes = null;
         private int[] columnIndexes = null;
 
@@ -156,7 +155,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
 
         public void initialize(Iterator input, DataType[] columnTypes, int[] columnIndexes) {
           this.input = input;
-          this.mutableRow = mutableRow;
           this.columnTypes = columnTypes;
           this.columnIndexes = columnIndexes;
         }

From 0cb7662d8683c913c4fff02e8fb0ec75261d9731 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 28 Oct 2015 21:35:57 -0700
Subject: [PATCH 713/802] [SPARK-11351] [SQL] support hive interval literal

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9304 from cloud-fan/interval.
---
 .../apache/spark/sql/catalyst/SqlParser.scala | 71 +++++++++++++------
 .../spark/sql/catalyst/SqlParserSuite.scala   | 52 ++++++++++++++
 2 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 833368b7d5898..0fef04302714e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -322,7 +322,7 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected lazy val literal: Parser[Literal] =
     ( numericLiteral
     | booleanLiteral
-    | stringLit ^^ {case s => Literal.create(s, StringType) }
+    | stringLit ^^ { case s => Literal.create(s, StringType) }
     | intervalLiteral
     | NULL ^^^ Literal.create(null, NullType)
     )
@@ -349,13 +349,12 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected lazy val integral: Parser[String] =
     sign.? ~ numericLit ^^ { case s ~ n => s.getOrElse("") + n }
 
-  private def intervalUnit(unitName: String) =
-    acceptIf {
-      case lexical.Identifier(str) =>
-        val normalized = lexical.normalizeKeyword(str)
-        normalized == unitName || normalized == unitName + "s"
-      case _ => false
-    } {_ => "wrong interval unit"}
+  private def intervalUnit(unitName: String) = acceptIf {
+    case lexical.Identifier(str) =>
+      val normalized = lexical.normalizeKeyword(str)
+      normalized == unitName || normalized == unitName + "s"
+    case _ => false
+  } {_ => "wrong interval unit"}
 
   protected lazy val month: Parser[Int] =
     integral <~ intervalUnit("month") ^^ { case num => num.toInt }
@@ -396,21 +395,53 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       case num => num.toLong * CalendarInterval.MICROS_PER_WEEK
     }
 
+  private def intervalKeyword(keyword: String) = acceptIf {
+    case lexical.Identifier(str) =>
+      lexical.normalizeKeyword(str) == keyword
+    case _ => false
+  } {_ => "wrong interval keyword"}
+
   protected lazy val intervalLiteral: Parser[Literal] =
-    INTERVAL ~> year.? ~ month.? ~ week.? ~ day.? ~ hour.? ~ minute.? ~ second.? ~
-      millisecond.? ~ microsecond.? ^^ {
-        case year ~ month ~ week ~ day ~ hour ~ minute ~ second ~
+    ( INTERVAL ~> stringLit <~ intervalKeyword("year") ~ intervalKeyword("to") ~
+        intervalKeyword("month") ^^ { case s =>
+      Literal(CalendarInterval.fromYearMonthString(s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("day") ~ intervalKeyword("to") ~
+        intervalKeyword("second") ^^ { case s =>
+      Literal(CalendarInterval.fromDayTimeString(s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("year") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("year", s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("month") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("month", s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("day") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("day", s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("hour") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("hour", s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("minute") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("minute", s))
+    }
+    | INTERVAL ~> stringLit <~ intervalKeyword("second") ^^ { case s =>
+      Literal(CalendarInterval.fromSingleUnitString("second", s))
+    }
+    | INTERVAL ~> year.? ~ month.? ~ week.? ~ day.? ~ hour.? ~ minute.? ~ second.? ~
+        millisecond.? ~ microsecond.? ^^ { case year ~ month ~ week ~ day ~ hour ~ minute ~ second ~
           millisecond ~ microsecond =>
-          if (!Seq(year, month, week, day, hour, minute, second,
-            millisecond, microsecond).exists(_.isDefined)) {
-            throw new AnalysisException(
-              "at least one time unit should be given for interval literal")
-          }
-          val months = Seq(year, month).map(_.getOrElse(0)).sum
-          val microseconds = Seq(week, day, hour, minute, second, millisecond, microsecond)
-            .map(_.getOrElse(0L)).sum
-          Literal.create(new CalendarInterval(months, microseconds), CalendarIntervalType)
+      if (!Seq(year, month, week, day, hour, minute, second,
+        millisecond, microsecond).exists(_.isDefined)) {
+        throw new AnalysisException(
+          "at least one time unit should be given for interval literal")
       }
+      val months = Seq(year, month).map(_.getOrElse(0)).sum
+      val microseconds = Seq(week, day, hour, minute, second, millisecond, microsecond)
+        .map(_.getOrElse(0L)).sum
+      Literal(new CalendarInterval(months, microseconds))
+    }
+    )
 
   private def toNarrowestIntegerType(value: String): Any = {
     val bigIntValue = BigDecimal(value)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
index 79b4846cb9544..ea28bfa021bed 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias
 import org.apache.spark.sql.catalyst.expressions.{Literal, GreaterThan, Not, Attribute}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project, LogicalPlan, Command}
+import org.apache.spark.unsafe.types.CalendarInterval
 
 private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command {
   override def output: Seq[Attribute] = Seq.empty
@@ -74,4 +75,55 @@ class SqlParserSuite extends PlanTest {
       OneRowRelation)
     comparePlans(parsed, expected)
   }
+
+  test("support hive interval literal") {
+    def checkInterval(sql: String, result: CalendarInterval): Unit = {
+      val parsed = SqlParser.parse(sql)
+      val expected = Project(
+        UnresolvedAlias(
+          Literal(result)
+        ) :: Nil,
+        OneRowRelation)
+      comparePlans(parsed, expected)
+    }
+
+    def checkYearMonth(lit: String): Unit = {
+      checkInterval(
+        s"SELECT INTERVAL '$lit' YEAR TO MONTH",
+        CalendarInterval.fromYearMonthString(lit))
+    }
+
+    def checkDayTime(lit: String): Unit = {
+      checkInterval(
+        s"SELECT INTERVAL '$lit' DAY TO SECOND",
+        CalendarInterval.fromDayTimeString(lit))
+    }
+
+    def checkSingleUnit(lit: String, unit: String): Unit = {
+      checkInterval(
+        s"SELECT INTERVAL '$lit' $unit",
+        CalendarInterval.fromSingleUnitString(unit, lit))
+    }
+
+    checkYearMonth("123-10")
+    checkYearMonth("496-0")
+    checkYearMonth("-2-3")
+    checkYearMonth("-123-0")
+
+    checkDayTime("99 11:22:33.123456789")
+    checkDayTime("-99 11:22:33.123456789")
+    checkDayTime("10 9:8:7.123456789")
+    checkDayTime("1 0:0:0")
+    checkDayTime("-1 0:0:0")
+    checkDayTime("1 0:0:1")
+
+    for (unit <- Seq("year", "month", "day", "hour", "minute", "second")) {
+      checkSingleUnit("7", unit)
+      checkSingleUnit("-7", unit)
+      checkSingleUnit("0", unit)
+    }
+
+    checkSingleUnit("13.123456789", "second")
+    checkSingleUnit("-13.123456789", "second")
+  }
 }

From 3dfa4ea526c881373eeffe541bc378d1fa598129 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 28 Oct 2015 21:45:00 -0700
Subject: [PATCH 714/802] [SPARK-11322] [PYSPARK] Keep full stack trace in
 captured exception

JIRA: https://issues.apache.org/jira/browse/SPARK-11322

As reported by JoshRosen in [databricks/spark-redshift/issues/89](https://github.com/databricks/spark-redshift/issues/89#issuecomment-149828308), the exception-masking behavior sometimes makes debugging harder. To deal with this issue, we should keep full stack trace in the captured exception.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9283 from viirya/py-exception-stacktrace.
---
 python/pyspark/sql/tests.py |  6 ++++++
 python/pyspark/sql/utils.py | 19 +++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6356d4bd6669b..4c03a0d4ffe93 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1079,6 +1079,12 @@ def test_capture_illegalargument_exception(self):
         df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"])
         self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
                                 lambda: df.select(sha2(df.a, 1024)).collect())
+        try:
+            df.select(sha2(df.a, 1024)).collect()
+        except IllegalArgumentException as e:
+            self.assertRegexpMatches(e.desc, "1024 is not in the permitted values")
+            self.assertRegexpMatches(e.stackTrace,
+                                     "org.apache.spark.sql.functions")
 
     def test_with_column_with_existing_name(self):
         keys = self.df.withColumn("key", self.df.key).select("key").collect()
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 0f795ca35b38a..c4fda8bd3b891 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -18,13 +18,22 @@
 import py4j
 
 
-class AnalysisException(Exception):
+class CapturedException(Exception):
+    def __init__(self, desc, stackTrace):
+        self.desc = desc
+        self.stackTrace = stackTrace
+
+    def __str__(self):
+        return repr(self.desc)
+
+
+class AnalysisException(CapturedException):
     """
     Failed to analyze a SQL query plan.
     """
 
 
-class IllegalArgumentException(Exception):
+class IllegalArgumentException(CapturedException):
     """
     Passed an illegal or inappropriate argument.
     """
@@ -36,10 +45,12 @@ def deco(*a, **kw):
             return f(*a, **kw)
         except py4j.protocol.Py4JJavaError as e:
             s = e.java_exception.toString()
+            stackTrace = '\n\t at '.join(map(lambda x: x.toString(),
+                                             e.java_exception.getStackTrace()))
             if s.startswith('org.apache.spark.sql.AnalysisException: '):
-                raise AnalysisException(s.split(': ', 1)[1])
+                raise AnalysisException(s.split(': ', 1)[1], stackTrace)
             if s.startswith('java.lang.IllegalArgumentException: '):
-                raise IllegalArgumentException(s.split(': ', 1)[1])
+                raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
             raise
     return deco
 

From 87f28fc24003ad60c52f899d10f38032631624dc Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 29 Oct 2015 11:17:03 +0100
Subject: [PATCH 715/802] [SPARK-11379][SQL] ExpressionEncoder can't handle top
 level primitive type correctly

For inner primitive type(e.g. inside `Product`), we use `schemaFor` to get the catalyst type for it, https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala#L403.

However, for top level primitive type, we use `dataTypeFor`, which is wrong.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9337 from cloud-fan/encoder.
---
 .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala   | 2 +-
 .../spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 9cbb7c2ffdc76..0b8a8abd02d67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -170,7 +170,7 @@ trait ScalaReflection {
         .getOrElse(BoundReference(ordinal, dataType, false))
 
     /** Returns the current path or throws an error. */
-    def getPath = path.getOrElse(BoundReference(0, dataTypeFor(tpe), true))
+    def getPath = path.getOrElse(BoundReference(0, schemaFor(tpe).dataType, true))
 
     tpe match {
       case t if !dataTypeFor(t).isInstanceOf[ObjectType] =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index a374da4da1f08..b0dacf7f555e0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -57,6 +57,7 @@ class ExpressionEncoderSuite extends SparkFunSuite {
   encodeDecodeTest(false)
   encodeDecodeTest(1.toShort)
   encodeDecodeTest(1.toByte)
+  encodeDecodeTest("hello")
 
   encodeDecodeTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 

From f79ebf2a9e99575908dad6f7a14c8cfcffdebd91 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 29 Oct 2015 11:49:45 +0100
Subject: [PATCH 716/802] [SPARK-11370] [SQL] fix a bug in GroupedIterator and
 create unit test for it

Before this PR, user has to consume the iterator of one group before process next group, or we will get into infinite loops.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9330 from cloud-fan/group.
---
 .../spark/sql/execution/GroupedIterator.scala | 99 ++++++++++++-------
 .../sql/execution/GroupedIteratorSuite.scala  | 82 +++++++++++++++
 2 files changed, 144 insertions(+), 37 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
index 10742cf7348f8..6a8850129f1ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GroupedIterator.scala
@@ -27,7 +27,7 @@ object GroupedIterator {
       keyExpressions: Seq[Expression],
       inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
     if (input.hasNext) {
-      new GroupedIterator(input, keyExpressions, inputSchema)
+      new GroupedIterator(input.buffered, keyExpressions, inputSchema)
     } else {
       Iterator.empty
     }
@@ -64,7 +64,7 @@ object GroupedIterator {
  * @param inputSchema The schema of the rows in the `input` iterator.
  */
 class GroupedIterator private(
-    input: Iterator[InternalRow],
+    input: BufferedIterator[InternalRow],
     groupingExpressions: Seq[Expression],
     inputSchema: Seq[Attribute])
   extends Iterator[(InternalRow, Iterator[InternalRow])] {
@@ -83,10 +83,17 @@ class GroupedIterator private(
 
   /** Holds a copy of an input row that is in the current group. */
   var currentGroup = currentRow.copy()
-  var currentIterator: Iterator[InternalRow] = null
+
   assert(keyOrdering.compare(currentGroup, currentRow) == 0)
+  var currentIterator = createGroupValuesIterator()
 
-  // Return true if we already have the next iterator or fetching a new iterator is successful.
+  /**
+   * Return true if we already have the next iterator or fetching a new iterator is successful.
+   *
+   * Note that, if we get the iterator by `next`, we should consume it before call `hasNext`,
+   * because we will consume the input data to skip to next group while fetching a new iterator,
+   * thus make the previous iterator empty.
+   */
   def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator
 
   def next(): (InternalRow, Iterator[InternalRow]) = {
@@ -96,46 +103,64 @@ class GroupedIterator private(
     ret
   }
 
-  def fetchNextGroupIterator(): Boolean = {
-    if (currentRow != null || input.hasNext) {
-      val inputIterator = new Iterator[InternalRow] {
-        // Return true if we have a row and it is in the current group, or if fetching a new row is
-        // successful.
-        def hasNext = {
-          (currentRow != null && keyOrdering.compare(currentGroup, currentRow) == 0) ||
-            fetchNextRowInGroup()
-        }
+  private def fetchNextGroupIterator(): Boolean = {
+    assert(currentIterator == null)
+
+    if (currentRow == null && input.hasNext) {
+      currentRow = input.next()
+    }
+
+    if (currentRow == null) {
+      // These is no data left, return false.
+      false
+    } else {
+      // Skip to next group.
+      while (input.hasNext && keyOrdering.compare(currentGroup, currentRow) == 0) {
+        currentRow = input.next()
+      }
+
+      if (keyOrdering.compare(currentGroup, currentRow) == 0) {
+        // We are in the last group, there is no more groups, return false.
+        false
+      } else {
+        // Now the `currentRow` is the first row of next group.
+        currentGroup = currentRow.copy()
+        currentIterator = createGroupValuesIterator()
+        true
+      }
+    }
+  }
+
+  private def createGroupValuesIterator(): Iterator[InternalRow] = {
+    new Iterator[InternalRow] {
+      def hasNext: Boolean = currentRow != null || fetchNextRowInGroup()
+
+      def next(): InternalRow = {
+        assert(hasNext)
+        val res = currentRow
+        currentRow = null
+        res
+      }
 
-        def fetchNextRowInGroup(): Boolean = {
-          if (currentRow != null || input.hasNext) {
+      private def fetchNextRowInGroup(): Boolean = {
+        assert(currentRow == null)
+
+        if (input.hasNext) {
+          // The inner iterator should NOT consume the input into next group, here we use `head` to
+          // peek the next input, to see if we should continue to process it.
+          if (keyOrdering.compare(currentGroup, input.head) == 0) {
+            // Next input is in the current group.  Continue the inner iterator.
             currentRow = input.next()
-            if (keyOrdering.compare(currentGroup, currentRow) == 0) {
-              // The row is in the current group.  Continue the inner iterator.
-              true
-            } else {
-              // We got a row, but its not in the right group.  End this inner iterator and prepare
-              // for the next group.
-              currentIterator = null
-              currentGroup = currentRow.copy()
-              false
-            }
+            true
           } else {
-            // There is no more input so we are done.
+            // Next input is not in the right group.  End this inner iterator.
             false
           }
-        }
-
-        def next(): InternalRow = {
-          assert(hasNext) // Ensure we have fetched the next row.
-          val res = currentRow
-          currentRow = null
-          res
+        } else {
+          // There is no more data, return false.
+          false
         }
       }
-      currentIterator = inputIterator
-      true
-    } else {
-      false
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala
new file mode 100644
index 0000000000000..e7a08481cfa80
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.types.{LongType, StringType, IntegerType, StructType}
+
+class GroupedIteratorSuite extends SparkFunSuite {
+
+  test("basic") {
+    val schema = new StructType().add("i", IntegerType).add("s", StringType)
+    val encoder = RowEncoder(schema)
+    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
+    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
+      Seq('i.int.at(0)), schema.toAttributes)
+
+    val result = grouped.map {
+      case (key, data) =>
+        assert(key.numFields == 1)
+        key.getInt(0) -> data.map(encoder.fromRow).toSeq
+    }.toSeq
+
+    assert(result ==
+      1 -> Seq(input(0), input(1)) ::
+      2 -> Seq(input(2)) :: Nil)
+  }
+
+  test("group by 2 columns") {
+    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
+    val encoder = RowEncoder(schema)
+
+    val input = Seq(
+      Row(1, 2L, "a"),
+      Row(1, 2L, "b"),
+      Row(1, 3L, "c"),
+      Row(2, 1L, "d"),
+      Row(3, 2L, "e"))
+
+    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
+      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)
+
+    val result = grouped.map {
+      case (key, data) =>
+        assert(key.numFields == 2)
+        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
+    }.toSeq
+
+    assert(result ==
+      (1, 2L, Seq(input(0), input(1))) ::
+      (1, 3L, Seq(input(2))) ::
+      (2, 1L, Seq(input(3))) ::
+      (3, 2L, Seq(input(4))) :: Nil)
+  }
+
+  test("do nothing to the value iterator") {
+    val schema = new StructType().add("i", IntegerType).add("s", StringType)
+    val encoder = RowEncoder(schema)
+    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
+    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
+      Seq('i.int.at(0)), schema.toAttributes)
+
+    assert(grouped.length == 2)
+  }
+}

From f304f9c9a1c954b3b5786f90bb13f543637d3192 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Thu, 29 Oct 2015 15:02:13 +0100
Subject: [PATCH 717/802] [SPARK-11318] Include hive profile in
 make-distribution.sh command

Author: tedyu <yuzhihong@gmail.com>

Closes #9281 from tedyu/master.
---
 docs/building-spark.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 743643cbcc62f..4f73adb85446c 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -38,7 +38,7 @@ To create a Spark distribution like those distributed by the
 to be runnable, use `make-distribution.sh` in the project root directory. It can be configured 
 with Maven profile settings and so on like the direct Maven build. Example:
 
-    ./make-distribution.sh --name custom-spark --tgz -Phadoop-2.4 -Pyarn
+    ./make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn
     
 For more information on usage, run `./make-distribution.sh --help`
 

From 3bb2a8d7508b507edfcc21bd20912b0ff4a0a248 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@questtec.nl>
Date: Thu, 29 Oct 2015 15:11:00 +0100
Subject: [PATCH 718/802] [SPARK-11388][BUILD] Fix self closing tags.

Java 8 javadoc does not like self closing tags: ```<p/>```, ```<br/>```, ...

This PR fixes those.

Author: Herman van Hovell <hvanhovell@questtec.nl>

Closes #9339 from hvanhovell/SPARK-11388.
---
 .../java/org/apache/spark/launcher/SparkAppHandle.java    | 4 ++--
 .../java/org/apache/spark/launcher/SparkLauncher.java     | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
index 2896a91d5e793..13dd9f1739fb6 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
@@ -19,7 +19,7 @@
 
 /**
  * A handle to a running Spark application.
- * <p/>
+ * <p>
  * Provides runtime information about the underlying Spark application, and actions to control it.
  *
  * @since 1.6.0
@@ -110,7 +110,7 @@ public interface Listener {
      * Callback for changes in the handle's state.
      *
      * @param handle The updated handle.
-     * @see {@link SparkAppHandle#getState()}
+     * @see SparkAppHandle#getState()
      */
     void stateChanged(SparkAppHandle handle);
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 5d74b37033a51..dd1c93af6ca4c 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -350,7 +350,7 @@ public SparkLauncher setVerbose(boolean verbose) {
 
   /**
    * Launches a sub-process that will start the configured Spark application.
-   * <p/>
+   * <p>
    * The {@link #startApplication(SparkAppHandle.Listener...)} method is preferred when launching
    * Spark, since it provides better control of the child application.
    *
@@ -362,16 +362,16 @@ public Process launch() throws IOException {
 
   /**
    * Starts a Spark application.
-   * <p/>
+   * <p>
    * This method returns a handle that provides information about the running application and can
    * be used to do basic interaction with it.
-   * <p/>
+   * <p>
    * The returned handle assumes that the application will instantiate a single SparkContext
    * during its lifetime. Once that context reports a final state (one that indicates the
    * SparkContext has stopped), the handle will not perform new state transitions, so anything
    * that happens after that cannot be monitored. If the underlying application is launched as
    * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process.
-   * <p/>
+   * <p>
    * Currently, all applications are launched as child processes. The child's stdout and stderr
    * are merged and written to a logger (see <code>java.util.logging</code>). The logger's name
    * can be defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If

From f7a51deebad1b4c3b970a051f25d286110b94438 Mon Sep 17 00:00:00 2001
From: xin Wu <xinwu@us.ibm.com>
Date: Thu, 29 Oct 2015 07:42:46 -0700
Subject: [PATCH 719/802] [SPARK-11246] [SQL] Table cache for Parquet broken in
 1.5

The root cause is that when spark.sql.hive.convertMetastoreParquet=true by default, the cached InMemoryRelation of the ParquetRelation can not be looked up from the cachedData of CacheManager because the key comparison fails even though it is the same LogicalPlan representing the Subquery that wraps the ParquetRelation.
The solution in this PR is overriding the LogicalPlan.sameResult function in Subquery case class to eliminate subquery node first before directly comparing the child (ParquetRelation), which will find the key  to the cached InMemoryRelation.

Author: xin Wu <xinwu@us.ibm.com>

Closes #9326 from xwu0226/spark-11246-commit.
---
 .../sql/execution/datasources/LogicalRelation.scala   |  5 +++++
 .../org/apache/spark/sql/hive/CachedTableSuite.scala  | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 783252e0a297f..219dae88e515d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -62,6 +62,11 @@ case class LogicalRelation(
     case _ => false
   }
 
+  // When comparing two LogicalRelations from within LogicalPlan.sameResult, we only need
+  // LogicalRelation.cleanArgs to return Seq(relation), since expectedOutputAttribute's
+  // expId can be different but the relation is still the same.
+  override lazy val cleanArgs: Seq[Any] = Seq(relation)
+
   @transient override lazy val statistics: Statistics = Statistics(
     sizeInBytes = BigInt(relation.sizeInBytes)
   )
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 9adb3780a2c55..5c2fc7d82ffbd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import java.io.File
 
 import org.apache.spark.sql.columnar.InMemoryColumnarTableScan
+import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
 import org.apache.spark.storage.RDDBlockId
@@ -203,4 +204,14 @@ class CachedTableSuite extends QueryTest with TestHiveSingleton {
     sql("DROP TABLE refreshTable")
     Utils.deleteRecursively(tempPath)
   }
+
+  test("SPARK-11246 cache parquet table") {
+    sql("CREATE TABLE cachedTable STORED AS PARQUET AS SELECT 1")
+
+    cacheTable("cachedTable")
+    val sparkPlan = sql("SELECT * FROM cachedTable").queryExecution.sparkPlan
+    assert(sparkPlan.collect { case e: InMemoryColumnarTableScan => e }.size === 1)
+
+    sql("DROP TABLE cachedTable")
+  }
 }

From 8185f038c13c72e1bea7b0921b84125b7a352139 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Thu, 29 Oct 2015 18:29:50 +0100
Subject: [PATCH 720/802] [SPARK-11188][SQL] Elide stacktraces in bin/spark-sql
 for AnalysisExceptions

Only print the error message to the console for Analysis Exceptions in sql-shell.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #9194 from dilipbiswal/spark-11188.
---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala    | 10 +++++++++-
 .../spark/sql/hive/thriftserver/SparkSQLDriver.scala | 11 ++++++++---
 .../spark/sql/hive/thriftserver/CliSuite.scala       | 12 ++++++++++--
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index b5073961a1c84..62e912c69abc6 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hive.thriftserver
 import java.io._
 import java.util.{ArrayList => JArrayList, Locale}
 
+import org.apache.spark.sql.AnalysisException
+
 import scala.collection.JavaConverters._
 
 import jline.console.ConsoleReader
@@ -298,6 +300,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
 
           driver.init()
           val out = sessionState.out
+          val err = sessionState.err
           val start: Long = System.currentTimeMillis()
           if (sessionState.getIsVerbose) {
             out.println(cmd)
@@ -308,7 +311,12 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
 
           ret = rc.getResponseCode
           if (ret != 0) {
-            console.printError(rc.getErrorMessage())
+            // For analysis exception, only the error is printed out to the console.
+            rc.getException() match {
+              case e : AnalysisException =>
+                err.println(s"""Error in query: ${e.getMessage}""")
+              case _ => err.println(rc.getErrorMessage())
+            }
             driver.close()
             return ret
           }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 2619286afc148..f1ec7238520ac 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.hive.thriftserver
 
 import java.util.{Arrays, ArrayList => JArrayList, List => JList}
+import org.apache.log4j.LogManager
+import org.apache.spark.sql.AnalysisException
 
 import scala.collection.JavaConverters._
 
@@ -63,9 +65,12 @@ private[hive] class SparkSQLDriver(
       tableSchema = getResultSetSchema(execution)
       new CommandProcessorResponse(0)
     } catch {
-      case cause: Throwable =>
-        logError(s"Failed in [$command]", cause)
-        new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null)
+        case ae: AnalysisException =>
+          logDebug(s"Failed in [$command]", ae)
+          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
+        case cause: Throwable =>
+          logError(s"Failed in [$command]", cause)
+          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
     }
   }
 
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 76d1591a235c2..3fa5c8528b602 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -58,7 +58,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
    * @param timeout maximum time for the commands to complete
    * @param extraArgs any extra arguments
    * @param errorResponses a sequence of strings whose presence in the stdout of the forked process
-   *                       is taken as an immediate error condition. That is: if a line beginning
+   *                       is taken as an immediate error condition. That is: if a line containing
    *                       with one of these strings is found, fail the test immediately.
    *                       The default value is `Seq("Error:")`
    *
@@ -104,7 +104,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
         }
       } else {
         errorResponses.foreach { r =>
-          if (line.startsWith(r)) {
+          if (line.contains(r)) {
             foundAllExpectedAnswers.tryFailure(
               new RuntimeException(s"Failed with error line '$line'"))
           }
@@ -219,4 +219,12 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
         -> "OK"
     )
   }
+
+  test("SPARK-11188 Analysis error reporting") {
+    runCliWithin(timeout = 2.minute,
+      errorResponses = Seq("AnalysisException"))(
+      "select * from nonexistent_table;"
+        -> "Error in query: Table not found: nonexistent_table;"
+    )
+  }
 }

From a01cbf5daac148f39cd97299780f542abc41d1e9 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Thu, 29 Oct 2015 11:58:39 -0700
Subject: [PATCH 721/802] [SPARK-10641][SQL] Add Skewness and Kurtosis Support

Implementing skewness and kurtosis support based on following algorithm:
https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics

Author: sethah <seth.hendrickson16@gmail.com>

Closes #9003 from sethah/SPARK-10641.
---
 .../catalyst/analysis/FunctionRegistry.scala  |   5 +
 .../catalyst/analysis/HiveTypeCoercion.scala  |   5 +
 .../spark/sql/catalyst/dsl/package.scala      |   5 +
 .../expressions/aggregate/functions.scala     | 329 ++++++++++++++++++
 .../expressions/aggregate/utils.scala         |  30 ++
 .../sql/catalyst/expressions/aggregates.scala |  95 +++++
 .../org/apache/spark/sql/GroupedData.scala    |  65 ++++
 .../org/apache/spark/sql/functions.scala      | 115 +++++-
 .../spark/sql/DataFrameAggregateSuite.scala   |  73 ++++
 .../org/apache/spark/sql/QueryTest.scala      |  48 +++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  63 +++-
 .../execution/HiveCompatibilitySuite.scala    |   1 -
 12 files changed, 823 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 3dce6c1a27e85..ed9fcfe014f0c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -189,6 +189,11 @@ object FunctionRegistry {
     expression[StddevPop]("stddev_pop"),
     expression[StddevSamp]("stddev_samp"),
     expression[Sum]("sum"),
+    expression[Variance]("variance"),
+    expression[VariancePop]("var_pop"),
+    expression[VarianceSamp]("var_samp"),
+    expression[Skewness]("skewness"),
+    expression[Kurtosis]("kurtosis"),
 
     // string functions
     expression[Ascii]("ascii"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 1140150f66864..3c675672dab85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -300,6 +300,11 @@ object HiveTypeCoercion {
       case Stddev(e @ StringType()) => Stddev(Cast(e, DoubleType))
       case StddevPop(e @ StringType()) => StddevPop(Cast(e, DoubleType))
       case StddevSamp(e @ StringType()) => StddevSamp(Cast(e, DoubleType))
+      case Variance(e @ StringType()) => Variance(Cast(e, DoubleType))
+      case VariancePop(e @ StringType()) => VariancePop(Cast(e, DoubleType))
+      case VarianceSamp(e @ StringType()) => VarianceSamp(Cast(e, DoubleType))
+      case Skewness(e @ StringType()) => Skewness(Cast(e, DoubleType))
+      case Kurtosis(e @ StringType()) => Kurtosis(Cast(e, DoubleType))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 27b3cd84b3846..787f67a297e33 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -162,6 +162,11 @@ package object dsl {
     def stddev(e: Expression): Expression = Stddev(e)
     def stddev_pop(e: Expression): Expression = StddevPop(e)
     def stddev_samp(e: Expression): Expression = StddevSamp(e)
+    def variance(e: Expression): Expression = Variance(e)
+    def var_pop(e: Expression): Expression = VariancePop(e)
+    def var_samp(e: Expression): Expression = VarianceSamp(e)
+    def skewness(e: Expression): Expression = Skewness(e)
+    def kurtosis(e: Expression): Expression = Kurtosis(e)
 
     implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s: String = sym.name }
     // TODO more implicit class for literal?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 515246d344244..281404f285a98 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -930,3 +930,332 @@ object HyperLogLogPlusPlus {
   )
   // scalastyle:on
 }
+
+/**
+ * A central moment is the expected value of a specified power of the deviation of a random
+ * variable from the mean. Central moments are often used to characterize the properties of about
+ * the shape of a distribution.
+ *
+ * This class implements online, one-pass algorithms for computing the central moments of a set of
+ * points.
+ *
+ * Behavior:
+ *  - null values are ignored
+ *  - returns `Double.NaN` when the column contains `Double.NaN` values
+ *
+ * References:
+ *  - Xiangrui Meng.  "Simpler Online Updates for Arbitrary-Order Central Moments."
+ *      2015. http://arxiv.org/abs/1510.04923
+ *
+ * @see [[https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ *     Algorithms for calculating variance (Wikipedia)]]
+ *
+ * @param child to compute central moments of.
+ */
+abstract class CentralMomentAgg(child: Expression) extends ImperativeAggregate with Serializable {
+
+  /**
+   * The central moment order to be computed.
+   */
+  protected def momentOrder: Int
+
+  override def children: Seq[Expression] = Seq(child)
+
+  override def nullable: Boolean = false
+
+  override def dataType: DataType = DoubleType
+
+  // Expected input data type.
+  // TODO: Right now, we replace old aggregate functions (based on AggregateExpression1) to the
+  // new version at planning time (after analysis phase). For now, NullType is added at here
+  // to make it resolved when we have cases like `select avg(null)`.
+  // We can use our analyzer to cast NullType to the default data type of the NumericType once
+  // we remove the old aggregate functions. Then, we will not need NullType at here.
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
+
+  override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
+
+  /**
+   * Size of aggregation buffer.
+   */
+  private[this] val bufferSize = 5
+
+  override val aggBufferAttributes: Seq[AttributeReference] = Seq.tabulate(bufferSize) { i =>
+    AttributeReference(s"M$i", DoubleType)()
+  }
+
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
+  // buffer offsets
+  private[this] val nOffset = mutableAggBufferOffset
+  private[this] val meanOffset = mutableAggBufferOffset + 1
+  private[this] val secondMomentOffset = mutableAggBufferOffset + 2
+  private[this] val thirdMomentOffset = mutableAggBufferOffset + 3
+  private[this] val fourthMomentOffset = mutableAggBufferOffset + 4
+
+  // frequently used values for online updates
+  private[this] var delta = 0.0
+  private[this] var deltaN = 0.0
+  private[this] var delta2 = 0.0
+  private[this] var deltaN2 = 0.0
+  private[this] var n = 0.0
+  private[this] var mean = 0.0
+  private[this] var m2 = 0.0
+  private[this] var m3 = 0.0
+  private[this] var m4 = 0.0
+
+  /**
+   * Initialize all moments to zero.
+   */
+  override def initialize(buffer: MutableRow): Unit = {
+    for (aggIndex <- 0 until bufferSize) {
+      buffer.setDouble(mutableAggBufferOffset + aggIndex, 0.0)
+    }
+  }
+
+  /**
+   * Update the central moments buffer.
+   */
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+    val v = Cast(child, DoubleType).eval(input)
+    if (v != null) {
+      val updateValue = v match {
+        case d: Double => d
+      }
+
+      n = buffer.getDouble(nOffset)
+      mean = buffer.getDouble(meanOffset)
+
+      n += 1.0
+      buffer.setDouble(nOffset, n)
+      delta = updateValue - mean
+      deltaN = delta / n
+      mean += deltaN
+      buffer.setDouble(meanOffset, mean)
+
+      if (momentOrder >= 2) {
+        m2 = buffer.getDouble(secondMomentOffset)
+        m2 += delta * (delta - deltaN)
+        buffer.setDouble(secondMomentOffset, m2)
+      }
+
+      if (momentOrder >= 3) {
+        delta2 = delta * delta
+        deltaN2 = deltaN * deltaN
+        m3 = buffer.getDouble(thirdMomentOffset)
+        m3 += -3.0 * deltaN * m2 + delta * (delta2 - deltaN2)
+        buffer.setDouble(thirdMomentOffset, m3)
+      }
+
+      if (momentOrder >= 4) {
+        m4 = buffer.getDouble(fourthMomentOffset)
+        m4 += -4.0 * deltaN * m3 - 6.0 * deltaN2 * m2 +
+          delta * (delta * delta2 - deltaN * deltaN2)
+        buffer.setDouble(fourthMomentOffset, m4)
+      }
+    }
+  }
+
+  /**
+   * Merge two central moment buffers.
+   */
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    val n1 = buffer1.getDouble(nOffset)
+    val n2 = buffer2.getDouble(inputAggBufferOffset)
+    val mean1 = buffer1.getDouble(meanOffset)
+    val mean2 = buffer2.getDouble(inputAggBufferOffset + 1)
+
+    var secondMoment1 = 0.0
+    var secondMoment2 = 0.0
+
+    var thirdMoment1 = 0.0
+    var thirdMoment2 = 0.0
+
+    var fourthMoment1 = 0.0
+    var fourthMoment2 = 0.0
+
+    n = n1 + n2
+    buffer1.setDouble(nOffset, n)
+    delta = mean2 - mean1
+    deltaN = if (n == 0.0) 0.0 else delta / n
+    mean = mean1 + deltaN * n2
+    buffer1.setDouble(mutableAggBufferOffset + 1, mean)
+
+    // higher order moments computed according to:
+    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
+    if (momentOrder >= 2) {
+      secondMoment1 = buffer1.getDouble(secondMomentOffset)
+      secondMoment2 = buffer2.getDouble(inputAggBufferOffset + 2)
+      m2 = secondMoment1 + secondMoment2 + delta * deltaN * n1 * n2
+      buffer1.setDouble(secondMomentOffset, m2)
+    }
+
+    if (momentOrder >= 3) {
+      thirdMoment1 = buffer1.getDouble(thirdMomentOffset)
+      thirdMoment2 = buffer2.getDouble(inputAggBufferOffset + 3)
+      m3 = thirdMoment1 + thirdMoment2 + deltaN * deltaN * delta * n1 * n2 *
+        (n1 - n2) + 3.0 * deltaN * (n1 * secondMoment2 - n2 * secondMoment1)
+      buffer1.setDouble(thirdMomentOffset, m3)
+    }
+
+    if (momentOrder >= 4) {
+      fourthMoment1 = buffer1.getDouble(fourthMomentOffset)
+      fourthMoment2 = buffer2.getDouble(inputAggBufferOffset + 4)
+      m4 = fourthMoment1 + fourthMoment2 + deltaN * deltaN * deltaN * delta * n1 *
+        n2 * (n1 * n1 - n1 * n2 + n2 * n2) + deltaN * deltaN * 6.0 *
+        (n1 * n1 * secondMoment2 + n2 * n2 * secondMoment1) +
+        4.0 * deltaN * (n1 * thirdMoment2 - n2 * thirdMoment1)
+      buffer1.setDouble(fourthMomentOffset, m4)
+    }
+  }
+
+  /**
+   * Compute aggregate statistic from sufficient moments.
+   * @param centralMoments Length `momentOrder + 1` array of central moments (un-normalized)
+   *                       needed to compute the aggregate stat.
+   */
+  def getStatistic(n: Double, mean: Double, centralMoments: Array[Double]): Double
+
+  override final def eval(buffer: InternalRow): Any = {
+    val n = buffer.getDouble(nOffset)
+    val mean = buffer.getDouble(meanOffset)
+    val moments = Array.ofDim[Double](momentOrder + 1)
+    moments(0) = 1.0
+    moments(1) = 0.0
+    if (momentOrder >= 2) {
+      moments(2) = buffer.getDouble(secondMomentOffset)
+    }
+    if (momentOrder >= 3) {
+      moments(3) = buffer.getDouble(thirdMomentOffset)
+    }
+    if (momentOrder >= 4) {
+      moments(4) = buffer.getDouble(fourthMomentOffset)
+    }
+
+    getStatistic(n, mean, moments)
+  }
+}
+
+case class Variance(child: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0) extends CentralMomentAgg(child) {
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def prettyName: String = "variance"
+
+  override protected val momentOrder = 2
+
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = {
+    require(moments.length == momentOrder + 1,
+      s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}")
+
+    if (n == 0.0) Double.NaN else moments(2) / n
+  }
+}
+
+case class VarianceSamp(child: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0) extends CentralMomentAgg(child) {
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def prettyName: String = "variance_samp"
+
+  override protected val momentOrder = 2
+
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = {
+    require(moments.length == momentOrder + 1,
+      s"$prettyName requires ${momentOrder + 1} central moment, received: ${moments.length}")
+
+    if (n == 0.0 || n == 1.0) Double.NaN else moments(2) / (n - 1.0)
+  }
+}
+
+case class VariancePop(child: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0) extends CentralMomentAgg(child) {
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def prettyName: String = "variance_pop"
+
+  override protected val momentOrder = 2
+
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = {
+    require(moments.length == momentOrder + 1,
+      s"$prettyName requires ${momentOrder + 1} central moment, received: ${moments.length}")
+
+    if (n == 0.0) Double.NaN else moments(2) / n
+  }
+}
+
+case class Skewness(child: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0) extends CentralMomentAgg(child) {
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def prettyName: String = "skewness"
+
+  override protected val momentOrder = 3
+
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = {
+    require(moments.length == momentOrder + 1,
+      s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}")
+    val m2 = moments(2)
+    val m3 = moments(3)
+    if (n == 0.0 || m2 == 0.0) {
+      Double.NaN
+    } else {
+      math.sqrt(n) * m3 / math.sqrt(m2 * m2 * m2)
+    }
+  }
+}
+
+case class Kurtosis(child: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0) extends CentralMomentAgg(child) {
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def prettyName: String = "kurtosis"
+
+  override protected val momentOrder = 4
+
+  // NOTE: this is the formula for excess kurtosis, which is default for R and SciPy
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = {
+    require(moments.length == momentOrder + 1,
+      s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}")
+    val m2 = moments(2)
+    val m4 = moments(4)
+    if (n == 0.0 || m2 == 0.0) {
+      Double.NaN
+    } else {
+      n * m4 / (m2 * m2) - 3.0
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index 12bdab0915801..c911ec53f1ba0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -67,6 +67,12 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = false)
 
+        case expressions.Kurtosis(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Kurtosis(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
         case expressions.Last(child, ignoreNulls) =>
           aggregate.AggregateExpression2(
             aggregateFunction = aggregate.Last(child, ignoreNulls),
@@ -85,6 +91,12 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = false)
 
+        case expressions.Skewness(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Skewness(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
         case expressions.Stddev(child) =>
           aggregate.AggregateExpression2(
             aggregateFunction = aggregate.Stddev(child),
@@ -120,6 +132,24 @@ object Utils {
             aggregateFunction = aggregate.HyperLogLogPlusPlus(child, rsd),
             mode = aggregate.Complete,
             isDistinct = false)
+
+        case expressions.Variance(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Variance(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.VariancePop(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.VariancePop(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.VarianceSamp(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.VarianceSamp(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
       }
       // Check if there is any expressions.AggregateExpression1 left.
       // If so, we cannot convert this plan.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 70819be5af5b0..c1bab6d36ab29 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -991,3 +991,98 @@ case class StddevFunction(
     }
   }
 }
+
+// placeholder
+case class Kurtosis(child: Expression) extends UnaryExpression with AggregateExpression1 {
+
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException("AggregateExpression1 is no longer supported, " +
+      "please set spark.sql.useAggregate2 = true")
+  }
+
+  override def nullable: Boolean = false
+
+  override def dataType: DoubleType.type = DoubleType
+
+  override def foldable: Boolean = false
+
+  override def prettyName: String = "kurtosis"
+
+  override def toString: String = s"KURTOSIS($child)"
+}
+
+// placeholder
+case class Skewness(child: Expression) extends UnaryExpression with AggregateExpression1 {
+
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException("AggregateExpression1 is no longer supported, " +
+      "please set spark.sql.useAggregate2 = true")
+  }
+
+  override def nullable: Boolean = false
+
+  override def dataType: DoubleType.type = DoubleType
+
+  override def foldable: Boolean = false
+
+  override def prettyName: String = "skewness"
+
+  override def toString: String = s"SKEWNESS($child)"
+}
+
+// placeholder
+case class Variance(child: Expression) extends UnaryExpression with AggregateExpression1 {
+
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException("AggregateExpression1 is no longer supported, " +
+      "please set spark.sql.useAggregate2 = true")
+  }
+
+  override def nullable: Boolean = false
+
+  override def dataType: DoubleType.type = DoubleType
+
+  override def foldable: Boolean = false
+
+  override def prettyName: String = "variance"
+
+  override def toString: String = s"VARIANCE($child)"
+}
+
+// placeholder
+case class VariancePop(child: Expression) extends UnaryExpression with AggregateExpression1 {
+
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException("AggregateExpression1 is no longer supported, " +
+      "please set spark.sql.useAggregate2 = true")
+  }
+
+  override def nullable: Boolean = false
+
+  override def dataType: DoubleType.type = DoubleType
+
+  override def foldable: Boolean = false
+
+  override def prettyName: String = "variance_pop"
+
+  override def toString: String = s"VAR_POP($child)"
+}
+
+// placeholder
+case class VarianceSamp(child: Expression) extends UnaryExpression with AggregateExpression1 {
+
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException("AggregateExpression1 is no longer supported, " +
+      "please set spark.sql.useAggregate2 = true")
+  }
+
+  override def nullable: Boolean = false
+
+  override def dataType: DoubleType.type = DoubleType
+
+  override def foldable: Boolean = false
+
+  override def prettyName: String = "variance_samp"
+
+  override def toString: String = s"VAR_SAMP($child)"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 102b802ad0a0a..dc96384a4d28d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -127,7 +127,12 @@ class GroupedData protected[sql](
       case "stddev" => Stddev
       case "stddev_pop" => StddevPop
       case "stddev_samp" => StddevSamp
+      case "variance" => Variance
+      case "var_pop" => VariancePop
+      case "var_samp" => VarianceSamp
       case "sum" => Sum
+      case "skewness" => Skewness
+      case "kurtosis" => Kurtosis
       case "count" | "size" =>
         // Turn count(*) into count(1)
         (inputExpr: Expression) => inputExpr match {
@@ -250,6 +255,30 @@ class GroupedData protected[sql](
     aggregateNumericColumns(colNames : _*)(Average)
   }
 
+  /**
+   * Compute the skewness for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the skewness values for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def skewness(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Skewness)
+  }
+
+  /**
+   * Compute the kurtosis for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the kurtosis values for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def kurtosis(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Kurtosis)
+  }
+
   /**
    * Compute the max value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
@@ -333,4 +362,40 @@ class GroupedData protected[sql](
   def sum(colNames: String*): DataFrame = {
     aggregateNumericColumns(colNames : _*)(Sum)
   }
+
+  /**
+   * Compute the sample variance for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the variance for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def variance(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Variance)
+  }
+
+  /**
+   * Compute the population variance for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the variance for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def var_pop(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(VariancePop)
+  }
+
+  /**
+   * Compute the sample variance for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the variance for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def var_samp(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(VarianceSamp)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 15c864a8ab641..c1737b1ef663c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -228,6 +228,22 @@ object functions {
    */
   def first(columnName: String): Column = first(Column(columnName))
 
+  /**
+   * Aggregate function: returns the kurtosis of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def kurtosis(e: Column): Column = Kurtosis(e.expr)
+
+  /**
+   * Aggregate function: returns the kurtosis of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
+
   /**
    * Aggregate function: returns the last value in a group.
    *
@@ -295,8 +311,24 @@ object functions {
   def min(columnName: String): Column = min(Column(columnName))
 
   /**
-   * Aggregate function: returns the unbiased sample standard deviation
-   * of the expression in a group.
+   * Aggregate function: returns the skewness of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def skewness(e: Column): Column = Skewness(e.expr)
+
+  /**
+   * Aggregate function: returns the skewness of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def skewness(columnName: String): Column = skewness(Column(columnName))
+
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation of
+   * the expression in a group.
    *
    * @group agg_funcs
    * @since 1.6.0
@@ -304,13 +336,13 @@ object functions {
   def stddev(e: Column): Column = Stddev(e.expr)
 
   /**
-   * Aggregate function: returns the population standard deviation of
+   * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
    *
    * @group agg_funcs
    * @since 1.6.0
    */
-  def stddev_pop(e: Column): Column = StddevPop(e.expr)
+  def stddev(columnName: String): Column = stddev(Column(columnName))
 
   /**
    * Aggregate function: returns the unbiased sample standard deviation of
@@ -321,6 +353,33 @@ object functions {
    */
   def stddev_samp(e: Column): Column = StddevSamp(e.expr)
 
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
+
+  /**
+   * Aggregate function: returns the population standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_pop(e: Column): Column = StddevPop(e.expr)
+
+  /**
+   * Aggregate function: returns the population standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
+
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
@@ -353,6 +412,54 @@ object functions {
    */
   def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName))
 
+  /**
+   * Aggregate function: returns the population variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def variance(e: Column): Column = Variance(e.expr)
+
+  /**
+   * Aggregate function: returns the population variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def variance(columnName: String): Column = variance(Column(columnName))
+
+  /**
+   * Aggregate function: returns the unbiased variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def var_samp(e: Column): Column = VarianceSamp(e.expr)
+
+  /**
+   * Aggregate function: returns the unbiased variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def var_samp(columnName: String): Column = var_samp(Column(columnName))
+
+  /**
+   * Aggregate function: returns the population variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def var_pop(e: Column): Column = VariancePop(e.expr)
+
+  /**
+   * Aggregate function: returns the population variance of the values in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def var_pop(columnName: String): Column = var_pop(Column(columnName))
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Window functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index f5ef9ffd7f4f2..9b23977c765dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -221,4 +221,77 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       emptyTableData.agg(sumDistinct('a)),
       Row(null))
   }
+
+  test("moments") {
+    val absTol = 1e-8
+
+    val sparkVariance = testData2.agg(variance('a))
+    val expectedVariance = Row(4.0 / 6.0)
+    checkAggregatesWithTol(sparkVariance, expectedVariance, absTol)
+    val sparkVariancePop = testData2.agg(var_pop('a))
+    checkAggregatesWithTol(sparkVariancePop, expectedVariance, absTol)
+
+    val sparkVarianceSamp = testData2.agg(var_samp('a))
+    val expectedVarianceSamp = Row(4.0 / 5.0)
+    checkAggregatesWithTol(sparkVarianceSamp, expectedVarianceSamp, absTol)
+
+    val sparkSkewness = testData2.agg(skewness('a))
+    val expectedSkewness = Row(0.0)
+    checkAggregatesWithTol(sparkSkewness, expectedSkewness, absTol)
+
+    val sparkKurtosis = testData2.agg(kurtosis('a))
+    val expectedKurtosis = Row(-1.5)
+    checkAggregatesWithTol(sparkKurtosis, expectedKurtosis, absTol)
+
+  }
+
+  test("zero moments") {
+    val emptyTableData = Seq((1, 2)).toDF("a", "b")
+    assert(emptyTableData.count() === 1)
+
+    checkAnswer(
+      emptyTableData.agg(variance('a)),
+      Row(0.0))
+
+    checkAnswer(
+      emptyTableData.agg(var_samp('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(var_pop('a)),
+      Row(0.0))
+
+    checkAnswer(
+      emptyTableData.agg(skewness('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(kurtosis('a)),
+      Row(Double.NaN))
+  }
+
+  test("null moments") {
+    val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
+    assert(emptyTableData.count() === 0)
+
+    checkAnswer(
+      emptyTableData.agg(variance('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(var_samp('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(var_pop('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(skewness('a)),
+      Row(Double.NaN))
+
+    checkAnswer(
+      emptyTableData.agg(kurtosis('a)),
+      Row(Double.NaN))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 73e02eb0d9574..3c174efe73ffe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -134,6 +134,32 @@ abstract class QueryTest extends PlanTest {
     checkAnswer(df, expectedAnswer.collect())
   }
 
+  /**
+   * Runs the plan and makes sure the answer is within absTol of the expected result.
+   * @param dataFrame the [[DataFrame]] to be executed
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param absTol the absolute tolerance between actual and expected answers.
+   */
+  protected def checkAggregatesWithTol(dataFrame: DataFrame,
+      expectedAnswer: Seq[Row],
+      absTol: Double): Unit = {
+    // TODO: catch exceptions in data frame execution
+    val actualAnswer = dataFrame.collect()
+    require(actualAnswer.length == expectedAnswer.length,
+      s"actual num rows ${actualAnswer.length} != expected num of rows ${expectedAnswer.length}")
+
+    actualAnswer.zip(expectedAnswer).foreach {
+      case (actualRow, expectedRow) =>
+        QueryTest.checkAggregatesWithTol(actualRow, expectedRow, absTol)
+    }
+  }
+
+  protected def checkAggregatesWithTol(dataFrame: DataFrame,
+      expectedAnswer: Row,
+      absTol: Double): Unit = {
+    checkAggregatesWithTol(dataFrame, Seq(expectedAnswer), absTol)
+  }
+
   /**
    * Asserts that a given [[DataFrame]] will be executed using the given number of cached results.
    */
@@ -214,6 +240,28 @@ object QueryTest {
     return None
   }
 
+  /**
+   * Runs the plan and makes sure the answer is within absTol of the expected result.
+   * @param actualAnswer the actual result in a [[Row]].
+   * @param expectedAnswer the expected result in a[[Row]].
+   * @param absTol the absolute tolerance between actual and expected answers.
+   */
+  protected def checkAggregatesWithTol(actualAnswer: Row, expectedAnswer: Row, absTol: Double) = {
+    require(actualAnswer.length == expectedAnswer.length,
+      s"actual answer length ${actualAnswer.length} != " +
+        s"expected answer length ${expectedAnswer.length}")
+
+    // TODO: support other numeric types besides Double
+    // TODO: support struct types?
+    actualAnswer.toSeq.zip(expectedAnswer.toSeq).foreach {
+      case (actual: Double, expected: Double) =>
+        assert(math.abs(actual - expected) < absTol,
+          s"actual answer $actual not within $absTol of correct answer $expected")
+      case (actual, expected) =>
+        assert(actual == expected, s"$actual did not equal $expected")
+    }
+  }
+
   def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): String = {
     checkAnswer(df, expectedAnswer.asScala) match {
       case Some(errorMessage) => errorMessage
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index f5ae3ae49b460..5a616fac0bc2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -523,8 +523,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("aggregates with nulls") {
     checkAnswer(
-      sql("SELECT MIN(a), MAX(a), AVG(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
-      Row(1, 3, 2, 1, 6, 3)
+      sql("SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a)," +
+        "AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
+      Row(0, -1.5, 1, 3, 2, 2.0 / 3.0, 1, 6, 3)
     )
   }
 
@@ -717,14 +718,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   test("stddev") {
     checkAnswer(
       sql("SELECT STDDEV(a) FROM testData2"),
-      Row(math.sqrt(4/5.0))
+      Row(math.sqrt(4.0 / 5.0))
     )
   }
 
   test("stddev_pop") {
     checkAnswer(
       sql("SELECT STDDEV_POP(a) FROM testData2"),
-      Row(math.sqrt(4/6.0))
+      Row(math.sqrt(4.0 / 6.0))
     )
   }
 
@@ -735,10 +736,60 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("var_samp") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT VAR_SAMP(a) FROM testData2")
+    val expectedAnswer = Row(4.0 / 5.0)
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
+  test("variance") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT VARIANCE(a) FROM testData2")
+    val expectedAnswer = Row(4.0 / 6.0)
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
+  test("var_pop") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT VAR_POP(a) FROM testData2")
+    val expectedAnswer = Row(4.0 / 6.0)
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
+  test("skewness") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT skewness(a) FROM testData2")
+    val expectedAnswer = Row(0.0)
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
+  test("kurtosis") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT kurtosis(a) FROM testData2")
+    val expectedAnswer = Row(-1.5)
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
   test("stddev agg") {
     checkAnswer(
-      sql("SELECT a, stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2 GROUP BY a"),
-      (1 to 3).map(i => Row(i, math.sqrt(1/2.0), math.sqrt(1/4.0), math.sqrt(1/2.0))))
+        sql("SELECT a, stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2 GROUP BY a"),
+      (1 to 3).map(i => Row(i, math.sqrt(1.0 / 2.0), math.sqrt(1.0 / 4.0), math.sqrt(1.0 / 2.0))))
+  }
+
+  test("variance agg") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT a, variance(b), var_samp(b), var_pop(b)" +
+      "FROM testData2 GROUP BY a")
+    val expectedAnswer = (1 to 3).map(i => Row(i, 1.0 / 4.0, 1.0 / 2.0, 1.0 / 4.0))
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
+  }
+
+  test("skewness and kurtosis agg") {
+    val absTol = 1e-8
+    val sparkAnswer = sql("SELECT a, skewness(b), kurtosis(b) FROM testData2 GROUP BY a")
+    val expectedAnswer = (1 to 3).map(i => Row(i, 0.0, -2.0))
+    checkAggregatesWithTol(sparkAnswer, expectedAnswer, absTol)
   }
 
   test("inner join where, one match per row") {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index eed9e436f9af7..9e357bf348c94 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -467,7 +467,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "escape_orderby1",
     "escape_sortby1",
     "explain_rearrange",
-    "fetch_aggregation",
     "fileformat_mix",
     "fileformat_sequencefile",
     "fileformat_text",

From f21ef8dbb2ef6526b8b47e18b9d8d91dd520c086 Mon Sep 17 00:00:00 2001
From: teramonagi <teramonagi@gmail.com>
Date: Thu, 29 Oct 2015 10:54:04 -0700
Subject: [PATCH 722/802] [SPARK-10532][EC2] Added --profile option to specify
 the name of profile

"profiles" give us the way that you can specify the set of credentials you want to use when you initialize a connection to AWS.

You can keep multiple sets of credentials in the same credentials files using different profile names.
For example, you can use --profile option to do that when you use "aws cli tool".

http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html

Author: teramonagi <teramonagi@gmail.com>

Closes #8696 from teramonagi/SPARK-10532.
---
 ec2/spark_ec2.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 3a2361c6d6d2b..9327e21e43db7 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -181,6 +181,10 @@ def parse_args():
     parser.add_option(
         "-i", "--identity-file",
         help="SSH private key file to use for logging into instances")
+    parser.add_option(
+        "-p", "--profile", default=None,
+        help="If you have multiple profiles (AWS or boto config), you can configure " +
+             "additional, named profiles by using this option (default: %default)")
     parser.add_option(
         "-t", "--instance-type", default="m1.large",
         help="Type of instance to launch (default: %default). " +
@@ -1315,7 +1319,10 @@ def real_main():
         sys.exit(1)
 
     try:
-        conn = ec2.connect_to_region(opts.region)
+        if opts.profile is None:
+            conn = ec2.connect_to_region(opts.region)
+        else:
+            conn = ec2.connect_to_region(opts.region, profile_name=opts.profile)
     except Exception as e:
         print((e), file=stderr)
         sys.exit(1)

From 4f5e60c647d7d6827438721b7fabbc3a57b81023 Mon Sep 17 00:00:00 2001
From: Calvin Jia <jia.calvin@gmail.com>
Date: Thu, 29 Oct 2015 15:13:38 -0700
Subject: [PATCH 723/802] [SPARK-11236][CORE] Update Tachyon dependency from
 0.7.1 -> 0.8.0.

Upgrades the tachyon-client version to the latest release.

No new dependencies are added and no spark facing APIs are changed. The removal of the `tachyon-underfs-s3` exclusion will enable users to use S3 out of the box and there are no longer any additional external dependencies added by the module.

Author: Calvin Jia <jia.calvin@gmail.com>

Closes #9204 from calvinjia/spark-11236.
---
 core/pom.xml         | 6 +-----
 make-distribution.sh | 8 ++++----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 319a50049a82d..dff40e91ad228 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -266,7 +266,7 @@
     <dependency>
       <groupId>org.tachyonproject</groupId>
       <artifactId>tachyon-client</artifactId>
-      <version>0.7.1</version>
+      <version>0.8.0</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.hadoop</groupId>
@@ -288,10 +288,6 @@
           <groupId>org.tachyonproject</groupId>
           <artifactId>tachyon-underfs-glusterfs</artifactId>
         </exclusion>
-        <exclusion>
-          <groupId>org.tachyonproject</groupId>
-          <artifactId>tachyon-underfs-s3</artifactId>
-        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/make-distribution.sh b/make-distribution.sh
index 24418ace26270..f6766784813c3 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -33,9 +33,9 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 SPARK_TACHYON=false
-TACHYON_VERSION="0.7.1"
+TACHYON_VERSION="0.8.0"
 TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz"
-TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}"
+TACHYON_URL="http://tachyon-project.org/downloads/files/${TACHYON_VERSION}/${TACHYON_TGZ}"
 
 MAKE_TGZ=false
 NAME=none
@@ -240,10 +240,10 @@ if [ "$SPARK_TACHYON" == "true" ]; then
   fi
 
   tar xzf "${TACHYON_TGZ}"
-  cp "tachyon-${TACHYON_VERSION}/core/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
+  cp "tachyon-${TACHYON_VERSION}/assembly/target/tachyon-assemblies-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
   mkdir -p "$DISTDIR/tachyon/src/main/java/tachyon/web"
   cp -r "tachyon-${TACHYON_VERSION}"/{bin,conf,libexec} "$DISTDIR/tachyon"
-  cp -r "tachyon-${TACHYON_VERSION}"/core/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
+  cp -r "tachyon-${TACHYON_VERSION}"/servers/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
 
   if [[ `uname -a` == Darwin* ]]; then
     # need to run sed differently on osx

From 96cf87f66d47245b19e719cb83947042b21546fa Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 29 Oct 2015 16:36:52 -0700
Subject: [PATCH 724/802] [SPARK-11301] [SQL] fix case sensitivity for filter
 on partitioned columns

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9271 from cloud-fan/filter.
---
 .../execution/datasources/DataSourceStrategy.scala   | 12 +++++-------
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  | 10 ++++++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index ffb4645b89321..af6626c897583 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -63,16 +63,14 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
       // We divide the filter expressions into 3 parts
-      val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
+      val partitionColumns = AttributeSet(
+        t.partitionColumns.map(c => l.output.find(_.name == c.name).get))
 
-      // TODO this is case-sensitive
-      // Only prunning the partition keys
-      val partitionFilters =
-        filters.filter(_.references.map(_.name).toSet.subsetOf(partitionColumnNames))
+      // Only pruning the partition keys
+      val partitionFilters = filters.filter(_.references.subsetOf(partitionColumns))
 
       // Only pushes down predicates that do not reference partition keys.
-      val pushedFilters =
-        filters.filter(_.references.map(_.name).toSet.intersect(partitionColumnNames).isEmpty)
+      val pushedFilters = filters.filter(_.references.intersect(partitionColumns).isEmpty)
 
       // Predicates with both partition keys and attributes
       val combineFilters = filters.toSet -- partitionFilters.toSet -- pushedFilters.toSet
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 59565a6b13d40..c9d6e19d2ce93 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -987,4 +987,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = (1 to 10).map(Tuple1.apply).toDF("i").as("src")
     assert(df.select($"src.i".cast(StringType)).columns.head === "i")
   }
+
+  test("SPARK-11301: fix case sensitivity for filter on partitioned columns") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTempPath { path =>
+        Seq(2012 -> "a").toDF("year", "val").write.partitionBy("year").parquet(path.getAbsolutePath)
+        val df = sqlContext.read.parquet(path.getAbsolutePath)
+        checkAnswer(df.filter($"yEAr" > 2000).select($"val"), Row("a"))
+      }
+    }
+  }
 }

From d89be0bf81029cd82008a959d191e1c7b6ceaa18 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Thu, 29 Oct 2015 21:01:10 -0700
Subject: [PATCH 725/802] [SPARK-11409][SPARKR] Enable url link in R doc for
 Persist

Quick one line doc fix
link is not clickable
![image](https://cloud.githubusercontent.com/assets/8969467/10833041/4e91dd7c-7e4c-11e5-8905-713b986dbbde.png)

shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9363 from felixcheung/rpersistdoc.
---
 R/pkg/R/DataFrame.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index c8944459542af..87a2c66ffd2a9 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -357,7 +357,7 @@ setMethod("cache",
 #'
 #' Persist this DataFrame with the specified storage level. For details of the
 #' supported storage levels, refer to
-#' http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence.
+#' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
 #'
 #' @param x The DataFrame to persist
 #' @rdname persist
@@ -1572,7 +1572,7 @@ setMethod("merge",
             joinRes
           })
 
-#' 
+#'
 #' Creates a list of columns by replacing the intersected ones with aliases.
 #' The name of the alias column is formed by concatanating the original column name and a suffix.
 #'

From 56419cf11f769c80f391b45dc41b3c7101cc5ff4 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 29 Oct 2015 23:38:06 -0700
Subject: [PATCH 726/802] [SPARK-10342] [SPARK-10309] [SPARK-10474]
 [SPARK-10929] [SQL] Cooperative memory management

This PR introduce a mechanism to call spill() on those SQL operators that support spilling (for example, BytesToBytesMap, UnsafeExternalSorter and ShuffleExternalSorter) if there is not enough memory for execution. The preserved first page is needed anymore, so removed.

Other Spillable objects in Spark core (ExternalSorter and AppendOnlyMap) are not included in this PR, but those could benefit from this (trigger others' spilling).

The PrepareRDD may be not needed anymore, could be removed in follow up PR.

The following script will fail with OOM before this PR, finished in 150 seconds with 2G heap (also works in 1.5 branch, with similar duration).

```python
sqlContext.setConf("spark.sql.shuffle.partitions", "1")
df = sqlContext.range(1<<25).selectExpr("id", "repeat(id, 2) as s")
df2 = df.select(df.id.alias('id2'), df.s.alias('s2'))
j = df.join(df2, df.id==df2.id2).groupBy(df.id).max("id", "id2")
j.explain()
print j.count()
```

For thread-safety, here what I'm got:

1) Without calling spill(), the operators should only be used by single thread, no safety problems.

2) spill() could be triggered in two cases, triggered by itself, or by other operators. we can check trigger == this in spill(), so it's still in the same thread, so safety problems.

3) if it's triggered by other operators (right now cache will not trigger spill()), we only spill the data into disk when it's in scanning stage (building is finished), so the in-memory sorter or memory pages are read-only, we only need to synchronize the iterator and change it.

4) During scanning, the iterator will only use one record in one page, we can't free this page, because the downstream is currently using it (used by UnsafeRow or other objects). In BytesToBytesMap, we just skip the current page, and dump all others into disk. In UnsafeExternalSorter, we keep the page that is used by current record (having the same baseObject), free it when loading the next record. In ShuffleExternalSorter, the spill() will not trigger during scanning.

5) In order to avoid deadlock, we didn't call acquireMemory during spill (so we reused the pointer array in InMemorySorter).

Author: Davies Liu <davies@databricks.com>

Closes #9241 from davies/force_spill.
---
 .../apache/spark/memory/MemoryConsumer.java   | 128 ++++++
 .../spark/memory/TaskMemoryManager.java       | 138 ++++--
 .../shuffle/sort/ShuffleExternalSorter.java   | 210 +++------
 .../shuffle/sort/ShuffleInMemorySorter.java   |  50 +-
 .../shuffle/sort/UnsafeShuffleWriter.java     |   6 -
 .../spark/unsafe/map/BytesToBytesMap.java     | 430 +++++++++++-------
 .../unsafe/sort/UnsafeExternalSorter.java     | 426 ++++++++---------
 .../unsafe/sort/UnsafeInMemorySorter.java     |  60 ++-
 .../unsafe/sort/UnsafeSorterSpillReader.java  |   6 +-
 .../unsafe/sort/UnsafeSorterSpillWriter.java  |   2 +-
 .../apache/spark/memory/MemoryManager.scala   |   9 +-
 .../spark/util/collection/Spillable.scala     |   4 +-
 .../spark/memory/TaskMemoryManagerSuite.java  |  77 +++-
 .../sort/PackedRecordPointerSuite.java        |  30 +-
 .../sort/ShuffleInMemorySorterSuite.java      |   6 +-
 .../sort/UnsafeShuffleWriterSuite.java        |  38 +-
 .../map/AbstractBytesToBytesMapSuite.java     | 149 +++++-
 .../sort/UnsafeExternalSorterSuite.java       |  97 ++--
 .../sort/UnsafeInMemorySorterSuite.java       |  20 +-
 .../scala/org/apache/spark/FailureSuite.scala |   4 +-
 .../spark/memory/MemoryManagerSuite.scala     |  60 +--
 ...yManager.scala => TestMemoryManager.scala} |  32 +-
 .../execution/UnsafeExternalRowSorter.java    |   7 +-
 .../UnsafeFixedWidthAggregationMap.java       |   2 +-
 .../sql/execution/UnsafeKVExternalSorter.java |  19 +-
 .../UnsafeFixedWidthAggregationMapSuite.scala |  22 +-
 .../UnsafeKVExternalSorterSuite.scala         |   6 +-
 .../TungstenAggregationIteratorSuite.scala    |  54 ---
 .../unsafe/memory/HeapMemoryAllocator.java    |   9 +-
 .../unsafe/memory/UnsafeMemoryAllocator.java  |   3 -
 30 files changed, 1270 insertions(+), 834 deletions(-)
 create mode 100644 core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
 rename core/src/test/scala/org/apache/spark/memory/{GrantEverythingMemoryManager.scala => TestMemoryManager.scala} (71%)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala

diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
new file mode 100644
index 0000000000000..008799cc77395
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory;
+
+
+import java.io.IOException;
+
+import org.apache.spark.unsafe.memory.MemoryBlock;
+
+
+/**
+ * An memory consumer of TaskMemoryManager, which support spilling.
+ */
+public abstract class MemoryConsumer {
+
+  private final TaskMemoryManager taskMemoryManager;
+  private final long pageSize;
+  private long used;
+
+  protected MemoryConsumer(TaskMemoryManager taskMemoryManager, long pageSize) {
+    this.taskMemoryManager = taskMemoryManager;
+    this.pageSize = pageSize;
+    this.used = 0;
+  }
+
+  protected MemoryConsumer(TaskMemoryManager taskMemoryManager) {
+    this(taskMemoryManager, taskMemoryManager.pageSizeBytes());
+  }
+
+  /**
+   * Returns the size of used memory in bytes.
+   */
+  long getUsed() {
+    return used;
+  }
+
+  /**
+   * Force spill during building.
+   *
+   * For testing.
+   */
+  public void spill() throws IOException {
+    spill(Long.MAX_VALUE, this);
+  }
+
+  /**
+   * Spill some data to disk to release memory, which will be called by TaskMemoryManager
+   * when there is not enough memory for the task.
+   *
+   * This should be implemented by subclass.
+   *
+   * Note: In order to avoid possible deadlock, should not call acquireMemory() from spill().
+   *
+   * @param size the amount of memory should be released
+   * @param trigger the MemoryConsumer that trigger this spilling
+   * @return the amount of released memory in bytes
+   * @throws IOException
+   */
+  public abstract long spill(long size, MemoryConsumer trigger) throws IOException;
+
+  /**
+   * Acquire `size` bytes memory.
+   *
+   * If there is not enough memory, throws OutOfMemoryError.
+   */
+  protected void acquireMemory(long size) {
+    long got = taskMemoryManager.acquireExecutionMemory(size, this);
+    if (got < size) {
+      taskMemoryManager.releaseExecutionMemory(got, this);
+      taskMemoryManager.showMemoryUsage();
+      throw new OutOfMemoryError("Could not acquire " + size + " bytes of memory, got " + got);
+    }
+    used += got;
+  }
+
+  /**
+   * Release `size` bytes memory.
+   */
+  protected void releaseMemory(long size) {
+    used -= size;
+    taskMemoryManager.releaseExecutionMemory(size, this);
+  }
+
+  /**
+   * Allocate a memory block with at least `required` bytes.
+   *
+   * Throws IOException if there is not enough memory.
+   *
+   * @throws OutOfMemoryError
+   */
+  protected MemoryBlock allocatePage(long required) {
+    MemoryBlock page = taskMemoryManager.allocatePage(Math.max(pageSize, required), this);
+    if (page == null || page.size() < required) {
+      long got = 0;
+      if (page != null) {
+        got = page.size();
+        freePage(page);
+      }
+      taskMemoryManager.showMemoryUsage();
+      throw new OutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+    }
+    used += page.size();
+    return page;
+  }
+
+  /**
+   * Free a memory block.
+   */
+  protected void freePage(MemoryBlock page) {
+    used -= page.size();
+    taskMemoryManager.freePage(page, this);
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 7b31c90dac666..4230575446d31 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -17,13 +17,18 @@
 
 package org.apache.spark.memory;
 
-import java.util.*;
+import javax.annotation.concurrent.GuardedBy;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashSet;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.util.Utils;
 
 /**
  * Manages the memory allocated by an individual task.
@@ -100,6 +105,12 @@ public class TaskMemoryManager {
    */
   private final boolean inHeap;
 
+  /**
+   * The size of memory granted to each consumer.
+   */
+  @GuardedBy("this")
+  private final HashSet<MemoryConsumer> consumers;
+
   /**
    * Construct a new TaskMemoryManager.
    */
@@ -107,23 +118,92 @@ public TaskMemoryManager(MemoryManager memoryManager, long taskAttemptId) {
     this.inHeap = memoryManager.tungstenMemoryIsAllocatedInHeap();
     this.memoryManager = memoryManager;
     this.taskAttemptId = taskAttemptId;
+    this.consumers = new HashSet<>();
   }
 
   /**
-   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   * Acquire N bytes of memory for a consumer. If there is no enough memory, it will call
+   * spill() of consumers to release more memory.
+   *
    * @return number of bytes successfully granted (<= N).
    */
-  public long acquireExecutionMemory(long size) {
-    return memoryManager.acquireExecutionMemory(size, taskAttemptId);
+  public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
+    assert(required >= 0);
+    synchronized (this) {
+      long got = memoryManager.acquireExecutionMemory(required, taskAttemptId);
+
+      // try to release memory from other consumers first, then we can reduce the frequency of
+      // spilling, avoid to have too many spilled files.
+      if (got < required) {
+        // Call spill() on other consumers to release memory
+        for (MemoryConsumer c: consumers) {
+          if (c != null && c != consumer && c.getUsed() > 0) {
+            try {
+              long released = c.spill(required - got, consumer);
+              if (released > 0) {
+                logger.info("Task {} released {} from {} for {}", taskAttemptId,
+                  Utils.bytesToString(released), c, consumer);
+                got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId);
+                if (got >= required) {
+                  break;
+                }
+              }
+            } catch (IOException e) {
+              logger.error("error while calling spill() on " + c, e);
+              throw new OutOfMemoryError("error while calling spill() on " + c + " : "
+                + e.getMessage());
+            }
+          }
+        }
+      }
+
+      // call spill() on itself
+      if (got < required && consumer != null) {
+        try {
+          long released = consumer.spill(required - got, consumer);
+          if (released > 0) {
+            logger.info("Task {} released {} from itself ({})", taskAttemptId,
+              Utils.bytesToString(released), consumer);
+            got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId);
+          }
+        } catch (IOException e) {
+          logger.error("error while calling spill() on " + consumer, e);
+          throw new OutOfMemoryError("error while calling spill() on " + consumer + " : "
+            + e.getMessage());
+        }
+      }
+
+      consumers.add(consumer);
+      logger.debug("Task {} acquire {} for {}", taskAttemptId, Utils.bytesToString(got), consumer);
+      return got;
+    }
   }
 
   /**
-   * Release N bytes of execution memory.
+   * Release N bytes of execution memory for a MemoryConsumer.
    */
-  public void releaseExecutionMemory(long size) {
+  public void releaseExecutionMemory(long size, MemoryConsumer consumer) {
+    logger.debug("Task {} release {} from {}", taskAttemptId, Utils.bytesToString(size), consumer);
     memoryManager.releaseExecutionMemory(size, taskAttemptId);
   }
 
+  /**
+   * Dump the memory usage of all consumers.
+   */
+  public void showMemoryUsage() {
+    logger.info("Memory used in task " + taskAttemptId);
+    synchronized (this) {
+      for (MemoryConsumer c: consumers) {
+        if (c.getUsed() > 0) {
+          logger.info("Acquired by " + c + ": " + Utils.bytesToString(c.getUsed()));
+        }
+      }
+    }
+  }
+
+  /**
+   * Return the page size in bytes.
+   */
   public long pageSizeBytes() {
     return memoryManager.pageSizeBytes();
   }
@@ -134,42 +214,40 @@ public long pageSizeBytes() {
    *
    * Returns `null` if there was not enough memory to allocate the page.
    */
-  public MemoryBlock allocatePage(long size) {
+  public MemoryBlock allocatePage(long size, MemoryConsumer consumer) {
     if (size > MAXIMUM_PAGE_SIZE_BYTES) {
       throw new IllegalArgumentException(
         "Cannot allocate a page with more than " + MAXIMUM_PAGE_SIZE_BYTES + " bytes");
     }
 
+    long acquired = acquireExecutionMemory(size, consumer);
+    if (acquired <= 0) {
+      return null;
+    }
+
     final int pageNumber;
     synchronized (this) {
       pageNumber = allocatedPages.nextClearBit(0);
       if (pageNumber >= PAGE_TABLE_SIZE) {
+        releaseExecutionMemory(acquired, consumer);
         throw new IllegalStateException(
           "Have already allocated a maximum of " + PAGE_TABLE_SIZE + " pages");
       }
       allocatedPages.set(pageNumber);
     }
-    final long acquiredExecutionMemory = acquireExecutionMemory(size);
-    if (acquiredExecutionMemory != size) {
-      releaseExecutionMemory(acquiredExecutionMemory);
-      synchronized (this) {
-        allocatedPages.clear(pageNumber);
-      }
-      return null;
-    }
-    final MemoryBlock page = memoryManager.tungstenMemoryAllocator().allocate(size);
+    final MemoryBlock page = memoryManager.tungstenMemoryAllocator().allocate(acquired);
     page.pageNumber = pageNumber;
     pageTable[pageNumber] = page;
     if (logger.isTraceEnabled()) {
-      logger.trace("Allocate page number {} ({} bytes)", pageNumber, size);
+      logger.trace("Allocate page number {} ({} bytes)", pageNumber, acquired);
     }
     return page;
   }
 
   /**
-   * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage(long)}.
+   * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}.
    */
-  public void freePage(MemoryBlock page) {
+  public void freePage(MemoryBlock page, MemoryConsumer consumer) {
     assert (page.pageNumber != -1) :
       "Called freePage() on memory that wasn't allocated with allocatePage()";
     assert(allocatedPages.get(page.pageNumber));
@@ -182,14 +260,14 @@ public void freePage(MemoryBlock page) {
     }
     long pageSize = page.size();
     memoryManager.tungstenMemoryAllocator().free(page);
-    releaseExecutionMemory(pageSize);
+    releaseExecutionMemory(pageSize, consumer);
   }
 
   /**
    * Given a memory page and offset within that page, encode this address into a 64-bit long.
    * This address will remain valid as long as the corresponding page has not been freed.
    *
-   * @param page a data page allocated by {@link TaskMemoryManager#allocatePage(long)}/
+   * @param page a data page allocated by {@link TaskMemoryManager#allocatePage}/
    * @param offsetInPage an offset in this page which incorporates the base offset. In other words,
    *                     this should be the value that you would pass as the base offset into an
    *                     UNSAFE call (e.g. page.baseOffset() + something).
@@ -261,17 +339,17 @@ public long getOffsetInPage(long pagePlusOffsetAddress) {
    * value can be used to detect memory leaks.
    */
   public long cleanUpAllAllocatedMemory() {
-    long freedBytes = 0;
-    for (MemoryBlock page : pageTable) {
-      if (page != null) {
-        freedBytes += page.size();
-        freePage(page);
+    synchronized (this) {
+      Arrays.fill(pageTable, null);
+      for (MemoryConsumer c: consumers) {
+        if (c != null && c.getUsed() > 0) {
+          // In case of failed task, it's normal to see leaked memory
+          logger.warn("leak " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
+        }
       }
+      consumers.clear();
     }
-
-    freedBytes += memoryManager.releaseAllExecutionMemoryForTask(taskAttemptId);
-
-    return freedBytes;
+    return memoryManager.releaseAllExecutionMemoryForTask(taskAttemptId);
   }
 
   /**
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index f43236f41ae7b..400d8520019b9 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -31,15 +31,15 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.DiskBlockObjectWriter;
 import org.apache.spark.storage.TempShuffleBlockId;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 /**
@@ -58,23 +58,18 @@
  * spill files. Instead, this merging is performed in {@link UnsafeShuffleWriter}, which uses a
  * specialized merge procedure that avoids extra serialization/deserialization.
  */
-final class ShuffleExternalSorter {
+final class ShuffleExternalSorter extends MemoryConsumer {
 
   private final Logger logger = LoggerFactory.getLogger(ShuffleExternalSorter.class);
 
   @VisibleForTesting
   static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
 
-  private final int initialSize;
   private final int numPartitions;
-  private final int pageSizeBytes;
-  @VisibleForTesting
-  final int maxRecordSizeBytes;
   private final TaskMemoryManager taskMemoryManager;
   private final BlockManager blockManager;
   private final TaskContext taskContext;
   private final ShuffleWriteMetrics writeMetrics;
-  private long numRecordsInsertedSinceLastSpill = 0;
 
   /** Force this sorter to spill when there are this many elements in memory. For testing only */
   private final long numElementsForSpillThreshold;
@@ -98,8 +93,7 @@ final class ShuffleExternalSorter {
   // These variables are reset after spilling:
   @Nullable private ShuffleInMemorySorter inMemSorter;
   @Nullable private MemoryBlock currentPage = null;
-  private long currentPagePosition = -1;
-  private long freeSpaceInCurrentPage = 0;
+  private long pageCursor = -1;
 
   public ShuffleExternalSorter(
       TaskMemoryManager memoryManager,
@@ -108,42 +102,21 @@ public ShuffleExternalSorter(
       int initialSize,
       int numPartitions,
       SparkConf conf,
-      ShuffleWriteMetrics writeMetrics) throws IOException {
+      ShuffleWriteMetrics writeMetrics) {
+    super(memoryManager, (int) Math.min(PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES,
+      memoryManager.pageSizeBytes()));
     this.taskMemoryManager = memoryManager;
     this.blockManager = blockManager;
     this.taskContext = taskContext;
-    this.initialSize = initialSize;
-    this.peakMemoryUsedBytes = initialSize;
     this.numPartitions = numPartitions;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
     this.numElementsForSpillThreshold =
       conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", Long.MAX_VALUE);
-    this.pageSizeBytes = (int) Math.min(
-      PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, taskMemoryManager.pageSizeBytes());
-    this.maxRecordSizeBytes = pageSizeBytes - 4;
     this.writeMetrics = writeMetrics;
-    initializeForWriting();
-
-    // preserve first page to ensure that we have at least one page to work with. Otherwise,
-    // other operators in the same task may starve this sorter (SPARK-9709).
-    acquireNewPageIfNecessary(pageSizeBytes);
-  }
-
-  /**
-   * Allocates new sort data structures. Called when creating the sorter and after each spill.
-   */
-  private void initializeForWriting() throws IOException {
-    // TODO: move this sizing calculation logic into a static method of sorter:
-    final long memoryRequested = initialSize * 8L;
-    final long memoryAcquired = taskMemoryManager.acquireExecutionMemory(memoryRequested);
-    if (memoryAcquired != memoryRequested) {
-      taskMemoryManager.releaseExecutionMemory(memoryAcquired);
-      throw new IOException("Could not acquire " + memoryRequested + " bytes of memory");
-    }
-
+    acquireMemory(initialSize * 8L);
     this.inMemSorter = new ShuffleInMemorySorter(initialSize);
-    numRecordsInsertedSinceLastSpill = 0;
+    this.peakMemoryUsedBytes = getMemoryUsage();
   }
 
   /**
@@ -242,6 +215,8 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
       }
     }
 
+    inMemSorter.reset();
+
     if (!isLastFile) {  // i.e. this is a spill file
       // The current semantics of `shuffleRecordsWritten` seem to be that it's updated when records
       // are written to disk, not when they enter the shuffle sorting code. DiskBlockObjectWriter
@@ -266,9 +241,12 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
   /**
    * Sort and spill the current records in response to memory pressure.
    */
-  @VisibleForTesting
-  void spill() throws IOException {
-    assert(inMemSorter != null);
+  @Override
+  public long spill(long size, MemoryConsumer trigger) throws IOException {
+    if (trigger != this || inMemSorter == null || inMemSorter.numRecords() == 0) {
+      return 0L;
+    }
+
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
       Thread.currentThread().getId(),
       Utils.bytesToString(getMemoryUsage()),
@@ -276,13 +254,9 @@ void spill() throws IOException {
       spills.size() > 1 ? " times" : " time");
 
     writeSortedFile(false);
-    final long inMemSorterMemoryUsage = inMemSorter.getMemoryUsage();
-    inMemSorter = null;
-    taskMemoryManager.releaseExecutionMemory(inMemSorterMemoryUsage);
     final long spillSize = freeMemory();
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
-
-    initializeForWriting();
+    return spillSize;
   }
 
   private long getMemoryUsage() {
@@ -312,18 +286,12 @@ private long freeMemory() {
     updatePeakMemoryUsed();
     long memoryFreed = 0;
     for (MemoryBlock block : allocatedPages) {
-      taskMemoryManager.freePage(block);
       memoryFreed += block.size();
-    }
-    if (inMemSorter != null) {
-      long sorterMemoryUsage = inMemSorter.getMemoryUsage();
-      inMemSorter = null;
-      taskMemoryManager.releaseExecutionMemory(sorterMemoryUsage);
+      freePage(block);
     }
     allocatedPages.clear();
     currentPage = null;
-    currentPagePosition = -1;
-    freeSpaceInCurrentPage = 0;
+    pageCursor = 0;
     return memoryFreed;
   }
 
@@ -332,16 +300,16 @@ private long freeMemory() {
    */
   public void cleanupResources() {
     freeMemory();
+    if (inMemSorter != null) {
+      long sorterMemoryUsage = inMemSorter.getMemoryUsage();
+      inMemSorter = null;
+      releaseMemory(sorterMemoryUsage);
+    }
     for (SpillInfo spill : spills) {
       if (spill.file.exists() && !spill.file.delete()) {
         logger.error("Unable to delete spill file {}", spill.file.getPath());
       }
     }
-    if (inMemSorter != null) {
-      long sorterMemoryUsage = inMemSorter.getMemoryUsage();
-      inMemSorter = null;
-      taskMemoryManager.releaseExecutionMemory(sorterMemoryUsage);
-    }
   }
 
   /**
@@ -352,16 +320,27 @@ public void cleanupResources() {
   private void growPointerArrayIfNecessary() throws IOException {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
-      logger.debug("Attempting to expand sort pointer array");
-      final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage();
-      final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2;
-      final long memoryAcquired = taskMemoryManager.acquireExecutionMemory(memoryToGrowPointerArray);
-      if (memoryAcquired < memoryToGrowPointerArray) {
-        taskMemoryManager.releaseExecutionMemory(memoryAcquired);
-        spill();
+      long used = inMemSorter.getMemoryUsage();
+      long needed = used + inMemSorter.getMemoryToExpand();
+      try {
+        acquireMemory(needed);  // could trigger spilling
+      } catch (OutOfMemoryError e) {
+        // should have trigger spilling
+        assert(inMemSorter.hasSpaceForAnotherRecord());
+        return;
+      }
+      // check if spilling is triggered or not
+      if (inMemSorter.hasSpaceForAnotherRecord()) {
+        releaseMemory(needed);
       } else {
-        inMemSorter.expandPointerArray();
-        taskMemoryManager.releaseExecutionMemory(oldPointerArrayMemoryUsage);
+        try {
+          inMemSorter.expandPointerArray();
+          releaseMemory(used);
+        } catch (OutOfMemoryError oom) {
+          // Just in case that JVM had run out of memory
+          releaseMemory(needed);
+          spill();
+        }
       }
     }
   }
@@ -370,96 +349,46 @@ private void growPointerArrayIfNecessary() throws IOException {
    * Allocates more memory in order to insert an additional record. This will request additional
    * memory from the memory manager and spill if the requested memory can not be obtained.
    *
-   * @param requiredSpace the required space in the data page, in bytes, including space for storing
+   * @param required the required space in the data page, in bytes, including space for storing
    *                      the record size. This must be less than or equal to the page size (records
    *                      that exceed the page size are handled via a different code path which uses
    *                      special overflow pages).
    */
-  private void acquireNewPageIfNecessary(int requiredSpace) throws IOException {
-    growPointerArrayIfNecessary();
-    if (requiredSpace > freeSpaceInCurrentPage) {
-      logger.trace("Required space {} is less than free space in current page ({})", requiredSpace,
-        freeSpaceInCurrentPage);
-      // TODO: we should track metrics on the amount of space wasted when we roll over to a new page
-      // without using the free space at the end of the current page. We should also do this for
-      // BytesToBytesMap.
-      if (requiredSpace > pageSizeBytes) {
-        throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
-          pageSizeBytes + ")");
-      } else {
-        currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
-        if (currentPage == null) {
-          spill();
-          currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
-          if (currentPage == null) {
-            throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
-          }
-        }
-        currentPagePosition = currentPage.getBaseOffset();
-        freeSpaceInCurrentPage = pageSizeBytes;
-        allocatedPages.add(currentPage);
-      }
+  private void acquireNewPageIfNecessary(int required) {
+    if (currentPage == null ||
+      pageCursor + required > currentPage.getBaseOffset() + currentPage.size() ) {
+      // TODO: try to find space in previous pages
+      currentPage = allocatePage(required);
+      pageCursor = currentPage.getBaseOffset();
+      allocatedPages.add(currentPage);
     }
   }
 
   /**
    * Write a record to the shuffle sorter.
    */
-  public void insertRecord(
-      Object recordBaseObject,
-      long recordBaseOffset,
-      int lengthInBytes,
-      int partitionId) throws IOException {
+  public void insertRecord(Object recordBase, long recordOffset, int length, int partitionId)
+    throws IOException {
 
-    if (numRecordsInsertedSinceLastSpill > numElementsForSpillThreshold) {
+    // for tests
+    assert(inMemSorter != null);
+    if (inMemSorter.numRecords() > numElementsForSpillThreshold) {
       spill();
     }
 
     growPointerArrayIfNecessary();
     // Need 4 bytes to store the record length.
-    final int totalSpaceRequired = lengthInBytes + 4;
-
-    // --- Figure out where to insert the new record ----------------------------------------------
-
-    final MemoryBlock dataPage;
-    long dataPagePosition;
-    boolean useOverflowPage = totalSpaceRequired > pageSizeBytes;
-    if (useOverflowPage) {
-      long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
-      // The record is larger than the page size, so allocate a special overflow page just to hold
-      // that record.
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-      if (overflowPage == null) {
-        spill();
-        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-        if (overflowPage == null) {
-          throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
-        }
-      }
-      allocatedPages.add(overflowPage);
-      dataPage = overflowPage;
-      dataPagePosition = overflowPage.getBaseOffset();
-    } else {
-      // The record is small enough to fit in a regular data page, but the current page might not
-      // have enough space to hold it (or no pages have been allocated yet).
-      acquireNewPageIfNecessary(totalSpaceRequired);
-      dataPage = currentPage;
-      dataPagePosition = currentPagePosition;
-      // Update bookkeeping information
-      freeSpaceInCurrentPage -= totalSpaceRequired;
-      currentPagePosition += totalSpaceRequired;
-    }
-    final Object dataPageBaseObject = dataPage.getBaseObject();
-
-    final long recordAddress =
-      taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition);
-    Platform.putInt(dataPageBaseObject, dataPagePosition, lengthInBytes);
-    dataPagePosition += 4;
-    Platform.copyMemory(
-      recordBaseObject, recordBaseOffset, dataPageBaseObject, dataPagePosition, lengthInBytes);
-    assert(inMemSorter != null);
+    final int required = length + 4;
+    acquireNewPageIfNecessary(required);
+
+    assert(currentPage != null);
+    final Object base = currentPage.getBaseObject();
+    final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
+    Platform.putInt(base, pageCursor, length);
+    pageCursor += 4;
+    Platform.copyMemory(recordBase, recordOffset, base, pageCursor, length);
+    pageCursor += length;
     inMemSorter.insertRecord(recordAddress, partitionId);
-    numRecordsInsertedSinceLastSpill += 1;
   }
 
   /**
@@ -475,6 +404,9 @@ public SpillInfo[] closeAndGetSpills() throws IOException {
         // Do not count the final file towards the spill count.
         writeSortedFile(true);
         freeMemory();
+        long sorterMemoryUsage = inMemSorter.getMemoryUsage();
+        inMemSorter = null;
+        releaseMemory(sorterMemoryUsage);
       }
       return spills.toArray(new SpillInfo[spills.size()]);
     } catch (IOException e) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
index a8dee6c6101c1..e630575d1ae19 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java
@@ -37,33 +37,51 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) {
    * {@link PackedRecordPointer}. The sort operates on this array instead of directly manipulating
    * records.
    */
-  private long[] pointerArray;
+  private long[] array;
 
   /**
    * The position in the pointer array where new records can be inserted.
    */
-  private int pointerArrayInsertPosition = 0;
+  private int pos = 0;
 
   public ShuffleInMemorySorter(int initialSize) {
     assert (initialSize > 0);
-    this.pointerArray = new long[initialSize];
-    this.sorter = new Sorter<PackedRecordPointer, long[]>(ShuffleSortDataFormat.INSTANCE);
+    this.array = new long[initialSize];
+    this.sorter = new Sorter<>(ShuffleSortDataFormat.INSTANCE);
   }
 
-  public void expandPointerArray() {
-    final long[] oldArray = pointerArray;
+  public int numRecords() {
+    return pos;
+  }
+
+  public void reset() {
+    pos = 0;
+  }
+
+  private int newLength() {
     // Guard against overflow:
-    final int newLength = oldArray.length * 2 > 0 ? (oldArray.length * 2) : Integer.MAX_VALUE;
-    pointerArray = new long[newLength];
-    System.arraycopy(oldArray, 0, pointerArray, 0, oldArray.length);
+    return array.length <= Integer.MAX_VALUE / 2 ? (array.length * 2) : Integer.MAX_VALUE;
+  }
+
+  /**
+   * Returns the memory needed to expand
+   */
+  public long getMemoryToExpand() {
+    return ((long) (newLength() - array.length)) * 8;
+  }
+
+  public void expandPointerArray() {
+    final long[] oldArray = array;
+    array = new long[newLength()];
+    System.arraycopy(oldArray, 0, array, 0, oldArray.length);
   }
 
   public boolean hasSpaceForAnotherRecord() {
-    return pointerArrayInsertPosition + 1 < pointerArray.length;
+    return pos < array.length;
   }
 
   public long getMemoryUsage() {
-    return pointerArray.length * 8L;
+    return array.length * 8L;
   }
 
   /**
@@ -78,15 +96,15 @@ public long getMemoryUsage() {
    */
   public void insertRecord(long recordPointer, int partitionId) {
     if (!hasSpaceForAnotherRecord()) {
-      if (pointerArray.length == Integer.MAX_VALUE) {
+      if (array.length == Integer.MAX_VALUE) {
         throw new IllegalStateException("Sort pointer array has reached maximum size");
       } else {
         expandPointerArray();
       }
     }
-    pointerArray[pointerArrayInsertPosition] =
+    array[pos] =
         PackedRecordPointer.packPointer(recordPointer, partitionId);
-    pointerArrayInsertPosition++;
+    pos++;
   }
 
   /**
@@ -118,7 +136,7 @@ public void loadNext() {
    * Return an iterator over record pointers in sorted order.
    */
   public ShuffleSorterIterator getSortedIterator() {
-    sorter.sort(pointerArray, 0, pointerArrayInsertPosition, SORT_COMPARATOR);
-    return new ShuffleSorterIterator(pointerArrayInsertPosition, pointerArray);
+    sorter.sort(array, 0, pos, SORT_COMPARATOR);
+    return new ShuffleSorterIterator(pos, array);
   }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index f6c5c944bd77b..e19b37864293c 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -127,12 +127,6 @@ public UnsafeShuffleWriter(
     open();
   }
 
-  @VisibleForTesting
-  public int maxRecordSizeBytes() {
-    assert(sorter != null);
-    return sorter.maxRecordSizeBytes;
-  }
-
   private void updatePeakMemoryUsed() {
     // sorter can be null if this writer is closed
     if (sorter != null) {
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index f035bdac810bd..e36709c6fc849 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -18,14 +18,20 @@
 package org.apache.spark.unsafe.map;
 
 import javax.annotation.Nullable;
+import java.io.File;
+import java.io.IOException;
 import java.util.Iterator;
 import java.util.LinkedList;
-import java.util.List;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.spark.SparkEnv;
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.array.LongArray;
@@ -33,7 +39,8 @@
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.memory.MemoryLocation;
-import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader;
+import org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter;
 
 /**
  * An append-only hash map where keys and values are contiguous regions of bytes.
@@ -54,7 +61,7 @@
  * is consistent with {@link org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter},
  * so we can pass records from this map directly into the sorter to sort records in place.
  */
-public final class BytesToBytesMap {
+public final class BytesToBytesMap extends MemoryConsumer {
 
   private final Logger logger = LoggerFactory.getLogger(BytesToBytesMap.class);
 
@@ -62,27 +69,22 @@ public final class BytesToBytesMap {
 
   private static final HashMapGrowthStrategy growthStrategy = HashMapGrowthStrategy.DOUBLING;
 
-  /**
-   * Special record length that is placed after the last record in a data page.
-   */
-  private static final int END_OF_PAGE_MARKER = -1;
-
   private final TaskMemoryManager taskMemoryManager;
 
   /**
    * A linked list for tracking all allocated data pages so that we can free all of our memory.
    */
-  private final List<MemoryBlock> dataPages = new LinkedList<MemoryBlock>();
+  private final LinkedList<MemoryBlock> dataPages = new LinkedList<>();
 
   /**
    * The data page that will be used to store keys and values for new hashtable entries. When this
    * page becomes full, a new page will be allocated and this pointer will change to point to that
    * new page.
    */
-  private MemoryBlock currentDataPage = null;
+  private MemoryBlock currentPage = null;
 
   /**
-   * Offset into `currentDataPage` that points to the location where new data can be inserted into
+   * Offset into `currentPage` that points to the location where new data can be inserted into
    * the page. This does not incorporate the page's base offset.
    */
   private long pageCursor = 0;
@@ -116,6 +118,11 @@ public final class BytesToBytesMap {
   // full base addresses in the page table for off-heap mode so that we can reconstruct the full
   // absolute memory addresses.
 
+  /**
+   * Whether or not the longArray can grow. We will not insert more elements if it's false.
+   */
+  private boolean canGrowArray = true;
+
   /**
    * A {@link BitSet} used to track location of the map where the key is set.
    * Size of the bitset should be half of the size of the long array.
@@ -164,13 +171,20 @@ public final class BytesToBytesMap {
 
   private long peakMemoryUsedBytes = 0L;
 
+  private final BlockManager blockManager;
+  private volatile MapIterator destructiveIterator = null;
+  private LinkedList<UnsafeSorterSpillWriter> spillWriters = new LinkedList<>();
+
   public BytesToBytesMap(
       TaskMemoryManager taskMemoryManager,
+      BlockManager blockManager,
       int initialCapacity,
       double loadFactor,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
+    super(taskMemoryManager, pageSizeBytes);
     this.taskMemoryManager = taskMemoryManager;
+    this.blockManager = blockManager;
     this.loadFactor = loadFactor;
     this.loc = new Location();
     this.pageSizeBytes = pageSizeBytes;
@@ -187,18 +201,13 @@ public BytesToBytesMap(
         TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES);
     }
     allocate(initialCapacity);
-
-    // Acquire a new page as soon as we construct the map to ensure that we have at least
-    // one page to work with. Otherwise, other operators in the same task may starve this
-    // map (SPARK-9747).
-    acquireNewPage();
   }
 
   public BytesToBytesMap(
       TaskMemoryManager taskMemoryManager,
       int initialCapacity,
       long pageSizeBytes) {
-    this(taskMemoryManager, initialCapacity, 0.70, pageSizeBytes, false);
+    this(taskMemoryManager, initialCapacity, pageSizeBytes, false);
   }
 
   public BytesToBytesMap(
@@ -208,6 +217,7 @@ public BytesToBytesMap(
       boolean enablePerfMetrics) {
     this(
       taskMemoryManager,
+      SparkEnv.get() != null ? SparkEnv.get().blockManager() :  null,
       initialCapacity,
       0.70,
       pageSizeBytes,
@@ -219,61 +229,153 @@ public BytesToBytesMap(
    */
   public int numElements() { return numElements; }
 
-  public static final class BytesToBytesMapIterator implements Iterator<Location> {
+  public final class MapIterator implements Iterator<Location> {
 
-    private final int numRecords;
-    private final Iterator<MemoryBlock> dataPagesIterator;
+    private int numRecords;
     private final Location loc;
 
     private MemoryBlock currentPage = null;
-    private int currentRecordNumber = 0;
+    private int recordsInPage = 0;
     private Object pageBaseObject;
     private long offsetInPage;
 
     // If this iterator destructive or not. When it is true, it frees each page as it moves onto
     // next one.
     private boolean destructive = false;
-    private BytesToBytesMap bmap;
+    private UnsafeSorterSpillReader reader = null;
 
-    private BytesToBytesMapIterator(
-        int numRecords, Iterator<MemoryBlock> dataPagesIterator, Location loc,
-        boolean destructive, BytesToBytesMap bmap) {
+    private MapIterator(int numRecords, Location loc, boolean destructive) {
       this.numRecords = numRecords;
-      this.dataPagesIterator = dataPagesIterator;
       this.loc = loc;
       this.destructive = destructive;
-      this.bmap = bmap;
-      if (dataPagesIterator.hasNext()) {
-        advanceToNextPage();
+      if (destructive) {
+        destructiveIterator = this;
       }
     }
 
     private void advanceToNextPage() {
-      if (destructive && currentPage != null) {
-        dataPagesIterator.remove();
-        this.bmap.taskMemoryManager.freePage(currentPage);
+      synchronized (this) {
+        int nextIdx = dataPages.indexOf(currentPage) + 1;
+        if (destructive && currentPage != null) {
+          dataPages.remove(currentPage);
+          freePage(currentPage);
+          nextIdx --;
+        }
+        if (dataPages.size() > nextIdx) {
+          currentPage = dataPages.get(nextIdx);
+          pageBaseObject = currentPage.getBaseObject();
+          offsetInPage = currentPage.getBaseOffset();
+          recordsInPage = Platform.getInt(pageBaseObject, offsetInPage);
+          offsetInPage += 4;
+        } else {
+          currentPage = null;
+          if (reader != null) {
+            // remove the spill file from disk
+            File file = spillWriters.removeFirst().getFile();
+            if (file != null && file.exists()) {
+              if (!file.delete()) {
+                logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+              }
+            }
+          }
+          try {
+            reader = spillWriters.getFirst().getReader(blockManager);
+            recordsInPage = -1;
+          } catch (IOException e) {
+            // Scala iterator does not handle exception
+            Platform.throwException(e);
+          }
+        }
       }
-      currentPage = dataPagesIterator.next();
-      pageBaseObject = currentPage.getBaseObject();
-      offsetInPage = currentPage.getBaseOffset();
     }
 
     @Override
     public boolean hasNext() {
-      return currentRecordNumber != numRecords;
+      if (numRecords == 0) {
+        if (reader != null) {
+          // remove the spill file from disk
+          File file = spillWriters.removeFirst().getFile();
+          if (file != null && file.exists()) {
+            if (!file.delete()) {
+              logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+            }
+          }
+        }
+      }
+      return numRecords > 0;
     }
 
     @Override
     public Location next() {
-      int totalLength = Platform.getInt(pageBaseObject, offsetInPage);
-      if (totalLength == END_OF_PAGE_MARKER) {
+      if (recordsInPage == 0) {
         advanceToNextPage();
-        totalLength = Platform.getInt(pageBaseObject, offsetInPage);
       }
-      loc.with(currentPage, offsetInPage);
-      offsetInPage += 4 + totalLength;
-      currentRecordNumber++;
-      return loc;
+      numRecords--;
+      if (currentPage != null) {
+        int totalLength = Platform.getInt(pageBaseObject, offsetInPage);
+        loc.with(currentPage, offsetInPage);
+        offsetInPage += 4 + totalLength;
+        recordsInPage --;
+        return loc;
+      } else {
+        assert(reader != null);
+        if (!reader.hasNext()) {
+          advanceToNextPage();
+        }
+        try {
+          reader.loadNext();
+        } catch (IOException e) {
+          // Scala iterator does not handle exception
+          Platform.throwException(e);
+        }
+        loc.with(reader.getBaseObject(), reader.getBaseOffset(), reader.getRecordLength());
+        return loc;
+      }
+    }
+
+    public long spill(long numBytes) throws IOException {
+      synchronized (this) {
+        if (!destructive || dataPages.size() == 1) {
+          return 0L;
+        }
+
+        // TODO: use existing ShuffleWriteMetrics
+        ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
+
+        long released = 0L;
+        while (dataPages.size() > 0) {
+          MemoryBlock block = dataPages.getLast();
+          // The currentPage is used, cannot be released
+          if (block == currentPage) {
+            break;
+          }
+
+          Object base = block.getBaseObject();
+          long offset = block.getBaseOffset();
+          int numRecords = Platform.getInt(base, offset);
+          offset += 4;
+          final UnsafeSorterSpillWriter writer =
+            new UnsafeSorterSpillWriter(blockManager, 32 * 1024, writeMetrics, numRecords);
+          while (numRecords > 0) {
+            int length = Platform.getInt(base, offset);
+            writer.write(base, offset + 4, length, 0);
+            offset += 4 + length;
+            numRecords--;
+          }
+          writer.close();
+          spillWriters.add(writer);
+
+          dataPages.removeLast();
+          released += block.size();
+          freePage(block);
+
+          if (released >= numBytes) {
+            break;
+          }
+        }
+
+        return released;
+      }
     }
 
     @Override
@@ -290,8 +392,8 @@ public void remove() {
    * If any other lookups or operations are performed on this map while iterating over it, including
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
-  public BytesToBytesMapIterator iterator() {
-    return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc, false, this);
+  public MapIterator iterator() {
+    return new MapIterator(numElements, loc, false);
   }
 
   /**
@@ -304,8 +406,8 @@ public BytesToBytesMapIterator iterator() {
    * If any other lookups or operations are performed on this map while iterating over it, including
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
-  public BytesToBytesMapIterator destructiveIterator() {
-    return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc, true, this);
+  public MapIterator destructiveIterator() {
+    return new MapIterator(numElements, loc, true);
   }
 
   /**
@@ -314,11 +416,8 @@ public BytesToBytesMapIterator destructiveIterator() {
    *
    * This function always return the same {@link Location} instance to avoid object allocation.
    */
-  public Location lookup(
-      Object keyBaseObject,
-      long keyBaseOffset,
-      int keyRowLengthBytes) {
-    safeLookup(keyBaseObject, keyBaseOffset, keyRowLengthBytes, loc);
+  public Location lookup(Object keyBase, long keyOffset, int keyLength) {
+    safeLookup(keyBase, keyOffset, keyLength, loc);
     return loc;
   }
 
@@ -327,18 +426,14 @@ public Location lookup(
    *
    * This is a thread-safe version of `lookup`, could be used by multiple threads.
    */
-  public void safeLookup(
-      Object keyBaseObject,
-      long keyBaseOffset,
-      int keyRowLengthBytes,
-      Location loc) {
+  public void safeLookup(Object keyBase, long keyOffset, int keyLength, Location loc) {
     assert(bitset != null);
     assert(longArray != null);
 
     if (enablePerfMetrics) {
       numKeyLookups++;
     }
-    final int hashcode = HASHER.hashUnsafeWords(keyBaseObject, keyBaseOffset, keyRowLengthBytes);
+    final int hashcode = HASHER.hashUnsafeWords(keyBase, keyOffset, keyLength);
     int pos = hashcode & mask;
     int step = 1;
     while (true) {
@@ -354,16 +449,16 @@ public void safeLookup(
         if ((int) (stored) == hashcode) {
           // Full hash code matches.  Let's compare the keys for equality.
           loc.with(pos, hashcode, true);
-          if (loc.getKeyLength() == keyRowLengthBytes) {
+          if (loc.getKeyLength() == keyLength) {
             final MemoryLocation keyAddress = loc.getKeyAddress();
-            final Object storedKeyBaseObject = keyAddress.getBaseObject();
-            final long storedKeyBaseOffset = keyAddress.getBaseOffset();
+            final Object storedkeyBase = keyAddress.getBaseObject();
+            final long storedkeyOffset = keyAddress.getBaseOffset();
             final boolean areEqual = ByteArrayMethods.arrayEquals(
-              keyBaseObject,
-              keyBaseOffset,
-              storedKeyBaseObject,
-              storedKeyBaseOffset,
-              keyRowLengthBytes
+              keyBase,
+              keyOffset,
+              storedkeyBase,
+              storedkeyOffset,
+              keyLength
             );
             if (areEqual) {
               return;
@@ -410,18 +505,18 @@ private void updateAddressesAndSizes(long fullKeyAddress) {
         taskMemoryManager.getOffsetInPage(fullKeyAddress));
     }
 
-    private void updateAddressesAndSizes(final Object page, final long offsetInPage) {
-      long position = offsetInPage;
-      final int totalLength = Platform.getInt(page, position);
+    private void updateAddressesAndSizes(final Object base, final long offset) {
+      long position = offset;
+      final int totalLength = Platform.getInt(base, position);
       position += 4;
-      keyLength = Platform.getInt(page, position);
+      keyLength = Platform.getInt(base, position);
       position += 4;
       valueLength = totalLength - keyLength - 4;
 
-      keyMemoryLocation.setObjAndOffset(page, position);
+      keyMemoryLocation.setObjAndOffset(base, position);
 
       position += keyLength;
-      valueMemoryLocation.setObjAndOffset(page, position);
+      valueMemoryLocation.setObjAndOffset(base, position);
     }
 
     private Location with(int pos, int keyHashcode, boolean isDefined) {
@@ -443,6 +538,19 @@ private Location with(MemoryBlock page, long offsetInPage) {
       return this;
     }
 
+    /**
+     * This is only used for spilling
+     */
+    private Location with(Object base, long offset, int length) {
+      this.isDefined = true;
+      this.memoryPage = null;
+      keyLength = Platform.getInt(base, offset);
+      valueLength = length - 4 - keyLength;
+      keyMemoryLocation.setObjAndOffset(base, offset + 4);
+      valueMemoryLocation.setObjAndOffset(base, offset + 4 + keyLength);
+      return this;
+    }
+
     /**
      * Returns the memory page that contains the current record.
      * This is only valid if this is returned by {@link BytesToBytesMap#iterator()}.
@@ -517,9 +625,9 @@ public int getValueLength() {
      * As an example usage, here's the proper way to store a new key:
      * </p>
      * <pre>
-     *   Location loc = map.lookup(keyBaseObject, keyBaseOffset, keyLengthInBytes);
+     *   Location loc = map.lookup(keyBase, keyOffset, keyLength);
      *   if (!loc.isDefined()) {
-     *     if (!loc.putNewKey(keyBaseObject, keyBaseOffset, keyLengthInBytes, ...)) {
+     *     if (!loc.putNewKey(keyBase, keyOffset, keyLength, ...)) {
      *       // handle failure to grow map (by spilling, for example)
      *     }
      *   }
@@ -531,113 +639,59 @@ public int getValueLength() {
      * @return true if the put() was successful and false if the put() failed because memory could
      *         not be acquired.
      */
-    public boolean putNewKey(
-        Object keyBaseObject,
-        long keyBaseOffset,
-        int keyLengthBytes,
-        Object valueBaseObject,
-        long valueBaseOffset,
-        int valueLengthBytes) {
+    public boolean putNewKey(Object keyBase, long keyOffset, int keyLength,
+        Object valueBase, long valueOffset, int valueLength) {
       assert (!isDefined) : "Can only set value once for a key";
-      assert (keyLengthBytes % 8 == 0);
-      assert (valueLengthBytes % 8 == 0);
+      assert (keyLength % 8 == 0);
+      assert (valueLength % 8 == 0);
       assert(bitset != null);
       assert(longArray != null);
 
-      if (numElements == MAX_CAPACITY) {
-        throw new IllegalStateException("BytesToBytesMap has reached maximum capacity");
+      if (numElements == MAX_CAPACITY || !canGrowArray) {
+        return false;
       }
 
       // Here, we'll copy the data into our data pages. Because we only store a relative offset from
       // the key address instead of storing the absolute address of the value, the key and value
       // must be stored in the same memory page.
       // (8 byte key length) (key) (value)
-      final long requiredSize = 8 + keyLengthBytes + valueLengthBytes;
-
-      // --- Figure out where to insert the new record ---------------------------------------------
-
-      final MemoryBlock dataPage;
-      final Object dataPageBaseObject;
-      final long dataPageInsertOffset;
-      boolean useOverflowPage = requiredSize > pageSizeBytes - 8;
-      if (useOverflowPage) {
-        // The record is larger than the page size, so allocate a special overflow page just to hold
-        // that record.
-        final long overflowPageSize = requiredSize + 8;
-        MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-        if (overflowPage == null) {
-          logger.debug("Failed to acquire {} bytes of memory", overflowPageSize);
+      final long recordLength = 8 + keyLength + valueLength;
+      if (currentPage == null || currentPage.size() - pageCursor < recordLength) {
+        if (!acquireNewPage(recordLength + 4L)) {
           return false;
         }
-        dataPages.add(overflowPage);
-        dataPage = overflowPage;
-        dataPageBaseObject = overflowPage.getBaseObject();
-        dataPageInsertOffset = overflowPage.getBaseOffset();
-      } else if (currentDataPage == null || pageSizeBytes - 8 - pageCursor < requiredSize) {
-        // The record can fit in a data page, but either we have not allocated any pages yet or
-        // the current page does not have enough space.
-        if (currentDataPage != null) {
-          // There wasn't enough space in the current page, so write an end-of-page marker:
-          final Object pageBaseObject = currentDataPage.getBaseObject();
-          final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
-          Platform.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
-        }
-        if (!acquireNewPage()) {
-          return false;
-        }
-        dataPage = currentDataPage;
-        dataPageBaseObject = currentDataPage.getBaseObject();
-        dataPageInsertOffset = currentDataPage.getBaseOffset();
-      } else {
-        // There is enough space in the current data page.
-        dataPage = currentDataPage;
-        dataPageBaseObject = currentDataPage.getBaseObject();
-        dataPageInsertOffset = currentDataPage.getBaseOffset() + pageCursor;
       }
 
       // --- Append the key and value data to the current data page --------------------------------
-
-      long insertCursor = dataPageInsertOffset;
-
-      // Compute all of our offsets up-front:
-      final long recordOffset = insertCursor;
-      insertCursor += 4;
-      final long keyLengthOffset = insertCursor;
-      insertCursor += 4;
-      final long keyDataOffsetInPage = insertCursor;
-      insertCursor += keyLengthBytes;
-      final long valueDataOffsetInPage = insertCursor;
-      insertCursor += valueLengthBytes; // word used to store the value size
-
-      Platform.putInt(dataPageBaseObject, recordOffset,
-        keyLengthBytes + valueLengthBytes + 4);
-      Platform.putInt(dataPageBaseObject, keyLengthOffset, keyLengthBytes);
-      // Copy the key
-      Platform.copyMemory(
-        keyBaseObject, keyBaseOffset, dataPageBaseObject, keyDataOffsetInPage, keyLengthBytes);
-      // Copy the value
-      Platform.copyMemory(valueBaseObject, valueBaseOffset, dataPageBaseObject,
-        valueDataOffsetInPage, valueLengthBytes);
-
-      // --- Update bookeeping data structures -----------------------------------------------------
-
-      if (useOverflowPage) {
-        // Store the end-of-page marker at the end of the data page
-        Platform.putInt(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER);
-      } else {
-        pageCursor += requiredSize;
-      }
-
+      final Object base = currentPage.getBaseObject();
+      long offset = currentPage.getBaseOffset() + pageCursor;
+      final long recordOffset = offset;
+      Platform.putInt(base, offset, keyLength + valueLength + 4);
+      Platform.putInt(base, offset + 4, keyLength);
+      offset += 8;
+      Platform.copyMemory(keyBase, keyOffset, base, offset, keyLength);
+      offset += keyLength;
+      Platform.copyMemory(valueBase, valueOffset, base, offset, valueLength);
+
+      // --- Update bookkeeping data structures -----------------------------------------------------
+      offset = currentPage.getBaseOffset();
+      Platform.putInt(base, offset, Platform.getInt(base, offset) + 1);
+      pageCursor += recordLength;
       numElements++;
       bitset.set(pos);
       final long storedKeyAddress = taskMemoryManager.encodePageNumberAndOffset(
-        dataPage, recordOffset);
+        currentPage, recordOffset);
       longArray.set(pos * 2, storedKeyAddress);
       longArray.set(pos * 2 + 1, keyHashcode);
       updateAddressesAndSizes(storedKeyAddress);
       isDefined = true;
+
       if (numElements > growthThreshold && longArray.size() < MAX_CAPACITY) {
-        growAndRehash();
+        try {
+          growAndRehash();
+        } catch (OutOfMemoryError oom) {
+          canGrowArray = false;
+        }
       }
       return true;
     }
@@ -647,18 +701,26 @@ public boolean putNewKey(
    * Acquire a new page from the memory manager.
    * @return whether there is enough space to allocate the new page.
    */
-  private boolean acquireNewPage() {
-    MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes);
-    if (newPage == null) {
-      logger.debug("Failed to acquire {} bytes of memory", pageSizeBytes);
+  private boolean acquireNewPage(long required) {
+    try {
+      currentPage = allocatePage(required);
+    } catch (OutOfMemoryError e) {
       return false;
     }
-    dataPages.add(newPage);
-    pageCursor = 0;
-    currentDataPage = newPage;
+    dataPages.add(currentPage);
+    Platform.putInt(currentPage.getBaseObject(), currentPage.getBaseOffset(), 0);
+    pageCursor = 4;
     return true;
   }
 
+  @Override
+  public long spill(long size, MemoryConsumer trigger) throws IOException {
+    if (trigger != this && destructiveIterator != null) {
+      return destructiveIterator.spill(size);
+    }
+    return 0L;
+  }
+
   /**
    * Allocate new data structures for this map. When calling this outside of the constructor,
    * make sure to keep references to the old data structures so that you can free them.
@@ -670,6 +732,7 @@ private void allocate(int capacity) {
     // The capacity needs to be divisible by 64 so that our bit set can be sized properly
     capacity = Math.max((int) Math.min(MAX_CAPACITY, ByteArrayMethods.nextPowerOf2(capacity)), 64);
     assert (capacity <= MAX_CAPACITY);
+    acquireMemory(capacity * 16);
     longArray = new LongArray(MemoryBlock.fromLongArray(new long[capacity * 2]));
     bitset = new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));
 
@@ -677,6 +740,19 @@ private void allocate(int capacity) {
     this.mask = capacity - 1;
   }
 
+  /**
+   * Free the memory used by longArray.
+   */
+  public void freeArray() {
+    updatePeakMemoryUsed();
+    if (longArray != null) {
+      long used = longArray.memoryBlock().size();
+      longArray = null;
+      releaseMemory(used);
+      bitset = null;
+    }
+  }
+
   /**
    * Free all allocated memory associated with this map, including the storage for keys and values
    * as well as the hash map array itself.
@@ -684,16 +760,23 @@ private void allocate(int capacity) {
    * This method is idempotent and can be called multiple times.
    */
   public void free() {
-    updatePeakMemoryUsed();
-    longArray = null;
-    bitset = null;
+    freeArray();
     Iterator<MemoryBlock> dataPagesIterator = dataPages.iterator();
     while (dataPagesIterator.hasNext()) {
       MemoryBlock dataPage = dataPagesIterator.next();
       dataPagesIterator.remove();
-      taskMemoryManager.freePage(dataPage);
+      freePage(dataPage);
     }
     assert(dataPages.isEmpty());
+
+    while (!spillWriters.isEmpty()) {
+      File file = spillWriters.removeFirst().getFile();
+      if (file != null && file.exists()) {
+        if (!file.delete()) {
+          logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+        }
+      }
+    }
   }
 
   public TaskMemoryManager getTaskMemoryManager() {
@@ -782,7 +865,13 @@ void growAndRehash() {
     final int oldCapacity = (int) oldBitSet.capacity();
 
     // Allocate the new data structures
-    allocate(Math.min(growthStrategy.nextCapacity(oldCapacity), MAX_CAPACITY));
+    try {
+      allocate(Math.min(growthStrategy.nextCapacity(oldCapacity), MAX_CAPACITY));
+    } catch (OutOfMemoryError oom) {
+      longArray = oldLongArray;
+      bitset = oldBitSet;
+      throw oom;
+    }
 
     // Re-mask (we don't recompute the hashcode because we stored all 32 bits of it)
     for (int pos = oldBitSet.nextSetBit(0); pos >= 0; pos = oldBitSet.nextSetBit(pos + 1)) {
@@ -806,6 +895,7 @@ void growAndRehash() {
         }
       }
     }
+    releaseMemory(oldLongArray.memoryBlock().size());
 
     if (enablePerfMetrics) {
       timeSpentResizingNs += System.nanoTime() - resizeStartTime;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index e317ea391c556..49a5a4b13b70d 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -17,39 +17,34 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
+import javax.annotation.Nullable;
 import java.io.File;
 import java.io.IOException;
 import java.util.LinkedList;
 
-import javax.annotation.Nullable;
-
-import scala.runtime.AbstractFunction0;
-import scala.runtime.BoxedUnit;
-
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.storage.BlockManager;
-import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.util.TaskCompletionListener;
 import org.apache.spark.util.Utils;
 
 /**
  * External sorter based on {@link UnsafeInMemorySorter}.
  */
-public final class UnsafeExternalSorter {
+public final class UnsafeExternalSorter extends MemoryConsumer {
 
   private final Logger logger = LoggerFactory.getLogger(UnsafeExternalSorter.class);
 
-  private final long pageSizeBytes;
   private final PrefixComparator prefixComparator;
   private final RecordComparator recordComparator;
-  private final int initialSize;
   private final TaskMemoryManager taskMemoryManager;
   private final BlockManager blockManager;
   private final TaskContext taskContext;
@@ -69,14 +64,12 @@ public final class UnsafeExternalSorter {
   private final LinkedList<UnsafeSorterSpillWriter> spillWriters = new LinkedList<>();
 
   // These variables are reset after spilling:
-  @Nullable private UnsafeInMemorySorter inMemSorter;
-  // Whether the in-mem sorter is created internally, or passed in from outside.
-  // If it is passed in from outside, we shouldn't release the in-mem sorter's memory.
-  private boolean isInMemSorterExternal = false;
+  @Nullable private volatile UnsafeInMemorySorter inMemSorter;
+
   private MemoryBlock currentPage = null;
-  private long currentPagePosition = -1;
-  private long freeSpaceInCurrentPage = 0;
+  private long pageCursor = -1;
   private long peakMemoryUsedBytes = 0;
+  private volatile SpillableIterator readingIterator = null;
 
   public static UnsafeExternalSorter createWithExistingInMemorySorter(
       TaskMemoryManager taskMemoryManager,
@@ -86,7 +79,7 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter(
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes,
-      UnsafeInMemorySorter inMemorySorter) throws IOException {
+      UnsafeInMemorySorter inMemorySorter) {
     return new UnsafeExternalSorter(taskMemoryManager, blockManager,
       taskContext, recordComparator, prefixComparator, initialSize, pageSizeBytes, inMemorySorter);
   }
@@ -98,7 +91,7 @@ public static UnsafeExternalSorter create(
       RecordComparator recordComparator,
       PrefixComparator prefixComparator,
       int initialSize,
-      long pageSizeBytes) throws IOException {
+      long pageSizeBytes) {
     return new UnsafeExternalSorter(taskMemoryManager, blockManager,
       taskContext, recordComparator, prefixComparator, initialSize, pageSizeBytes, null);
   }
@@ -111,60 +104,41 @@ private UnsafeExternalSorter(
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes,
-      @Nullable UnsafeInMemorySorter existingInMemorySorter) throws IOException {
+      @Nullable UnsafeInMemorySorter existingInMemorySorter) {
+    super(taskMemoryManager, pageSizeBytes);
     this.taskMemoryManager = taskMemoryManager;
     this.blockManager = blockManager;
     this.taskContext = taskContext;
     this.recordComparator = recordComparator;
     this.prefixComparator = prefixComparator;
-    this.initialSize = initialSize;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
     // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
     this.fileBufferSizeBytes = 32 * 1024;
-    this.pageSizeBytes = pageSizeBytes;
+    // TODO: metrics tracking + integration with shuffle write metrics
+    // need to connect the write metrics to task metrics so we count the spill IO somewhere.
     this.writeMetrics = new ShuffleWriteMetrics();
 
     if (existingInMemorySorter == null) {
-      initializeForWriting();
-      // Acquire a new page as soon as we construct the sorter to ensure that we have at
-      // least one page to work with. Otherwise, other operators in the same task may starve
-      // this sorter (SPARK-9709). We don't need to do this if we already have an existing sorter.
-      acquireNewPage();
+      this.inMemSorter =
+        new UnsafeInMemorySorter(taskMemoryManager, recordComparator, prefixComparator, initialSize);
+      acquireMemory(inMemSorter.getMemoryUsage());
     } else {
-      this.isInMemSorterExternal = true;
       this.inMemSorter = existingInMemorySorter;
+      // will acquire after free the map
     }
+    this.peakMemoryUsedBytes = getMemoryUsage();
 
     // Register a cleanup task with TaskContext to ensure that memory is guaranteed to be freed at
     // the end of the task. This is necessary to avoid memory leaks in when the downstream operator
     // does not fully consume the sorter's output (e.g. sort followed by limit).
-    taskContext.addOnCompleteCallback(new AbstractFunction0<BoxedUnit>() {
-      @Override
-      public BoxedUnit apply() {
-        cleanupResources();
-        return null;
+    taskContext.addTaskCompletionListener(
+      new TaskCompletionListener() {
+        @Override
+        public void onTaskCompletion(TaskContext context) {
+          cleanupResources();
+        }
       }
-    });
-  }
-
-  // TODO: metrics tracking + integration with shuffle write metrics
-  // need to connect the write metrics to task metrics so we count the spill IO somewhere.
-
-  /**
-   * Allocates new sort data structures. Called when creating the sorter and after each spill.
-   */
-  private void initializeForWriting() throws IOException {
-    // Note: Do not track memory for the pointer array for now because of SPARK-10474.
-    // In more detail, in TungstenAggregate we only reserve a page, but when we fall back to
-    // sort-based aggregation we try to acquire a page AND a pointer array, which inevitably
-    // fails if all other memory is already occupied. It should be safe to not track the array
-    // because its memory footprint is frequently much smaller than that of a page. This is a
-    // temporary hack that we should address in 1.6.0.
-    // TODO: track the pointer array memory!
-    this.writeMetrics = new ShuffleWriteMetrics();
-    this.inMemSorter =
-      new UnsafeInMemorySorter(taskMemoryManager, recordComparator, prefixComparator, initialSize);
-    this.isInMemSorterExternal = false;
+    );
   }
 
   /**
@@ -173,14 +147,27 @@ private void initializeForWriting() throws IOException {
    */
   @VisibleForTesting
   public void closeCurrentPage() {
-    freeSpaceInCurrentPage = 0;
+    if (currentPage != null) {
+      pageCursor = currentPage.getBaseOffset() + currentPage.size();
+    }
   }
 
   /**
    * Sort and spill the current records in response to memory pressure.
    */
-  public void spill() throws IOException {
-    assert(inMemSorter != null);
+  @Override
+  public long spill(long size, MemoryConsumer trigger) throws IOException {
+    if (trigger != this) {
+      if (readingIterator != null) {
+        return readingIterator.spill();
+      }
+      return 0L;
+    }
+
+    if (inMemSorter == null || inMemSorter.numRecords() <= 0) {
+      return 0L;
+    }
+
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
       Thread.currentThread().getId(),
       Utils.bytesToString(getMemoryUsage()),
@@ -202,6 +189,8 @@ public void spill() throws IOException {
         spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
       }
       spillWriter.close();
+
+      inMemSorter.reset();
     }
 
     final long spillSize = freeMemory();
@@ -210,7 +199,7 @@ public void spill() throws IOException {
     // written to disk. This also counts the space needed to store the sorter's pointer array.
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
 
-    initializeForWriting();
+    return spillSize;
   }
 
   /**
@@ -246,7 +235,7 @@ public int getNumberOfAllocatedPages() {
   }
 
   /**
-   * Free this sorter's in-memory data structures, including its data pages and pointer array.
+   * Free this sorter's data pages.
    *
    * @return the number of bytes freed.
    */
@@ -254,14 +243,12 @@ private long freeMemory() {
     updatePeakMemoryUsed();
     long memoryFreed = 0;
     for (MemoryBlock block : allocatedPages) {
-      taskMemoryManager.freePage(block);
       memoryFreed += block.size();
+      freePage(block);
     }
-    // TODO: track in-memory sorter memory usage (SPARK-10474)
     allocatedPages.clear();
     currentPage = null;
-    currentPagePosition = -1;
-    freeSpaceInCurrentPage = 0;
+    pageCursor = 0;
     return memoryFreed;
   }
 
@@ -283,8 +270,15 @@ private void deleteSpillFiles() {
    * Frees this sorter's in-memory data structures and cleans up its spill files.
    */
   public void cleanupResources() {
-    deleteSpillFiles();
-    freeMemory();
+    synchronized (this) {
+      deleteSpillFiles();
+      freeMemory();
+      if (inMemSorter != null) {
+        long used = inMemSorter.getMemoryUsage();
+        inMemSorter = null;
+        releaseMemory(used);
+      }
+    }
   }
 
   /**
@@ -295,8 +289,28 @@ public void cleanupResources() {
   private void growPointerArrayIfNecessary() throws IOException {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
-      // TODO: track the pointer array memory! (SPARK-10474)
-      inMemSorter.expandPointerArray();
+      long used = inMemSorter.getMemoryUsage();
+      long needed = used + inMemSorter.getMemoryToExpand();
+      try {
+        acquireMemory(needed);  // could trigger spilling
+      } catch (OutOfMemoryError e) {
+        // should have trigger spilling
+        assert(inMemSorter.hasSpaceForAnotherRecord());
+        return;
+      }
+      // check if spilling is triggered or not
+      if (inMemSorter.hasSpaceForAnotherRecord()) {
+        releaseMemory(needed);
+      } else {
+        try {
+          inMemSorter.expandPointerArray();
+          releaseMemory(used);
+        } catch (OutOfMemoryError oom) {
+          // Just in case that JVM had run out of memory
+          releaseMemory(needed);
+          spill();
+        }
+      }
     }
   }
 
@@ -304,101 +318,38 @@ private void growPointerArrayIfNecessary() throws IOException {
    * Allocates more memory in order to insert an additional record. This will request additional
    * memory from the memory manager and spill if the requested memory can not be obtained.
    *
-   * @param requiredSpace the required space in the data page, in bytes, including space for storing
+   * @param required the required space in the data page, in bytes, including space for storing
    *                      the record size. This must be less than or equal to the page size (records
    *                      that exceed the page size are handled via a different code path which uses
    *                      special overflow pages).
    */
-  private void acquireNewPageIfNecessary(int requiredSpace) throws IOException {
-    assert (requiredSpace <= pageSizeBytes);
-    if (requiredSpace > freeSpaceInCurrentPage) {
-      logger.trace("Required space {} is less than free space in current page ({})", requiredSpace,
-        freeSpaceInCurrentPage);
-      // TODO: we should track metrics on the amount of space wasted when we roll over to a new page
-      // without using the free space at the end of the current page. We should also do this for
-      // BytesToBytesMap.
-      if (requiredSpace > pageSizeBytes) {
-        throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
-          pageSizeBytes + ")");
-      } else {
-        acquireNewPage();
-      }
+  private void acquireNewPageIfNecessary(int required) {
+    if (currentPage == null ||
+      pageCursor + required > currentPage.getBaseOffset() + currentPage.size()) {
+      // TODO: try to find space on previous pages
+      currentPage = allocatePage(required);
+      pageCursor = currentPage.getBaseOffset();
+      allocatedPages.add(currentPage);
     }
   }
 
-  /**
-   * Acquire a new page from the memory manager.
-   *
-   * If there is not enough space to allocate the new page, spill all existing ones
-   * and try again. If there is still not enough space, report error to the caller.
-   */
-  private void acquireNewPage() throws IOException {
-    currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
-    if (currentPage == null) {
-      spill();
-      currentPage = taskMemoryManager.allocatePage(pageSizeBytes);
-      if (currentPage == null) {
-        throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
-      }
-    }
-    currentPagePosition = currentPage.getBaseOffset();
-    freeSpaceInCurrentPage = pageSizeBytes;
-    allocatedPages.add(currentPage);
-  }
-
   /**
    * Write a record to the sorter.
    */
-  public void insertRecord(
-      Object recordBaseObject,
-      long recordBaseOffset,
-      int lengthInBytes,
-      long prefix) throws IOException {
+  public void insertRecord(Object recordBase, long recordOffset, int length, long prefix)
+    throws IOException {
 
     growPointerArrayIfNecessary();
     // Need 4 bytes to store the record length.
-    final int totalSpaceRequired = lengthInBytes + 4;
-
-    // --- Figure out where to insert the new record ----------------------------------------------
-
-    final MemoryBlock dataPage;
-    long dataPagePosition;
-    boolean useOverflowPage = totalSpaceRequired > pageSizeBytes;
-    if (useOverflowPage) {
-      long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
-      // The record is larger than the page size, so allocate a special overflow page just to hold
-      // that record.
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-      if (overflowPage == null) {
-        spill();
-        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-        if (overflowPage == null) {
-          throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
-        }
-      }
-      allocatedPages.add(overflowPage);
-      dataPage = overflowPage;
-      dataPagePosition = overflowPage.getBaseOffset();
-    } else {
-      // The record is small enough to fit in a regular data page, but the current page might not
-      // have enough space to hold it (or no pages have been allocated yet).
-      acquireNewPageIfNecessary(totalSpaceRequired);
-      dataPage = currentPage;
-      dataPagePosition = currentPagePosition;
-      // Update bookkeeping information
-      freeSpaceInCurrentPage -= totalSpaceRequired;
-      currentPagePosition += totalSpaceRequired;
-    }
-    final Object dataPageBaseObject = dataPage.getBaseObject();
-
-    // --- Insert the record ----------------------------------------------------------------------
-
-    final long recordAddress =
-      taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition);
-    Platform.putInt(dataPageBaseObject, dataPagePosition, lengthInBytes);
-    dataPagePosition += 4;
-    Platform.copyMemory(
-      recordBaseObject, recordBaseOffset, dataPageBaseObject, dataPagePosition, lengthInBytes);
+    final int required = length + 4;
+    acquireNewPageIfNecessary(required);
+
+    final Object base = currentPage.getBaseObject();
+    final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
+    Platform.putInt(base, pageCursor, length);
+    pageCursor += 4;
+    Platform.copyMemory(recordBase, recordOffset, base, pageCursor, length);
+    pageCursor += length;
     assert(inMemSorter != null);
     inMemSorter.insertRecord(recordAddress, prefix);
   }
@@ -411,59 +362,24 @@ public void insertRecord(
    *
    * record length = key length + value length + 4
    */
-  public void insertKVRecord(
-      Object keyBaseObj, long keyOffset, int keyLen,
-      Object valueBaseObj, long valueOffset, int valueLen, long prefix) throws IOException {
+  public void insertKVRecord(Object keyBase, long keyOffset, int keyLen,
+      Object valueBase, long valueOffset, int valueLen, long prefix)
+    throws IOException {
 
     growPointerArrayIfNecessary();
-    final int totalSpaceRequired = keyLen + valueLen + 4 + 4;
-
-    // --- Figure out where to insert the new record ----------------------------------------------
-
-    final MemoryBlock dataPage;
-    long dataPagePosition;
-    boolean useOverflowPage = totalSpaceRequired > pageSizeBytes;
-    if (useOverflowPage) {
-      long overflowPageSize = ByteArrayMethods.roundNumberOfBytesToNearestWord(totalSpaceRequired);
-      // The record is larger than the page size, so allocate a special overflow page just to hold
-      // that record.
-      MemoryBlock overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-      if (overflowPage == null) {
-        spill();
-        overflowPage = taskMemoryManager.allocatePage(overflowPageSize);
-        if (overflowPage == null) {
-          throw new IOException("Unable to acquire " + overflowPageSize + " bytes of memory");
-        }
-      }
-      allocatedPages.add(overflowPage);
-      dataPage = overflowPage;
-      dataPagePosition = overflowPage.getBaseOffset();
-    } else {
-      // The record is small enough to fit in a regular data page, but the current page might not
-      // have enough space to hold it (or no pages have been allocated yet).
-      acquireNewPageIfNecessary(totalSpaceRequired);
-      dataPage = currentPage;
-      dataPagePosition = currentPagePosition;
-      // Update bookkeeping information
-      freeSpaceInCurrentPage -= totalSpaceRequired;
-      currentPagePosition += totalSpaceRequired;
-    }
-    final Object dataPageBaseObject = dataPage.getBaseObject();
-
-    // --- Insert the record ----------------------------------------------------------------------
-
-    final long recordAddress =
-      taskMemoryManager.encodePageNumberAndOffset(dataPage, dataPagePosition);
-    Platform.putInt(dataPageBaseObject, dataPagePosition, keyLen + valueLen + 4);
-    dataPagePosition += 4;
-
-    Platform.putInt(dataPageBaseObject, dataPagePosition, keyLen);
-    dataPagePosition += 4;
-
-    Platform.copyMemory(keyBaseObj, keyOffset, dataPageBaseObject, dataPagePosition, keyLen);
-    dataPagePosition += keyLen;
-
-    Platform.copyMemory(valueBaseObj, valueOffset, dataPageBaseObject, dataPagePosition, valueLen);
+    final int required = keyLen + valueLen + 4 + 4;
+    acquireNewPageIfNecessary(required);
+
+    final Object base = currentPage.getBaseObject();
+    final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
+    Platform.putInt(base, pageCursor, keyLen + valueLen + 4);
+    pageCursor += 4;
+    Platform.putInt(base, pageCursor, keyLen);
+    pageCursor += 4;
+    Platform.copyMemory(keyBase, keyOffset, base, pageCursor, keyLen);
+    pageCursor += keyLen;
+    Platform.copyMemory(valueBase, valueOffset, base, pageCursor, valueLen);
+    pageCursor += valueLen;
 
     assert(inMemSorter != null);
     inMemSorter.insertRecord(recordAddress, prefix);
@@ -475,10 +391,10 @@ public void insertKVRecord(
    */
   public UnsafeSorterIterator getSortedIterator() throws IOException {
     assert(inMemSorter != null);
-    final UnsafeInMemorySorter.SortedIterator inMemoryIterator = inMemSorter.getSortedIterator();
-    int numIteratorsToMerge = spillWriters.size() + (inMemoryIterator.hasNext() ? 1 : 0);
+    readingIterator = new SpillableIterator(inMemSorter.getSortedIterator());
+    int numIteratorsToMerge = spillWriters.size() + (readingIterator.hasNext() ? 1 : 0);
     if (spillWriters.isEmpty()) {
-      return inMemoryIterator;
+      return readingIterator;
     } else {
       final UnsafeSorterSpillMerger spillMerger =
         new UnsafeSorterSpillMerger(recordComparator, prefixComparator, numIteratorsToMerge);
@@ -486,9 +402,113 @@ public UnsafeSorterIterator getSortedIterator() throws IOException {
         spillMerger.addSpillIfNotEmpty(spillWriter.getReader(blockManager));
       }
       spillWriters.clear();
-      spillMerger.addSpillIfNotEmpty(inMemoryIterator);
+      spillMerger.addSpillIfNotEmpty(readingIterator);
 
       return spillMerger.getSortedIterator();
     }
   }
+
+  /**
+   * An UnsafeSorterIterator that support spilling.
+   */
+  class SpillableIterator extends UnsafeSorterIterator {
+    private UnsafeSorterIterator upstream;
+    private UnsafeSorterIterator nextUpstream = null;
+    private MemoryBlock lastPage = null;
+    private boolean loaded = false;
+    private int numRecords = 0;
+
+    public SpillableIterator(UnsafeInMemorySorter.SortedIterator inMemIterator) {
+      this.upstream = inMemIterator;
+      this.numRecords = inMemIterator.numRecordsLeft();
+    }
+
+    public long spill() throws IOException {
+      synchronized (this) {
+        if (!(upstream instanceof UnsafeInMemorySorter.SortedIterator && nextUpstream == null
+          && numRecords > 0)) {
+          return 0L;
+        }
+
+        UnsafeInMemorySorter.SortedIterator inMemIterator =
+          ((UnsafeInMemorySorter.SortedIterator) upstream).clone();
+
+        final UnsafeSorterSpillWriter spillWriter =
+          new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, numRecords);
+        while (inMemIterator.hasNext()) {
+          inMemIterator.loadNext();
+          final Object baseObject = inMemIterator.getBaseObject();
+          final long baseOffset = inMemIterator.getBaseOffset();
+          final int recordLength = inMemIterator.getRecordLength();
+          spillWriter.write(baseObject, baseOffset, recordLength, inMemIterator.getKeyPrefix());
+        }
+        spillWriter.close();
+        spillWriters.add(spillWriter);
+        nextUpstream = spillWriter.getReader(blockManager);
+
+        long released = 0L;
+        synchronized (UnsafeExternalSorter.this) {
+          // release the pages except the one that is used
+          for (MemoryBlock page : allocatedPages) {
+            if (!loaded || page.getBaseObject() != inMemIterator.getBaseObject()) {
+              released += page.size();
+              freePage(page);
+            } else {
+              lastPage = page;
+            }
+          }
+          allocatedPages.clear();
+        }
+        return released;
+      }
+    }
+
+    @Override
+    public boolean hasNext() {
+      return numRecords > 0;
+    }
+
+    @Override
+    public void loadNext() throws IOException {
+      synchronized (this) {
+        loaded = true;
+        if (nextUpstream != null) {
+          // Just consumed the last record from in memory iterator
+          if (lastPage != null) {
+            freePage(lastPage);
+            lastPage = null;
+          }
+          upstream = nextUpstream;
+          nextUpstream = null;
+
+          assert(inMemSorter != null);
+          long used = inMemSorter.getMemoryUsage();
+          inMemSorter = null;
+          releaseMemory(used);
+        }
+        numRecords--;
+        upstream.loadNext();
+      }
+    }
+
+    @Override
+    public Object getBaseObject() {
+      return upstream.getBaseObject();
+    }
+
+    @Override
+    public long getBaseOffset() {
+      return upstream.getBaseOffset();
+    }
+
+    @Override
+    public int getRecordLength() {
+      return upstream.getRecordLength();
+    }
+
+    @Override
+    public long getKeyPrefix() {
+      return upstream.getKeyPrefix();
+    }
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 5aad72c374c37..1480f0681ed9c 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -70,12 +70,12 @@ public int compare(RecordPointerAndKeyPrefix r1, RecordPointerAndKeyPrefix r2) {
    * Within this buffer, position {@code 2 * i} holds a pointer pointer to the record at
    * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix.
    */
-  private long[] pointerArray;
+  private long[] array;
 
   /**
    * The position in the sort buffer where new records can be inserted.
    */
-  private int pointerArrayInsertPosition = 0;
+  private int pos = 0;
 
   public UnsafeInMemorySorter(
       final TaskMemoryManager memoryManager,
@@ -83,37 +83,43 @@ public UnsafeInMemorySorter(
       final PrefixComparator prefixComparator,
       int initialSize) {
     assert (initialSize > 0);
-    this.pointerArray = new long[initialSize * 2];
+    this.array = new long[initialSize * 2];
     this.memoryManager = memoryManager;
     this.sorter = new Sorter<>(UnsafeSortDataFormat.INSTANCE);
     this.sortComparator = new SortComparator(recordComparator, prefixComparator, memoryManager);
   }
 
+  public void reset() {
+    pos = 0;
+  }
+
   /**
    * @return the number of records that have been inserted into this sorter.
    */
   public int numRecords() {
-    return pointerArrayInsertPosition / 2;
+    return pos / 2;
   }
 
-  public long getMemoryUsage() {
-    return pointerArray.length * 8L;
+  private int newLength() {
+    return array.length < Integer.MAX_VALUE / 2 ? (array.length * 2) : Integer.MAX_VALUE;
+  }
+
+  public long getMemoryToExpand() {
+    return (long) (newLength() - array.length) * 8L;
   }
 
-  static long getMemoryRequirementsForPointerArray(long numEntries) {
-    return numEntries * 2L * 8L;
+  public long getMemoryUsage() {
+    return array.length * 8L;
   }
 
   public boolean hasSpaceForAnotherRecord() {
-    return pointerArrayInsertPosition + 2 < pointerArray.length;
+    return pos + 2 <= array.length;
   }
 
   public void expandPointerArray() {
-    final long[] oldArray = pointerArray;
-    // Guard against overflow:
-    final int newLength = oldArray.length * 2 > 0 ? (oldArray.length * 2) : Integer.MAX_VALUE;
-    pointerArray = new long[newLength];
-    System.arraycopy(oldArray, 0, pointerArray, 0, oldArray.length);
+    final long[] oldArray = array;
+    array = new long[newLength()];
+    System.arraycopy(oldArray, 0, array, 0, oldArray.length);
   }
 
   /**
@@ -127,10 +133,10 @@ public void insertRecord(long recordPointer, long keyPrefix) {
     if (!hasSpaceForAnotherRecord()) {
       expandPointerArray();
     }
-    pointerArray[pointerArrayInsertPosition] = recordPointer;
-    pointerArrayInsertPosition++;
-    pointerArray[pointerArrayInsertPosition] = keyPrefix;
-    pointerArrayInsertPosition++;
+    array[pos] = recordPointer;
+    pos++;
+    array[pos] = keyPrefix;
+    pos++;
   }
 
   public static final class SortedIterator extends UnsafeSorterIterator {
@@ -153,11 +159,25 @@ private SortedIterator(
       this.sortBuffer = sortBuffer;
     }
 
+    public SortedIterator clone () {
+      SortedIterator iter = new SortedIterator(memoryManager, sortBufferInsertPosition, sortBuffer);
+      iter.position = position;
+      iter.baseObject = baseObject;
+      iter.baseOffset = baseOffset;
+      iter.keyPrefix = keyPrefix;
+      iter.recordLength = recordLength;
+      return iter;
+    }
+
     @Override
     public boolean hasNext() {
       return position < sortBufferInsertPosition;
     }
 
+    public int numRecordsLeft() {
+      return (sortBufferInsertPosition - position) / 2;
+    }
+
     @Override
     public void loadNext() {
       // This pointer points to a 4-byte record length, followed by the record's bytes
@@ -187,7 +207,7 @@ public void loadNext() {
    * {@code next()} will return the same mutable object.
    */
   public SortedIterator getSortedIterator() {
-    sorter.sort(pointerArray, 0, pointerArrayInsertPosition / 2, sortComparator);
-    return new SortedIterator(memoryManager, pointerArrayInsertPosition, pointerArray);
+    sorter.sort(array, 0, pos / 2, sortComparator);
+    return new SortedIterator(memoryManager, pos, array);
   }
 }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 501dfe77d13cb..039e940a357ea 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -20,18 +20,18 @@
 import java.io.*;
 
 import com.google.common.io.ByteStreams;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.Platform;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description
  * of the file format).
  */
-final class UnsafeSorterSpillReader extends UnsafeSorterIterator {
+public final class UnsafeSorterSpillReader extends UnsafeSorterIterator {
   private static final Logger logger = LoggerFactory.getLogger(UnsafeSorterSpillReader.class);
 
   private final File file;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
index e59a84ff8d118..234e21140a1dd 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -35,7 +35,7 @@
  *
  *   [# of records (int)] [[len (int)][prefix (long)][data (bytes)]...]
  */
-final class UnsafeSorterSpillWriter {
+public final class UnsafeSorterSpillWriter {
 
   static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
 
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
index 6c9a71c3855b0..b0cf2696a397f 100644
--- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.annotations.VisibleForTesting
 
+import org.apache.spark.util.Utils
 import org.apache.spark.{SparkException, TaskContext, SparkConf, Logging}
 import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
 import org.apache.spark.unsafe.array.ByteArrayMethods
@@ -215,8 +216,12 @@ private[spark] abstract class MemoryManager(conf: SparkConf, numCores: Int) exte
   final def releaseExecutionMemory(numBytes: Long, taskAttemptId: Long): Unit = synchronized {
     val curMem = executionMemoryForTask.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
-      throw new SparkException(
-        s"Internal error: release called on $numBytes bytes but task only has $curMem")
+      if (Utils.isTesting) {
+        throw new SparkException(
+          s"Internal error: release called on $numBytes bytes but task only has $curMem")
+      } else {
+        logWarning(s"Internal error: release called on $numBytes bytes but task only has $curMem")
+      }
     }
     if (executionMemoryForTask.contains(taskAttemptId)) {
       executionMemoryForTask(taskAttemptId) -= numBytes
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index a76891acf0baf..9e002621a6909 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -78,7 +78,7 @@ private[spark] trait Spillable[C] extends Logging {
     if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
       // Claim up to double our current memory from the shuffle memory pool
       val amountToRequest = 2 * currentMemory - myMemoryThreshold
-      val granted = taskMemoryManager.acquireExecutionMemory(amountToRequest)
+      val granted = taskMemoryManager.acquireExecutionMemory(amountToRequest, null)
       myMemoryThreshold += granted
       // If we were granted too little memory to grow further (either tryToAcquire returned 0,
       // or we already had more memory than myMemoryThreshold), spill the current collection
@@ -107,7 +107,7 @@ private[spark] trait Spillable[C] extends Logging {
    */
   def releaseMemory(): Unit = {
     // The amount we requested does not include the initial memory tracking threshold
-    taskMemoryManager.releaseExecutionMemory(myMemoryThreshold - initialMemoryThreshold)
+    taskMemoryManager.releaseExecutionMemory(myMemoryThreshold - initialMemoryThreshold, null)
     myMemoryThreshold = initialMemoryThreshold
   }
 
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index f381db0c62653..dab7b0592cb4e 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.memory;
 
+import java.io.IOException;
+
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -25,19 +27,40 @@
 
 public class TaskMemoryManagerSuite {
 
+  class TestMemoryConsumer extends MemoryConsumer {
+    TestMemoryConsumer(TaskMemoryManager memoryManager) {
+      super(memoryManager);
+    }
+
+    @Override
+    public long spill(long size, MemoryConsumer trigger) throws IOException {
+      long used = getUsed();
+      releaseMemory(used);
+      return used;
+    }
+
+    void use(long size) {
+      acquireMemory(size);
+    }
+
+    void free(long size) {
+      releaseMemory(size);
+    }
+  }
+
   @Test
   public void leakedPageMemoryIsDetected() {
     final TaskMemoryManager manager = new TaskMemoryManager(
-      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
-    manager.allocatePage(4096);  // leak memory
+      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+    manager.allocatePage(4096, null);  // leak memory
     Assert.assertEquals(4096, manager.cleanUpAllAllocatedMemory());
   }
 
   @Test
   public void encodePageNumberAndOffsetOffHeap() {
     final TaskMemoryManager manager = new TaskMemoryManager(
-      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "true")), 0);
-    final MemoryBlock dataPage = manager.allocatePage(256);
+      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "true")), 0);
+    final MemoryBlock dataPage = manager.allocatePage(256, null);
     // In off-heap mode, an offset is an absolute address that may require more than 51 bits to
     // encode. This test exercises that corner-case:
     final long offset = ((1L << TaskMemoryManager.OFFSET_BITS) + 10);
@@ -49,11 +72,53 @@ public void encodePageNumberAndOffsetOffHeap() {
   @Test
   public void encodePageNumberAndOffsetOnHeap() {
     final TaskMemoryManager manager = new TaskMemoryManager(
-      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
-    final MemoryBlock dataPage = manager.allocatePage(256);
+      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+    final MemoryBlock dataPage = manager.allocatePage(256, null);
     final long encodedAddress = manager.encodePageNumberAndOffset(dataPage, 64);
     Assert.assertEquals(dataPage.getBaseObject(), manager.getPage(encodedAddress));
     Assert.assertEquals(64, manager.getOffsetInPage(encodedAddress));
   }
 
+  @Test
+  public void cooperativeSpilling() {
+    final TestMemoryManager memoryManager = new TestMemoryManager(new SparkConf());
+    memoryManager.limit(100);
+    final TaskMemoryManager manager = new TaskMemoryManager(memoryManager, 0);
+
+    TestMemoryConsumer c1 = new TestMemoryConsumer(manager);
+    TestMemoryConsumer c2 = new TestMemoryConsumer(manager);
+    c1.use(100);
+    assert(c1.getUsed() == 100);
+    c2.use(100);
+    assert(c2.getUsed() == 100);
+    assert(c1.getUsed() == 0);  // spilled
+    c1.use(100);
+    assert(c1.getUsed() == 100);
+    assert(c2.getUsed() == 0);  // spilled
+
+    c1.use(50);
+    assert(c1.getUsed() == 50);  // spilled
+    assert(c2.getUsed() == 0);
+    c2.use(50);
+    assert(c1.getUsed() == 50);
+    assert(c2.getUsed() == 50);
+
+    c1.use(100);
+    assert(c1.getUsed() == 100);
+    assert(c2.getUsed() == 0);  // spilled
+
+    c1.free(20);
+    assert(c1.getUsed() == 80);
+    c2.use(10);
+    assert(c1.getUsed() == 80);
+    assert(c2.getUsed() == 10);
+    c2.use(100);
+    assert(c2.getUsed() == 100);
+    assert(c1.getUsed() == 0);  // spilled
+
+    c1.free(0);
+    c2.free(100);
+    assert(manager.cleanUpAllAllocatedMemory() == 0);
+  }
+
 }
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
index 7fb2f92ca80e8..9a43f1f3a9235 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
@@ -17,25 +17,29 @@
 
 package org.apache.spark.shuffle.sort;
 
-import org.apache.spark.shuffle.sort.PackedRecordPointer;
+import java.io.IOException;
+
 import org.junit.Test;
-import static org.junit.Assert.*;
 
 import org.apache.spark.SparkConf;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
-import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.memory.TestMemoryManager;
 import org.apache.spark.memory.TaskMemoryManager;
-import static org.apache.spark.shuffle.sort.PackedRecordPointer.*;
+import org.apache.spark.unsafe.memory.MemoryBlock;
+
+import static org.apache.spark.shuffle.sort.PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES;
+import static org.apache.spark.shuffle.sort.PackedRecordPointer.MAXIMUM_PARTITION_ID;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 
 public class PackedRecordPointerSuite {
 
   @Test
-  public void heap() {
+  public void heap() throws IOException {
     final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
-    final MemoryBlock page0 = memoryManager.allocatePage(128);
-    final MemoryBlock page1 = memoryManager.allocatePage(128);
+      new TaskMemoryManager(new TestMemoryManager(conf), 0);
+    final MemoryBlock page0 = memoryManager.allocatePage(128, null);
+    final MemoryBlock page1 = memoryManager.allocatePage(128, null);
     final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1,
       page1.getBaseOffset() + 42);
     PackedRecordPointer packedPointer = new PackedRecordPointer();
@@ -49,12 +53,12 @@ public void heap() {
   }
 
   @Test
-  public void offHeap() {
+  public void offHeap() throws IOException {
     final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "true");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
-    final MemoryBlock page0 = memoryManager.allocatePage(128);
-    final MemoryBlock page1 = memoryManager.allocatePage(128);
+      new TaskMemoryManager(new TestMemoryManager(conf), 0);
+    final MemoryBlock page0 = memoryManager.allocatePage(128, null);
+    final MemoryBlock page1 = memoryManager.allocatePage(128, null);
     final long addressInPage1 = memoryManager.encodePageNumberAndOffset(page1,
       page1.getBaseOffset() + 42);
     PackedRecordPointer packedPointer = new PackedRecordPointer();
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
index 5049a5306ff21..2293b1bbc113e 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
@@ -26,7 +26,7 @@
 import org.apache.spark.HashPartitioner;
 import org.apache.spark.SparkConf;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
+import org.apache.spark.memory.TestMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.memory.TaskMemoryManager;
 
@@ -60,8 +60,8 @@ public void testBasicSorting() throws Exception {
     };
     final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
     final TaskMemoryManager memoryManager =
-      new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
-    final MemoryBlock dataPage = memoryManager.allocatePage(2048);
+      new TaskMemoryManager(new TestMemoryManager(conf), 0);
+    final MemoryBlock dataPage = memoryManager.allocatePage(2048, null);
     final Object baseObject = dataPage.getBaseObject();
     final ShuffleInMemorySorter sorter = new ShuffleInMemorySorter(4);
     final HashPartitioner hashPartitioner = new HashPartitioner(4);
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index d65926949c036..4763395d7d401 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -54,13 +54,14 @@
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.storage.*;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
+import org.apache.spark.memory.TestMemoryManager;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
 public class UnsafeShuffleWriterSuite {
 
   static final int NUM_PARTITITONS = 4;
+  TestMemoryManager memoryManager;
   TaskMemoryManager taskMemoryManager;
   final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITITONS);
   File mergedOutputFile;
@@ -106,10 +107,11 @@ public void setUp() throws IOException {
     partitionSizesInMergedFile = null;
     spillFilesCreated.clear();
     conf = new SparkConf()
-      .set("spark.buffer.pageSize", "128m")
+      .set("spark.buffer.pageSize", "1m")
       .set("spark.unsafe.offHeap", "false");
     taskMetrics = new TaskMetrics();
-    taskMemoryManager =  new TaskMemoryManager(new GrantEverythingMemoryManager(conf), 0);
+    memoryManager = new TestMemoryManager(conf);
+    taskMemoryManager =  new TaskMemoryManager(memoryManager, 0);
 
     when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
     when(blockManager.getDiskWriter(
@@ -344,9 +346,7 @@ private void testMergingSpills(
     }
     assertEquals(sumOfPartitionSizes, mergedOutputFile.length());
 
-    assertEquals(
-      HashMultiset.create(dataToWrite),
-      HashMultiset.create(readRecordsFromFile()));
+    assertEquals(HashMultiset.create(dataToWrite), HashMultiset.create(readRecordsFromFile()));
     assertSpillFilesWereCleanedUp();
     ShuffleWriteMetrics shuffleWriteMetrics = taskMetrics.shuffleWriteMetrics().get();
     assertEquals(dataToWrite.size(), shuffleWriteMetrics.shuffleRecordsWritten());
@@ -398,20 +398,14 @@ public void mergeSpillsWithFileStreamAndNoCompression() throws Exception {
 
   @Test
   public void writeEnoughDataToTriggerSpill() throws Exception {
-    taskMemoryManager = spy(taskMemoryManager);
-    doCallRealMethod() // initialize sort buffer
-      .doCallRealMethod() // allocate initial data page
-      .doReturn(0L) // deny request to allocate new page
-      .doCallRealMethod() // grant new sort buffer and data page
-      .when(taskMemoryManager).acquireExecutionMemory(anyLong());
+    memoryManager.limit(PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES);
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(false);
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<Product2<Object, Object>>();
-    final byte[] bigByteArray = new byte[PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES / 128];
-    for (int i = 0; i < 128 + 1; i++) {
+    final byte[] bigByteArray = new byte[PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES / 10];
+    for (int i = 0; i < 10 + 1; i++) {
       dataToWrite.add(new Tuple2<Object, Object>(i, bigByteArray));
     }
     writer.write(dataToWrite.iterator());
-    verify(taskMemoryManager, times(5)).acquireExecutionMemory(anyLong());
     assertEquals(2, spillFilesCreated.size());
     writer.stop(true);
     readRecordsFromFile();
@@ -426,19 +420,13 @@ public void writeEnoughDataToTriggerSpill() throws Exception {
 
   @Test
   public void writeEnoughRecordsToTriggerSortBufferExpansionAndSpill() throws Exception {
-    taskMemoryManager = spy(taskMemoryManager);
-    doCallRealMethod() // initialize sort buffer
-      .doCallRealMethod() // allocate initial data page
-      .doReturn(0L) // deny request to allocate new page
-      .doCallRealMethod() // grant new sort buffer and data page
-      .when(taskMemoryManager).acquireExecutionMemory(anyLong());
+    memoryManager.limit(UnsafeShuffleWriter.INITIAL_SORT_BUFFER_SIZE * 16);
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(false);
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<>();
     for (int i = 0; i < UnsafeShuffleWriter.INITIAL_SORT_BUFFER_SIZE; i++) {
       dataToWrite.add(new Tuple2<Object, Object>(i, i));
     }
     writer.write(dataToWrite.iterator());
-    verify(taskMemoryManager, times(5)).acquireExecutionMemory(anyLong());
     assertEquals(2, spillFilesCreated.size());
     writer.stop(true);
     readRecordsFromFile();
@@ -473,11 +461,11 @@ public void writeRecordsThatAreBiggerThanMaxRecordSize() throws Exception {
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<Product2<Object, Object>>();
     dataToWrite.add(new Tuple2<Object, Object>(1, ByteBuffer.wrap(new byte[1])));
     // We should be able to write a record that's right _at_ the max record size
-    final byte[] atMaxRecordSize = new byte[writer.maxRecordSizeBytes()];
+    final byte[] atMaxRecordSize = new byte[(int) taskMemoryManager.pageSizeBytes() - 4];
     new Random(42).nextBytes(atMaxRecordSize);
     dataToWrite.add(new Tuple2<Object, Object>(2, ByteBuffer.wrap(atMaxRecordSize)));
     // Inserting a record that's larger than the max record size
-    final byte[] exceedsMaxRecordSize = new byte[writer.maxRecordSizeBytes() + 1];
+    final byte[] exceedsMaxRecordSize = new byte[(int) taskMemoryManager.pageSizeBytes()];
     new Random(42).nextBytes(exceedsMaxRecordSize);
     dataToWrite.add(new Tuple2<Object, Object>(3, ByteBuffer.wrap(exceedsMaxRecordSize)));
     writer.write(dataToWrite.iterator());
@@ -524,7 +512,7 @@ public void testPeakMemoryUsed() throws Exception {
       for (int i = 0; i < numRecordsPerPage * 10; i++) {
         writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1));
         newPeakMemory = writer.getPeakMemoryUsedBytes();
-        if (i % numRecordsPerPage == 0 && i != 0) {
+        if (i % numRecordsPerPage == 0) {
           // The first page is allocated in constructor, another page will be allocated after
           // every numRecordsPerPage records (peak memory should change).
           assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory);
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 6e52496cf933b..92bd45e5fa241 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -17,40 +17,117 @@
 
 package org.apache.spark.unsafe.map;
 
-import java.lang.Exception;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.spark.memory.TaskMemoryManager;
-import org.junit.*;
-import static org.hamcrest.Matchers.greaterThan;
-import static org.junit.Assert.*;
+import scala.Tuple2;
+import scala.Tuple2$;
+import scala.runtime.AbstractFunction1;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mock;
+import org.mockito.MockitoAnnotations;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
 
 import org.apache.spark.SparkConf;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
-import org.apache.spark.unsafe.array.ByteArrayMethods;
-import org.apache.spark.unsafe.memory.*;
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.memory.TestMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
+import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.storage.*;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.memory.MemoryLocation;
+import org.apache.spark.util.Utils;
+
+import static org.hamcrest.Matchers.greaterThan;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.mockito.AdditionalAnswers.returnsSecondArg;
+import static org.mockito.Answers.RETURNS_SMART_NULLS;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyInt;
+import static org.mockito.Mockito.when;
 
 
 public abstract class AbstractBytesToBytesMapSuite {
 
   private final Random rand = new Random(42);
 
-  private GrantEverythingMemoryManager memoryManager;
+  private TestMemoryManager memoryManager;
   private TaskMemoryManager taskMemoryManager;
   private final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
 
+  final LinkedList<File> spillFilesCreated = new LinkedList<File>();
+  File tempDir;
+
+  @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
+  @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
+
+  private static final class CompressStream extends AbstractFunction1<OutputStream, OutputStream> {
+    @Override
+    public OutputStream apply(OutputStream stream) {
+      return stream;
+    }
+  }
+
   @Before
   public void setup() {
     memoryManager =
-      new GrantEverythingMemoryManager(
+      new TestMemoryManager(
         new SparkConf().set("spark.unsafe.offHeap", "" + useOffHeapMemoryAllocator()));
     taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
+
+    tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "unsafe-test");
+    spillFilesCreated.clear();
+    MockitoAnnotations.initMocks(this);
+    when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
+    when(diskBlockManager.createTempLocalBlock()).thenAnswer(new Answer<Tuple2<TempLocalBlockId, File>>() {
+      @Override
+      public Tuple2<TempLocalBlockId, File> answer(InvocationOnMock invocationOnMock) throws Throwable {
+        TempLocalBlockId blockId = new TempLocalBlockId(UUID.randomUUID());
+        File file = File.createTempFile("spillFile", ".spill", tempDir);
+        spillFilesCreated.add(file);
+        return Tuple2$.MODULE$.apply(blockId, file);
+      }
+    });
+    when(blockManager.getDiskWriter(
+      any(BlockId.class),
+      any(File.class),
+      any(SerializerInstance.class),
+      anyInt(),
+      any(ShuffleWriteMetrics.class))).thenAnswer(new Answer<DiskBlockObjectWriter>() {
+      @Override
+      public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Throwable {
+        Object[] args = invocationOnMock.getArguments();
+
+        return new DiskBlockObjectWriter(
+          (File) args[1],
+          (SerializerInstance) args[2],
+          (Integer) args[3],
+          new CompressStream(),
+          false,
+          (ShuffleWriteMetrics) args[4]
+        );
+      }
+    });
+    when(blockManager.wrapForCompression(any(BlockId.class), any(InputStream.class)))
+      .then(returnsSecondArg());
   }
 
   @After
   public void tearDown() {
+    Utils.deleteRecursively(tempDir);
+    tempDir = null;
+
     Assert.assertEquals(0L, taskMemoryManager.cleanUpAllAllocatedMemory());
     if (taskMemoryManager != null) {
       long leakedMemory = taskMemoryManager.getMemoryConsumptionForThisTask();
@@ -415,9 +492,8 @@ public void randomizedTestWithRecordsLargerThanPageSize() {
 
   @Test
   public void failureToAllocateFirstPage() {
-    memoryManager.markExecutionAsOutOfMemory();
+    memoryManager.limit(1024);  // longArray
     BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1, PAGE_SIZE_BYTES);
-    memoryManager.markExecutionAsOutOfMemory();
     try {
       final long[] emptyArray = new long[0];
       final BytesToBytesMap.Location loc =
@@ -439,7 +515,7 @@ public void failureToGrow() {
       int i;
       for (i = 0; i < 127; i++) {
         if (i > 0) {
-          memoryManager.markExecutionAsOutOfMemory();
+          memoryManager.limit(0);
         }
         final long[] arr = new long[]{i};
         final BytesToBytesMap.Location loc = map.lookup(arr, Platform.LONG_ARRAY_OFFSET, 8);
@@ -456,6 +532,44 @@ public void failureToGrow() {
     }
   }
 
+  @Test
+  public void spillInIterator() throws IOException {
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, blockManager, 1, 0.75, 1024, false);
+    try {
+      int i;
+      for (i = 0; i < 1024; i++) {
+        final long[] arr = new long[]{i};
+        final BytesToBytesMap.Location loc = map.lookup(arr, Platform.LONG_ARRAY_OFFSET, 8);
+        loc.putNewKey(arr, Platform.LONG_ARRAY_OFFSET, 8, arr, Platform.LONG_ARRAY_OFFSET, 8);
+      }
+      BytesToBytesMap.MapIterator iter = map.iterator();
+      for (i = 0; i < 100; i++) {
+        iter.next();
+      }
+      // Non-destructive iterator is not spillable
+      Assert.assertEquals(0, iter.spill(1024L * 10));
+      for (i = 100; i < 1024; i++) {
+        iter.next();
+      }
+
+      BytesToBytesMap.MapIterator iter2 = map.destructiveIterator();
+      for (i = 0; i < 100; i++) {
+        iter2.next();
+      }
+      Assert.assertTrue(iter2.spill(1024) >= 1024);
+      for (i = 100; i < 1024; i++) {
+        iter2.next();
+      }
+      assertFalse(iter2.hasNext());
+    } finally {
+      map.free();
+      for (File spillFile : spillFilesCreated) {
+        assertFalse("Spill file " + spillFile.getPath() + " was not cleaned up",
+          spillFile.exists());
+      }
+    }
+  }
+
   @Test
   public void initialCapacityBoundsChecking() {
     try {
@@ -500,7 +614,7 @@ public void testPeakMemoryUsed() {
           Platform.LONG_ARRAY_OFFSET,
           8);
         newPeakMemory = map.getPeakMemoryUsedBytes();
-        if (i % numRecordsPerPage == 0 && i > 0) {
+        if (i % numRecordsPerPage == 0) {
           // We allocated a new page for this record, so peak memory should change
           assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory);
         } else {
@@ -519,11 +633,4 @@ public void testPeakMemoryUsed() {
     }
   }
 
-  @Test
-  public void testAcquirePageInConstructor() {
-    final BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, 1, PAGE_SIZE_BYTES);
-    assertEquals(1, map.getNumDataPages());
-    map.free();
-  }
-
 }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 94d50b94fde3f..cfead0e5924b8 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -36,28 +36,29 @@
 import org.mockito.MockitoAnnotations;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
-import static org.hamcrest.Matchers.greaterThanOrEqualTo;
-import static org.junit.Assert.*;
-import static org.mockito.AdditionalAnswers.returnsSecondArg;
-import static org.mockito.Answers.RETURNS_SMART_NULLS;
-import static org.mockito.Mockito.*;
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.executor.TaskMetrics;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
+import org.apache.spark.memory.TestMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.storage.*;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.Utils;
 
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.junit.Assert.*;
+import static org.mockito.AdditionalAnswers.returnsSecondArg;
+import static org.mockito.Answers.RETURNS_SMART_NULLS;
+import static org.mockito.Mockito.*;
+
 public class UnsafeExternalSorterSuite {
 
   final LinkedList<File> spillFilesCreated = new LinkedList<File>();
-  final GrantEverythingMemoryManager memoryManager =
-    new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"));
+  final TestMemoryManager memoryManager =
+    new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"));
   final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   // Use integer comparison for comparing prefixes (which are partition ids, in this case)
   final PrefixComparator prefixComparator = new PrefixComparator() {
@@ -86,7 +87,7 @@ public int compare(
   @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext;
 
 
-  private final long pageSizeBytes = new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "64m");
+  private final long pageSizeBytes = new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "4m");
 
   private static final class CompressStream extends AbstractFunction1<OutputStream, OutputStream> {
     @Override
@@ -233,7 +234,7 @@ public void spillingOccursInResponseToMemoryPressure() throws Exception {
       insertNumber(sorter, numRecords - i);
     }
     assertEquals(1, sorter.getNumberOfAllocatedPages());
-    memoryManager.markExecutionAsOutOfMemory();
+    memoryManager.markExecutionAsOutOfMemoryOnce();
     // The insertion of this record should trigger a spill:
     insertNumber(sorter, 0);
     // Ensure that spill files were created
@@ -311,6 +312,62 @@ public void sortingRecordsThatExceedPageSize() throws Exception {
     assertSpillFilesWereCleanedUp();
   }
 
+  @Test
+  public void forcedSpillingWithReadIterator() throws Exception {
+    final UnsafeExternalSorter sorter = newSorter();
+    long[] record = new long[100];
+    int recordSize = record.length * 8;
+    int n = (int) pageSizeBytes / recordSize * 3;
+    for (int i = 0; i < n; i++) {
+      record[0] = (long) i;
+      sorter.insertRecord(record, Platform.LONG_ARRAY_OFFSET, recordSize, 0);
+    }
+    assert(sorter.getNumberOfAllocatedPages() >= 2);
+    UnsafeExternalSorter.SpillableIterator iter =
+      (UnsafeExternalSorter.SpillableIterator) sorter.getSortedIterator();
+    int lastv = 0;
+    for (int i = 0; i < n / 3; i++) {
+      iter.hasNext();
+      iter.loadNext();
+      assert(Platform.getLong(iter.getBaseObject(), iter.getBaseOffset()) == i);
+      lastv = i;
+    }
+    assert(iter.spill() > 0);
+    assert(iter.spill() == 0);
+    assert(Platform.getLong(iter.getBaseObject(), iter.getBaseOffset()) == lastv);
+    for (int i = n / 3; i < n; i++) {
+      iter.hasNext();
+      iter.loadNext();
+      assert(Platform.getLong(iter.getBaseObject(), iter.getBaseOffset()) == i);
+    }
+    sorter.cleanupResources();
+    assertSpillFilesWereCleanedUp();
+  }
+
+  @Test
+  public void forcedSpillingWithNotReadIterator() throws Exception {
+    final UnsafeExternalSorter sorter = newSorter();
+    long[] record = new long[100];
+    int recordSize = record.length * 8;
+    int n = (int) pageSizeBytes / recordSize * 3;
+    for (int i = 0; i < n; i++) {
+      record[0] = (long) i;
+      sorter.insertRecord(record, Platform.LONG_ARRAY_OFFSET, recordSize, 0);
+    }
+    assert(sorter.getNumberOfAllocatedPages() >= 2);
+    UnsafeExternalSorter.SpillableIterator iter =
+      (UnsafeExternalSorter.SpillableIterator) sorter.getSortedIterator();
+    assert(iter.spill() > 0);
+    assert(iter.spill() == 0);
+    for (int i = 0; i < n; i++) {
+      iter.hasNext();
+      iter.loadNext();
+      assert(Platform.getLong(iter.getBaseObject(), iter.getBaseOffset()) == i);
+    }
+    sorter.cleanupResources();
+    assertSpillFilesWereCleanedUp();
+  }
+
   @Test
   public void testPeakMemoryUsed() throws Exception {
     final long recordLengthBytes = 8;
@@ -334,7 +391,7 @@ public void testPeakMemoryUsed() throws Exception {
         insertNumber(sorter, i);
         newPeakMemory = sorter.getPeakMemoryUsedBytes();
         // The first page is pre-allocated on instantiation
-        if (i % numRecordsPerPage == 0 && i > 0) {
+        if (i % numRecordsPerPage == 0) {
           // We allocated a new page for this record, so peak memory should change
           assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory);
         } else {
@@ -358,21 +415,5 @@ public void testPeakMemoryUsed() throws Exception {
     }
   }
 
-  @Test
-  public void testReservePageOnInstantiation() throws Exception {
-    final UnsafeExternalSorter sorter = newSorter();
-    try {
-      assertEquals(1, sorter.getNumberOfAllocatedPages());
-      // Inserting a new record doesn't allocate more memory since we already have a page
-      long peakMemory = sorter.getPeakMemoryUsedBytes();
-      insertNumber(sorter, 100);
-      assertEquals(peakMemory, sorter.getPeakMemoryUsedBytes());
-      assertEquals(1, sorter.getNumberOfAllocatedPages());
-    } finally {
-      sorter.cleanupResources();
-      assertSpillFilesWereCleanedUp();
-    }
-  }
-
 }
 
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index d5de56a0512f9..642f6585f8a15 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -20,17 +20,19 @@
 import java.util.Arrays;
 
 import org.junit.Test;
-import static org.hamcrest.MatcherAssert.assertThat;
-import static org.hamcrest.Matchers.*;
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.mock;
 
 import org.apache.spark.HashPartitioner;
 import org.apache.spark.SparkConf;
+import org.apache.spark.memory.TestMemoryManager;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.memory.GrantEverythingMemoryManager;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.memory.TaskMemoryManager;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.isIn;
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.mock;
 
 public class UnsafeInMemorySorterSuite {
 
@@ -44,7 +46,7 @@ private static String getStringFromDataPage(Object baseObject, long baseOffset,
   public void testSortingEmptyInput() {
     final UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(
       new TaskMemoryManager(
-        new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0),
+        new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0),
       mock(RecordComparator.class),
       mock(PrefixComparator.class),
       100);
@@ -66,8 +68,8 @@ public void testSortingOnlyByIntegerPrefix() throws Exception {
       "Mango"
     };
     final TaskMemoryManager memoryManager = new TaskMemoryManager(
-      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
-    final MemoryBlock dataPage = memoryManager.allocatePage(2048);
+      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+    final MemoryBlock dataPage = memoryManager.allocatePage(2048, null);
     final Object baseObject = dataPage.getBaseObject();
     // Write the records into the data page:
     long position = dataPage.getBaseOffset();
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index 0242cbc9244a8..203dab934ca1f 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -149,7 +149,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     // cause is preserved
     val thrownDueToTaskFailure = intercept[SparkException] {
       sc.parallelize(Seq(0)).mapPartitions { iter =>
-        TaskContext.get().taskMemoryManager().allocatePage(128)
+        TaskContext.get().taskMemoryManager().allocatePage(128, null)
         throw new Exception("intentional task failure")
         iter
       }.count()
@@ -159,7 +159,7 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     // If the task succeeded but memory was leaked, then the task should fail due to that leak
     val thrownDueToMemoryLeak = intercept[SparkException] {
       sc.parallelize(Seq(0)).mapPartitions { iter =>
-        TaskContext.get().taskMemoryManager().allocatePage(128)
+        TaskContext.get().taskMemoryManager().allocatePage(128, null)
         iter
       }.count()
     }
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
index 1265087743a98..4a9479cf490fb 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -145,20 +145,20 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val manager = createMemoryManager(1000L)
     val taskMemoryManager = new TaskMemoryManager(manager, 0)
 
-    assert(taskMemoryManager.acquireExecutionMemory(100L) === 100L)
-    assert(taskMemoryManager.acquireExecutionMemory(400L) === 400L)
-    assert(taskMemoryManager.acquireExecutionMemory(400L) === 400L)
-    assert(taskMemoryManager.acquireExecutionMemory(200L) === 100L)
-    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
-    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L, null) === 100L)
+    assert(taskMemoryManager.acquireExecutionMemory(400L, null) === 400L)
+    assert(taskMemoryManager.acquireExecutionMemory(400L, null) === 400L)
+    assert(taskMemoryManager.acquireExecutionMemory(200L, null) === 100L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L, null) === 0L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L, null) === 0L)
 
-    taskMemoryManager.releaseExecutionMemory(500L)
-    assert(taskMemoryManager.acquireExecutionMemory(300L) === 300L)
-    assert(taskMemoryManager.acquireExecutionMemory(300L) === 200L)
+    taskMemoryManager.releaseExecutionMemory(500L, null)
+    assert(taskMemoryManager.acquireExecutionMemory(300L, null) === 300L)
+    assert(taskMemoryManager.acquireExecutionMemory(300L, null) === 200L)
 
     taskMemoryManager.cleanUpAllAllocatedMemory()
-    assert(taskMemoryManager.acquireExecutionMemory(1000L) === 1000L)
-    assert(taskMemoryManager.acquireExecutionMemory(100L) === 0L)
+    assert(taskMemoryManager.acquireExecutionMemory(1000L, null) === 1000L)
+    assert(taskMemoryManager.acquireExecutionMemory(100L, null) === 0L)
   }
 
   test("two tasks requesting full execution memory") {
@@ -168,15 +168,15 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val futureTimeout: Duration = 20.seconds
 
     // Have both tasks request 500 bytes, then wait until both requests have been granted:
-    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(500L) }
-    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(500L, null) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     assert(Await.result(t1Result1, futureTimeout) === 500L)
     assert(Await.result(t2Result1, futureTimeout) === 500L)
 
     // Have both tasks each request 500 bytes more; both should immediately return 0 as they are
     // both now at 1 / N
-    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L) }
-    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L, null) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     assert(Await.result(t1Result2, 200.millis) === 0L)
     assert(Await.result(t2Result2, 200.millis) === 0L)
   }
@@ -188,15 +188,15 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val futureTimeout: Duration = 20.seconds
 
     // Have both tasks request 250 bytes, then wait until both requests have been granted:
-    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(250L) }
-    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L) }
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(250L, null) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L, null) }
     assert(Await.result(t1Result1, futureTimeout) === 250L)
     assert(Await.result(t2Result1, futureTimeout) === 250L)
 
     // Have both tasks each request 500 bytes more.
     // We should only grant 250 bytes to each of them on this second request
-    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L) }
-    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(500L, null) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     assert(Await.result(t1Result2, futureTimeout) === 250L)
     assert(Await.result(t2Result2, futureTimeout) === 250L)
   }
@@ -208,17 +208,17 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val futureTimeout: Duration = 20.seconds
 
     // t1 grabs 1000 bytes and then waits until t2 is ready to make a request.
-    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L) }
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L, null) }
     assert(Await.result(t1Result1, futureTimeout) === 1000L)
-    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(250L, null) }
     // Make sure that t2 didn't grab the memory right away. This is hacky but it would be difficult
     // to make sure the other thread blocks for some time otherwise.
     Thread.sleep(300)
-    t1MemManager.releaseExecutionMemory(250L)
+    t1MemManager.releaseExecutionMemory(250L, null)
     // The memory freed from t1 should now be granted to t2.
     assert(Await.result(t2Result1, futureTimeout) === 250L)
     // Further requests by t2 should be denied immediately because it now has 1 / 2N of the memory.
-    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(100L) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(100L, null) }
     assert(Await.result(t2Result2, 200.millis) === 0L)
   }
 
@@ -229,18 +229,18 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val futureTimeout: Duration = 20.seconds
 
     // t1 grabs 1000 bytes and then waits until t2 is ready to make a request.
-    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L) }
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(1000L, null) }
     assert(Await.result(t1Result1, futureTimeout) === 1000L)
-    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     // Make sure that t2 didn't grab the memory right away. This is hacky but it would be difficult
     // to make sure the other thread blocks for some time otherwise.
     Thread.sleep(300)
     // t1 releases all of its memory, so t2 should be able to grab all of the memory
     t1MemManager.cleanUpAllAllocatedMemory()
     assert(Await.result(t2Result1, futureTimeout) === 500L)
-    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t2Result2 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     assert(Await.result(t2Result2, futureTimeout) === 500L)
-    val t2Result3 = Future { t2MemManager.acquireExecutionMemory(500L) }
+    val t2Result3 = Future { t2MemManager.acquireExecutionMemory(500L, null) }
     assert(Await.result(t2Result3, 200.millis) === 0L)
   }
 
@@ -251,13 +251,13 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
     val t2MemManager = new TaskMemoryManager(memoryManager, 2)
     val futureTimeout: Duration = 20.seconds
 
-    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(700L) }
+    val t1Result1 = Future { t1MemManager.acquireExecutionMemory(700L, null) }
     assert(Await.result(t1Result1, futureTimeout) === 700L)
 
-    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(300L) }
+    val t2Result1 = Future { t2MemManager.acquireExecutionMemory(300L, null) }
     assert(Await.result(t2Result1, futureTimeout) === 300L)
 
-    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(300L) }
+    val t1Result2 = Future { t1MemManager.acquireExecutionMemory(300L, null) }
     assert(Await.result(t1Result2, 200.millis) === 0L)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
similarity index 71%
rename from core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala
rename to core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
index fe102d8aeb2a5..77e43554ee27c 100644
--- a/core/src/test/scala/org/apache/spark/memory/GrantEverythingMemoryManager.scala
+++ b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
@@ -22,16 +22,22 @@ import scala.collection.mutable
 import org.apache.spark.SparkConf
 import org.apache.spark.storage.{BlockStatus, BlockId}
 
-class GrantEverythingMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1) {
+class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1) {
   private[memory] override def doAcquireExecutionMemory(
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
-    if (oom) {
-      oom = false
+    if (oomOnce) {
+      oomOnce = false
       0
-    } else {
+    } else if (available >= numBytes) {
       _executionMemoryUsed += numBytes // To suppress warnings when freeing unallocated memory
+      available -= numBytes
       numBytes
+    } else {
+      _executionMemoryUsed += available
+      val grant = available
+      available = 0
+      grant
     }
   }
   override def acquireStorageMemory(
@@ -42,13 +48,23 @@ class GrantEverythingMemoryManager(conf: SparkConf) extends MemoryManager(conf,
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
-  override def releaseStorageMemory(numBytes: Long): Unit = { }
+  override def releaseExecutionMemory(numBytes: Long): Unit = {
+    available += numBytes
+    _executionMemoryUsed -= numBytes
+  }
+  override def releaseStorageMemory(numBytes: Long): Unit = {}
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
 
-  private var oom = false
+  private var oomOnce = false
+  private var available = Long.MaxValue
 
-  def markExecutionAsOutOfMemory(): Unit = {
-    oom = true
+  def markExecutionAsOutOfMemoryOnce(): Unit = {
+    oomOnce = true
   }
+
+  def limit(avail: Long): Unit = {
+    available = avail
+  }
+
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 810c74fd2fb96..f7063d1e5c829 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -96,15 +96,10 @@ void insertRow(UnsafeRow row) throws IOException {
     );
     numRowsInserted++;
     if (testSpillFrequency > 0 && (numRowsInserted % testSpillFrequency) == 0) {
-      spill();
+      sorter.spill();
     }
   }
 
-  @VisibleForTesting
-  void spill() throws IOException {
-    sorter.spill();
-  }
-
   /**
    * Return the peak memory used so far, in bytes.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 82c645df284de..889f97003450c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -165,7 +165,7 @@ public UnsafeRow getAggregationBufferFromUnsafeRow(UnsafeRow unsafeGroupingKeyRo
   public KVIterator<UnsafeRow, UnsafeRow> iterator() {
     return new KVIterator<UnsafeRow, UnsafeRow>() {
 
-      private final BytesToBytesMap.BytesToBytesMapIterator mapLocationIterator =
+      private final BytesToBytesMap.MapIterator mapLocationIterator =
         map.destructiveIterator();
       private final UnsafeRow key = new UnsafeRow();
       private final UnsafeRow value = new UnsafeRow();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index 46301f0042954..845f2ae6859b7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.execution;
 
-import java.io.IOException;
-
 import javax.annotation.Nullable;
+import java.io.IOException;
 
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.spark.TaskContext;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.expressions.codegen.BaseOrdering;
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering;
@@ -33,7 +33,6 @@
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryBlock;
-import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.util.collection.unsafe.sort.*;
 
 /**
@@ -84,18 +83,16 @@ public UnsafeKVExternalSorter(
         /* initialSize */ 4096,
         pageSizeBytes);
     } else {
-      // Insert the records into the in-memory sorter.
-      // We will use the number of elements in the map as the initialSize of the
-      // UnsafeInMemorySorter. Because UnsafeInMemorySorter does not accept 0 as the initialSize,
-      // we will use 1 as its initial size if the map is empty.
-      // TODO: track pointer array memory used by this in-memory sorter! (SPARK-10474)
+      // The memory needed for UnsafeInMemorySorter should be less than longArray in map.
+      map.freeArray();
+      // The memory used by UnsafeInMemorySorter will be counted later (end of this block)
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
         taskMemoryManager, recordComparator, prefixComparator, Math.max(1, map.numElements()));
 
       // We cannot use the destructive iterator here because we are reusing the existing memory
       // pages in BytesToBytesMap to hold records during sorting.
       // The only new memory we are allocating is the pointer/prefix array.
-      BytesToBytesMap.BytesToBytesMapIterator iter = map.iterator();
+      BytesToBytesMap.MapIterator iter = map.iterator();
       final int numKeyFields = keySchema.size();
       UnsafeRow row = new UnsafeRow();
       while (iter.hasNext()) {
@@ -117,7 +114,7 @@ public UnsafeKVExternalSorter(
       }
 
       sorter = UnsafeExternalSorter.createWithExistingInMemorySorter(
-        taskContext.taskMemoryManager(),
+        taskMemoryManager,
         blockManager,
         taskContext,
         new KVComparator(ordering, keySchema.length()),
@@ -128,6 +125,8 @@ public UnsafeKVExternalSorter(
 
       sorter.spill();
       map.free();
+      // counting the memory used UnsafeInMemorySorter
+      taskMemoryManager.acquireExecutionMemory(inMemSorter.getMemoryUsage(), sorter);
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index dbf4863b767bf..a38623623a441 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -24,7 +24,7 @@ import scala.util.{Try, Random}
 import org.scalatest.Matchers
 
 import org.apache.spark.{SparkConf, TaskContextImpl, TaskContext, SparkFunSuite}
-import org.apache.spark.memory.{TaskMemoryManager, GrantEverythingMemoryManager}
+import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.test.SharedSQLContext
@@ -48,7 +48,7 @@ class UnsafeFixedWidthAggregationMapSuite
   private def emptyAggregationBuffer: InternalRow = InternalRow(0)
   private val PAGE_SIZE_BYTES: Long = 1L << 26; // 64 megabytes
 
-  private var memoryManager: GrantEverythingMemoryManager = null
+  private var memoryManager: TestMemoryManager = null
   private var taskMemoryManager: TaskMemoryManager = null
 
   def testWithMemoryLeakDetection(name: String)(f: => Unit) {
@@ -62,7 +62,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
     test(name) {
       val conf = new SparkConf().set("spark.unsafe.offHeap", "false")
-      memoryManager = new GrantEverythingMemoryManager(conf)
+      memoryManager = new TestMemoryManager(conf)
       taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
 
       TaskContext.setTaskContext(new TaskContextImpl(
@@ -193,10 +193,6 @@ class UnsafeFixedWidthAggregationMapSuite
     // Convert the map into a sorter
     val sorter = map.destructAndCreateExternalSorter()
 
-    withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      assert(taskMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
-    }
-
     // Add more keys to the sorter and make sure the results come out sorted.
     val additionalKeys = randomStrings(1024)
     val keyConverter = UnsafeProjection.create(groupKeySchema)
@@ -208,7 +204,7 @@ class UnsafeFixedWidthAggregationMapSuite
       sorter.insertKV(keyConverter.apply(k), valueConverter.apply(v))
 
       if ((i % 100) == 0) {
-        memoryManager.markExecutionAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemoryOnce()
         sorter.closeCurrentPage()
       }
     }
@@ -251,7 +247,7 @@ class UnsafeFixedWidthAggregationMapSuite
       sorter.insertKV(keyConverter.apply(k), valueConverter.apply(v))
 
       if ((i % 100) == 0) {
-        memoryManager.markExecutionAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemoryOnce()
         sorter.closeCurrentPage()
       }
     }
@@ -294,16 +290,12 @@ class UnsafeFixedWidthAggregationMapSuite
     // Convert the map into a sorter. Right now, it contains one record.
     val sorter = map.destructAndCreateExternalSorter()
 
-    withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      assert(taskMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
-    }
-
     // Add more keys to the sorter and make sure the results come out sorted.
     (1 to 4096).foreach { i =>
       sorter.insertKV(UnsafeRow.createFromByteArray(0, 0), UnsafeRow.createFromByteArray(0, 0))
 
       if ((i % 100) == 0) {
-        memoryManager.markExecutionAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemoryOnce()
         sorter.closeCurrentPage()
       }
     }
@@ -342,7 +334,7 @@ class UnsafeFixedWidthAggregationMapSuite
       buf.setInt(0, str.length)
     }
     // Simulate running out of space
-    memoryManager.markExecutionAsOutOfMemory()
+    memoryManager.limit(0)
     val str = rand.nextString(1024)
     val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
     assert(buf == null)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 13dc1754c9ff0..7b80963ec8708 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import scala.util.Random
 
 import org.apache.spark._
-import org.apache.spark.memory.{TaskMemoryManager, GrantEverythingMemoryManager}
+import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager}
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{InterpretedOrdering, UnsafeRow, UnsafeProjection}
@@ -109,7 +109,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       pageSize: Long,
       spill: Boolean): Unit = {
     val memoryManager =
-      new GrantEverythingMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"))
+      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"))
     val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
@@ -128,7 +128,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       sorter.insertKV(k.asInstanceOf[UnsafeRow], v.asInstanceOf[UnsafeRow])
       // 1% chance we will spill
       if (rand.nextDouble() < 0.01 && spill) {
-        memoryManager.markExecutionAsOutOfMemory()
+        memoryManager.markExecutionAsOutOfMemoryOnce()
         sorter.closeCurrentPage()
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
deleted file mode 100644
index 475037bd45379..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.aggregate
-
-import org.apache.spark._
-import org.apache.spark.memory.TaskMemoryManager
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection
-import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.test.SharedSQLContext
-
-class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLContext {
-
-  test("memory acquired on construction") {
-    val taskMemoryManager = new TaskMemoryManager(SparkEnv.get.memoryManager, 0)
-    val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, null, Seq.empty)
-    TaskContext.setTaskContext(taskContext)
-
-    // Assert that a page is allocated before processing starts
-    var iter: TungstenAggregationIterator = null
-    try {
-      val newMutableProjection = (expr: Seq[Expression], schema: Seq[Attribute]) => {
-        () => new InterpretedMutableProjection(expr, schema)
-      }
-      val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
-      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
-        0, Seq.empty, newMutableProjection, Seq.empty, None,
-        dummyAccum, dummyAccum, dummyAccum, dummyAccum)
-      val numPages = iter.getHashMap.getNumDataPages
-      assert(numPages === 1)
-    } finally {
-      // Clean up
-      if (iter != null) {
-        iter.free()
-      }
-      TaskContext.unset()
-    }
-  }
-}
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index ebe90d9e63d83..09847cec9c4ca 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -23,6 +23,8 @@
 import java.util.LinkedList;
 import java.util.Map;
 
+import org.apache.spark.unsafe.Platform;
+
 /**
  * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array.
  */
@@ -45,9 +47,6 @@ private boolean shouldPool(long size) {
 
   @Override
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
-    if (size % 8 != 0) {
-      throw new IllegalArgumentException("Size " + size + " was not a multiple of 8");
-    }
     if (shouldPool(size)) {
       synchronized (this) {
         final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
@@ -64,8 +63,8 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
         }
       }
     }
-    long[] array = new long[(int) (size / 8)];
-    return MemoryBlock.fromLongArray(array);
+    long[] array = new long[(int) ((size + 7) / 8)];
+    return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size);
   }
 
   @Override
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
index cda7826c8c99b..98ce711176e43 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
@@ -26,9 +26,6 @@ public class UnsafeMemoryAllocator implements MemoryAllocator {
 
   @Override
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
-    if (size % 8 != 0) {
-      throw new IllegalArgumentException("Size " + size + " was not a multiple of 8");
-    }
     long address = Platform.allocateMemory(size);
     return new MemoryBlock(null, address, size);
   }

From eb59b94c450fe6391d24d44ff7ea9bd4c6893af8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 30 Oct 2015 00:36:20 -0700
Subject: [PATCH 727/802] [SPARK-11417] [SQL] no @Override in codegen

Older version of Janino (>2.7) does not support Override, we should not use that in codegen.

Author: Davies Liu <davies@databricks.com>

Closes #9372 from davies/no_override.
---
 .../catalyst/expressions/codegen/GenerateOrdering.scala    | 1 -
 .../catalyst/expressions/codegen/GeneratePredicate.scala   | 1 -
 .../catalyst/expressions/codegen/GenerateProjection.scala  | 7 -------
 3 files changed, 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index c2b420286f755..1af7c73cd4bf5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -126,7 +126,6 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
           ${initMutableStates(ctx)}
         }
 
-        @Override
         public int compare(InternalRow a, InternalRow b) {
           InternalRow ${ctx.INPUT_ROW} = null;  // Holds current row being evaluated.
           $comparisons
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index ae6ffe6293c5d..457b4f08424a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -55,7 +55,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
           ${initMutableStates(ctx)}
         }
 
-        @Override
         public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
           ${eval.code}
           return !${eval.isNull} && ${eval.value};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index dbcc9dc08408f..c0d313b2e1301 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -82,7 +82,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       if (cases.length > 0) {
         val getter = "get" + ctx.primitiveTypeName(jt)
         s"""
-      @Override
       public $jt $getter(int i) {
         if (isNullAt(i)) {
           return ${ctx.defaultValue(jt)};
@@ -107,7 +106,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       if (cases.length > 0) {
         val setter = "set" + ctx.primitiveTypeName(jt)
         s"""
-      @Override
       public void $setter(int i, $jt value) {
         nullBits[i] = false;
         switch (i) {
@@ -169,7 +167,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         ${initMutableStates(ctx)}
       }
 
-      @Override
       public Object apply(Object r) {
         // GenerateProjection does not work with UnsafeRows.
         assert(!(r instanceof ${classOf[UnsafeRow].getName}));
@@ -189,7 +186,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         public void setNullAt(int i) { nullBits[i] = true; }
         public boolean isNullAt(int i) { return nullBits[i]; }
 
-        @Override
         public Object genericGet(int i) {
           if (isNullAt(i)) return null;
           switch (i) {
@@ -210,14 +206,12 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         $specificAccessorFunctions
         $specificMutatorFunctions
 
-        @Override
         public int hashCode() {
           int result = 37;
           $hashUpdates
           return result;
         }
 
-        @Override
         public boolean equals(Object other) {
           if (other instanceof SpecificRow) {
             SpecificRow row = (SpecificRow) other;
@@ -227,7 +221,6 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
           return super.equals(other);
         }
 
-        @Override
         public InternalRow copy() {
           Object[] arr = new Object[${expressions.length}];
           ${copyColumns}

From 86d65265fcab7edab88a7bdb10acba47da95bcb3 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Fri, 30 Oct 2015 02:59:05 -0700
Subject: [PATCH 728/802] =?UTF-8?q?[SPARK-11207]=20[ML]=20Add=20test=20cas?=
 =?UTF-8?q?es=20for=20solver=20selection=20of=20LinearRegres=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…sion as followup. This is the follow up work of SPARK-10668.

* Fix miner style issues.
* Add test case for checking whether solver is selected properly.

Author: Lewuathe <lewuathe@me.com>
Author: lewuathe <lewuathe@me.com>

Closes #9180 from Lewuathe/SPARK-11207.
---
 .../mllib/util/LinearDataGenerator.scala      |  54 +++++-
 .../ml/regression/LinearRegressionSuite.scala | 172 ++++++++++--------
 2 files changed, 144 insertions(+), 82 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index d0ba454f379a9..6ff07eed6cfd2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -77,13 +77,11 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double = 0.1): Seq[LabeledPoint] = {
-    generateLinearInput(intercept, weights,
-      Array.fill[Double](weights.length)(0.0),
-      Array.fill[Double](weights.length)(1.0 / 3.0),
-      nPoints, seed, eps)}
+    generateLinearInput(intercept, weights, Array.fill[Double](weights.length)(0.0),
+      Array.fill[Double](weights.length)(1.0 / 3.0), nPoints, seed, eps)
+  }
 
   /**
-   *
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
    * @param xMean the mean of the generated features. Lots of time, if the features are not properly
@@ -104,16 +102,49 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): Seq[LabeledPoint] = {
+    generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
+  }
+
 
+  /**
+   * @param intercept Data intercept
+   * @param weights  Weights to be applied.
+   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
+   *              standardized, the algorithm with poor implementation will have difficulty
+   *              to converge.
+   * @param xVariance the variance of the generated features.
+   * @param nPoints Number of points in sample.
+   * @param seed Random seed
+   * @param eps Epsilon scaling factor.
+   * @param sparsity The ratio of zero elements. If it is 0.0, LabeledPoints with
+   *                 DenseVector is returned.
+   * @return Seq of input.
+   */
+  @Since("1.6.0")
+  def generateLinearInput(
+      intercept: Double,
+      weights: Array[Double],
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double,
+      sparsity: Double): Seq[LabeledPoint] = {
+    require(0.0 <= sparsity && sparsity <= 1.0)
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(rnd.nextDouble()))
 
+    val sparseRnd = new Random(seed)
     x.foreach { v =>
       var i = 0
       val len = v.length
       while (i < len) {
-        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        if (sparseRnd.nextDouble() < sparsity) {
+          v(i) = 0.0
+        } else {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }
         i += 1
       }
     }
@@ -121,7 +152,16 @@ object LinearDataGenerator {
     val y = x.map { xi =>
       blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
     }
-    y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+
+    y.zip(x).map { p =>
+      if (sparsity == 0.0) {
+        // Return LabeledPoints with DenseVector
+        LabeledPoint(p._1, Vectors.dense(p._2))
+      } else {
+        // Return LabeledPoints with SparseVector
+        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+      }
+    }
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a6e0c72ba9030..a2a5c0bbdcb90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -32,8 +32,9 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private val seed: Int = 42
-  @transient var dataset: DataFrame = _
-  @transient var datasetWithoutIntercept: DataFrame = _
+  @transient var datasetWithDenseFeature: DataFrame = _
+  @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _
+  @transient var datasetWithSparseFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -49,16 +50,29 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
-    dataset = sqlContext.createDataFrame(
+    datasetWithDenseFeature = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
-    datasetWithoutIntercept = sqlContext.createDataFrame(
+    datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
+
+    val r = new Random(seed)
+    // When feature size is larger than 4096, normal optimizer is choosed
+    // as the solver of linear regression in the case of "auto" mode.
+    val featureSize = 4100
+    datasetWithSparseFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+        seed, eps = 0.1, sparsity = 0.7), 2))
   }
 
   test("params") {
@@ -77,19 +91,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
     assert(lir.getSolver == "auto")
-    val model = lir.fit(dataset)
+    val model = lir.fit(datasetWithDenseFeature)
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
 
-    model.transform(dataset)
+    model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
       .collect()
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
-    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
   }
 
@@ -98,8 +112,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer1 = new LinearRegression().setSolver(solver)
       // The result should be the same regardless of standardization without regularization
       val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          Using the following R code to load the data and train the model using glmnet package.
@@ -124,7 +138,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR relTol 1E-3)
       assert(model2.weights ~= weightsR relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -139,10 +153,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Without regularization the results should be the same
       val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
         .setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
-      val model2 = trainer2.fit(dataset)
-      val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
@@ -186,19 +200,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setSolver(solver).setStandardization(false)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
-
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
@@ -230,11 +240,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -247,18 +258,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
@@ -292,11 +300,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -308,8 +317,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
@@ -342,7 +351,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 relTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -357,8 +366,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setFitIntercept(false).setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -392,7 +401,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 absTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -408,18 +417,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
@@ -452,10 +458,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -469,18 +476,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
@@ -514,10 +518,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -527,27 +532,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model training summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
       val trainerNoPredictionCol = trainer.setPredictionCol("")
-      val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
-
+      val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature)
 
       // Training results for the model should be available
       assert(model.hasSummary)
       assert(modelNoPredictionCol.hasSummary)
 
       // Schema should be a superset of the input dataset
-      assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf(
         model.summary.predictions.schema.fieldNames.toSet))
       // Validate that we re-insert a prediction column for evaluation
       val modelNoPredictionColFieldNames
       = modelNoPredictionCol.summary.predictions.schema.fieldNames
-      assert((dataset.schema.fieldNames.toSet).subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf(
         modelNoPredictionColFieldNames.toSet))
       assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
 
       // Residuals in [[LinearRegressionResults]] should equal those manually computed
-      val expectedResiduals = dataset.select("features", "label")
+      val expectedResiduals = datasetWithDenseFeature.select("features", "label")
         .map { case Row(features: DenseVector, label: Double) =>
         val prediction =
           features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
@@ -585,6 +589,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             .objectiveHistory
             .sliding(2)
             .forall(x => x(0) >= x(1)))
+      } else {
+        // To clalify that the normal solver is used here.
+        assert(model.summary.objectiveHistory.length == 1)
+        assert(model.summary.objectiveHistory(0) == 0.0)
       }
     }
   }
@@ -592,10 +600,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model testset evaluation summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
 
       // Evaluating on training dataset should yield results summary equal to training summary
-      val testSummary = model.evaluate(dataset)
+      val testSummary = model.evaluate(datasetWithDenseFeature)
       assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
       assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
       model.summary.residuals.select("residuals").collect()
@@ -693,4 +701,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
   }
+
+  test("linear regression model with l-bfgs with big feature datasets") {
+    val trainer = new LinearRegression().setSolver("auto")
+    val model = trainer.fit(datasetWithSparseFeature)
+
+    // Training results for the model should be available
+    assert(model.hasSummary)
+    // When LBFGS is used as optimizer, objective history can be restored.
+    assert(
+      model.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+  }
 }

From 59db9e9c382fab40aac0633f2c779bee8cf2025f Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 30 Oct 2015 18:17:35 +0800
Subject: [PATCH 729/802] [SPARK-11103][SQL] Filter applied on Merged Parquet
 shema with new column fail

When enabling mergedSchema and predicate filter, this fails since Parquet does not accept filters pushed down when the columns of the filters do not exist in the schema.
This is related with Parquet issue (https://issues.apache.org/jira/browse/PARQUET-389).

For now, it just simply disables predicate push down when using merged schema in this PR.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #9327 from HyukjinKwon/SPARK-11103.
---
 .../datasources/parquet/ParquetRelation.scala |  6 +++++-
 .../parquet/ParquetFilterSuite.scala          | 20 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 77d851ca486b3..44649a68b3c9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -292,6 +292,10 @@ private[sql] class ParquetRelation(
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
 
+    // When merging schemas is enabled and the column of the given filter does not exist,
+    // Parquet emits an exception which is an issue of Parquet (PARQUET-389).
+    val safeParquetFilterPushDown = !shouldMergeSchemas && parquetFilterPushDown
+
     // Parquet row group size. We will use this value as the value for
     // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value
     // of these flags are smaller than the parquet row group size.
@@ -305,7 +309,7 @@ private[sql] class ParquetRelation(
         dataSchema,
         parquetBlockSize,
         useMetadataCache,
-        parquetFilterPushDown,
+        safeParquetFilterPushDown,
         assumeBinaryIsString,
         assumeInt96IsTimestamp) _
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 13fdd555a4c71..b2101beb92224 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -316,4 +316,24 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-11103: Filter applied on merged Parquet schema with new column fails") {
+    import testImplicits._
+
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true",
+      SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        var pathOne = s"${dir.getCanonicalPath}/table1"
+        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathOne)
+        var pathTwo = s"${dir.getCanonicalPath}/table2"
+        (1 to 3).map(i => (i, i.toString)).toDF("c", "b").write.parquet(pathTwo)
+
+        // If the "c = 1" filter gets pushed down, this query will throw an exception which
+        // Parquet emits. This is a Parquet issue (PARQUET-389).
+        checkAnswer(
+          sqlContext.read.parquet(pathOne, pathTwo).filter("c = 1"),
+          (1 to 1).map(i => Row(i, i.toString, null)))
+      }
+    }
+  }
 }

From 14d08b99085d4e609aeae0cf54d4584e860eb552 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 30 Oct 2015 12:17:51 +0100
Subject: [PATCH 730/802] [SPARK-11393] [SQL] CoGroupedIterator should respect
 the fact that GroupedIterator.hasNext is not idempotent

When we cogroup 2 `GroupedIterator`s in `CoGroupedIterator`, if the right side is smaller, we will consume right data and keep the left data unchanged. Then we call `hasNext` which will call `left.hasNext`. This will make `GroupedIterator` generate an extra group as the previous one has not been comsumed yet.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9346 from cloud-fan/cogroup and squashes the following commits:

9be67c8 [Wenchen Fan] SPARK-11393
---
 .../sql/execution/CoGroupedIterator.scala     | 14 ++++++-----
 .../execution/CoGroupedIteratorSuite.scala    | 24 +++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
index ce5827855e4aa..663bc904f39c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CoGroupedIterator.scala
@@ -38,17 +38,19 @@ class CoGroupedIterator(
   private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
   private var currentRightData: (InternalRow, Iterator[InternalRow]) = _
 
-  override def hasNext: Boolean = left.hasNext || right.hasNext
-
-  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
-    if (currentLeftData.eq(null) && left.hasNext) {
+  override def hasNext: Boolean = {
+    if (currentLeftData == null && left.hasNext) {
       currentLeftData = left.next()
     }
-    if (currentRightData.eq(null) && right.hasNext) {
+    if (currentRightData == null && right.hasNext) {
       currentRightData = right.next()
     }
 
-    assert(currentLeftData.ne(null) || currentRightData.ne(null))
+    currentLeftData != null || currentRightData != null
+  }
+
+  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
+    assert(hasNext)
 
     if (currentLeftData.eq(null)) {
       // left is null, right is not null, consume the right data.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala
index d1fe81947e9ea..4ff96e6574cac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoGroupedIteratorSuite.scala
@@ -48,4 +48,28 @@ class CoGroupedIteratorSuite extends SparkFunSuite with ExpressionEvalHelper {
       Nil
     )
   }
+
+  test("SPARK-11393: respect the fact that GroupedIterator.hasNext is not idempotent") {
+    val leftInput = Seq(create_row(2, "a")).iterator
+    val rightInput = Seq(create_row(1, 2L)).iterator
+    val leftGrouped = GroupedIterator(leftInput, Seq('i.int.at(0)), Seq('i.int, 's.string))
+    val rightGrouped = GroupedIterator(rightInput, Seq('i.int.at(0)), Seq('i.int, 'l.long))
+    val cogrouped = new CoGroupedIterator(leftGrouped, rightGrouped, Seq('i.int))
+
+    val result = cogrouped.map {
+      case (key, leftData, rightData) =>
+        assert(key.numFields == 1)
+        (key.getInt(0), leftData.toSeq, rightData.toSeq)
+    }.toSeq
+
+    assert(result ==
+      (1,
+        Seq.empty,
+        Seq(create_row(1, 2L))) ::
+      (2,
+        Seq(create_row(2, "a")),
+        Seq.empty) ::
+      Nil
+    )
+  }
 }

From 0451b00148a294c665146563242d2fe2de943a02 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Fri, 30 Oct 2015 16:51:32 +0000
Subject: [PATCH 731/802] [SPARK-10986][MESOS] Set the context class loader in
 the Mesos executor backend.

See [SPARK-10986](https://issues.apache.org/jira/browse/SPARK-10986) for details.

This fixes the `ClassNotFoundException` for Spark classes in the serializer.

I am not sure this is the right way to handle the class loader, but I couldn't find any documentation on how the context class loader is used and who relies on it. It seems at least the serializer uses it to instantiate classes during deserialization.

I am open to suggestions (I tried this fix on a real Mesos cluster and it *does* fix the issue).

tnachen andrewor14

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #9282 from dragos/issue/mesos-classloader.
---
 .../org/apache/spark/executor/MesosExecutorBackend.scala     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index 0474fd2ccc12e..c9f18ebc7f0ea 100644
--- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -63,6 +63,11 @@ private[spark] class MesosExecutorBackend
 
     logInfo(s"Registered with Mesos as executor ID $executorId with $cpusPerTask cpus")
     this.driver = driver
+    // Set a context class loader to be picked up by the serializer. Without this call
+    // the serializer would default to the null class loader, and fail to find Spark classes
+    // See SPARK-10986.
+    Thread.currentThread().setContextClassLoader(this.getClass.getClassLoader)
+
     val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray) ++
       Seq[(String, String)](("spark.app.id", frameworkInfo.getId.getValue))
     val conf = new SparkConf(loadDefaults = true).setAll(properties)

From fab710a9171932f01ac81d100db8523dbd314925 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Fri, 30 Oct 2015 10:51:11 -0700
Subject: [PATCH 732/802] [SPARK-11414][SPARKR] Forgot to update usage of
 'spark.sparkr.r.command' in RRDD in the PR for SPARK-10971.

Author: Sun Rui <rui.sun@intel.com>

Closes #9368 from sun-rui/SPARK-11414.
---
 core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 9d5bbb5d609f3..6b418e908cb53 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -392,7 +392,12 @@ private[r] object RRDD {
   }
 
   private def createRProcess(port: Int, script: String): BufferedStreamThread = {
-    val rCommand = SparkEnv.get.conf.get("spark.sparkr.r.command", "Rscript")
+    // "spark.sparkr.r.command" is deprecated and replaced by "spark.r.command",
+    // but kept here for backward compatibility.
+    val sparkConf = SparkEnv.get.conf
+    var rCommand = sparkConf.get("spark.sparkr.r.command", "Rscript")
+    rCommand = sparkConf.get("spark.r.command", rCommand)
+
     val rOptions = "--vanilla"
     val rLibDir = RUtils.sparkRPackagePath(isDriver = false)
     val rExecScript = rLibDir + "/SparkR/worker/" + script

From 40c77fb23a1ee0a4e69d735ee6247f83b7e13b92 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Fri, 30 Oct 2015 10:56:06 -0700
Subject: [PATCH 733/802] [SPARK-11210][SPARKR] Add window functions into
 SparkR [step 2].

Author: Sun Rui <rui.sun@intel.com>

Closes #9196 from sun-rui/SPARK-11210.
---
 R/pkg/NAMESPACE                  |  4 ++
 R/pkg/R/functions.R              | 92 ++++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               | 16 ++++++
 R/pkg/inst/tests/test_sparkSQL.R |  5 ++
 4 files changed, 117 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index b73bed3128242..cd9537a2655f0 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -126,6 +126,7 @@ exportMethods("%in%",
               "datediff",
               "dayofmonth",
               "dayofyear",
+              "denseRank",
               "desc",
               "endsWith",
               "exp",
@@ -182,16 +183,19 @@ exportMethods("%in%",
               "next_day",
               "ntile",
               "otherwise",
+              "percentRank",
               "pmod",
               "quarter",
               "rand",
               "randn",
+              "rank",
               "regexp_extract",
               "regexp_replace",
               "reverse",
               "rint",
               "rlike",
               "round",
+              "rowNumber",
               "rpad",
               "rtrim",
               "second",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 366290fe66276..d7fd279279137 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2038,6 +2038,28 @@ setMethod("cumeDist",
             column(jc)
           })
 
+#' denseRank
+#' 
+#' Window function: returns the rank of rows within a window partition, without any gaps.
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#' 
+#' This is equivalent to the DENSE_RANK function in SQL.
+#'
+#' @rdname denseRank
+#' @name denseRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{denseRank()}
+setMethod("denseRank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "denseRank")
+            column(jc)
+          })
+
 #' lag
 #'
 #' Window function: returns the value that is `offset` rows before the current row, and
@@ -2111,3 +2133,73 @@ setMethod("ntile",
             jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
             column(jc)
           })
+
+#' percentRank
+#'
+#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+#' 
+#' This is computed by:
+#' 
+#'   (rank of row in its partition - 1) / (number of rows in the partition - 1)
+#'
+#' This is equivalent to the PERCENT_RANK function in SQL.
+#'
+#' @rdname percentRank
+#' @name percentRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{percentRank()}
+setMethod("percentRank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "percentRank")
+            column(jc)
+          })
+
+#' rank
+#'
+#' Window function: returns the rank of rows within a window partition.
+#' 
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#' 
+#' This is equivalent to the RANK function in SQL.
+#'
+#' @rdname rank
+#' @name rank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rank()}
+setMethod("rank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rank")
+            column(jc)
+          })
+
+# Expose rank() in the R base package
+setMethod("rank",
+          signature(x = "ANY"),
+          function(x, ...) {
+            base::rank(x, ...)
+          })
+
+#' rowNumber
+#'
+#' Window function: returns a sequential number starting at 1 within a window partition.
+#' 
+#' This is equivalent to the ROW_NUMBER function in SQL.
+#'
+#' @rdname rowNumber
+#' @name rowNumber
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rowNumber()}
+setMethod("rowNumber",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rowNumber")
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c11c3c8d3e150..0b35340e48e42 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -742,6 +742,10 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
 #' @export
 setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 
+#' @rdname denseRank
+#' @export
+setGeneric("denseRank", function(x) { standardGeneric("denseRank") })
+
 #' @rdname explode
 #' @export
 setGeneric("explode", function(x) { standardGeneric("explode") })
@@ -878,6 +882,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @export
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname percentRank
+#' @export
+setGeneric("percentRank", function(x) { standardGeneric("percentRank") })
+
 #' @rdname pmod
 #' @export
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
@@ -894,6 +902,10 @@ setGeneric("rand", function(seed) { standardGeneric("rand") })
 #' @export
 setGeneric("randn", function(seed) { standardGeneric("randn") })
 
+#' @rdname rank
+#' @export
+setGeneric("rank", function(x, ...) { standardGeneric("rank") })
+
 #' @rdname regexp_extract
 #' @export
 setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
@@ -911,6 +923,10 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") })
 #' @export
 setGeneric("rint", function(x, ...) { standardGeneric("rint") })
 
+#' @rdname rowNumber
+#' @export
+setGeneric("rowNumber", function(x) { standardGeneric("rowNumber") })
+
 #' @rdname rpad
 #' @export
 setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e1d4499925fe7..b4a4d03b2643b 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -831,6 +831,11 @@ test_that("column functions", {
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
   c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
   c13 <- cumeDist() + ntile(1)
+  c14 <- denseRank() + percentRank() + rank() + rowNumber()
+
+  # Test if base::rank() is exposed
+  expect_equal(class(rank())[[1]], "Column")
+  expect_equal(rank(1:3), as.numeric(c(1:3)))
 
   df <- jsonFile(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))

From 729f983e66cf65da2e8f48c463ccde2b355240c4 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Fri, 30 Oct 2015 18:50:12 +0000
Subject: [PATCH 734/802] =?UTF-8?q?[SPARK-11342][TESTS]=20Allow=20to=20set?=
 =?UTF-8?q?=20hadoop=20profile=20when=20running=20dev/ru=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…n_tests

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9295 from zjffdu/SPARK-11342.
---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 6b4b71073453d..9e1abb0697192 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -486,7 +486,7 @@ def main():
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"
-        hadoop_version = "hadoop2.3"
+        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3")
         test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,

From bb5a2af034196620d869fc9b1a400e014e718b8c Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Fri, 30 Oct 2015 13:51:32 -0700
Subject: [PATCH 735/802] [SPARK-11340][SPARKR] Support setting driver
 properties when starting Spark from R programmatically or from RStudio

Mapping spark.driver.memory from sparkEnvir to spark-submit commandline arguments.

shivaram suggested that we possibly add other spark.driver.* properties - do we want to add all of those? I thought those could be set in SparkConf?
sun-rui

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9290 from felixcheung/rdrivermem.
---
 R/pkg/R/sparkR.R                | 45 +++++++++++++++++++++++++++++----
 R/pkg/inst/tests/test_context.R | 27 ++++++++++++++++++++
 docs/sparkr.md                  | 28 ++++++++++++++------
 3 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 043b0057bd04a..004d08e74e1cd 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -77,7 +77,9 @@ sparkR.stop <- function() {
 
 #' Initialize a new Spark Context.
 #'
-#' This function initializes a new SparkContext.
+#' This function initializes a new SparkContext. For details on how to initialize
+#' and use SparkR, refer to SparkR programming guide at 
+#' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparkcontext-sqlcontext}.
 #'
 #' @param master The Spark master URL.
 #' @param appName Application name to register with cluster manager
@@ -93,7 +95,7 @@ sparkR.stop <- function() {
 #' sc <- sparkR.init("local[2]", "SparkR", "/home/spark",
 #'                  list(spark.executor.memory="1g"))
 #' sc <- sparkR.init("yarn-client", "SparkR", "/home/spark",
-#'                  list(spark.executor.memory="1g"),
+#'                  list(spark.executor.memory="4g"),
 #'                  list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"),
 #'                  c("jarfile1.jar","jarfile2.jar"))
 #'}
@@ -123,16 +125,21 @@ sparkR.init <- function(
     uriSep <- "////"
   }
 
+  sparkEnvirMap <- convertNamedListToEnv(sparkEnvir)
+
   existingPort <- Sys.getenv("EXISTING_SPARKR_BACKEND_PORT", "")
   if (existingPort != "") {
     backendPort <- existingPort
   } else {
     path <- tempfile(pattern = "backend_port")
+    submitOps <- getClientModeSparkSubmitOpts(
+        Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
+        sparkEnvirMap)
     launchBackend(
         args = path,
         sparkHome = sparkHome,
         jars = jars,
-        sparkSubmitOpts = Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
+        sparkSubmitOpts = submitOps,
         packages = sparkPackages)
     # wait atmost 100 seconds for JVM to launch
     wait <- 0.1
@@ -171,8 +178,6 @@ sparkR.init <- function(
     sparkHome <- suppressWarnings(normalizePath(sparkHome))
   }
 
-  sparkEnvirMap <- convertNamedListToEnv(sparkEnvir)
-
   sparkExecutorEnvMap <- convertNamedListToEnv(sparkExecutorEnv)
   if(is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) {
     sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <-
@@ -320,3 +325,33 @@ clearJobGroup <- function(sc) {
 cancelJobGroup <- function(sc, groupId) {
   callJMethod(sc, "cancelJobGroup", groupId)
 }
+
+sparkConfToSubmitOps <- new.env()
+sparkConfToSubmitOps[["spark.driver.memory"]]           <- "--driver-memory"
+sparkConfToSubmitOps[["spark.driver.extraClassPath"]]   <- "--driver-class-path"
+sparkConfToSubmitOps[["spark.driver.extraJavaOptions"]] <- "--driver-java-options"
+sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-path"
+
+# Utility function that returns Spark Submit arguments as a string
+#
+# A few Spark Application and Runtime environment properties cannot take effect after driver
+# JVM has started, as documented in:
+# http://spark.apache.org/docs/latest/configuration.html#application-properties
+# When starting SparkR without using spark-submit, for example, from Rstudio, add them to
+# spark-submit commandline if not already set in SPARKR_SUBMIT_ARGS so that they can be effective.
+getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) {
+  envirToOps <- lapply(ls(sparkConfToSubmitOps), function(conf) {
+    opsValue <- sparkEnvirMap[[conf]]
+    # process only if --option is not already specified
+    if (!is.null(opsValue) &&
+        nchar(opsValue) > 1 &&
+        !grepl(sparkConfToSubmitOps[[conf]], submitOps)) {
+      # put "" around value in case it has spaces
+      paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ")
+    } else {
+      ""
+    }
+  })
+  # --option must be before the application class "sparkr-shell" in submitOps
+  paste0(paste0(envirToOps, collapse = ""), submitOps)
+}
diff --git a/R/pkg/inst/tests/test_context.R b/R/pkg/inst/tests/test_context.R
index e99815ed1562c..80c1b89a4c627 100644
--- a/R/pkg/inst/tests/test_context.R
+++ b/R/pkg/inst/tests/test_context.R
@@ -65,3 +65,30 @@ test_that("job group functions can be called", {
   cancelJobGroup(sc, "groupId")
   clearJobGroup(sc)
 })
+
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+  e <- new.env()
+  e[["spark.driver.memory"]] <- "512m"
+  ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
+  expect_equal("--driver-memory \"512m\" sparkrmain", ops)
+
+  e[["spark.driver.memory"]] <- "5g"
+  e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
+  e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
+  e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
+  e[["random"]] <- "skipthis"
+  ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
+  # nolint start
+  expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
+                      "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
+                      "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
+  # nolint end
+
+  e[["spark.driver.extraClassPath"]] <- "/" # too short
+  ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
+  # nolint start
+  expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
+                      "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
+                      " --driver-memory 4g sparkr-shell2"))
+  # nolint end
+})
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 7139d16b4a068..497a276679f3b 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -29,7 +29,7 @@ All of the examples on this page use sample data included in R or the Spark dist
 The entry point into SparkR is the `SparkContext` which connects your R program to a Spark cluster.
 You can create a `SparkContext` using `sparkR.init` and pass in options such as the application name
 , any spark packages depended on, etc. Further, to work with DataFrames we will need a `SQLContext`,
-which can be created from the  SparkContext. If you are working from the SparkR shell, the
+which can be created from the  SparkContext. If you are working from the `sparkR` shell, the
 `SQLContext` and `SparkContext` should already be created for you.
 
 {% highlight r %}
@@ -37,17 +37,29 @@ sc <- sparkR.init()
 sqlContext <- sparkRSQL.init(sc)
 {% endhighlight %}
 
+In the event you are creating `SparkContext` instead of using `sparkR` shell or `spark-submit`, you 
+could also specify certain Spark driver properties. Normally these
+[Application properties](configuration.html#application-properties) and
+[Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the
+driver JVM process would have been started, in this case SparkR takes care of this for you. To set
+them, pass them as you would other configuration properties in the `sparkEnvir` argument to
+`sparkR.init()`.
+
+{% highlight r %}
+sc <- sparkR.init("local[*]", "SparkR", "/home/spark", list(spark.driver.memory="2g"))
+{% endhighlight %}
+
 </div>
 
 ## Creating DataFrames
 With a `SQLContext`, applications can create `DataFrame`s from a local R data frame, from a [Hive table](sql-programming-guide.html#hive-tables), or from other [data sources](sql-programming-guide.html#data-sources).
 
 ### From local data frames
-The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use `createDataFrame` and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a `DataFrame` based using the `faithful` dataset from R. 
+The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use `createDataFrame` and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a `DataFrame` based using the `faithful` dataset from R.
 
 <div data-lang="r"  markdown="1">
 {% highlight r %}
-df <- createDataFrame(sqlContext, faithful) 
+df <- createDataFrame(sqlContext, faithful)
 
 # Displays the content of the DataFrame to stdout
 head(df)
@@ -96,7 +108,7 @@ printSchema(people)
 </div>
 
 The data sources API can also be used to save out DataFrames into multiple file formats. For example we can save the DataFrame from the previous example
-to a Parquet file using `write.df` 
+to a Parquet file using `write.df`
 
 <div data-lang="r"  markdown="1">
 {% highlight r %}
@@ -139,7 +151,7 @@ Here we include some basic examples and a complete list can be found in the [API
 <div data-lang="r"  markdown="1">
 {% highlight r %}
 # Create the DataFrame
-df <- createDataFrame(sqlContext, faithful) 
+df <- createDataFrame(sqlContext, faithful)
 
 # Get basic information about the DataFrame
 df
@@ -152,7 +164,7 @@ head(select(df, df$eruptions))
 ##2     1.800
 ##3     3.333
 
-# You can also pass in column name as strings 
+# You can also pass in column name as strings
 head(select(df, "eruptions"))
 
 # Filter the DataFrame to only retain rows with wait times shorter than 50 mins
@@ -166,7 +178,7 @@ head(filter(df, df$waiting < 50))
 
 </div>
 
-### Grouping, Aggregation 
+### Grouping, Aggregation
 
 SparkR data frames support a number of commonly used functions to aggregate data after grouping. For example we can compute a histogram of the `waiting` time in the `faithful` dataset as shown below
 
@@ -194,7 +206,7 @@ head(arrange(waiting_counts, desc(waiting_counts$count)))
 
 ### Operating on Columns
 
-SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions. 
+SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions.
 
 <div data-lang="r"  markdown="1">
 {% highlight r %}

From 45029bfdea42eb8964f2ba697859687393d2a558 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 30 Oct 2015 15:47:40 -0700
Subject: [PATCH 736/802] [SPARK-11423] remove MapPartitionsWithPreparationRDD

Since we do not need to preserve a page before calling compute(), MapPartitionsWithPreparationRDD is not needed anymore.

This PR basically revert #8543, #8511, #8038, #8011

Author: Davies Liu <davies@databricks.com>

Closes #9381 from davies/remove_prepare2.
---
 .../rdd/MapPartitionsWithPreparationRDD.scala | 66 ----------------
 .../spark/rdd/ZippedPartitionsRDD.scala       | 13 ---
 ...MapPartitionsWithPreparationRDDSuite.scala | 66 ----------------
 project/MimaExcludes.scala                    |  6 +-
 .../UnsafeFixedWidthAggregationMap.java       |  9 +--
 .../aggregate/TungstenAggregate.scala         | 79 +++++++------------
 .../TungstenAggregationIterator.scala         | 78 ++++++++----------
 .../org/apache/spark/sql/execution/sort.scala | 28 ++-----
 8 files changed, 75 insertions(+), 270 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
deleted file mode 100644
index 417ff5278db2a..0000000000000
--- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rdd
-
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.ClassTag
-
-import org.apache.spark.{Partition, Partitioner, TaskContext}
-
-/**
- * An RDD that applies a user provided function to every partition of the parent RDD, and
- * additionally allows the user to prepare each partition before computing the parent partition.
- */
-private[spark] class MapPartitionsWithPreparationRDD[U: ClassTag, T: ClassTag, M: ClassTag](
-    prev: RDD[T],
-    preparePartition: () => M,
-    executePartition: (TaskContext, Int, M, Iterator[T]) => Iterator[U],
-    preservesPartitioning: Boolean = false)
-  extends RDD[U](prev) {
-
-  override val partitioner: Option[Partitioner] = {
-    if (preservesPartitioning) firstParent[T].partitioner else None
-  }
-
-  override def getPartitions: Array[Partition] = firstParent[T].partitions
-
-  // In certain join operations, prepare can be called on the same partition multiple times.
-  // In this case, we need to ensure that each call to compute gets a separate prepare argument.
-  private[this] val preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M]
-
-  /**
-   * Prepare a partition for a single call to compute.
-   */
-  def prepare(): Unit = {
-    preparedArguments += preparePartition()
-  }
-
-  /**
-   * Prepare a partition before computing it from its parent.
-   */
-  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
-    val prepared =
-      if (preparedArguments.isEmpty) {
-        preparePartition()
-      } else {
-        preparedArguments.remove(0)
-      }
-    val parentIterator = firstParent[T].iterator(partition, context)
-    executePartition(context, partition.index, prepared, parentIterator)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index 70bf04de6400d..4333a679c8aae 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -73,16 +73,6 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag](
     super.clearDependencies()
     rdds = null
   }
-
-  /**
-   * Call the prepare method of every parent that has one.
-   * This is needed for reserving execution memory in advance.
-   */
-  protected def tryPrepareParents(): Unit = {
-    rdds.collect {
-      case rdd: MapPartitionsWithPreparationRDD[_, _, _] => rdd.prepare()
-    }
-  }
 }
 
 private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag](
@@ -94,7 +84,6 @@ private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag]
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
-    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context))
   }
@@ -118,7 +107,6 @@ private[spark] class ZippedPartitionsRDD3
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
-    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
@@ -146,7 +134,6 @@ private[spark] class ZippedPartitionsRDD4
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
-    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
diff --git a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
deleted file mode 100644
index e281e817e493d..0000000000000
--- a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.rdd
-
-import scala.collection.mutable
-
-import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite, TaskContext}
-
-class MapPartitionsWithPreparationRDDSuite extends SparkFunSuite with LocalSparkContext {
-
-  test("prepare called before parent partition is computed") {
-    sc = new SparkContext("local", "test")
-
-    // Have the parent partition push a number to the list
-    val parent = sc.parallelize(1 to 100, 1).mapPartitions { iter =>
-      TestObject.things.append(20)
-      iter
-    }
-
-    // Push a different number during the prepare phase
-    val preparePartition = () => { TestObject.things.append(10) }
-
-    // Push yet another number during the execution phase
-    val executePartition = (
-        taskContext: TaskContext,
-        partitionIndex: Int,
-        notUsed: Unit,
-        parentIterator: Iterator[Int]) => {
-      TestObject.things.append(30)
-      TestObject.things.iterator
-    }
-
-    // Verify that the numbers are pushed in the order expected
-    val rdd = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
-      parent, preparePartition, executePartition)
-    val result = rdd.collect()
-    assert(result === Array(10, 20, 30))
-
-    TestObject.things.clear()
-    // Zip two of these RDDs, both should be prepared before the parent is executed
-    val rdd2 = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
-      parent, preparePartition, executePartition)
-    val result2 = rdd.zipPartitions(rdd2)((a, b) => a).collect()
-    assert(result2 === Array(10, 10, 20, 30, 20, 30))
-  }
-
-}
-
-private object TestObject {
-  val things = new mutable.ListBuffer[Int]
-}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index b5e661d3ecfa8..8282f7ea62400 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -107,7 +107,11 @@ object MimaExcludes {
           "org.apache.spark.sql.SQLContext.createSession")
       ) ++ Seq(
         ProblemFilters.exclude[MissingMethodProblem](
-          "org.apache.spark.SparkContext.preferredNodeLocationData_=")
+          "org.apache.spark.SparkContext.preferredNodeLocationData_="),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.rdd.MapPartitionsWithPreparationRDD"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.rdd.MapPartitionsWithPreparationRDD$")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 889f97003450c..d4b6d75b4d981 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -19,9 +19,8 @@
 
 import java.io.IOException;
 
-import com.google.common.annotations.VisibleForTesting;
-
 import org.apache.spark.SparkEnv;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -31,7 +30,6 @@
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
-import org.apache.spark.memory.TaskMemoryManager;
 
 /**
  * Unsafe-based HashMap for performing aggregations where the aggregated values are fixed-width.
@@ -218,11 +216,6 @@ public long getPeakMemoryUsedBytes() {
     return map.getPeakMemoryUsedBytes();
   }
 
-  @VisibleForTesting
-  public int getNumDataPages() {
-    return map.getNumDataPages();
-  }
-
   /**
    * Free the memory associated with this map. This is idempotent and can be called multiple times.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 0d3a4b36c161b..15616915f7364 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.sql.execution.aggregate
 
-import org.apache.spark.TaskContext
-import org.apache.spark.rdd.{MapPartitionsWithPreparationRDD, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, UnaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.execution.{SparkPlan, UnaryNode, UnsafeFixedWidthAggregationMap}
 import org.apache.spark.sql.types.StructType
 
 case class TungstenAggregate(
@@ -84,59 +83,39 @@ case class TungstenAggregate(
     val dataSize = longMetric("dataSize")
     val spillSize = longMetric("spillSize")
 
-    /**
-     * Set up the underlying unsafe data structures used before computing the parent partition.
-     * This makes sure our iterator is not starved by other operators in the same task.
-     */
-    def preparePartition(): TungstenAggregationIterator = {
-      new TungstenAggregationIterator(
-        groupingExpressions,
-        nonCompleteAggregateExpressions,
-        nonCompleteAggregateAttributes,
-        completeAggregateExpressions,
-        completeAggregateAttributes,
-        initialInputBufferOffset,
-        resultExpressions,
-        newMutableProjection,
-        child.output,
-        testFallbackStartsAt,
-        numInputRows,
-        numOutputRows,
-        dataSize,
-        spillSize)
-    }
+    child.execute().mapPartitions { iter =>
 
-    /** Compute a partition using the iterator already set up previously. */
-    def executePartition(
-        context: TaskContext,
-        partitionIndex: Int,
-        aggregationIterator: TungstenAggregationIterator,
-        parentIterator: Iterator[InternalRow]): Iterator[UnsafeRow] = {
-      val hasInput = parentIterator.hasNext
-      if (!hasInput) {
-        // We're not using the underlying map, so we just can free it here
-        aggregationIterator.free()
-        if (groupingExpressions.isEmpty) {
+      val hasInput = iter.hasNext
+      if (!hasInput && groupingExpressions.nonEmpty) {
+        // This is a grouped aggregate and the input iterator is empty,
+        // so return an empty iterator.
+        Iterator.empty
+      } else {
+        val aggregationIterator =
+          new TungstenAggregationIterator(
+            groupingExpressions,
+            nonCompleteAggregateExpressions,
+            nonCompleteAggregateAttributes,
+            completeAggregateExpressions,
+            completeAggregateAttributes,
+            initialInputBufferOffset,
+            resultExpressions,
+            newMutableProjection,
+            child.output,
+            iter,
+            testFallbackStartsAt,
+            numInputRows,
+            numOutputRows,
+            dataSize,
+            spillSize)
+        if (!hasInput && groupingExpressions.isEmpty) {
           numOutputRows += 1
           Iterator.single[UnsafeRow](aggregationIterator.outputForEmptyGroupingKeyWithoutInput())
         } else {
-          // This is a grouped aggregate and the input iterator is empty,
-          // so return an empty iterator.
-          Iterator.empty
+          aggregationIterator
         }
-      } else {
-        aggregationIterator.start(parentIterator)
-        aggregationIterator
       }
     }
-
-    // Note: we need to set up the iterator in each partition before computing the
-    // parent partition, so we cannot simply use `mapPartitions` here (SPARK-9747).
-    val resultRdd = {
-      new MapPartitionsWithPreparationRDD[UnsafeRow, InternalRow, TungstenAggregationIterator](
-        child.execute(), preparePartition, executePartition, preservesPartitioning = true)
-    }
-    resultRdd.asInstanceOf[RDD[InternalRow]]
   }
 
   override def simpleString: String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index fb2fc98e34fbe..713a4db0cd59b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -74,6 +74,8 @@ import org.apache.spark.sql.types.StructType
  *   the function used to create mutable projections.
  * @param originalInputAttributes
  *   attributes of representing input rows from `inputIter`.
+ * @param inputIter
+ *   the iterator containing input [[UnsafeRow]]s.
  */
 class TungstenAggregationIterator(
     groupingExpressions: Seq[NamedExpression],
@@ -85,6 +87,7 @@ class TungstenAggregationIterator(
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow],
     testFallbackStartsAt: Option[Int],
     numInputRows: LongSQLMetric,
     numOutputRows: LongSQLMetric,
@@ -92,9 +95,6 @@ class TungstenAggregationIterator(
     spillSize: LongSQLMetric)
   extends Iterator[UnsafeRow] with Logging {
 
-  // The parent partition iterator, to be initialized later in `start`
-  private[this] var inputIter: Iterator[InternalRow] = null
-
   ///////////////////////////////////////////////////////////////////////////
   // Part 1: Initializing aggregate functions.
   ///////////////////////////////////////////////////////////////////////////
@@ -486,15 +486,11 @@ class TungstenAggregationIterator(
     false // disable tracking of performance metrics
   )
 
-  // Exposed for testing
-  private[aggregate] def getHashMap: UnsafeFixedWidthAggregationMap = hashMap
-
   // The function used to read and process input rows. When processing input rows,
   // it first uses hash-based aggregation by putting groups and their buffers in
   // hashMap. If we could not allocate more memory for the map, we switch to
   // sort-based aggregation (by calling switchToSortBasedAggregation).
   private def processInputs(): Unit = {
-    assert(inputIter != null, "attempted to process input when iterator was null")
     if (groupingExpressions.isEmpty) {
       // If there is no grouping expressions, we can just reuse the same buffer over and over again.
       // Note that it would be better to eliminate the hash map entirely in the future.
@@ -526,7 +522,6 @@ class TungstenAggregationIterator(
   // that it switch to sort-based aggregation after `fallbackStartsAt` input rows have
   // been processed.
   private def processInputsWithControlledFallback(fallbackStartsAt: Int): Unit = {
-    assert(inputIter != null, "attempted to process input when iterator was null")
     var i = 0
     while (!sortBased && inputIter.hasNext) {
       val newInput = inputIter.next()
@@ -567,15 +562,11 @@ class TungstenAggregationIterator(
    * Switch to sort-based aggregation when the hash-based approach is unable to acquire memory.
    */
   private def switchToSortBasedAggregation(firstKey: UnsafeRow, firstInput: InternalRow): Unit = {
-    assert(inputIter != null, "attempted to process input when iterator was null")
     logInfo("falling back to sort based aggregation.")
     // Step 1: Get the ExternalSorter containing sorted entries of the map.
     externalSorter = hashMap.destructAndCreateExternalSorter()
 
-    // Step 2: Free the memory used by the map.
-    hashMap.free()
-
-    // Step 3: If we have aggregate function with mode Partial or Complete,
+    // Step 2: If we have aggregate function with mode Partial or Complete,
     // we need to process input rows to get aggregation buffer.
     // So, later in the sort-based aggregation iterator, we can do merge.
     // If aggregate functions are with mode Final and PartialMerge,
@@ -770,31 +761,27 @@ class TungstenAggregationIterator(
 
   /**
    * Start processing input rows.
-   * Only after this method is called will this iterator be non-empty.
    */
-  def start(parentIter: Iterator[InternalRow]): Unit = {
-    inputIter = parentIter
-    testFallbackStartsAt match {
-      case None =>
-        processInputs()
-      case Some(fallbackStartsAt) =>
-        // This is the testing path. processInputsWithControlledFallback is same as processInputs
-        // except that it switches to sort-based aggregation after `fallbackStartsAt` input rows
-        // have been processed.
-        processInputsWithControlledFallback(fallbackStartsAt)
-    }
+  testFallbackStartsAt match {
+    case None =>
+      processInputs()
+    case Some(fallbackStartsAt) =>
+      // This is the testing path. processInputsWithControlledFallback is same as processInputs
+      // except that it switches to sort-based aggregation after `fallbackStartsAt` input rows
+      // have been processed.
+      processInputsWithControlledFallback(fallbackStartsAt)
+  }
 
-    // If we did not switch to sort-based aggregation in processInputs,
-    // we pre-load the first key-value pair from the map (to make hasNext idempotent).
-    if (!sortBased) {
-      // First, set aggregationBufferMapIterator.
-      aggregationBufferMapIterator = hashMap.iterator()
-      // Pre-load the first key-value pair from the aggregationBufferMapIterator.
-      mapIteratorHasNext = aggregationBufferMapIterator.next()
-      // If the map is empty, we just free it.
-      if (!mapIteratorHasNext) {
-        hashMap.free()
-      }
+  // If we did not switch to sort-based aggregation in processInputs,
+  // we pre-load the first key-value pair from the map (to make hasNext idempotent).
+  if (!sortBased) {
+    // First, set aggregationBufferMapIterator.
+    aggregationBufferMapIterator = hashMap.iterator()
+    // Pre-load the first key-value pair from the aggregationBufferMapIterator.
+    mapIteratorHasNext = aggregationBufferMapIterator.next()
+    // If the map is empty, we just free it.
+    if (!mapIteratorHasNext) {
+      hashMap.free()
     }
   }
 
@@ -868,13 +855,16 @@ class TungstenAggregationIterator(
    * Generate a output row when there is no input and there is no grouping expression.
    */
   def outputForEmptyGroupingKeyWithoutInput(): UnsafeRow = {
-    assert(groupingExpressions.isEmpty)
-    assert(inputIter == null)
-    generateOutput(UnsafeRow.createFromByteArray(0, 0), initialAggregationBuffer)
-  }
-
-  /** Free memory used in the underlying map. */
-  def free(): Unit = {
-    hashMap.free()
+    if (groupingExpressions.isEmpty) {
+      sortBasedAggregationBuffer.copyFrom(initialAggregationBuffer)
+      // We create a output row and copy it. So, we can free the map.
+      val resultCopy =
+        generateOutput(UnsafeRow.createFromByteArray(0, 0), sortBasedAggregationBuffer).copy()
+      hashMap.free()
+      resultCopy
+    } else {
+      throw new IllegalStateException(
+        "This method should not be called when groupingExpressions is not empty.")
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index dd92dda480601..1a3832a698b61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.rdd.{MapPartitionsWithPreparationRDD, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,7 +26,7 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
-import org.apache.spark.{SparkEnv, InternalAccumulator, TaskContext}
+import org.apache.spark.{InternalAccumulator, SparkEnv, TaskContext}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines various sort operators.
@@ -77,6 +77,7 @@ case class Sort(
  * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
  *                           spill every `frequency` records.
  */
+
 case class TungstenSort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
@@ -106,11 +107,7 @@ case class TungstenSort(
     val dataSize = longMetric("dataSize")
     val spillSize = longMetric("spillSize")
 
-    /**
-     * Set up the sorter in each partition before computing the parent partition.
-     * This makes sure our sorter is not starved by other sorters used in the same task.
-     */
-    def preparePartition(): UnsafeExternalRowSorter = {
+    child.execute().mapPartitions { iter =>
       val ordering = newOrdering(sortOrder, childOutput)
 
       // The comparator for comparing prefix
@@ -131,33 +128,20 @@ case class TungstenSort(
       if (testSpillFrequency > 0) {
         sorter.setTestSpillFrequency(testSpillFrequency)
       }
-      sorter
-    }
 
-    /** Compute a partition using the sorter already set up previously. */
-    def executePartition(
-        taskContext: TaskContext,
-        partitionIndex: Int,
-        sorter: UnsafeExternalRowSorter,
-        parentIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       // Remember spill data size of this task before execute this operator so that we can
       // figure out how many bytes we spilled for this operator.
       val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled
 
-      val sortedIterator = sorter.sort(parentIterator.asInstanceOf[Iterator[UnsafeRow]])
+      val sortedIterator = sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]])
 
       dataSize += sorter.getPeakMemoryUsage
       spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore
 
-      taskContext.internalMetricsToAccumulators(
+      TaskContext.get().internalMetricsToAccumulators(
         InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage)
       sortedIterator
     }
-
-    // Note: we need to set up the external sorter in each partition before computing
-    // the parent partition, so we cannot simply use `mapPartitions` here (SPARK-9709).
-    new MapPartitionsWithPreparationRDD[InternalRow, InternalRow, UnsafeExternalRowSorter](
-      child.execute(), preparePartition, executePartition, preservesPartitioning = true)
   }
 
 }

From e8ec2a7b01cc86329a6fbafc3d371bdfd79fc1d6 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 30 Oct 2015 16:12:33 -0700
Subject: [PATCH 737/802] Revert "[SPARK-11236][CORE] Update Tachyon dependency
 from 0.7.1 -> 0.8.0."

This reverts commit 4f5e60c647d7d6827438721b7fabbc3a57b81023.
---
 core/pom.xml         | 6 +++++-
 make-distribution.sh | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index dff40e91ad228..319a50049a82d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -266,7 +266,7 @@
     <dependency>
       <groupId>org.tachyonproject</groupId>
       <artifactId>tachyon-client</artifactId>
-      <version>0.8.0</version>
+      <version>0.7.1</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.hadoop</groupId>
@@ -288,6 +288,10 @@
           <groupId>org.tachyonproject</groupId>
           <artifactId>tachyon-underfs-glusterfs</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.tachyonproject</groupId>
+          <artifactId>tachyon-underfs-s3</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/make-distribution.sh b/make-distribution.sh
index f6766784813c3..24418ace26270 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -33,9 +33,9 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 SPARK_TACHYON=false
-TACHYON_VERSION="0.8.0"
+TACHYON_VERSION="0.7.1"
 TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz"
-TACHYON_URL="http://tachyon-project.org/downloads/files/${TACHYON_VERSION}/${TACHYON_TGZ}"
+TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}"
 
 MAKE_TGZ=false
 NAME=none
@@ -240,10 +240,10 @@ if [ "$SPARK_TACHYON" == "true" ]; then
   fi
 
   tar xzf "${TACHYON_TGZ}"
-  cp "tachyon-${TACHYON_VERSION}/assembly/target/tachyon-assemblies-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
+  cp "tachyon-${TACHYON_VERSION}/core/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
   mkdir -p "$DISTDIR/tachyon/src/main/java/tachyon/web"
   cp -r "tachyon-${TACHYON_VERSION}"/{bin,conf,libexec} "$DISTDIR/tachyon"
-  cp -r "tachyon-${TACHYON_VERSION}"/servers/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
+  cp -r "tachyon-${TACHYON_VERSION}"/core/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
 
   if [[ `uname -a` == Darwin* ]]; then
     # need to run sed differently on osx

From 69b9e4b3c2f929e3df55f5e71875c03bb9712948 Mon Sep 17 00:00:00 2001
From: Nakul Jindal <njindal@us.ibm.com>
Date: Fri, 30 Oct 2015 17:12:24 -0700
Subject: [PATCH 738/802] [SPARK-11385] [ML] foreachActive made public in
 MLLib's vector API

Made foreachActive public in MLLib's vector API

Author: Nakul Jindal <njindal@us.ibm.com>

Closes #9362 from nakul02/SPARK-11385_foreach_for_mllib_linalg_vector.
---
 .../scala/org/apache/spark/mllib/linalg/Vectors.scala    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index dcdc614455d34..bd9badc03c345 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -123,7 +123,8 @@ sealed trait Vector extends Serializable {
    *          the vector with type `Int`, and the second parameter is the corresponding value
    *          with type `Double`.
    */
-  private[spark] def foreachActive(f: (Int, Double) => Unit)
+  @Since("1.6.0")
+  def foreachActive(f: (Int, Double) => Unit): Unit
 
   /**
    * Number of active entries.  An "active entry" is an element which is explicitly stored,
@@ -570,7 +571,8 @@ class DenseVector @Since("1.0.0") (
     new DenseVector(values.clone())
   }
 
-  private[spark] override def foreachActive(f: (Int, Double) => Unit) = {
+  @Since("1.6.0")
+  override def foreachActive(f: (Int, Double) => Unit): Unit = {
     var i = 0
     val localValuesSize = values.length
     val localValues = values
@@ -700,7 +702,8 @@ class SparseVector @Since("1.0.0") (
 
   private[spark] override def toBreeze: BV[Double] = new BSV[Double](indices, values, size)
 
-  private[spark] override def foreachActive(f: (Int, Double) => Unit) = {
+  @Since("1.6.0")
+  override def foreachActive(f: (Int, Double) => Unit): Unit = {
     var i = 0
     val localValuesSize = values.length
     val localIndices = indices

From 3c471885dc4f86bea95ab542e0d48d22ae748404 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 30 Oct 2015 20:05:07 -0700
Subject: [PATCH 739/802] [SPARK-11434][SPARK-11103][SQL] Fix test ": Filter
 applied on merged Parquet schema with new column fails"

https://issues.apache.org/jira/browse/SPARK-11434

Author: Yin Huai <yhuai@databricks.com>

Closes #9387 from yhuai/SPARK-11434.
---
 .../execution/datasources/parquet/ParquetFilterSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index b2101beb92224..f88ddc77a6a4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -323,15 +323,15 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true",
       SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") {
       withTempPath { dir =>
-        var pathOne = s"${dir.getCanonicalPath}/table1"
+        val pathOne = s"${dir.getCanonicalPath}/table1"
         (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathOne)
-        var pathTwo = s"${dir.getCanonicalPath}/table2"
+        val pathTwo = s"${dir.getCanonicalPath}/table2"
         (1 to 3).map(i => (i, i.toString)).toDF("c", "b").write.parquet(pathTwo)
 
         // If the "c = 1" filter gets pushed down, this query will throw an exception which
         // Parquet emits. This is a Parquet issue (PARQUET-389).
         checkAnswer(
-          sqlContext.read.parquet(pathOne, pathTwo).filter("c = 1"),
+          sqlContext.read.parquet(pathOne, pathTwo).filter("c = 1").selectExpr("c", "b", "a"),
           (1 to 1).map(i => Row(i, i.toString, null)))
       }
     }

From 97b3c8fb470f0d3c1cdb1aeb27f675e695442e87 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Sat, 31 Oct 2015 11:10:37 +0000
Subject: [PATCH 740/802] [SPARK-11226][SQL] Empty line in json file should be
 skipped

Currently the empty line in json file will be parsed into Row with all null field values. But in json, "{}" represents a json object, empty line is supposed to be skipped.

Make a trivial change for this.

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9211 from zjffdu/SPARK-11226.
---
 .../datasources/json/JacksonParser.scala      | 46 ++++++++++---------
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 11 +++++
 .../datasources/json/JsonSuite.scala          |  3 --
 3 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index b2e52011a7276..4f53eeb081b93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -245,29 +245,33 @@ private[sql] object JacksonParser {
       val factory = new JsonFactory()
 
       iter.flatMap { record =>
-        try {
-          Utils.tryWithResource(factory.createParser(record)) { parser =>
-            parser.nextToken()
-
-            convertField(factory, parser, schema) match {
-              case null => failedRecord(record)
-              case row: InternalRow => row :: Nil
-              case array: ArrayData =>
-                if (array.numElements() == 0) {
-                  Nil
-                } else {
-                  array.toArray[InternalRow](schema)
-                }
-              case _ =>
-                sys.error(
-                  s"Failed to parse record $record. Please make sure that each line of the file " +
-                    "(or each string in the RDD) is a valid JSON object or " +
-                    "an array of JSON objects.")
+        if (record.trim.isEmpty) {
+          Nil
+        } else {
+          try {
+            Utils.tryWithResource(factory.createParser(record)) { parser =>
+              parser.nextToken()
+
+              convertField(factory, parser, schema) match {
+                case null => failedRecord(record)
+                case row: InternalRow => row :: Nil
+                case array: ArrayData =>
+                  if (array.numElements() == 0) {
+                    Nil
+                  } else {
+                    array.toArray[InternalRow](schema)
+                  }
+                case _ =>
+                  sys.error(
+                    s"Failed to parse record $record. Please make sure that each line of " +
+                      "the file (or each string in the RDD) is a valid JSON object or " +
+                      "an array of JSON objects.")
+              }
             }
+          } catch {
+            case _: JsonProcessingException =>
+              failedRecord(record)
           }
-        } catch {
-          case _: JsonProcessingException =>
-            failedRecord(record)
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5a616fac0bc2d..5413ef1287da1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -225,6 +225,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Seq(Row("1"), Row("2")))
   }
 
+  test("SPARK-11226 Skip empty line in json file") {
+    sqlContext.read.json(
+      sparkContext.parallelize(
+        Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}", "")))
+      .registerTempTable("d")
+
+    checkAnswer(
+      sql("select count(1) from d"),
+      Seq(Row(3)))
+  }
+
   test("SPARK-8828 sum should return null if all input values are null") {
     withSQLConf(SQLConf.USE_SQL_AGGREGATE2.key -> "true") {
       withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index d3fd409291f29..28b8f02bdf87f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -959,7 +959,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       withTempTable("jsonTable") {
         val jsonDF = sqlContext.read.json(corruptRecords)
         jsonDF.registerTempTable("jsonTable")
-
         val schema = StructType(
           StructField("_unparsed", StringType, true) ::
           StructField("a", StringType, true) ::
@@ -976,7 +975,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
               |FROM jsonTable
             """.stripMargin),
           Row(null, null, null, "{") ::
-            Row(null, null, null, "") ::
             Row(null, null, null, """{"a":1, b:2}""") ::
             Row(null, null, null, """{"a":{, b:3}""") ::
             Row("str_a_4", "str_b_4", "str_c_4", null) ::
@@ -1001,7 +999,6 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
               |WHERE _unparsed IS NOT NULL
             """.stripMargin),
           Row("{") ::
-            Row("") ::
             Row("""{"a":1, b:2}""") ::
             Row("""{"a":{, b:3}""") ::
             Row("]") :: Nil

From ac4118db2dda802b936bb7a18a08844846c71285 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 31 Oct 2015 10:47:22 -0700
Subject: [PATCH 741/802] [SPARK-11424] Guard against double-close() of
 RecordReaders

**TL;DR**: We can rule out one rare but potential cause of input stream corruption via defensive programming.

## Background

[MAPREDUCE-5918](https://issues.apache.org/jira/browse/MAPREDUCE-5918) is a bug where an instance of a decompressor ends up getting placed into a pool multiple times. Since the pool is backed by a list instead of a set, this can lead to the same decompressor being used in different places at the same time, which is not safe because those decompressors will overwrite each other's buffers. Sometimes this buffer sharing will lead to exceptions but other times it will might silently result in invalid / garbled input.

That Hadoop bug is fixed in Hadoop 2.7 but is still present in many Hadoop versions that we wish to support. As a result, I think that we should try to work around this issue in Spark via defensive programming to prevent RecordReaders from being closed multiple times.

So far, I've had a hard time coming up with explanations of exactly how double-`close()`s occur in practice, but I do have a couple of explanations that work on paper.

For instance, it looks like https://github.com/apache/spark/pull/7424, added in 1.5, introduces at least one extremely~rare corner-case path where Spark could double-close() a LineRecordReader instance in a way that triggers the bug. Here are the steps involved in the bad execution that I brainstormed up:

* [The task has finished reading input, so we call close()](https://github.com/apache/spark/blob/v1.5.1/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala#L168).
* [While handling the close call and trying to close the reader, reader.close() throws an exception]( https://github.com/apache/spark/blob/v1.5.1/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala#L190)
* We don't set `reader = null` after handling this exception, so the [TaskCompletionListener also ends up calling NewHadoopRDD.close()](https://github.com/apache/spark/blob/v1.5.1/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala#L156), which, in turn, closes the record reader again.

In this hypothetical situation, `LineRecordReader.close()` could [fail with an exception if its InputStream failed to close](https://github.com/apache/hadoop/blob/release-1.2.1/src/mapred/org/apache/hadoop/mapred/LineRecordReader.java#L212).
I googled for "Exception in RecordReader.close()" and it looks like it's possible for a closed Hadoop FileSystem to trigger an error there: [SPARK-757](https://issues.apache.org/jira/browse/SPARK-757), [SPARK-2491](https://issues.apache.org/jira/browse/SPARK-2491)

Looking at [SPARK-3052](https://issues.apache.org/jira/browse/SPARK-3052), it seems like it's possible to get spurious exceptions there when there is an error reading from Hadoop. If the Hadoop FileSystem were to get into an error state _right_ after reading the last record then it looks like we could hit the bug here in 1.5.

## The fix

This patch guards against these issues by modifying `HadoopRDD.close()` and `NewHadoopRDD.close()` so that they set `reader = null` even if an exception occurs in the `reader.close()` call. In addition, I modified `NextIterator. closeIfNeeded()` to guard against double-close if the first `close()` call throws an exception.

I don't have an easy way to test this, since I haven't been able to reproduce the bug that prompted this patch, but these changes seem safe and seem to rule out the on-paper reproductions that I was able to brainstorm up.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9382 from JoshRosen/hadoop-decompressor-pooling-fix and squashes the following commits:

5ec97d7 [Josh Rosen] Add SqlNewHadoopRDD.unsetInputFileName() that I accidentally deleted.
ae46cf4 [Josh Rosen] Merge remote-tracking branch 'origin/master' into hadoop-decompressor-pooling-fix
087aa63 [Josh Rosen] Guard against double-close() of RecordReaders.
---
 .../org/apache/spark/rdd/HadoopRDD.scala      | 23 +++++----
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 44 ++++++++---------
 .../apache/spark/rdd/SqlNewHadoopRDD.scala    | 47 ++++++++++---------
 .../org/apache/spark/util/NextIterator.scala  |  4 +-
 4 files changed, 66 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 77b57132b9f1f..d841f05ec52cf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -251,8 +251,21 @@ class HadoopRDD[K, V](
       }
 
       override def close() {
-        try {
-          reader.close()
+        if (reader != null) {
+          // Close the reader and release it. Note: it's very important that we don't close the
+          // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
+          // Hadoop 1.x and older Hadoop 2.x releases. That bug can lead to non-deterministic
+          // corruption issues when reading compressed input.
+          try {
+            reader.close()
+          } catch {
+            case e: Exception =>
+              if (!ShutdownHookManager.inShutdown()) {
+                logWarning("Exception in RecordReader.close()", e)
+              }
+          } finally {
+            reader = null
+          }
           if (bytesReadCallback.isDefined) {
             inputMetrics.updateBytesRead()
           } else if (split.inputSplit.value.isInstanceOf[FileSplit] ||
@@ -266,12 +279,6 @@ class HadoopRDD[K, V](
                 logWarning("Unable to get input size to set InputMetrics for task", e)
             }
           }
-        } catch {
-          case e: Exception => {
-            if (!ShutdownHookManager.inShutdown()) {
-              logWarning("Exception in RecordReader.close()", e)
-            }
-          }
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 2872b93b8730e..9c4b70844bdbe 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -184,30 +184,32 @@ class NewHadoopRDD[K, V](
       }
 
       private def close() {
-        try {
-          if (reader != null) {
-            // Close reader and release it
+        if (reader != null) {
+          // Close the reader and release it. Note: it's very important that we don't close the
+          // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
+          // Hadoop 1.x and older Hadoop 2.x releases. That bug can lead to non-deterministic
+          // corruption issues when reading compressed input.
+          try {
             reader.close()
-            reader = null
-
-            if (bytesReadCallback.isDefined) {
-              inputMetrics.updateBytesRead()
-            } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
-                       split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
-              // If we can't get the bytes read from the FS stats, fall back to the split size,
-              // which may be inaccurate.
-              try {
-                inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
-              } catch {
-                case e: java.io.IOException =>
-                  logWarning("Unable to get input size to set InputMetrics for task", e)
+          } catch {
+            case e: Exception =>
+              if (!ShutdownHookManager.inShutdown()) {
+                logWarning("Exception in RecordReader.close()", e)
               }
-            }
+          } finally {
+            reader = null
           }
-        } catch {
-          case e: Exception => {
-            if (!ShutdownHookManager.inShutdown()) {
-              logWarning("Exception in RecordReader.close()", e)
+          if (bytesReadCallback.isDefined) {
+            inputMetrics.updateBytesRead()
+          } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
+                     split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
+            // If we can't get the bytes read from the FS stats, fall back to the split size,
+            // which may be inaccurate.
+            try {
+              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
+            } catch {
+              case e: java.io.IOException =>
+                logWarning("Unable to get input size to set InputMetrics for task", e)
             }
           }
         }
diff --git a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
index 0228c54e0511c..264dae7f39085 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
@@ -189,32 +189,35 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
       }
 
       private def close() {
-        try {
-          if (reader != null) {
+        if (reader != null) {
+          SqlNewHadoopRDD.unsetInputFileName()
+          // Close the reader and release it. Note: it's very important that we don't close the
+          // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
+          // Hadoop 1.x and older Hadoop 2.x releases. That bug can lead to non-deterministic
+          // corruption issues when reading compressed input.
+          try {
             reader.close()
-            reader = null
-
-            SqlNewHadoopRDD.unsetInputFileName()
-
-            if (bytesReadCallback.isDefined) {
-              inputMetrics.updateBytesRead()
-            } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
-                       split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
-              // If we can't get the bytes read from the FS stats, fall back to the split size,
-              // which may be inaccurate.
-              try {
-                inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
-              } catch {
-                case e: java.io.IOException =>
-                  logWarning("Unable to get input size to set InputMetrics for task", e)
+          } catch {
+            case e: Exception =>
+              if (!ShutdownHookManager.inShutdown()) {
+                logWarning("Exception in RecordReader.close()", e)
               }
-            }
+          } finally {
+            reader = null
           }
-        } catch {
-          case e: Exception =>
-            if (!ShutdownHookManager.inShutdown()) {
-              logWarning("Exception in RecordReader.close()", e)
+          if (bytesReadCallback.isDefined) {
+            inputMetrics.updateBytesRead()
+          } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
+                     split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
+            // If we can't get the bytes read from the FS stats, fall back to the split size,
+            // which may be inaccurate.
+            try {
+              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
+            } catch {
+              case e: java.io.IOException =>
+                logWarning("Unable to get input size to set InputMetrics for task", e)
             }
+          }
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/util/NextIterator.scala b/core/src/main/scala/org/apache/spark/util/NextIterator.scala
index e5c732a5a559b..0b505a576768c 100644
--- a/core/src/main/scala/org/apache/spark/util/NextIterator.scala
+++ b/core/src/main/scala/org/apache/spark/util/NextIterator.scala
@@ -60,8 +60,10 @@ private[spark] abstract class NextIterator[U] extends Iterator[U] {
    */
   def closeIfNeeded() {
     if (!closed) {
-      close()
+      // Note: it's important that we set closed = true before calling close(), since setting it
+      // afterwards would permit us to call close() multiple times if close() threw an exception.
       closed = true
+      close()
     }
   }
 

From fc27dfbf0f8d3f96c70e27d88f7d0316c97ddb1e Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Sat, 31 Oct 2015 12:55:33 -0700
Subject: [PATCH 742/802] [SPARK-11024][SQL] Optimize NULL in
 <inlist-expressions> by folding it to Literal(null)

Add a rule in optimizer to convert NULL [NOT] IN (expr1,...,expr2) to
Literal(null).

This is a follow up defect to SPARK-8654

cloud-fan Can you please take a look ?

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #9348 from dilipbiswal/spark_11024.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  5 ++
 .../catalyst/optimizer/OptimizeInSuite.scala  | 51 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d37f43888fd4f..338c5193cb7a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -417,6 +417,11 @@ object NullPropagation extends Rule[LogicalPlan] {
         case left :: Literal(null, _) :: Nil => Literal.create(null, e.dataType)
         case _ => e
       }
+
+      // If the value expression is NULL then transform the In expression to
+      // Literal(null)
+      case In(Literal(null, _), list) => Literal.create(null, BooleanType)
+
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 6f7b5b9572e22..48cab01ac1004 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -35,7 +35,8 @@ class OptimizeInSuite extends PlanTest {
     val batches =
       Batch("AnalysisNodes", Once,
         EliminateSubQueries) ::
-      Batch("ConstantFolding", Once,
+      Batch("ConstantFolding", FixedPoint(10),
+        NullPropagation,
         ConstantFolding,
         BooleanSimplification,
         OptimizeIn) :: Nil
@@ -82,4 +83,52 @@ class OptimizeInSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("OptimizedIn test: NULL IN (expr1, ..., exprN) gets transformed to Filter(null)") {
+    val originalQuery =
+      testRelation
+        .where(In(Literal.create(null, NullType), Seq(Literal(1), Literal(2))))
+        .analyze
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .where(Literal.create(null, BooleanType))
+        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("OptimizedIn test: Inset optimization disabled as " +
+    "list expression contains attribute)") {
+    val originalQuery =
+      testRelation
+        .where(In(Literal.create(null, StringType), Seq(Literal(1), UnresolvedAttribute("b"))))
+        .analyze
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .where(Literal.create(null, BooleanType))
+        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("OptimizedIn test: Inset optimization disabled as " +
+    "list expression contains attribute - select)") {
+    val originalQuery =
+      testRelation
+        .select(In(Literal.create(null, StringType),
+        Seq(Literal(1), UnresolvedAttribute("b"))).as("a")).analyze
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .select(Literal.create(null, BooleanType).as("a"))
+        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
 }

From 40d3c6797a3dfd037eb69b2bcd336d8544deddf5 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Sat, 31 Oct 2015 18:23:15 -0700
Subject: [PATCH 743/802] [SPARK-11265][YARN] YarnClient can't get tokens to
 talk to Hive 1.2.1 in a secure cluster

This is a fix for SPARK-11265; the introspection code to get Hive delegation tokens failing on Spark 1.5.1+, due to changes in the Hive codebase

Author: Steve Loughran <stevel@hortonworks.com>

Closes #9232 from steveloughran/stevel/patches/SPARK-11265-hive-tokens.
---
 yarn/pom.xml                                  | 25 +++++++
 .../org/apache/spark/deploy/yarn/Client.scala | 51 +------------
 .../deploy/yarn/YarnSparkHadoopUtil.scala     | 73 +++++++++++++++++++
 .../yarn/YarnSparkHadoopUtilSuite.scala       | 29 ++++++++
 4 files changed, 129 insertions(+), 49 deletions(-)

diff --git a/yarn/pom.xml b/yarn/pom.xml
index 3eadacba13e18..989b820bec9ef 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -162,6 +162,31 @@
        <artifactId>jersey-server</artifactId>
        <scope>test</scope>
      </dependency>
+
+    <!--
+     Testing Hive reflection needs hive on the test classpath only.
+     It doesn't need the spark hive modules, so the -Phive flag is not checked.
+      -->
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.thrift</groupId>
+      <artifactId>libthrift</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.thrift</groupId>
+      <artifactId>libfb303</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 4954b6180902e..a3f33d80184a3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1337,55 +1337,8 @@ object Client extends Logging {
       conf: Configuration,
       credentials: Credentials) {
     if (shouldGetTokens(sparkConf, "hive") && UserGroupInformation.isSecurityEnabled) {
-      val mirror = universe.runtimeMirror(getClass.getClassLoader)
-
-      try {
-        val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
-        val hiveConf = hiveConfClass.newInstance()
-
-        val hiveConfGet = (param: String) => Option(hiveConfClass
-          .getMethod("get", classOf[java.lang.String])
-          .invoke(hiveConf, param))
-
-        val metastore_uri = hiveConfGet("hive.metastore.uris")
-
-        // Check for local metastore
-        if (metastore_uri != None && metastore_uri.get.toString.size > 0) {
-          val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
-          val hive = hiveClass.getMethod("get").invoke(null, hiveConf.asInstanceOf[Object])
-
-          val metastore_kerberos_principal_conf_var = mirror.classLoader
-            .loadClass("org.apache.hadoop.hive.conf.HiveConf$ConfVars")
-            .getField("METASTORE_KERBEROS_PRINCIPAL").get("varname").toString
-
-          val principal = hiveConfGet(metastore_kerberos_principal_conf_var)
-
-          val username = Option(UserGroupInformation.getCurrentUser().getUserName)
-          if (principal != None && username != None) {
-            val tokenStr = hiveClass.getMethod("getDelegationToken",
-              classOf[java.lang.String], classOf[java.lang.String])
-              .invoke(hive, username.get, principal.get).asInstanceOf[java.lang.String]
-
-            val hive2Token = new Token[DelegationTokenIdentifier]()
-            hive2Token.decodeFromUrlString(tokenStr)
-            credentials.addToken(new Text("hive.server2.delegation.token"), hive2Token)
-            logDebug("Added hive.Server2.delegation.token to conf.")
-            hiveClass.getMethod("closeCurrent").invoke(null)
-          } else {
-            logError("Username or principal == NULL")
-            logError(s"""username=${username.getOrElse("(NULL)")}""")
-            logError(s"""principal=${principal.getOrElse("(NULL)")}""")
-            throw new IllegalArgumentException("username and/or principal is equal to null!")
-          }
-        } else {
-          logDebug("HiveMetaStore configured in localmode")
-        }
-      } catch {
-        case e: java.lang.NoSuchMethodException => { logInfo("Hive Method not found " + e); return }
-        case e: java.lang.ClassNotFoundException => { logInfo("Hive Class not found " + e); return }
-        case e: Exception => { logError("Unexpected Exception " + e)
-          throw new RuntimeException("Unexpected exception", e)
-        }
+      YarnSparkHadoopUtil.get.obtainTokenForHiveMetastore(conf).foreach {
+        credentials.addToken(new Text("hive.server2.delegation.token"), _)
       }
     }
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index f276e7efde9d7..5924daf3ece49 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -22,14 +22,17 @@ import java.util.regex.Matcher
 import java.util.regex.Pattern
 
 import scala.collection.mutable.HashMap
+import scala.reflect.runtime._
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.mapred.{Master, JobConf}
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.security.token.Token
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
@@ -142,6 +145,76 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
     val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
     ConverterUtils.toContainerId(containerIdString)
   }
+
+  /**
+   * Obtains token for the Hive metastore, using the current user as the principal.
+   * Some exceptions are caught and downgraded to a log message.
+   * @param conf hadoop configuration; the Hive configuration will be based on this
+   * @return a token, or `None` if there's no need for a token (no metastore URI or principal
+   *         in the config), or if a binding exception was caught and downgraded.
+   */
+  def obtainTokenForHiveMetastore(conf: Configuration): Option[Token[DelegationTokenIdentifier]] = {
+    try {
+      obtainTokenForHiveMetastoreInner(conf, UserGroupInformation.getCurrentUser().getUserName)
+    } catch {
+      case e: ClassNotFoundException =>
+        logInfo(s"Hive class not found $e")
+        logDebug("Hive class not found", e)
+        None
+    }
+  }
+
+  /**
+   * Inner routine to obtains token for the Hive metastore; exceptions are raised on any problem.
+   * @param conf hadoop configuration; the Hive configuration will be based on this.
+   * @param username the username of the principal requesting the delegating token.
+   * @return a delegation token
+   */
+  private[yarn] def obtainTokenForHiveMetastoreInner(conf: Configuration,
+      username: String): Option[Token[DelegationTokenIdentifier]] = {
+    val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
+
+    // the hive configuration class is a subclass of Hadoop Configuration, so can be cast down
+    // to a Configuration and used without reflection
+    val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+    // using the (Configuration, Class) constructor allows the current configuratin to be included
+    // in the hive config.
+    val ctor = hiveConfClass.getDeclaredConstructor(classOf[Configuration],
+      classOf[Object].getClass)
+    val hiveConf = ctor.newInstance(conf, hiveConfClass).asInstanceOf[Configuration]
+    val metastoreUri = hiveConf.getTrimmed("hive.metastore.uris", "")
+
+    // Check for local metastore
+    if (metastoreUri.nonEmpty) {
+      require(username.nonEmpty, "Username undefined")
+      val principalKey = "hive.metastore.kerberos.principal"
+      val principal = hiveConf.getTrimmed(principalKey, "")
+      require(principal.nonEmpty, "Hive principal $principalKey undefined")
+      logDebug(s"Getting Hive delegation token for $username against $principal at $metastoreUri")
+      val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
+      val closeCurrent = hiveClass.getMethod("closeCurrent")
+      try {
+        // get all the instance methods before invoking any
+        val getDelegationToken = hiveClass.getMethod("getDelegationToken",
+          classOf[String], classOf[String])
+        val getHive = hiveClass.getMethod("get", hiveConfClass)
+
+        // invoke
+        val hive = getHive.invoke(null, hiveConf)
+        val tokenStr = getDelegationToken.invoke(hive, username, principal).asInstanceOf[String]
+        val hive2Token = new Token[DelegationTokenIdentifier]()
+        hive2Token.decodeFromUrlString(tokenStr)
+        Some(hive2Token)
+      } finally {
+        Utils.tryLogNonFatalError {
+          closeCurrent.invoke(null)
+        }
+      }
+    } else {
+      logDebug("HiveMetaStore configured in localmode")
+      None
+    }
+  }
 }
 
 object YarnSparkHadoopUtil {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index e1c67db76571f..9132c56a91754 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -18,10 +18,12 @@
 package org.apache.spark.deploy.yarn
 
 import java.io.{File, IOException}
+import java.lang.reflect.InvocationTargetException
 
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.metadata.HiveException
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
 import org.apache.hadoop.yarn.conf.YarnConfiguration
@@ -245,4 +247,31 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
       System.clearProperty("SPARK_YARN_MODE")
     }
   }
+
+  test("Obtain tokens For HiveMetastore") {
+    val hadoopConf = new Configuration()
+    hadoopConf.set("hive.metastore.kerberos.principal", "bob")
+    // thrift picks up on port 0 and bails out, without trying to talk to endpoint
+    hadoopConf.set("hive.metastore.uris", "http://localhost:0")
+    val util = new YarnSparkHadoopUtil
+    assertNestedHiveException(intercept[InvocationTargetException] {
+      util.obtainTokenForHiveMetastoreInner(hadoopConf, "alice")
+    })
+    // expect exception trapping code to unwind this hive-side exception
+    assertNestedHiveException(intercept[InvocationTargetException] {
+      util.obtainTokenForHiveMetastore(hadoopConf)
+    })
+  }
+
+  def assertNestedHiveException(e: InvocationTargetException): Throwable = {
+    val inner = e.getCause
+    if (inner == null) {
+      fail("No inner cause", e)
+    }
+    if (!inner.isInstanceOf[HiveException]) {
+      fail("Not a hive exception", inner)
+    }
+    inner
+  }
+
 }

From aa494a9c2ebd59baec47beb434cd09bf3f188218 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 31 Oct 2015 21:16:09 -0700
Subject: [PATCH 744/802] [SPARK-11117] [SPARK-11345] [SQL] Makes all
 HadoopFsRelation data sources produce UnsafeRow

This PR fixes two issues:

1.  `PhysicalRDD.outputsUnsafeRows` is always `false`

    Thus a `ConvertToUnsafe` operator is often required even if the underlying data source relation does output `UnsafeRow`.

1.  Internal/external row conversion for `HadoopFsRelation` is kinda messy

    Currently we're using `HadoopFsRelation.needConversion` and [dirty type erasure hacks][1] to indicate whether the relation outputs external row or internal row and apply external-to-internal conversion when necessary.  Basically, all builtin `HadoopFsRelation` data sources, i.e. Parquet, JSON, ORC, and Text output `InternalRow`, while typical external `HadoopFsRelation` data sources, e.g. spark-avro and spark-csv, output `Row`.

This PR adds a `private[sql]` interface method `HadoopFsRelation.buildInternalScan`, which by default invokes `HadoopFsRelation.buildScan` and converts `Row`s to `UnsafeRow`s (which are also `InternalRow`s).  All builtin `HadoopFsRelation` data sources override this method and directly output `UnsafeRow`s.  In this way, now `HadoopFsRelation` always produces `UnsafeRow`s. Thus `PhysicalRDD.outputsUnsafeRows` can be properly set by checking whether the underlying data source is a `HadoopFsRelation`.

A remaining question is that, can we assume that all non-builtin `HadoopFsRelation` data sources output external rows?  At least all well known ones do so.  However it's possible that some users implemented their own `HadoopFsRelation` data sources that leverages `InternalRow` and thus all those unstable internal data representations.  If this assumption is safe, we can deprecate `HadoopFsRelation.needConversion` and cleanup some more conversion code (like [here][2] and [here][3]).

This PR supersedes #9125.

Follow-ups:

1.  Makes JSON and ORC data sources output `UnsafeRow` directly

1.  Makes `HiveTableScan` output `UnsafeRow` directly

    This is related to 1 since ORC data source shares the same `Writable` unwrapping code with `HiveTableScan`.

[1]: https://github.com/apache/spark/blob/v1.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala#L353
[2]: https://github.com/apache/spark/blob/v1.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala#L331-L335
[3]: https://github.com/apache/spark/blob/v1.5.1/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala#L630-L669

Author: Cheng Lian <lian@databricks.com>

Closes #9305 from liancheng/spark-11345.unsafe-hadoop-fs-relation.
---
 .../sql/columnar/GenerateColumnAccessor.scala |  2 +-
 .../spark/sql/execution/ExistingRDD.scala     |  8 ++--
 .../datasources/DataSourceStrategy.scala      | 30 ++++++++-----
 .../datasources/json/JSONRelation.scala       | 18 +++++---
 .../parquet/CatalystRecordMaterializer.scala  |  2 +-
 .../parquet/CatalystRowConverter.scala        |  8 +++-
 .../datasources/parquet/ParquetRelation.scala |  6 +--
 .../datasources/text/DefaultSource.scala      | 34 +++++++++-----
 .../apache/spark/sql/sources/interfaces.scala | 44 +++++++++++++++++--
 .../parquet/ParquetQuerySuite.scala           |  2 +-
 .../spark/sql/hive/orc/OrcRelation.scala      | 30 ++++++-------
 .../sql/sources/hadoopFsRelationSuites.scala  | 31 +++++++++++++
 12 files changed, 156 insertions(+), 59 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
index 7980a6f36d8ea..ff9393b465b7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
@@ -34,7 +34,7 @@ abstract class ColumnarIterator extends Iterator[InternalRow] {
 /**
  * An helper class to update the fields of UnsafeRow, used by ColumnAccessor
  *
- * WARNNING: These setter MUST be called in increasing order of ordinals.
+ * WARNING: These setter MUST be called in increasing order of ordinals.
  */
 class MutableUnsafeRow(val writer: UnsafeRowWriter) extends GenericMutableRow(null) {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 87bd92e00a2c1..7a466cf6a0a94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
-import org.apache.spark.sql.sources.BaseRelation
+import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation}
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.{Row, SQLContext}
 
@@ -93,7 +93,9 @@ private[sql] case class LogicalRDD(
 private[sql] case class PhysicalRDD(
     output: Seq[Attribute],
     rdd: RDD[InternalRow],
-    extraInformation: String) extends LeafNode {
+    extraInformation: String,
+    override val outputsUnsafeRows: Boolean = false)
+  extends LeafNode {
 
   protected override def doExecute(): RDD[InternalRow] = rdd
 
@@ -105,7 +107,7 @@ private[sql] object PhysicalRDD {
       output: Seq[Attribute],
       rdd: RDD[InternalRow],
       relation: BaseRelation): PhysicalRDD = {
-    PhysicalRDD(output, rdd, relation.toString)
+    PhysicalRDD(output, rdd, relation.toString, relation.isInstanceOf[HadoopFsRelation])
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index af6626c897583..65859865c8fbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -17,21 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _}
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{SerializableConfiguration, Utils}
+import org.apache.spark.{Logging, TaskContext}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
@@ -106,8 +106,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         l,
         projects,
         filters,
-        (a, f) =>
-          toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f, t.paths, confBroadcast))) :: Nil
+        (a, f) => t.buildInternalScan(a.map(_.name).toArray, f, t.paths, confBroadcast)) :: Nil
 
     case l @ LogicalRelation(baseRelation: TableScan, _) =>
       execution.PhysicalRDD.createFromDataSource(
@@ -152,7 +151,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
           // Don't scan any partition columns to save I/O.  Here we are being optimistic and
           // assuming partition columns data stored in data files are always consistent with those
           // partition values encoded in partition directory paths.
-          val dataRows = relation.buildScan(
+          val dataRows = relation.buildInternalScan(
             requiredDataColumns.map(_.name).toArray, filters, Array(dir), confBroadcast)
 
           // Merges data values with partition values.
@@ -161,7 +160,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
             requiredDataColumns,
             partitionColumns,
             partitionValues,
-            toCatalystRDD(logicalRelation, requiredDataColumns, dataRows))
+            dataRows)
         }
 
         val unionedRows =
@@ -199,15 +198,24 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       // Builds `AttributeReference`s for all partition columns so that we can use them to project
       // required partition columns.  Note that if a partition column appears in `requiredColumns`,
       // we should use the `AttributeReference` in `requiredColumns`.
-      val requiredColumnMap = requiredColumns.map(a => a.name -> a).toMap
-      val partitionColumns = partitionColumnSchema.toAttributes.map { a =>
-        requiredColumnMap.getOrElse(a.name, a)
+      val partitionColumns = {
+        val requiredColumnMap = requiredColumns.map(a => a.name -> a).toMap
+        partitionColumnSchema.toAttributes.map { a =>
+          requiredColumnMap.getOrElse(a.name, a)
+        }
       }
 
       val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[InternalRow]) => {
-        val projection = UnsafeProjection.create(requiredColumns, dataColumns ++ partitionColumns)
+        // Note that we can't use an `UnsafeRowJoiner` to replace the following `JoinedRow` and
+        // `UnsafeProjection`.  Because the projection may also adjust column order.
         val mutableJoinedRow = new JoinedRow()
-        iterator.map(dataRow => projection(mutableJoinedRow(dataRow, partitionValues)))
+        val unsafePartitionValues = UnsafeProjection.create(partitionColumnSchema)(partitionValues)
+        val unsafeProjection =
+          UnsafeProjection.create(requiredColumns, dataColumns ++ partitionColumns)
+
+        iterator.map { unsafeDataRow =>
+          unsafeProjection(mutableJoinedRow(unsafeDataRow, unsafePartitionValues))
+        }
       }
 
       // This is an internal RDD whose call site the user should not be concerned with
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index 5f104fca7d629..85b52f04c8d01 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -34,6 +34,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.execution.datasources.PartitionSpec
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
@@ -122,14 +123,21 @@ private[sql] class JSONRelation(
     jsonSchema
   }
 
-  override def buildScan(
+  override private[sql] def buildInternalScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[FileStatus]): RDD[Row] = {
-    JacksonParser(
+      inputPaths: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
+    val requiredDataSchema = StructType(requiredColumns.map(dataSchema(_)))
+    val rows = JacksonParser(
       inputRDD.getOrElse(createBaseRdd(inputPaths)),
-      StructType(requiredColumns.map(dataSchema(_))),
-      sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]]
+      requiredDataSchema,
+      sqlContext.conf.columnNameOfCorruptRecord)
+
+    rows.mapPartitions { iterator =>
+      val unsafeProjection = UnsafeProjection.create(requiredDataSchema)
+      iterator.map(unsafeProjection)
+    }
   }
 
   override def equals(other: Any): Boolean = other match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala
index ed9e0aa65977b..eeead9f5d88a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRecordMaterializer.scala
@@ -35,7 +35,7 @@ private[parquet] class CatalystRecordMaterializer(
 
   private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
 
-  override def getCurrentRecord: InternalRow = rootConverter.currentRow
+  override def getCurrentRecord: InternalRow = rootConverter.currentRecord
 
   override def getRootConverter: GroupConverter = rootConverter
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index b16c46579f7c5..1f653cd3d3cb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -163,10 +163,14 @@ private[parquet] class CatalystRowConverter(
     override def setFloat(value: Float): Unit = row.setFloat(ordinal, value)
   }
 
+  private val currentRow = new SpecificMutableRow(catalystType.map(_.dataType))
+
+  private val unsafeProjection = UnsafeProjection.create(catalystType)
+
   /**
-   * Represents the converted row object once an entire Parquet record is converted.
+   * The [[UnsafeRow]] converted from an entire Parquet record.
    */
-  val currentRow = new SpecificMutableRow(catalystType.map(_.dataType))
+  def currentRecord: UnsafeRow = unsafeProjection(currentRow)
 
   // Converters for each field.
   private val fieldConverters: Array[Converter with HasParentContainerUpdater] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 44649a68b3c9b..5a7c6b95b565f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -282,11 +282,11 @@ private[sql] class ParquetRelation(
     }
   }
 
-  override def buildScan(
+  override def buildInternalScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
       inputFiles: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
     val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
@@ -361,7 +361,7 @@ private[sql] class ParquetRelation(
               id, i, rawSplits.get(i).asInstanceOf[InputSplit with Writable])
           }
         }
-      }.asInstanceOf[RDD[Row]]  // type erasure hack to pass RDD[InternalRow] as RDD[Row]
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
index ab26c57ad1923..52c4421d7e87e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/DefaultSource.scala
@@ -25,16 +25,20 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext, Job}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, GenericMutableRow}
+import org.apache.spark.sql.catalyst.expressions.codegen.{UnsafeRowWriter, BufferHolder}
+import org.apache.spark.sql.columnar.MutableUnsafeRow
 import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
 import org.apache.spark.sql.execution.datasources.PartitionSpec
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.SerializableConfiguration
 
 /**
  * A data source for reading text files.
@@ -79,8 +83,12 @@ private[sql] class TextRelation(
   /** This is an internal data source that outputs internal row format. */
   override val needConversion: Boolean = false
 
-  /** Read path. */
-  override def buildScan(inputPaths: Array[FileStatus]): RDD[Row] = {
+
+  override private[sql] def buildInternalScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputPaths: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
     val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     val paths = inputPaths.map(_.getPath).sortBy(_.toUri)
@@ -92,17 +100,19 @@ private[sql] class TextRelation(
     sqlContext.sparkContext.hadoopRDD(
       conf.asInstanceOf[JobConf], classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
       .mapPartitions { iter =>
-        var buffer = new Array[Byte](1024)
-        val row = new GenericMutableRow(1)
+        val bufferHolder = new BufferHolder
+        val unsafeRowWriter = new UnsafeRowWriter
+        val unsafeRow = new UnsafeRow
+
         iter.map { case (_, line) =>
-          if (line.getLength > buffer.length) {
-            buffer = new Array[Byte](line.getLength)
-          }
-          System.arraycopy(line.getBytes, 0, buffer, 0, line.getLength)
-          row.update(0, UTF8String.fromBytes(buffer, 0, line.getLength))
-          row
+          // Writes to an UnsafeRow directly
+          bufferHolder.reset()
+          unsafeRowWriter.initialize(bufferHolder, 1)
+          unsafeRowWriter.write(0, line.getBytes, 0, line.getLength)
+          unsafeRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize())
+          unsafeRow
         }
-      }.asInstanceOf[RDD[Row]]
+      }
   }
 
   /** Write path. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index a9a013e936fd3..7a553511483ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -585,11 +585,11 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     })
   }
 
-  final private[sql] def buildScan(
+  final private[sql] def buildInternalScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
       inputPaths: Array[String],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
     val inputStatuses = inputPaths.flatMap { input =>
       val path = new Path(input)
 
@@ -604,7 +604,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       }
     }
 
-    buildScan(requiredColumns, filters, inputStatuses, broadcastedConf)
+    buildInternalScan(requiredColumns, filters, inputStatuses, broadcastedConf)
   }
 
   /**
@@ -740,6 +740,44 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     buildScan(requiredColumns, filters, inputFiles)
   }
 
+  /**
+   * For a non-partitioned relation, this method builds an `RDD[InternalRow]` containing all rows
+   * within this relation. For partitioned relations, this method is called for each selected
+   * partition, and builds an `RDD[InternalRow]` containing all rows within that single partition.
+   *
+   * Note:
+   *
+   * 1. Rows contained in the returned `RDD[InternalRow]` are assumed to be `UnsafeRow`s.
+   * 2. This interface is subject to change in future.
+   *
+   * @param requiredColumns Required columns.
+   * @param filters Candidate filters to be pushed down. The actual filter should be the conjunction
+   *        of all `filters`.  The pushed down filters are currently purely an optimization as they
+   *        will all be evaluated again. This means it is safe to use them with methods that produce
+   *        false positives such as filtering partitions based on a bloom filter.
+   * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
+   *        relation. For a partitioned relation, it contains paths of all data files in a single
+   *        selected partition.
+   * @param broadcastedConf A shared broadcast Hadoop Configuration, which can be used to reduce the
+   *        overhead of broadcasting the Configuration for every Hadoop RDD.
+   */
+  private[sql] def buildInternalScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
+    val requiredSchema = StructType(requiredColumns.map(dataSchema.apply))
+    val internalRows = {
+      val externalRows = buildScan(requiredColumns, filters, inputFiles, broadcastedConf)
+      execution.RDDConversions.rowToRowRdd(externalRows, requiredSchema.map(_.dataType))
+    }
+
+    internalRows.mapPartitions { iterator =>
+      val unsafeProjection = UnsafeProjection.create(requiredSchema)
+      iterator.map(unsafeProjection)
+    }
+  }
+
   /**
    * Prepares a write job and returns an [[OutputWriterFactory]].  Client side job preparation can
    * be put here.  For example, user defined output committer can be configured here
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index baff7f5752a75..70fae32b7e7a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,8 +22,8 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{TableIdentifier, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index d1f30e188eafb..45de567039760 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -19,21 +19,20 @@ package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
 
-import scala.collection.JavaConverters._
-
 import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit, OrcStruct}
 import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector
-import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, StructTypeInfo}
+import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils}
 import org.apache.hadoop.io.{NullWritable, Writable}
 import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.Logging
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.{HadoopRDD, RDD}
@@ -199,12 +198,13 @@ private[sql] class OrcRelation(
       partitionColumns)
   }
 
-  override def buildScan(
+  override private[sql] def buildInternalScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
-      inputPaths: Array[FileStatus]): RDD[Row] = {
+      inputPaths: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = {
     val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
-    OrcTableScan(output, this, filters, inputPaths).execute().asInstanceOf[RDD[Row]]
+    OrcTableScan(output, this, filters, inputPaths).execute()
   }
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
@@ -253,16 +253,17 @@ private[orc] case class OrcTableScan(
       path: String,
       conf: Configuration,
       iterator: Iterator[Writable],
-      nonPartitionKeyAttrs: Seq[(Attribute, Int)],
-      mutableRow: MutableRow): Iterator[InternalRow] = {
+      nonPartitionKeyAttrs: Seq[Attribute]): Iterator[InternalRow] = {
     val deserializer = new OrcSerde
     val maybeStructOI = OrcFileOperator.getObjectInspector(path, Some(conf))
+    val mutableRow = new SpecificMutableRow(attributes.map(_.dataType))
+    val unsafeProjection = UnsafeProjection.create(StructType.fromAttributes(nonPartitionKeyAttrs))
 
     // SPARK-8501: ORC writes an empty schema ("struct<>") to an ORC file if the file contains zero
     // rows, and thus couldn't give a proper ObjectInspector.  In this case we just return an empty
     // partition since we know that this file is empty.
     maybeStructOI.map { soi =>
-      val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
+      val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.zipWithIndex.map {
         case (attr, ordinal) =>
           soi.getStructFieldRef(attr.name) -> ordinal
       }.unzip
@@ -280,7 +281,7 @@ private[orc] case class OrcTableScan(
           }
           i += 1
         }
-        mutableRow: InternalRow
+        unsafeProjection(mutableRow)
       }
     }.getOrElse {
       Iterator.empty
@@ -322,13 +323,8 @@ private[orc] case class OrcTableScan(
     val wrappedConf = new SerializableConfiguration(conf)
 
     rdd.mapPartitionsWithInputSplit { case (split: OrcSplit, iterator) =>
-      val mutableRow = new SpecificMutableRow(attributes.map(_.dataType))
-      fillObject(
-        split.getPath.toString,
-        wrappedConf.value,
-        iterator.map(_._2),
-        attributes.zipWithIndex,
-        mutableRow)
+      val writableIterator = iterator.map(_._2)
+      fillObject(split.getPath.toString, wrappedConf.value, writableIterator, attributes)
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index e3605bb3f6bf0..100b97137cff0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -27,6 +27,7 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
+import org.apache.spark.sql.execution.ConvertToUnsafe
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
@@ -687,6 +688,36 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString)
     }
   }
+
+  test("HadoopFsRelation produces UnsafeRow") {
+    withTempTable("test_unsafe") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        sqlContext.range(3).write.format(dataSourceName).save(path)
+        sqlContext.read
+          .format(dataSourceName)
+          .option("dataSchema", new StructType().add("id", LongType, nullable = false).json)
+          .load(path)
+          .registerTempTable("test_unsafe")
+
+        val df = sqlContext.sql(
+          """SELECT COUNT(*)
+            |FROM test_unsafe a JOIN test_unsafe b
+            |WHERE a.id = b.id
+          """.stripMargin)
+
+        val plan = df.queryExecution.executedPlan
+
+        assert(
+          plan.collect { case plan: ConvertToUnsafe => plan }.isEmpty,
+          s"""Query plan shouldn't have ${classOf[ConvertToUnsafe].getSimpleName} node(s):
+             |$plan
+           """.stripMargin)
+
+        checkAnswer(df, Row(3))
+      }
+    }
+  }
 }
 
 // This class is used to test SPARK-8578. We should not use any custom output committer when

From 643c49c75ee95243fd19ae73b5170e6e6e212b8d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 1 Nov 2015 12:25:49 +0000
Subject: [PATCH 745/802] [SPARK-11305][DOCS] Remove Third-Party Hadoop
 Distributions Doc Page

Remove Hadoop third party distro page, and move Hadoop cluster config info to configuration page

CC pwendell

Author: Sean Owen <sowen@cloudera.com>

Closes #9298 from srowen/SPARK-11305.
---
 README.md                                |   5 +-
 docs/_layouts/global.html                |   1 -
 docs/configuration.md                    |  15 +++
 docs/hadoop-third-party-distributions.md | 117 -----------------------
 docs/index.md                            |   1 -
 docs/programming-guide.md                |   9 +-
 6 files changed, 19 insertions(+), 129 deletions(-)
 delete mode 100644 docs/hadoop-third-party-distributions.md

diff --git a/README.md b/README.md
index 4116ef3563879..c0d6a946035a9 100644
--- a/README.md
+++ b/README.md
@@ -87,10 +87,7 @@ Hadoop, you must build Spark against the same version that your cluster runs.
 Please refer to the build documentation at
 ["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
 for detailed guidance on building for a particular distribution of Hadoop, including
-building for particular Hive and Hive Thriftserver distributions. See also
-["Third Party Hadoop Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html)
-for guidance on building a Spark application that works with a particular
-distribution.
+building for particular Hive and Hive Thriftserver distributions.
 
 ## Configuration
 
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index b4952fe97ca0e..467ff7a03fb70 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -112,7 +112,6 @@
                                 <li><a href="job-scheduling.html">Job Scheduling</a></li>
                                 <li><a href="security.html">Security</a></li>
                                 <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
-                                <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
                                 <li class="divider"></li>
                                 <li><a href="building-spark.html">Building Spark</a></li>
                                 <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
diff --git a/docs/configuration.md b/docs/configuration.md
index 682384d4249e0..c276e8e90decf 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1674,3 +1674,18 @@ Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can config
 To specify a different configuration directory other than the default "SPARK_HOME/conf",
 you can set SPARK_CONF_DIR. Spark will use the the configuration files (spark-defaults.conf, spark-env.sh, log4j.properties, etc)
 from this directory.
+
+# Inheriting Hadoop Cluster Configuration
+
+If you plan to read and write from HDFS using Spark, there are two Hadoop configuration files that
+should be included on Spark's classpath:
+
+* `hdfs-site.xml`, which provides default behaviors for the HDFS client.
+* `core-site.xml`, which sets the default filesystem name.
+
+The location of these configuration files varies across CDH and HDP versions, but
+a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create
+configurations on-the-fly, but offer a mechanisms to download copies of them.
+
+To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh`
+to a location containing the configuration files.
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
deleted file mode 100644
index 795dd82a6be06..0000000000000
--- a/docs/hadoop-third-party-distributions.md
+++ /dev/null
@@ -1,117 +0,0 @@
----
-layout: global
-title: Third-Party Hadoop Distributions
----
-
-Spark can run against all versions of Cloudera's Distribution Including Apache Hadoop (CDH) and
-the Hortonworks Data Platform (HDP). There are a few things to keep in mind when using Spark
-with these distributions:
-
-# Compile-time Hadoop Version
-
-When compiling Spark, you'll need to specify the Hadoop version by defining the `hadoop.version`
-property. For certain versions, you will need to specify additional profiles. For more detail,
-see the guide on [building with maven](building-spark.html#specifying-the-hadoop-version):
-
-    mvn -Dhadoop.version=1.0.4 -DskipTests clean package
-    mvn -Phadoop-2.3 -Dhadoop.version=2.3.0 -DskipTests clean package
-
-The table below lists the corresponding `hadoop.version` code for each CDH/HDP release. Note that
-some Hadoop releases are binary compatible across client versions. This means the pre-built Spark
-distribution may "just work" without you needing to compile. That said, we recommend compiling with
-the _exact_ Hadoop version you are running to avoid any compatibility errors.
-
-<table>
-  <tr valign="top">
-    <td>
-      <h3>CDH Releases</h3>
-      <table class="table" style="width:350px; margin-right: 20px;">
-        <tr><th>Release</th><th>Version code</th></tr>
-        <tr><td>CDH 4.X.X (YARN mode)</td><td>2.0.0-cdh4.X.X</td></tr>
-        <tr><td>CDH 4.X.X</td><td>2.0.0-mr1-cdh4.X.X</td></tr>
-      </table>
-    </td>
-    <td>
-      <h3>HDP Releases</h3>
-      <table class="table" style="width:350px;">
-        <tr><th>Release</th><th>Version code</th></tr>
-        <tr><td>HDP 1.3</td><td>1.2.0</td></tr>
-        <tr><td>HDP 1.2</td><td>1.1.2</td></tr>
-        <tr><td>HDP 1.1</td><td>1.0.3</td></tr>
-        <tr><td>HDP 1.0</td><td>1.0.3</td></tr>
-        <tr><td>HDP 2.0</td><td>2.2.0</td></tr>
-      </table>
-    </td>
-  </tr>
-</table>
-
-In SBT, the equivalent can be achieved by setting the the `hadoop.version` property:
-
-    build/sbt -Dhadoop.version=1.0.4 assembly
-
-# Linking Applications to the Hadoop Version
-
-In addition to compiling Spark itself against the right version, you need to add a Maven dependency on that
-version of `hadoop-client` to any Spark applications you run, so they can also talk to the HDFS version
-on the cluster. If you are using CDH, you also need to add the Cloudera Maven repository.
-This looks as follows in SBT:
-
-{% highlight scala %}
-libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "<version>"
-
-// If using CDH, also add Cloudera repo
-resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
-{% endhighlight %}
-
-Or in Maven:
-
-{% highlight xml %}
-<project>
-  <dependencies>
-    ...
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-client</artifactId>
-      <version>[version]</version>
-    </dependency>
-  </dependencies>
-
-  <!-- If using CDH, also add Cloudera repo -->
-  <repositories>
-    ...
-    <repository>
-      <id>Cloudera repository</id>
-      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
-    </repository>
-  </repositories>
-</project>
-
-{% endhighlight %}
-
-# Where to Run Spark
-
-As described in the [Hardware Provisioning](hardware-provisioning.html#storage-systems) guide,
-Spark can run in a variety of deployment modes:
-
-* Using dedicated set of Spark nodes in your cluster. These nodes should be co-located with your
-  Hadoop installation.
-* Running on the same nodes as an existing Hadoop installation, with a fixed amount memory and
-  cores dedicated to Spark on each node.
-* Run Spark alongside Hadoop using a cluster resource manager, such as YARN or Mesos.
-
-These options are identical for those using CDH and HDP.
-
-# Inheriting Cluster Configuration
-
-If you plan to read and write from HDFS using Spark, there are two Hadoop configuration files that
-should be included on Spark's classpath:
-
-* `hdfs-site.xml`, which provides default behaviors for the HDFS client.
-* `core-site.xml`, which sets the default filesystem name.
-
-The location of these configuration files varies across CDH and HDP versions, but
-a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create
-configurations on-the-fly, but offer a mechanisms to download copies of them.
-
-To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh`
-to a location containing the configuration files.
diff --git a/docs/index.md b/docs/index.md
index c0dc2b8d7412a..f1d9e012c6cf0 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -117,7 +117,6 @@ options for deployment:
 * [Job Scheduling](job-scheduling.html): scheduling resources across and within Spark applications
 * [Security](security.html): Spark security support
 * [Hardware Provisioning](hardware-provisioning.html): recommendations for cluster hardware
-* [3<sup>rd</sup> Party Hadoop Distributions](hadoop-third-party-distributions.html): using common Hadoop distributions
 * Integration with other storage systems:
   * [OpenStack Swift](storage-openstack-swift.html)
 * [Building Spark](building-spark.html): build Spark using the Maven system
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 22656fd7910c0..f823b89a4b5e9 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -34,8 +34,7 @@ To write a Spark application, you need to add a Maven dependency on Spark. Spark
     version = {{site.SPARK_VERSION}}
 
 In addition, if you wish to access an HDFS cluster, you need to add a dependency on
-`hadoop-client` for your version of HDFS. Some common HDFS version tags are listed on the
-[third party distributions](hadoop-third-party-distributions.html) page.
+`hadoop-client` for your version of HDFS.
 
     groupId = org.apache.hadoop
     artifactId = hadoop-client
@@ -66,8 +65,7 @@ To write a Spark application in Java, you need to add a dependency on Spark. Spa
     version = {{site.SPARK_VERSION}}
 
 In addition, if you wish to access an HDFS cluster, you need to add a dependency on
-`hadoop-client` for your version of HDFS. Some common HDFS version tags are listed on the
-[third party distributions](hadoop-third-party-distributions.html) page.
+`hadoop-client` for your version of HDFS.
 
     groupId = org.apache.hadoop
     artifactId = hadoop-client
@@ -93,8 +91,7 @@ This script will load Spark's Java/Scala libraries and allow you to submit appli
 You can also use `bin/pyspark` to launch an interactive Python shell.
 
 If you wish to access HDFS data, you need to use a build of PySpark linking
-to your version of HDFS. Some common HDFS version tags are listed on the
-[third party distributions](hadoop-third-party-distributions.html) page.
+to your version of HDFS.
 [Prebuilt packages](http://spark.apache.org/downloads.html) are also available on the Spark homepage
 for common HDFS versions.
 

From dc7e399fc01e74f2ba28ebd945785cc0f7759ccd Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Sun, 1 Nov 2015 13:09:42 -0800
Subject: [PATCH 746/802] [SPARK-11338] [WEBUI] Prepend app links on
 HistoryPage with uiRoot path

[SPARK-11338: HistoryPage not multi-tenancy enabled ...](https://issues.apache.org/jira/browse/SPARK-11338)
- `HistoryPage.scala` ...prepending all page links with the web proxy (`uiRoot`) path
- `HistoryServerSuite.scala` ...adding a test case to verify all site-relative links are prefixed when the environment variable `APPLICATION_WEB_PROXY_BASE` (or System property `spark.ui.proxyBase`) is set

Author: Christian Kadner <ckadner@us.ibm.com>

Closes #9291 from ckadner/SPARK-11338 and squashes the following commits:

01d2f35 [Christian Kadner] [SPARK-11338][WebUI] nit fixes
d054bd7 [Christian Kadner] [SPARK-11338][WebUI] prependBaseUri in method makePageLink
8bcb3dc [Christian Kadner] [SPARK-11338][WebUI] Prepend application links on HistoryPage with uiRoot path
---
 .../spark/deploy/history/HistoryPage.scala    |  9 ++++----
 .../deploy/history/HistoryServerSuite.scala   | 21 +++++++++++++++++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index b347cb3be69f7..642d71b18c9e2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -161,7 +161,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
       info: ApplicationHistoryInfo,
       attempt: ApplicationAttemptInfo,
       isFirst: Boolean): Seq[Node] = {
-    val uiAddress = HistoryServer.getAttemptURI(info.id, attempt.attemptId)
+    val uiAddress = UIUtils.prependBaseUri(HistoryServer.getAttemptURI(info.id, attempt.attemptId))
     val startTime = UIUtils.formatDate(attempt.startTime)
     val endTime = if (attempt.endTime > 0) UIUtils.formatDate(attempt.endTime) else "-"
     val duration =
@@ -190,8 +190,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
       {
         if (renderAttemptIdColumn) {
           if (info.attempts.size > 1 && attempt.attemptId.isDefined) {
-            <td><a href={HistoryServer.getAttemptURI(info.id, attempt.attemptId)}>
-              {attempt.attemptId.get}</a></td>
+            <td><a href={uiAddress}>{attempt.attemptId.get}</a></td>
           } else {
             <td>&nbsp;</td>
           }
@@ -218,9 +217,9 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
   }
 
   private def makePageLink(linkPage: Int, showIncomplete: Boolean): String = {
-    "/?" + Array(
+    UIUtils.prependBaseUri("/?" + Array(
       "page=" + linkPage,
       "showIncomplete=" + showIncomplete
-    ).mkString("&")
+      ).mkString("&"))
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index e5b5e1bb65337..4b7fd4f13b692 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -29,7 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.ui.SparkUI
+import org.apache.spark.ui.{SparkUI, UIUtils}
 
 /**
  * A collection of tests against the historyserver, including comparing responses from the json
@@ -261,7 +261,24 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       l <- links
       attrs <- l.attribute("href")
     } yield (attrs.toString)
-    justHrefs should contain(link)
+    justHrefs should contain (UIUtils.prependBaseUri(resource = link))
+  }
+
+  test("relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
+    val proxyBaseBeforeTest = System.getProperty("spark.ui.proxyBase")
+    val uiRoot = Option(System.getenv("APPLICATION_WEB_PROXY_BASE")).getOrElse("/testwebproxybase")
+    val page = new HistoryPage(server)
+    val request = mock[HttpServletRequest]
+
+    // when
+    System.setProperty("spark.ui.proxyBase", uiRoot)
+    val response = page.render(request)
+    System.setProperty("spark.ui.proxyBase", Option(proxyBaseBeforeTest).getOrElse(""))
+
+    // then
+    val urls = response \\ "@href" map (_.toString)
+    val siteRelativeLinks = urls filter (_.startsWith("/"))
+    all (siteRelativeLinks) should startWith (uiRoot)
   }
 
   def getContentAndCode(path: String, port: Int = port): (Int, Option[String], Option[String]) = {

From 046e32ed8467e0f46ffeca1a95d4d40017eb5bdb Mon Sep 17 00:00:00 2001
From: Nong Li <nongli@gmail.com>
Date: Sun, 1 Nov 2015 14:32:21 -0800
Subject: [PATCH 747/802] [SPARK-11410][SQL] Add APIs to provide functionality
 similar to Hive's DISTRIBUTE BY and SORT BY.

DISTRIBUTE BY allows the user to hash partition the data by specified exprs. It also allows for
optioning sorting within each resulting partition. There is no required relationship between the
exprs for partitioning and sorting (i.e. one does not need to be a prefix of the other).

This patch adds to APIs to DataFrames which can be used together to provide this functionality:
  1. distributeBy() which partitions the data frame into a specified number of partitions using the
     partitioning exprs.
  2. localSort() which sorts each partition using the provided sorting exprs.

To get the DISTRIBUTE BY functionality, the user simply does: df.distributeBy(...).localSort(...)

Author: Nong Li <nongli@gmail.com>

Closes #9364 from nongli/spark-11410.
---
 .../catalyst/plans/logical/partitioning.scala |  19 ++-
 .../org/apache/spark/sql/DataFrame.scala      |  60 +++++++--
 .../spark/sql/execution/SparkStrategies.scala |   8 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 118 +++++++++++++++++-
 4 files changed, 186 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
index 1f76b03bcb0f6..a5bdee1b854ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
@@ -31,10 +31,19 @@ case class SortPartitions(sortExpressions: Seq[SortOrder], child: LogicalPlan)
   extends RedistributeData
 
 /**
- * This method repartitions data using [[Expression]]s, and receives information about the
- * number of partitions during execution. Used when a specific ordering or distribution is
- * expected by the consumer of the query result. Use [[Repartition]] for RDD-like
+ * This method repartitions data using [[Expression]]s into `numPartitions`, and receives
+ * information about the number of partitions during execution. Used when a specific ordering or
+ * distribution is expected by the consumer of the query result. Use [[Repartition]] for RDD-like
  * `coalesce` and `repartition`.
+ * If `numPartitions` is not specified, the number of partitions will be the number set by
+ * `spark.sql.shuffle.partitions`.
  */
-case class RepartitionByExpression(partitionExpressions: Seq[Expression], child: LogicalPlan)
-  extends RedistributeData
+case class RepartitionByExpression(
+    partitionExpressions: Seq[Expression],
+    child: LogicalPlan,
+    numPartitions: Option[Int] = None) extends RedistributeData {
+  numPartitions match {
+    case Some(n) => require(n > 0, "numPartitions must be greater than 0.")
+    case None => // Ok
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index aa817a037ef5e..53ad3c0266cdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -241,6 +241,18 @@ class DataFrame private[sql](
     sb.toString()
   }
 
+  private[sql] def sortInternal(global: Boolean, sortExprs: Seq[Column]): DataFrame = {
+    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
+      col.expr match {
+        case expr: SortOrder =>
+          expr
+        case expr: Expression =>
+          SortOrder(expr, Ascending)
+      }
+    }
+    Sort(sortOrder, global = global, logicalPlan)
+  }
+
   override def toString: String = {
     try {
       schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]")
@@ -633,15 +645,7 @@ class DataFrame private[sql](
    */
   @scala.annotation.varargs
   def sort(sortExprs: Column*): DataFrame = {
-    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
-      col.expr match {
-        case expr: SortOrder =>
-          expr
-        case expr: Expression =>
-          SortOrder(expr, Ascending)
-      }
-    }
-    Sort(sortOrder, global = true, logicalPlan)
+    sortInternal(true, sortExprs)
   }
 
   /**
@@ -662,6 +666,44 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def orderBy(sortExprs: Column*): DataFrame = sort(sortExprs : _*)
 
+  /**
+   * Returns a new [[DataFrame]] partitioned by the given partitioning expressions into
+   * `numPartitions`. The resulting DataFrame is hash partitioned.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def distributeBy(partitionExprs: Seq[Column], numPartitions: Int): DataFrame = {
+    RepartitionByExpression(partitionExprs.map { _.expr }, logicalPlan, Some(numPartitions))
+  }
+
+  /**
+   * Returns a new [[DataFrame]] partitioned by the given partitioning expressions preserving
+   * the existing number of partitions. The resulting DataFrame is hash partitioned.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def distributeBy(partitionExprs: Seq[Column]): DataFrame = {
+    RepartitionByExpression(partitionExprs.map { _.expr }, logicalPlan, None)
+  }
+
+  /**
+   * Returns a new [[DataFrame]] with each partition sorted by the given expressions.
+   * @group dfops
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def localSort(sortCol: String, sortCols: String*): DataFrame = localSort(sortCol, sortCols : _*)
+
+  /**
+   * Returns a new [[DataFrame]] with each partition sorted by the given expressions.
+   * @group dfops
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def localSort(sortExprs: Column*): DataFrame = {
+    sortInternal(false, sortExprs)
+  }
+
   /**
    * Selects column based on the column name and return it as a [[Column]].
    * Note that the column name can also reference to a nested column like `a.b`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 86d1d390f1918..f4464e0b916f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -27,8 +27,7 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _}
 import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.{SQLContext, Strategy, execution}
+import org.apache.spark.sql.{Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SparkPlanner =>
@@ -455,8 +454,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           generator, join = join, outer = outer, g.output, planLater(child)) :: Nil
       case logical.OneRowRelation =>
         execution.PhysicalRDD(Nil, singleRowRdd, "OneRowRelation") :: Nil
-      case logical.RepartitionByExpression(expressions, child) =>
-        execution.Exchange(HashPartitioning(expressions, numPartitions), planLater(child)) :: Nil
+      case logical.RepartitionByExpression(expressions, child, nPartitions) =>
+        execution.Exchange(HashPartitioning(
+          expressions, nPartitions.getOrElse(numPartitions)), planLater(child)) :: Nil
       case e @ EvaluatePython(udf, child, _) =>
         BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil
       case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd, "PhysicalRDD") :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c9d6e19d2ce93..6b86c5951b413 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -24,10 +24,14 @@ import scala.util.Random
 
 import org.scalatest.Matchers._
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
+import org.apache.spark.sql.execution.Exchange
+import org.apache.spark.sql.execution.aggregate.TungstenAggregate
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.SQLTestData.TestData2
+import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSQLContext}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SharedSQLContext}
 
 class DataFrameSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -997,4 +1001,116 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  /**
+   * Verifies that there is no Exchange between the Aggregations for `df`
+   */
+  private def verifyNonExchangingAgg(df: DataFrame) = {
+    var atFirstAgg: Boolean = false
+    df.queryExecution.executedPlan.foreach {
+      case agg: TungstenAggregate => {
+        atFirstAgg = !atFirstAgg
+      }
+      case _ => {
+        if (atFirstAgg) {
+          fail("Should not have operators between the two aggregations")
+        }
+      }
+    }
+  }
+
+  /**
+   * Verifies that there is an Exchange between the Aggregations for `df`
+   */
+  private def verifyExchangingAgg(df: DataFrame) = {
+    var atFirstAgg: Boolean = false
+    df.queryExecution.executedPlan.foreach {
+      case agg: TungstenAggregate => {
+        if (atFirstAgg) {
+          fail("Should not have back to back Aggregates")
+        }
+        atFirstAgg = true
+      }
+      case e: Exchange => atFirstAgg = false
+      case _ =>
+    }
+  }
+
+  test("distributeBy and localSort") {
+    val original = testData.repartition(1)
+    assert(original.rdd.partitions.length == 1)
+    val df = original.distributeBy(Column("key") :: Nil, 5)
+    assert(df.rdd.partitions.length  == 5)
+    checkAnswer(original.select(), df.select())
+
+    val df2 = original.distributeBy(Column("key") :: Nil, 10)
+    assert(df2.rdd.partitions.length  == 10)
+    checkAnswer(original.select(), df2.select())
+
+    // Group by the column we are distributed by. This should generate a plan with no exchange
+    // between the aggregates
+    val df3 = testData.distributeBy(Column("key") :: Nil).groupBy("key").count()
+    verifyNonExchangingAgg(df3)
+    verifyNonExchangingAgg(testData.distributeBy(Column("key") :: Column("value") :: Nil)
+      .groupBy("key", "value").count())
+
+    // Grouping by just the first distributeBy expr, need to exchange.
+    verifyExchangingAgg(testData.distributeBy(Column("key") :: Column("value") :: Nil)
+      .groupBy("key").count())
+
+    val data = sqlContext.sparkContext.parallelize(
+      (1 to 100).map(i => TestData2(i % 10, i))).toDF()
+
+    // Distribute and order by.
+    val df4 = data.distributeBy(Column("a") :: Nil).localSort($"b".desc)
+    // Walk each partition and verify that it is sorted descending and does not contain all
+    // the values.
+    df4.rdd.foreachPartition(p => {
+      var previousValue: Int = -1
+      var allSequential: Boolean = true
+      p.foreach(r => {
+        val v: Int = r.getInt(1)
+        if (previousValue != -1) {
+          if (previousValue < v) throw new SparkException("Partition is not ordered.")
+          if (v + 1 != previousValue) allSequential = false
+        }
+        previousValue = v
+      })
+      if (allSequential) throw new SparkException("Partition should not be globally ordered")
+    })
+
+    // Distribute and order by with multiple order bys
+    val df5 = data.distributeBy(Column("a") :: Nil, 2).localSort($"b".asc, $"a".asc)
+    // Walk each partition and verify that it is sorted ascending
+    df5.rdd.foreachPartition(p => {
+      var previousValue: Int = -1
+      var allSequential: Boolean = true
+      p.foreach(r => {
+        val v: Int = r.getInt(1)
+        if (previousValue != -1) {
+          if (previousValue > v) throw new SparkException("Partition is not ordered.")
+          if (v - 1 != previousValue) allSequential = false
+        }
+        previousValue = v
+      })
+      if (allSequential) throw new SparkException("Partition should not be all sequential")
+    })
+
+    // Distribute into one partition and order by. This partition should contain all the values.
+    val df6 = data.distributeBy(Column("a") :: Nil, 1).localSort($"b".asc)
+    // Walk each partition and verify that it is sorted descending and not globally sorted.
+    df6.rdd.foreachPartition(p => {
+      var previousValue: Int = -1
+      var allSequential: Boolean = true
+      p.foreach(r => {
+        val v: Int = r.getInt(1)
+        if (previousValue != -1) {
+          if (previousValue > v) throw new SparkException("Partition is not ordered.")
+          if (v - 1 != previousValue) allSequential = false
+        }
+        previousValue = v
+      })
+      if (!allSequential) throw new SparkException("Partition should contain all sequential values")
+    })
+  }
 }

From cf04fdfe71abc395163a625cc1f99ec5e54cc07e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 1 Nov 2015 14:42:18 -0800
Subject: [PATCH 748/802] [SPARK-11020][CORE] Wait for HDFS to leave safe mode
 before initializing HS.

Large HDFS clusters may take a while to leave safe mode when starting; this change
makes the HS wait for that before doing checks about its configuraton. This means
the HS won't stop right away if HDFS is in safe mode and the configuration is not
correct, but that should be a very uncommon situation.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9043 from vanzin/SPARK-11020.
---
 .../deploy/history/FsHistoryProvider.scala    | 104 +++++++++++++++++-
 .../history/FsHistoryProviderSuite.scala      |  65 +++++++++++
 2 files changed, 166 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 80bfda9dddb39..24aa386c7212b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -27,6 +27,7 @@ import scala.collection.mutable
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.apache.hadoop.security.AccessControlException
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
@@ -52,6 +53,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   private val NOT_STARTED = "<Not Started>"
 
+  // Interval between safemode checks.
+  private val SAFEMODE_CHECK_INTERVAL_S = conf.getTimeAsSeconds(
+    "spark.history.fs.safemodeCheck.interval", "5s")
+
   // Interval between each check for event log updates
   private val UPDATE_INTERVAL_S = conf.getTimeAsSeconds("spark.history.fs.update.interval", "10s")
 
@@ -107,9 +112,57 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-  initialize()
+  // Conf option used for testing the initialization code.
+  val initThread = if (!conf.getBoolean("spark.history.testing.skipInitialize", false)) {
+      initialize(None)
+    } else {
+      null
+    }
+
+  private[history] def initialize(errorHandler: Option[Thread.UncaughtExceptionHandler]): Thread = {
+    if (!isFsInSafeMode()) {
+      startPolling()
+      return null
+    }
+
+    // Cannot probe anything while the FS is in safe mode, so spawn a new thread that will wait
+    // for the FS to leave safe mode before enabling polling. This allows the main history server
+    // UI to be shown (so that the user can see the HDFS status).
+    //
+    // The synchronization in the run() method is needed because of the tests; mockito can
+    // misbehave if the test is modifying the mocked methods while the thread is calling
+    // them.
+    val initThread = new Thread(new Runnable() {
+      override def run(): Unit = {
+        try {
+          clock.synchronized {
+            while (isFsInSafeMode()) {
+              logInfo("HDFS is still in safe mode. Waiting...")
+              val deadline = clock.getTimeMillis() +
+                TimeUnit.SECONDS.toMillis(SAFEMODE_CHECK_INTERVAL_S)
+              clock.waitTillTime(deadline)
+            }
+          }
+          startPolling()
+        } catch {
+          case _: InterruptedException =>
+        }
+      }
+    })
+    initThread.setDaemon(true)
+    initThread.setName(s"${getClass().getSimpleName()}-init")
+    initThread.setUncaughtExceptionHandler(errorHandler.getOrElse(
+      new Thread.UncaughtExceptionHandler() {
+        override def uncaughtException(t: Thread, e: Throwable): Unit = {
+          logError("Error initializing FsHistoryProvider.", e)
+          System.exit(1)
+        }
+      }))
+    initThread.start()
+    initThread
+  }
 
-  private def initialize(): Unit = {
+  private def startPolling(): Unit = {
     // Validate the log directory.
     val path = new Path(logDir)
     if (!fs.exists(path)) {
@@ -170,7 +223,21 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-  override def getConfig(): Map[String, String] = Map("Event log directory" -> logDir.toString)
+  override def getConfig(): Map[String, String] = {
+    val safeMode = if (isFsInSafeMode()) {
+      Map("HDFS State" -> "In safe mode, application logs not available.")
+    } else {
+      Map()
+    }
+    Map("Event log directory" -> logDir.toString) ++ safeMode
+  }
+
+  override def stop(): Unit = {
+    if (initThread != null && initThread.isAlive()) {
+      initThread.interrupt()
+      initThread.join()
+    }
+  }
 
   /**
    * Builds the application list based on the current contents of the log directory.
@@ -585,6 +652,37 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
+  /**
+   * Checks whether HDFS is in safe mode. The API is slightly different between hadoop 1 and 2,
+   * so we have to resort to ugly reflection (as usual...).
+   *
+   * Note that DistributedFileSystem is a `@LimitedPrivate` class, which for all practical reasons
+   * makes it more public than not.
+   */
+  private[history] def isFsInSafeMode(): Boolean = fs match {
+    case dfs: DistributedFileSystem =>
+      isFsInSafeMode(dfs)
+    case _ =>
+      false
+  }
+
+  // For testing.
+  private[history] def isFsInSafeMode(dfs: DistributedFileSystem): Boolean = {
+    val hadoop1Class = "org.apache.hadoop.hdfs.protocol.FSConstants$SafeModeAction"
+    val hadoop2Class = "org.apache.hadoop.hdfs.protocol.HdfsConstants$SafeModeAction"
+    val actionClass: Class[_] =
+      try {
+        getClass().getClassLoader().loadClass(hadoop2Class)
+      } catch {
+        case _: ClassNotFoundException =>
+          getClass().getClassLoader().loadClass(hadoop1Class)
+      }
+
+    val action = actionClass.getField("SAFEMODE_GET").get(null)
+    val method = dfs.getClass().getMethod("setSafeMode", action.getClass())
+    method.invoke(dfs, action).asInstanceOf[Boolean]
+  }
+
 }
 
 private[history] object FsHistoryProvider {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 73cff89544dc3..833aab14ca2da 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -24,13 +24,19 @@ import java.util.concurrent.TimeUnit
 import java.util.zip.{ZipInputStream, ZipOutputStream}
 
 import scala.io.Source
+import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import com.google.common.base.Charsets
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.json4s.jackson.JsonMethods._
+import org.mockito.Matchers.any
+import org.mockito.Mockito.{doReturn, mock, spy, verify, when}
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.io._
@@ -407,6 +413,65 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     }
   }
 
+  test("provider correctly checks whether fs is in safe mode") {
+    val provider = spy(new FsHistoryProvider(createTestConf()))
+    val dfs = mock(classOf[DistributedFileSystem])
+    // Asserts that safe mode is false because we can't really control the return value of the mock,
+    // since the API is different between hadoop 1 and 2.
+    assert(!provider.isFsInSafeMode(dfs))
+  }
+
+  test("provider waits for safe mode to finish before initializing") {
+    val clock = new ManualClock()
+    val conf = createTestConf().set("spark.history.testing.skipInitialize", "true")
+    val provider = spy(new FsHistoryProvider(conf, clock))
+    doReturn(true).when(provider).isFsInSafeMode()
+
+    val initThread = provider.initialize(None)
+    try {
+      provider.getConfig().keys should contain ("HDFS State")
+
+      clock.setTime(5000)
+      provider.getConfig().keys should contain ("HDFS State")
+
+      // Synchronization needed because of mockito.
+      clock.synchronized {
+        doReturn(false).when(provider).isFsInSafeMode()
+        clock.setTime(10000)
+      }
+
+      eventually(timeout(1 second), interval(10 millis)) {
+        provider.getConfig().keys should not contain ("HDFS State")
+      }
+    } finally {
+      provider.stop()
+    }
+  }
+
+  test("provider reports error after FS leaves safe mode") {
+    testDir.delete()
+    val clock = new ManualClock()
+    val conf = createTestConf().set("spark.history.testing.skipInitialize", "true")
+    val provider = spy(new FsHistoryProvider(conf, clock))
+    doReturn(true).when(provider).isFsInSafeMode()
+
+    val errorHandler = mock(classOf[Thread.UncaughtExceptionHandler])
+    val initThread = provider.initialize(Some(errorHandler))
+    try {
+      // Synchronization needed because of mockito.
+      clock.synchronized {
+        doReturn(false).when(provider).isFsInSafeMode()
+        clock.setTime(10000)
+      }
+
+      eventually(timeout(1 second), interval(10 millis)) {
+        verify(errorHandler).uncaughtException(any(), any())
+      }
+    } finally {
+      provider.stop()
+    }
+  }
+
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
    * app list. Example:

From f8d93edec82eedab59d50aec06ca2de7e4cf14f6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 1 Nov 2015 15:57:42 -0800
Subject: [PATCH 749/802] [SPARK-11073][CORE][YARN] Remove akka dependency in
 secret key generation.

Use standard JDK APIs for that (with a little help from Guava). Most of the
changes here are in test code, since there were no tests specific to that
part of the code.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9257 from vanzin/SPARK-11073.
---
 .../org/apache/spark/SecurityManager.scala    | 72 +++++++++++--------
 .../apache/spark/SecurityManagerSuite.scala   | 23 +++++-
 .../spark/deploy/LogUrlsStandaloneSuite.scala | 13 +---
 .../deploy/worker/WorkerArgumentsTest.scala   | 28 +-------
 .../apache/spark/storage/LocalDirsSuite.scala | 16 +----
 .../apache/spark/util/SparkConfWithEnv.scala  | 34 +++++++++
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  3 +-
 .../yarn/YarnSparkHadoopUtilSuite.scala       | 32 ++++++++-
 8 files changed, 138 insertions(+), 83 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/util/SparkConfWithEnv.scala

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 746d2081d4393..64e483e384772 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark
 
+import java.lang.{Byte => JByte}
 import java.net.{Authenticator, PasswordAuthentication}
-import java.security.KeyStore
+import java.security.{KeyStore, SecureRandom}
 import java.security.cert.X509Certificate
 import javax.net.ssl._
 
+import com.google.common.hash.HashCodes
 import com.google.common.io.Files
 import org.apache.hadoop.io.Text
 
@@ -130,15 +132,16 @@ import org.apache.spark.util.Utils
  *
  *  The exact mechanisms used to generate/distribute the shared secret are deployment-specific.
  *
- *  For Yarn deployments, the secret is automatically generated using the Akka remote
- *  Crypt.generateSecureCookie() API. The secret is placed in the Hadoop UGI which gets passed
- *  around via the Hadoop RPC mechanism. Hadoop RPC can be configured to support different levels
- *  of protection. See the Hadoop documentation for more details. Each Spark application on Yarn
- *  gets a different shared secret. On Yarn, the Spark UI gets configured to use the Hadoop Yarn
- *  AmIpFilter which requires the user to go through the ResourceManager Proxy. That Proxy is there
- *  to reduce the possibility of web based attacks through YARN. Hadoop can be configured to use
- *  filters to do authentication. That authentication then happens via the ResourceManager Proxy
- *  and Spark will use that to do authorization against the view acls.
+ *  For YARN deployments, the secret is automatically generated. The secret is placed in the Hadoop
+ *  UGI which gets passed around via the Hadoop RPC mechanism. Hadoop RPC can be configured to
+ *  support different levels of protection. See the Hadoop documentation for more details. Each
+ *  Spark application on YARN gets a different shared secret.
+ *
+ *  On YARN, the Spark UI gets configured to use the Hadoop YARN AmIpFilter which requires the user
+ *  to go through the ResourceManager Proxy. That proxy is there to reduce the possibility of web
+ *  based attacks through YARN. Hadoop can be configured to use filters to do authentication. That
+ *  authentication then happens via the ResourceManager Proxy and Spark will use that to do
+ *  authorization against the view acls.
  *
  *  For other Spark deployments, the shared secret must be specified via the
  *  spark.authenticate.secret config.
@@ -189,8 +192,7 @@ import org.apache.spark.util.Utils
 private[spark] class SecurityManager(sparkConf: SparkConf)
   extends Logging with SecretKeyHolder {
 
-  // key used to store the spark secret in the Hadoop UGI
-  private val sparkSecretLookupKey = "sparkCookie"
+  import SecurityManager._
 
   private val authOn = sparkConf.getBoolean(SecurityManager.SPARK_AUTH_CONF, false)
   // keep spark.ui.acls.enable for backwards compatibility with 1.0
@@ -365,33 +367,38 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
    * we throw an exception.
    */
   private def generateSecretKey(): String = {
-    if (!isAuthenticationEnabled) return null
-    // first check to see if the secret is already set, else generate a new one if on yarn
-    val sCookie = if (SparkHadoopUtil.get.isYarnMode) {
-      val secretKey = SparkHadoopUtil.get.getSecretKeyFromUserCredentials(sparkSecretLookupKey)
-      if (secretKey != null) {
-        logDebug("in yarn mode, getting secret from credentials")
-        return new Text(secretKey).toString
+    if (!isAuthenticationEnabled) {
+      null
+    } else if (SparkHadoopUtil.get.isYarnMode) {
+      // In YARN mode, the secure cookie will be created by the driver and stashed in the
+      // user's credentials, where executors can get it. The check for an array of size 0
+      // is because of the test code in YarnSparkHadoopUtilSuite.
+      val secretKey = SparkHadoopUtil.get.getSecretKeyFromUserCredentials(SECRET_LOOKUP_KEY)
+      if (secretKey == null || secretKey.length == 0) {
+        logDebug("generateSecretKey: yarn mode, secret key from credentials is null")
+        val rnd = new SecureRandom()
+        val length = sparkConf.getInt("spark.authenticate.secretBitLength", 256) / JByte.SIZE
+        val secret = new Array[Byte](length)
+        rnd.nextBytes(secret)
+
+        val cookie = HashCodes.fromBytes(secret).toString()
+        SparkHadoopUtil.get.addSecretKeyToUserCredentials(SECRET_LOOKUP_KEY, cookie)
+        cookie
       } else {
-        logDebug("getSecretKey: yarn mode, secret key from credentials is null")
+        new Text(secretKey).toString
       }
-      val cookie = akka.util.Crypt.generateSecureCookie
-      // if we generated the secret then we must be the first so lets set it so t
-      // gets used by everyone else
-      SparkHadoopUtil.get.addSecretKeyToUserCredentials(sparkSecretLookupKey, cookie)
-      logInfo("adding secret to credentials in yarn mode")
-      cookie
     } else {
       // user must have set spark.authenticate.secret config
       // For Master/Worker, auth secret is in conf; for Executors, it is in env variable
-      sys.env.get(SecurityManager.ENV_AUTH_SECRET)
+      Option(sparkConf.getenv(SecurityManager.ENV_AUTH_SECRET))
         .orElse(sparkConf.getOption(SecurityManager.SPARK_AUTH_SECRET_CONF)) match {
         case Some(value) => value
-        case None => throw new Exception("Error: a secret key must be specified via the " +
-          SecurityManager.SPARK_AUTH_SECRET_CONF + " config")
+        case None =>
+          throw new IllegalArgumentException(
+            "Error: a secret key must be specified via the " +
+              SecurityManager.SPARK_AUTH_SECRET_CONF + " config")
       }
     }
-    sCookie
   }
 
   /**
@@ -475,6 +482,9 @@ private[spark] object SecurityManager {
   val SPARK_AUTH_CONF: String = "spark.authenticate"
   val SPARK_AUTH_SECRET_CONF: String = "spark.authenticate.secret"
   // This is used to set auth secret to an executor's env variable. It should have the same
-  // value as SPARK_AUTH_SECERET_CONF set in SparkConf
+  // value as SPARK_AUTH_SECRET_CONF set in SparkConf
   val ENV_AUTH_SECRET = "_SPARK_AUTH_SECRET"
+
+  // key used to store the spark secret in the Hadoop UGI
+  val SECRET_LOOKUP_KEY = "sparkCookie"
 }
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index f29160d834082..26b95c06789f7 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import java.io.File
 
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SparkConfWithEnv, Utils}
 
 class SecurityManagerSuite extends SparkFunSuite {
 
@@ -223,5 +223,26 @@ class SecurityManagerSuite extends SparkFunSuite {
     assert(securityManager.hostnameVerifier.isDefined === false)
   }
 
+  test("missing secret authentication key") {
+    val conf = new SparkConf().set("spark.authenticate", "true")
+    intercept[IllegalArgumentException] {
+      new SecurityManager(conf)
+    }
+  }
+
+  test("secret authentication key") {
+    val key = "very secret key"
+    val conf = new SparkConf()
+      .set(SecurityManager.SPARK_AUTH_CONF, "true")
+      .set(SecurityManager.SPARK_AUTH_SECRET_CONF, key)
+    assert(key === new SecurityManager(conf).getSecretKey())
+
+    val keyFromEnv = "very secret key from env"
+    val conf2 = new SparkConfWithEnv(Map(SecurityManager.ENV_AUTH_SECRET -> keyFromEnv))
+      .set(SecurityManager.SPARK_AUTH_CONF, "true")
+      .set(SecurityManager.SPARK_AUTH_SECRET_CONF, key)
+    assert(keyFromEnv === new SecurityManager(conf2).getSecretKey())
+  }
+
 }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
index 86eb41dd7e5d7..8dd31b4b6fdda 100644
--- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -25,6 +25,7 @@ import scala.io.Source
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener}
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.util.SparkConfWithEnv
 
 class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
 
@@ -53,17 +54,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
 
   test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") {
     val SPARK_PUBLIC_DNS = "public_dns"
-    class MySparkConf extends SparkConf(false) {
-      override def getenv(name: String): String = {
-        if (name == "SPARK_PUBLIC_DNS") SPARK_PUBLIC_DNS
-        else super.getenv(name)
-      }
-
-      override def clone: SparkConf = {
-        new MySparkConf().setAll(getAll)
-      }
-    }
-    val conf = new MySparkConf().set(
+    val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set(
       "spark.extraListeners", classOf[SaveExecutorInfo].getName)
     sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
index 15f7ca4a6dacc..637e78fda0193 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerArgumentsTest.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.deploy.worker
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-
+import org.apache.spark.util.SparkConfWithEnv
 
 class WorkerArgumentsTest extends SparkFunSuite {
 
@@ -34,18 +34,7 @@ class WorkerArgumentsTest extends SparkFunSuite {
 
   test("Memory can't be set to 0 when SPARK_WORKER_MEMORY env property leaves off M or G") {
     val args = Array("spark://localhost:0000  ")
-
-    class MySparkConf extends SparkConf(false) {
-      override def getenv(name: String): String = {
-        if (name == "SPARK_WORKER_MEMORY") "50000"
-        else super.getenv(name)
-      }
-
-      override def clone: SparkConf = {
-        new MySparkConf().setAll(getAll)
-      }
-    }
-    val conf = new MySparkConf()
+    val conf = new SparkConfWithEnv(Map("SPARK_WORKER_MEMORY" -> "50000"))
     intercept[IllegalStateException] {
       new WorkerArguments(args, conf)
     }
@@ -53,18 +42,7 @@ class WorkerArgumentsTest extends SparkFunSuite {
 
   test("Memory correctly set when SPARK_WORKER_MEMORY env property appends G") {
     val args = Array("spark://localhost:0000  ")
-
-    class MySparkConf extends SparkConf(false) {
-      override def getenv(name: String): String = {
-        if (name == "SPARK_WORKER_MEMORY") "5G"
-        else super.getenv(name)
-      }
-
-      override def clone: SparkConf = {
-        new MySparkConf().setAll(getAll)
-      }
-    }
-    val conf = new MySparkConf()
+    val conf = new SparkConfWithEnv(Map("SPARK_WORKER_MEMORY" -> "5G"))
     val workerArgs = new WorkerArguments(args, conf)
     assert(workerArgs.memory === 5120)
   }
diff --git a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
index ac6fec56bbf4f..cc50289c7b3ea 100644
--- a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.util.Utils
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-
+import org.apache.spark.util.SparkConfWithEnv
 
 /**
  * Tests for the spark.local.dir and SPARK_LOCAL_DIRS configuration options.
@@ -45,20 +45,10 @@ class LocalDirsSuite extends SparkFunSuite with BeforeAndAfter {
   test("SPARK_LOCAL_DIRS override also affects driver") {
     // Regression test for SPARK-2975
     assert(!new File("/NONEXISTENT_DIR").exists())
-    // SPARK_LOCAL_DIRS is a valid directory:
-    class MySparkConf extends SparkConf(false) {
-      override def getenv(name: String): String = {
-        if (name == "SPARK_LOCAL_DIRS") System.getProperty("java.io.tmpdir")
-        else super.getenv(name)
-      }
-
-      override def clone: SparkConf = {
-        new MySparkConf().setAll(getAll)
-      }
-    }
     // spark.local.dir only contains invalid directories, but that's not a problem since
     // SPARK_LOCAL_DIRS will override it on both the driver and workers:
-    val conf = new MySparkConf().set("spark.local.dir", "/NONEXISTENT_PATH")
+    val conf = new SparkConfWithEnv(Map("SPARK_LOCAL_DIRS" -> System.getProperty("java.io.tmpdir")))
+      .set("spark.local.dir", "/NONEXISTENT_PATH")
     assert(new File(Utils.getLocalDir(conf)).exists())
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/SparkConfWithEnv.scala b/core/src/test/scala/org/apache/spark/util/SparkConfWithEnv.scala
new file mode 100644
index 0000000000000..ddd5edf4f7396
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/SparkConfWithEnv.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import org.apache.spark.SparkConf
+
+/**
+ * Customized SparkConf that allows env variables to be overridden.
+ */
+class SparkConfWithEnv(env: Map[String, String]) extends SparkConf(false) {
+  override def getenv(name: String): String = {
+    env.get(name).getOrElse(super.getenv(name))
+  }
+
+  override def clone: SparkConf = {
+    new SparkConfWithEnv(env).setAll(getAll)
+  }
+
+}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 5924daf3ece49..561ad79ee0228 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy.yarn
 
 import java.io.File
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.regex.Matcher
 import java.util.regex.Pattern
 
@@ -81,7 +82,7 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
 
   override def addSecretKeyToUserCredentials(key: String, secret: String) {
     val creds = new Credentials()
-    creds.addSecretKey(new Text(key), secret.getBytes("utf-8"))
+    creds.addSecretKey(new Text(key), secret.getBytes(UTF_8))
     addCurrentUserCredentials(creds)
   }
 
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index 9132c56a91754..a70e66d39a64e 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -24,8 +24,10 @@ import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.metadata.HiveException
+import org.apache.hadoop.io.Text
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.scalatest.Matchers
 
@@ -263,7 +265,7 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
     })
   }
 
-  def assertNestedHiveException(e: InvocationTargetException): Throwable = {
+  private def assertNestedHiveException(e: InvocationTargetException): Throwable = {
     val inner = e.getCause
     if (inner == null) {
       fail("No inner cause", e)
@@ -274,4 +276,32 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
     inner
   }
 
+  // This test needs to live here because it depends on isYarnMode returning true, which can only
+  // happen in the YARN module.
+  test("security manager token generation") {
+    try {
+      System.setProperty("SPARK_YARN_MODE", "true")
+      val initial = SparkHadoopUtil.get
+        .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
+      assert(initial === null || initial.length === 0)
+
+      val conf = new SparkConf()
+        .set(SecurityManager.SPARK_AUTH_CONF, "true")
+        .set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
+      val sm = new SecurityManager(conf)
+
+      val generated = SparkHadoopUtil.get
+        .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
+      assert(generated != null)
+      val genString = new Text(generated).toString()
+      assert(genString != "unused")
+      assert(sm.getSecretKey() === genString)
+    } finally {
+      // removeSecretKey() was only added in Hadoop 2.6, so instead we just set the secret
+      // to an empty string.
+      SparkHadoopUtil.get.addSecretKeyToUserCredentials(SecurityManager.SECRET_LOOKUP_KEY, "")
+      System.clearProperty("SPARK_YARN_MODE")
+    }
+  }
+
 }

From 3e770a64a48c271c5829d2bcbdc1d6430cda2ac9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Sun, 1 Nov 2015 18:37:27 -0800
Subject: [PATCH 750/802] [SPARK-9298][SQL] Add pearson correlation aggregation
 function

JIRA: https://issues.apache.org/jira/browse/SPARK-9298

This patch adds pearson correlation aggregation function based on `AggregateExpression2`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8587 from viirya/corr_aggregation.
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../expressions/aggregate/functions.scala     | 159 ++++++++++++++++++
 .../expressions/aggregate/utils.scala         |   6 +
 .../sql/catalyst/expressions/aggregates.scala |  18 ++
 .../org/apache/spark/sql/functions.scala      |  18 ++
 .../execution/HiveCompatibilitySuite.scala    |   7 +-
 .../execution/AggregationQuerySuite.scala     | 104 ++++++++++++
 7 files changed, 311 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ed9fcfe014f0c..5f3ec74ac0d92 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -178,6 +178,7 @@ object FunctionRegistry {
 
     // aggregate functions
     expression[Average]("avg"),
+    expression[Corr]("corr"),
     expression[Count]("count"),
     expression[First]("first"),
     expression[First]("first_value"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 281404f285a98..5d2eb7b017ab9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -23,6 +23,7 @@ import java.util
 import com.clearspring.analytics.hash.MurmurHash
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
@@ -524,6 +525,164 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
   override val evaluateExpression = Cast(currentSum, resultType)
 }
 
+/**
+ * Compute Pearson correlation between two expressions.
+ * When applied on empty data (i.e., count is zero), it returns NULL.
+ *
+ * Definition of Pearson correlation can be found at
+ * http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
+ *
+ * @param left one of the expressions to compute correlation with.
+ * @param right another expression to compute correlation with.
+ */
+case class Corr(
+    left: Expression,
+    right: Expression,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
+  extends ImperativeAggregate {
+
+  def children: Seq[Expression] = Seq(left, right)
+
+  def nullable: Boolean = false
+
+  def dataType: DataType = DoubleType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType)
+
+  def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
+
+  def inputAggBufferAttributes: Seq[AttributeReference] = aggBufferAttributes.map(_.newInstance())
+
+  val aggBufferAttributes: Seq[AttributeReference] = Seq(
+    AttributeReference("xAvg", DoubleType)(),
+    AttributeReference("yAvg", DoubleType)(),
+    AttributeReference("Ck", DoubleType)(),
+    AttributeReference("MkX", DoubleType)(),
+    AttributeReference("MkY", DoubleType)(),
+    AttributeReference("count", LongType)())
+
+  // Local cache of mutableAggBufferOffset(s) that will be used in update and merge
+  private[this] val mutableAggBufferOffsetPlus1 = mutableAggBufferOffset + 1
+  private[this] val mutableAggBufferOffsetPlus2 = mutableAggBufferOffset + 2
+  private[this] val mutableAggBufferOffsetPlus3 = mutableAggBufferOffset + 3
+  private[this] val mutableAggBufferOffsetPlus4 = mutableAggBufferOffset + 4
+  private[this] val mutableAggBufferOffsetPlus5 = mutableAggBufferOffset + 5
+
+  // Local cache of inputAggBufferOffset(s) that will be used in update and merge
+  private[this] val inputAggBufferOffsetPlus1 = inputAggBufferOffset + 1
+  private[this] val inputAggBufferOffsetPlus2 = inputAggBufferOffset + 2
+  private[this] val inputAggBufferOffsetPlus3 = inputAggBufferOffset + 3
+  private[this] val inputAggBufferOffsetPlus4 = inputAggBufferOffset + 4
+  private[this] val inputAggBufferOffsetPlus5 = inputAggBufferOffset + 5
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def initialize(buffer: MutableRow): Unit = {
+    buffer.setDouble(mutableAggBufferOffset, 0.0)
+    buffer.setDouble(mutableAggBufferOffsetPlus1, 0.0)
+    buffer.setDouble(mutableAggBufferOffsetPlus2, 0.0)
+    buffer.setDouble(mutableAggBufferOffsetPlus3, 0.0)
+    buffer.setDouble(mutableAggBufferOffsetPlus4, 0.0)
+    buffer.setLong(mutableAggBufferOffsetPlus5, 0L)
+  }
+
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+    val leftEval = left.eval(input)
+    val rightEval = right.eval(input)
+
+    if (leftEval != null && rightEval != null) {
+      val x = leftEval.asInstanceOf[Double]
+      val y = rightEval.asInstanceOf[Double]
+
+      var xAvg = buffer.getDouble(mutableAggBufferOffset)
+      var yAvg = buffer.getDouble(mutableAggBufferOffsetPlus1)
+      var Ck = buffer.getDouble(mutableAggBufferOffsetPlus2)
+      var MkX = buffer.getDouble(mutableAggBufferOffsetPlus3)
+      var MkY = buffer.getDouble(mutableAggBufferOffsetPlus4)
+      var count = buffer.getLong(mutableAggBufferOffsetPlus5)
+
+      val deltaX = x - xAvg
+      val deltaY = y - yAvg
+      count += 1
+      xAvg += deltaX / count
+      yAvg += deltaY / count
+      Ck += deltaX * (y - yAvg)
+      MkX += deltaX * (x - xAvg)
+      MkY += deltaY * (y - yAvg)
+
+      buffer.setDouble(mutableAggBufferOffset, xAvg)
+      buffer.setDouble(mutableAggBufferOffsetPlus1, yAvg)
+      buffer.setDouble(mutableAggBufferOffsetPlus2, Ck)
+      buffer.setDouble(mutableAggBufferOffsetPlus3, MkX)
+      buffer.setDouble(mutableAggBufferOffsetPlus4, MkY)
+      buffer.setLong(mutableAggBufferOffsetPlus5, count)
+    }
+  }
+
+  // Merge counters from other partitions. Formula can be found at:
+  // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    val count2 = buffer2.getLong(inputAggBufferOffsetPlus5)
+
+    // We only go to merge two buffers if there is at least one record aggregated in buffer2.
+    // We don't need to check count in buffer1 because if count2 is more than zero, totalCount
+    // is more than zero too, then we won't get a divide by zero exception.
+    if (count2 > 0) {
+      var xAvg = buffer1.getDouble(mutableAggBufferOffset)
+      var yAvg = buffer1.getDouble(mutableAggBufferOffsetPlus1)
+      var Ck = buffer1.getDouble(mutableAggBufferOffsetPlus2)
+      var MkX = buffer1.getDouble(mutableAggBufferOffsetPlus3)
+      var MkY = buffer1.getDouble(mutableAggBufferOffsetPlus4)
+      var count = buffer1.getLong(mutableAggBufferOffsetPlus5)
+
+      val xAvg2 = buffer2.getDouble(inputAggBufferOffset)
+      val yAvg2 = buffer2.getDouble(inputAggBufferOffsetPlus1)
+      val Ck2 = buffer2.getDouble(inputAggBufferOffsetPlus2)
+      val MkX2 = buffer2.getDouble(inputAggBufferOffsetPlus3)
+      val MkY2 = buffer2.getDouble(inputAggBufferOffsetPlus4)
+
+      val totalCount = count + count2
+      val deltaX = xAvg - xAvg2
+      val deltaY = yAvg - yAvg2
+      Ck += Ck2 + deltaX * deltaY * count / totalCount * count2
+      xAvg = (xAvg * count + xAvg2 * count2) / totalCount
+      yAvg = (yAvg * count + yAvg2 * count2) / totalCount
+      MkX += MkX2 + deltaX * deltaX * count / totalCount * count2
+      MkY += MkY2 + deltaY * deltaY * count / totalCount * count2
+      count = totalCount
+
+      buffer1.setDouble(mutableAggBufferOffset, xAvg)
+      buffer1.setDouble(mutableAggBufferOffsetPlus1, yAvg)
+      buffer1.setDouble(mutableAggBufferOffsetPlus2, Ck)
+      buffer1.setDouble(mutableAggBufferOffsetPlus3, MkX)
+      buffer1.setDouble(mutableAggBufferOffsetPlus4, MkY)
+      buffer1.setLong(mutableAggBufferOffsetPlus5, count)
+    }
+  }
+
+  override def eval(buffer: InternalRow): Any = {
+    val count = buffer.getLong(mutableAggBufferOffsetPlus5)
+    if (count > 0) {
+      val Ck = buffer.getDouble(mutableAggBufferOffsetPlus2)
+      val MkX = buffer.getDouble(mutableAggBufferOffsetPlus3)
+      val MkY = buffer.getDouble(mutableAggBufferOffsetPlus4)
+      val corr = Ck / math.sqrt(MkX * MkY)
+      if (corr.isNaN) {
+        null
+      } else {
+        corr
+      }
+    } else {
+      null
+    }
+  }
+}
+
 // scalastyle:off
 /**
  * HyperLogLog++ (HLL++) is a state of the art cardinality estimation algorithm. This class
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index c911ec53f1ba0..564174f9b64e4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -127,6 +127,12 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = true)
 
+        case expressions.Corr(left, right) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Corr(left, right),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
         case expressions.ApproxCountDistinct(child, rsd) =>
           aggregate.AggregateExpression2(
             aggregateFunction = aggregate.HyperLogLogPlusPlus(child, rsd),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index c1bab6d36ab29..bf59660c385ed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -747,6 +747,24 @@ case class LastFunction(
   }
 }
 
+/**
+ * Calculate Pearson Correlation Coefficient for the given columns.
+ * Only support AggregateExpression2.
+ *
+ */
+case class Corr(left: Expression, right: Expression)
+    extends BinaryExpression with AggregateExpression1 with ImplicitCastInputTypes {
+  override def nullable: Boolean = false
+  override def dataType: DoubleType.type = DoubleType
+  override def toString: String = s"CORRELATION($left, $right)"
+  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType)
+  override def newInstance(): AggregateFunction1 = {
+    throw new UnsupportedOperationException(
+      "Corr only supports the new AggregateExpression2 and can only be used " +
+        "when spark.sql.useAggregate2 = true")
+  }
+}
+
 // Compute standard deviation based on online algorithm specified here:
 // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 abstract class StddevAgg1(child: Expression) extends UnaryExpression with PartialAggregate1 {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c1737b1ef663c..5a5c695e6ab3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -172,6 +172,24 @@ object functions {
    */
   def avg(columnName: String): Column = avg(Column(columnName))
 
+  /**
+   * Aggregate function: returns the Pearson Correlation Coefficient for two columns.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def corr(column1: Column, column2: Column): Column =
+    Corr(column1.expr, column2.expr)
+
+  /**
+   * Aggregate function: returns the Pearson Correlation Coefficient for two columns.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def corr(columnName1: String, columnName2: String): Column =
+    corr(Column(columnName1), Column(columnName2))
+
   /**
    * Aggregate function: returns the number of items in a group.
    *
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 9e357bf348c94..6ed40b03975d0 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -304,7 +304,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
     // classpath problems
     "compute_stats.*",
-    "udf_bitmap_.*"
+    "udf_bitmap_.*",
+
+    // The difference between the double numbers generated by Hive and Spark
+    // can be ignored (e.g., 0.6633880657639323 and 0.6633880657639322)
+    "udaf_corr"
   )
 
   /**
@@ -857,7 +861,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "type_cast_1",
     "type_widening",
     "udaf_collect_set",
-    "udaf_corr",
     "udaf_covar_pop",
     "udaf_covar_samp",
     "udaf_histogram_numeric",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index f38a3f63c3b58..0cf0e0aab9eb2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution
 
 import scala.collection.JavaConverters._
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.aggregate
@@ -556,6 +557,109 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         Row(0, null, 1, 1, null, 0) :: Nil)
   }
 
+  test("pearson correlation") {
+    val df = Seq.tabulate(10)(i => (1.0 * i, 2.0 * i, i * -1.0)).toDF("a", "b", "c")
+    val corr1 = df.repartition(2).groupBy().agg(corr("a", "b")).collect()(0).getDouble(0)
+    assert(math.abs(corr1 - 1.0) < 1e-12)
+    val corr2 = df.groupBy().agg(corr("a", "c")).collect()(0).getDouble(0)
+    assert(math.abs(corr2 + 1.0) < 1e-12)
+    // non-trivial example. To reproduce in python, use:
+    // >>> from scipy.stats import pearsonr
+    // >>> import numpy as np
+    // >>> a = np.array(range(20))
+    // >>> b = np.array([x * x - 2 * x + 3.5 for x in range(20)])
+    // >>> pearsonr(a, b)
+    // (0.95723391394758572, 3.8902121417802199e-11)
+    // In R, use:
+    // > a <- 0:19
+    // > b <- mapply(function(x) x * x - 2 * x + 3.5, a)
+    // > cor(a, b)
+    // [1] 0.957233913947585835
+    val df2 = Seq.tabulate(20)(x => (1.0 * x, x * x - 2 * x + 3.5)).toDF("a", "b")
+    val corr3 = df2.groupBy().agg(corr("a", "b")).collect()(0).getDouble(0)
+    assert(math.abs(corr3 - 0.95723391394758572) < 1e-12)
+
+    val df3 = Seq.tabulate(0)(i => (1.0 * i, 2.0 * i)).toDF("a", "b")
+    val corr4 = df3.groupBy().agg(corr("a", "b")).collect()(0)
+    assert(corr4 == Row(null))
+
+    val df4 = Seq.tabulate(10)(i => (1 * i, 2 * i, i * -1)).toDF("a", "b", "c")
+    val corr5 = df4.repartition(2).groupBy().agg(corr("a", "b")).collect()(0).getDouble(0)
+    assert(math.abs(corr5 - 1.0) < 1e-12)
+    val corr6 = df4.groupBy().agg(corr("a", "c")).collect()(0).getDouble(0)
+    assert(math.abs(corr6 + 1.0) < 1e-12)
+
+    // Test for udaf_corr in HiveCompatibilitySuite
+    // udaf_corr has been blacklisted due to numerical errors
+    // We test it here:
+    // SELECT corr(b, c) FROM covar_tab WHERE a < 1; => NULL
+    // SELECT corr(b, c) FROM covar_tab WHERE a < 3; => NULL
+    // SELECT corr(b, c) FROM covar_tab WHERE a = 3; => NULL
+    // SELECT a, corr(b, c) FROM covar_tab GROUP BY a ORDER BY a; =>
+    // 1       NULL
+    // 2       NULL
+    // 3       NULL
+    // 4       NULL
+    // 5       NULL
+    // 6       NULL
+    // SELECT corr(b, c) FROM covar_tab; => 0.6633880657639323
+
+    val covar_tab = Seq[(Integer, Integer, Integer)](
+      (1, null, 15),
+      (2, 3, null),
+      (3, 7, 12),
+      (4, 4, 14),
+      (5, 8, 17),
+      (6, 2, 11)).toDF("a", "b", "c")
+
+    covar_tab.registerTempTable("covar_tab")
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT corr(b, c) FROM covar_tab WHERE a < 1
+        """.stripMargin),
+      Row(null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT corr(b, c) FROM covar_tab WHERE a < 3
+        """.stripMargin),
+      Row(null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT corr(b, c) FROM covar_tab WHERE a = 3
+        """.stripMargin),
+      Row(null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT a, corr(b, c) FROM covar_tab GROUP BY a ORDER BY a
+        """.stripMargin),
+      Row(1, null) ::
+      Row(2, null) ::
+      Row(3, null) ::
+      Row(4, null) ::
+      Row(5, null) ::
+      Row(6, null) :: Nil)
+
+    val corr7 = sqlContext.sql("SELECT corr(b, c) FROM covar_tab").collect()(0).getDouble(0)
+    assert(math.abs(corr7 - 0.6633880657639323) < 1e-12)
+
+    withSQLConf(SQLConf.USE_SQL_AGGREGATE2.key -> "false") {
+      val errorMessage = intercept[SparkException] {
+        val df = Seq.tabulate(10)(i => (1.0 * i, 2.0 * i, i * -1.0)).toDF("a", "b", "c")
+        val corr1 = df.repartition(2).groupBy().agg(corr("a", "b")).collect()(0).getDouble(0)
+      }.getMessage
+      assert(errorMessage.contains("java.lang.UnsupportedOperationException: " +
+        "Corr only supports the new AggregateExpression2"))
+    }
+  }
+
   test("test Last implemented based on AggregateExpression1") {
     // TODO: Remove this test once we remove AggregateExpression1.
     import org.apache.spark.sql.functions._

From e963070c13f56fbc2dfaf9f5d4e69d34afd0957c Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Sun, 1 Nov 2015 23:52:50 -0800
Subject: [PATCH 751/802] [SPARK-9722] [ML] Pass random seed to spark.ml
 DecisionTree*

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #9402 from yu-iskw/SPARK-9722.
---
 .../org/apache/spark/ml/tree/impl/RandomForest.scala      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 96d5652857e08..4a3b12d1440b8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -74,7 +74,7 @@ private[ml] object RandomForest extends Logging {
     // Find the splits and the corresponding bins (interval between the splits) using a sample
     // of the input data.
     timer.start("findSplitsBins")
-    val splits = findSplits(retaggedInput, metadata)
+    val splits = findSplits(retaggedInput, metadata, seed)
     timer.stop("findSplitsBins")
     logDebug("numBins: feature: number of bins")
     logDebug(Range(0, metadata.numFeatures).map { featureIndex =>
@@ -815,6 +815,7 @@ private[ml] object RandomForest extends Logging {
    *
    * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
    * @param metadata Learning and dataset metadata
+   * @param seed random seed
    * @return A tuple of (splits, bins).
    *         Splits is an Array of [[org.apache.spark.mllib.tree.model.Split]]
    *          of size (numFeatures, numSplits).
@@ -823,7 +824,8 @@ private[ml] object RandomForest extends Logging {
    */
   protected[tree] def findSplits(
       input: RDD[LabeledPoint],
-      metadata: DecisionTreeMetadata): Array[Array[Split]] = {
+      metadata: DecisionTreeMetadata,
+      seed : Long): Array[Array[Split]] = {
 
     logDebug("isMulticlass = " + metadata.isMulticlass)
 
@@ -840,7 +842,7 @@ private[ml] object RandomForest extends Logging {
         1.0
       }
       logDebug("fraction of data used for calculating quantiles = " + fraction)
-      input.sample(withReplacement = false, fraction, new XORShiftRandom(1).nextInt()).collect()
+      input.sample(withReplacement = false, fraction, new XORShiftRandom(seed).nextInt()).collect()
     } else {
       new Array[LabeledPoint](0)
     }

From e209fa271ae57dc8849f8b1241bf1ea7d6d3d62c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 2 Nov 2015 08:52:52 +0000
Subject: [PATCH 752/802] [SPARK-11271][SPARK-11016][CORE] Use Spark BitSet
 instead of RoaringBitmap to reduce memory usage

JIRA: https://issues.apache.org/jira/browse/SPARK-11271

As reported in the JIRA ticket, when there are too many tasks, the memory usage of MapStatus will cause problem. Use BitSet instead of RoaringBitMap should be more efficient in memory usage.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9243 from viirya/mapstatus-bitset.
---
 core/pom.xml                                  |  4 --
 .../apache/spark/scheduler/MapStatus.scala    | 13 +++--
 .../spark/serializer/KryoSerializer.scala     | 10 +---
 .../apache/spark/util/collection/BitSet.scala | 28 +++++++++--
 .../serializer/KryoSerializerSuite.scala      |  6 ---
 .../spark/util/collection/BitSetSuite.scala   | 49 +++++++++++++++++++
 pom.xml                                       |  5 --
 7 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 319a50049a82d..1b6b13517bd56 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -173,10 +173,6 @@
       <groupId>net.jpountz.lz4</groupId>
       <artifactId>lz4</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.roaringbitmap</groupId>
-      <artifactId>RoaringBitmap</artifactId>
-    </dependency>
     <dependency>
       <groupId>commons-net</groupId>
       <artifactId>commons-net</artifactId>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
index 1efce124c0a6b..180c8d1827e13 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
@@ -19,9 +19,8 @@ package org.apache.spark.scheduler
 
 import java.io.{Externalizable, ObjectInput, ObjectOutput}
 
-import org.roaringbitmap.RoaringBitmap
-
 import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.util.collection.BitSet
 import org.apache.spark.util.Utils
 
 /**
@@ -133,7 +132,7 @@ private[spark] class CompressedMapStatus(
 private[spark] class HighlyCompressedMapStatus private (
     private[this] var loc: BlockManagerId,
     private[this] var numNonEmptyBlocks: Int,
-    private[this] var emptyBlocks: RoaringBitmap,
+    private[this] var emptyBlocks: BitSet,
     private[this] var avgSize: Long)
   extends MapStatus with Externalizable {
 
@@ -146,7 +145,7 @@ private[spark] class HighlyCompressedMapStatus private (
   override def location: BlockManagerId = loc
 
   override def getSizeForBlock(reduceId: Int): Long = {
-    if (emptyBlocks.contains(reduceId)) {
+    if (emptyBlocks.get(reduceId)) {
       0
     } else {
       avgSize
@@ -161,7 +160,7 @@ private[spark] class HighlyCompressedMapStatus private (
 
   override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
     loc = BlockManagerId(in)
-    emptyBlocks = new RoaringBitmap()
+    emptyBlocks = new BitSet
     emptyBlocks.readExternal(in)
     avgSize = in.readLong()
   }
@@ -177,15 +176,15 @@ private[spark] object HighlyCompressedMapStatus {
     // From a compression standpoint, it shouldn't matter whether we track empty or non-empty
     // blocks. From a performance standpoint, we benefit from tracking empty blocks because
     // we expect that there will be far fewer of them, so we will perform fewer bitmap insertions.
-    val emptyBlocks = new RoaringBitmap()
     val totalNumBlocks = uncompressedSizes.length
+    val emptyBlocks = new BitSet(totalNumBlocks)
     while (i < totalNumBlocks) {
       var size = uncompressedSizes(i)
       if (size > 0) {
         numNonEmptyBlocks += 1
         totalSize += size
       } else {
-        emptyBlocks.add(i)
+        emptyBlocks.set(i)
       }
       i += 1
     }
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index c5195c1143a8f..bc51d4f2820c8 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -30,7 +30,6 @@ import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
 import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
 import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator}
 import org.apache.avro.generic.{GenericData, GenericRecord}
-import org.roaringbitmap.{ArrayContainer, BitmapContainer, RoaringArray, RoaringBitmap}
 
 import org.apache.spark._
 import org.apache.spark.api.python.PythonBroadcast
@@ -39,7 +38,7 @@ import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus}
 import org.apache.spark.storage._
 import org.apache.spark.util.{Utils, BoundedPriorityQueue, SerializableConfiguration, SerializableJobConf}
-import org.apache.spark.util.collection.CompactBuffer
+import org.apache.spark.util.collection.{BitSet, CompactBuffer}
 
 /**
  * A Spark serializer that uses the [[https://code.google.com/p/kryo/ Kryo serialization library]].
@@ -363,12 +362,7 @@ private[serializer] object KryoSerializer {
     classOf[StorageLevel],
     classOf[CompressedMapStatus],
     classOf[HighlyCompressedMapStatus],
-    classOf[RoaringBitmap],
-    classOf[RoaringArray],
-    classOf[RoaringArray.Element],
-    classOf[Array[RoaringArray.Element]],
-    classOf[ArrayContainer],
-    classOf[BitmapContainer],
+    classOf[BitSet],
     classOf[CompactBuffer[_]],
     classOf[BlockManagerId],
     classOf[Array[Byte]],
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 7ab67fc3a2de9..85c5bdbfcebc0 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -17,14 +17,21 @@
 
 package org.apache.spark.util.collection
 
+import java.io.{Externalizable, ObjectInput, ObjectOutput}
+
+import org.apache.spark.util.{Utils => UUtils}
+
+
 /**
  * A simple, fixed-size bit set implementation. This implementation is fast because it avoids
  * safety/bound checking.
  */
-class BitSet(numBits: Int) extends Serializable {
+class BitSet(private[this] var numBits: Int) extends Externalizable {
 
-  private val words = new Array[Long](bit2words(numBits))
-  private val numWords = words.length
+  private var words = new Array[Long](bit2words(numBits))
+  private def numWords = words.length
+
+  def this() = this(0)
 
   /**
    * Compute the capacity (number of bits) that can be represented
@@ -230,4 +237,19 @@ class BitSet(numBits: Int) extends Serializable {
 
   /** Return the number of longs it would take to hold numBits. */
   private def bit2words(numBits: Int) = ((numBits - 1) >> 6) + 1
+
+  override def writeExternal(out: ObjectOutput): Unit = UUtils.tryOrIOException {
+    out.writeInt(numBits)
+    words.foreach(out.writeLong(_))
+  }
+
+  override def readExternal(in: ObjectInput): Unit = UUtils.tryOrIOException {
+    numBits = in.readInt()
+    words = new Array[Long](bit2words(numBits))
+    var index = 0
+    while (index < words.length) {
+      words(index) = in.readLong()
+      index += 1
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index e428414cf6e85..afe2e80358ca0 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -322,12 +322,6 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
     val conf = new SparkConf(false)
     conf.set("spark.kryo.registrationRequired", "true")
 
-    // these cases require knowing the internals of RoaringBitmap a little.  Blocks span 2^16
-    // values, and they use a bitmap (dense) if they have more than 4096 values, and an
-    // array (sparse) if they use less.  So we just create two cases, one sparse and one dense.
-    // and we use a roaring bitmap for the empty blocks, so we trigger the dense case w/ mostly
-    // empty blocks
-
     val ser = new KryoSerializer(conf).newInstance()
     val denseBlockSizes = new Array[Long](5000)
     val sparseBlockSizes = Array[Long](0L, 1L, 0L, 2L)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
index 69dbfa9cd7141..b0db0988eeaab 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.util.collection
 
+import java.io.{File, FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream}
+
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.{Utils => UUtils}
 
 class BitSetSuite extends SparkFunSuite {
 
@@ -152,4 +155,50 @@ class BitSetSuite extends SparkFunSuite {
     assert(bitsetDiff.nextSetBit(85) === 85)
     assert(bitsetDiff.nextSetBit(86) === -1)
   }
+
+  test("read and write externally") {
+    val tempDir = UUtils.createTempDir()
+    val outputFile = File.createTempFile("bits", null, tempDir)
+
+    val fos = new FileOutputStream(outputFile)
+    val oos = new ObjectOutputStream(fos)
+
+    // Create BitSet
+    val setBits = Seq(0, 9, 1, 10, 90, 96)
+    val bitset = new BitSet(100)
+
+    for (i <- 0 until 100) {
+      assert(!bitset.get(i))
+    }
+
+    setBits.foreach(i => bitset.set(i))
+
+    for (i <- 0 until 100) {
+      if (setBits.contains(i)) {
+        assert(bitset.get(i))
+      } else {
+        assert(!bitset.get(i))
+      }
+    }
+    assert(bitset.cardinality() === setBits.size)
+
+    bitset.writeExternal(oos)
+    oos.close()
+
+    val fis = new FileInputStream(outputFile)
+    val ois = new ObjectInputStream(fis)
+
+    // Read BitSet from the file
+    val bitset2 = new BitSet(0)
+    bitset2.readExternal(ois)
+
+    for (i <- 0 until 100) {
+      if (setBits.contains(i)) {
+        assert(bitset2.get(i))
+      } else {
+        assert(!bitset2.get(i))
+      }
+    }
+    assert(bitset2.cardinality() === setBits.size)
+  }
 }
diff --git a/pom.xml b/pom.xml
index 3dfc434fb553b..50c8f29cdbcd4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -623,11 +623,6 @@
           </exclusion>
         </exclusions>
       </dependency>
-      <dependency>
-        <groupId>org.roaringbitmap</groupId>
-        <artifactId>RoaringBitmap</artifactId>
-        <version>0.4.5</version>
-      </dependency>
       <dependency>
         <groupId>commons-net</groupId>
         <artifactId>commons-net</artifactId>

From ea4a3e7d06dd4a0f669460513b27469c468214fb Mon Sep 17 00:00:00 2001
From: Yongjia Wang <yongjiaw@gmail.com>
Date: Mon, 2 Nov 2015 08:59:35 +0000
Subject: [PATCH 753/802] [SPARK-11413][BUILD] Bump joda-time version to 2.9
 for java 8 and s3

It's a known issue that joda-time before 2.8.1 is incompatible with java 1.8u60 or later, which causes s3 request to fail. This affects Spark when using s3 as data source.
https://github.com/aws/aws-sdk-java/issues/444

Author: Yongjia Wang <yongjiaw@gmail.com>

Closes #9379 from yongjiaw/SPARK-11413.
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 50c8f29cdbcd4..762bfc7282335 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,7 +176,7 @@
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
     <janino.version>2.7.8</janino.version>
     <jersey.version>1.9</jersey.version>
-    <joda.version>2.5</joda.version>
+    <joda.version>2.9</joda.version>
     <jodd.version>3.5.2</jodd.version>
     <jsr305.version>1.3.9</jsr305.version>
     <libthrift.version>0.9.2</libthrift.version>

From 767522dc4e66dd26773d41d1576945187180d2b9 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Mon, 2 Nov 2015 21:31:10 +0800
Subject: [PATCH 754/802] [SPARK-10786][SQL] Take the whole statement to
 generate the CommandProcessor

In the now implementation of `SparkSQLCLIDriver.scala`:
`val proc: CommandProcessor = CommandProcessorFactory.get(Array(tokens(0)), hconf)`
`CommandProcessorFactory` only take the first token of the statement, and this will be hard to diff the statement `delete jar xxx` and `delete from xxx`.
So maybe it's better to take the whole statement into the `CommandProcessorFactory`.

And in [HiveCommand](https://github.com/SaintBacchus/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/processors/HiveCommand.java#L76), it already special handing these two statement.
```java
if(command.length > 1 && "from".equalsIgnoreCase(command[1])) {
  //special handling for SQL "delete from <table> where..."
  return null;
}
```

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #8895 from SaintBacchus/SPARK-10786.
---
 .../apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 62e912c69abc6..6419002a2aa89 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -290,7 +290,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
     } else {
       var ret = 0
       val hconf = conf.asInstanceOf[HiveConf]
-      val proc: CommandProcessor = CommandProcessorFactory.get(Array(tokens(0)), hconf)
+      val proc: CommandProcessor = CommandProcessorFactory.get(tokens, hconf)
 
       if (proc != null) {
         // scalastyle:off println

From 74ba95228d71a6dc4e95fef19f41dabe7c363d9e Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 2 Nov 2015 23:07:30 +0800
Subject: [PATCH 755/802] [SPARK-11311][SQL] spark cannot describe temporary
 functions

When describe temporary function, spark would return 'Unable to find function', this is not right.

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #9277 from adrian-wang/functionreg.
---
 .../scala/org/apache/spark/sql/hive/hiveUDFs.scala     |  6 +++++-
 .../spark/sql/hive/execution/HiveQuerySuite.scala      | 10 ++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 2ccad474b4f7a..0b5e863506142 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -119,7 +119,11 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
           annotation.value(),
           annotation.extended()))
       } else {
-        None
+        Some(new ExpressionInfo(
+          info.getFunctionClass.getCanonicalName,
+          name,
+          null,
+          null))
       }
     }.getOrElse(None))
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index b52f7d4b57899..e597d6865f67a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -953,6 +953,16 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     sql("DROP TABLE t1")
   }
 
+  test("CREATE TEMPORARY FUNCTION") {
+    val funcJar = TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath
+    sql(s"ADD JAR $funcJar")
+    sql(
+      """CREATE TEMPORARY FUNCTION udtf_count2 AS
+        | 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'""".stripMargin)
+    assert(sql("DESCRIBE FUNCTION udtf_count2").count > 1)
+    sql("DROP TEMPORARY FUNCTION udtf_count2")
+  }
+
   test("ADD FILE command") {
     val testFile = TestHive.getHiveFile("data/files/v1.txt").getCanonicalFile
     sql(s"ADD FILE $testFile")

From a930e624eb9feb0f7d37d99dcb8178feb9c0f177 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 2 Nov 2015 10:23:30 -0800
Subject: [PATCH 756/802] [SPARK-9817][YARN] Improve the locality calculation
 of containers by taking pending container requests into consideraion

This is a follow-up PR to further improve the locality calculation by considering the pending container's request. Since the locality preferences of tasks may be shifted from time to time, current localities of pending container requests may not fully match the new preferences, this PR improve it by removing outdated, unmatched container requests and replace with new requests.

sryza please help to review, thanks a lot.

Author: jerryshao <sshao@hortonworks.com>

Closes #8100 from jerryshao/SPARK-9817.
---
 .../spark/deploy/yarn/ApplicationMaster.scala |  2 +-
 ...yPreferredContainerPlacementStrategy.scala | 60 +++++++++++++--
 .../spark/deploy/yarn/YarnAllocator.scala     | 73 +++++++++++++++----
 .../ContainerPlacementStrategySuite.scala     | 38 ++++++++--
 .../deploy/yarn/YarnAllocatorSuite.scala      | 26 +++----
 5 files changed, 159 insertions(+), 40 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 4b4d9990ce9f9..c6a6d7ac56bf3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -375,7 +375,7 @@ private[spark] class ApplicationMaster(
             }
           }
           try {
-            val numPendingAllocate = allocator.getNumPendingAllocate
+            val numPendingAllocate = allocator.getPendingAllocate.size
             val sleepInterval =
               if (numPendingAllocate > 0) {
                 val currentAllocationInterval =
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 081780204e424..2ec189de7c914 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.deploy.yarn
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
+import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.hadoop.yarn.util.RackResolver
 
 import org.apache.spark.SparkConf
@@ -30,8 +32,8 @@ private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], rack
 /**
  * This strategy is calculating the optimal locality preferences of YARN containers by considering
  * the node ratio of pending tasks, number of required cores/containers and and locality of current
- * existing containers. The target of this algorithm is to maximize the number of tasks that
- * would run locally.
+ * existing and pending allocated containers. The target of this algorithm is to maximize the number
+ * of tasks that would run locally.
  *
  * Consider a situation in which we have 20 tasks that require (host1, host2, host3)
  * and 10 tasks that require (host1, host2, host4), besides each container has 2 cores
@@ -91,6 +93,11 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
    * @param numLocalityAwareTasks number of locality required tasks
    * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
    *                             numbers running on it, used as hints for container allocation
+   * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
+   *                                     expected locality preference by considering the existing
+   *                                     containers
+   * @param localityMatchedPendingAllocations A sequence of pending container request which
+   *                                          matches the localities of current required tasks.
    * @return node localities and rack localities, each locality is an array of string,
    *         the length of localities is the same as number of containers
    */
@@ -98,10 +105,12 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
       numContainer: Int,
       numLocalityAwareTasks: Int,
       hostToLocalTaskCount: Map[String, Int],
-      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]]
+      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
+      localityMatchedPendingAllocations: Seq[ContainerRequest]
     ): Array[ContainerLocalityPreferences] = {
     val updatedHostToContainerCount = expectedHostToContainerCount(
-      numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap)
+      numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap,
+        localityMatchedPendingAllocations)
     val updatedLocalityAwareContainerNum = updatedHostToContainerCount.values.sum
 
     // The number of containers to allocate, divided into two groups, one with preferred locality,
@@ -158,20 +167,28 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
    * @param localityAwareTasks number of locality aware tasks
    * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
    *                             numbers running on it, used as hints for container allocation
+   * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
+   *                                     expected locality preference by considering the existing
+   *                                     containers
+   * @param localityMatchedPendingAllocations A sequence of pending container request which
+   *                                          matches the localities of current required tasks.
    * @return a map with hostname as key and required number of containers on this host as value
    */
   private def expectedHostToContainerCount(
       localityAwareTasks: Int,
       hostToLocalTaskCount: Map[String, Int],
-      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]]
+      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
+      localityMatchedPendingAllocations: Seq[ContainerRequest]
     ): Map[String, Int] = {
     val totalLocalTaskNum = hostToLocalTaskCount.values.sum
+    val pendingHostToContainersMap = pendingHostToContainerCount(localityMatchedPendingAllocations)
+
     hostToLocalTaskCount.map { case (host, count) =>
       val expectedCount =
         count.toDouble * numExecutorsPending(localityAwareTasks) / totalLocalTaskNum
-      val existedCount = allocatedHostToContainersMap.get(host)
-        .map(_.size)
-        .getOrElse(0)
+      // Take the locality of pending containers into consideration
+      val existedCount = allocatedHostToContainersMap.get(host).map(_.size).getOrElse(0) +
+        pendingHostToContainersMap.getOrElse(host, 0.0)
 
       // If existing container can not fully satisfy the expected number of container,
       // the required container number is expected count minus existed count. Otherwise the
@@ -179,4 +196,31 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
       (host, math.max(0, (expectedCount - existedCount).ceil.toInt))
     }
   }
+
+  /**
+   * According to the locality ratio and number of container requests, calculate the host to
+   * possible number of containers for pending allocated containers.
+   *
+   * If current locality ratio of hosts is: Host1 : Host2 : Host3 = 20 : 20 : 10,
+   * and pending container requests is 3, so the possible number of containers on
+   * Host1 : Host2 : Host3 will be 1.2 : 1.2 : 0.6.
+   * @param localityMatchedPendingAllocations A sequence of pending container request which
+   *                                          matches the localities of current required tasks.
+   * @return a Map with hostname as key and possible number of containers on this host as value
+   */
+  private def pendingHostToContainerCount(
+      localityMatchedPendingAllocations: Seq[ContainerRequest]): Map[String, Double] = {
+    val pendingHostToContainerCount = new HashMap[String, Int]()
+    localityMatchedPendingAllocations.foreach { cr =>
+      cr.getNodes.asScala.foreach { n =>
+        val count = pendingHostToContainerCount.getOrElse(n, 0) + 1
+        pendingHostToContainerCount(n) = count
+      }
+    }
+
+    val possibleTotalContainerNum = pendingHostToContainerCount.values.sum
+    val localityMatchedPendingNum = localityMatchedPendingAllocations.size.toDouble
+    pendingHostToContainerCount.mapValues(_ * localityMatchedPendingNum / possibleTotalContainerNum)
+      .toMap
+  }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 875bbd4e4e3d5..a0cf1b4aa469b 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -157,15 +157,19 @@ private[yarn] class YarnAllocator(
   def getNumExecutorsFailed: Int = numExecutorsFailed
 
   /**
-   * Number of container requests that have not yet been fulfilled.
+   * A sequence of pending container requests that have not yet been fulfilled.
    */
-  def getNumPendingAllocate: Int = getNumPendingAtLocation(ANY_HOST)
+  def getPendingAllocate: Seq[ContainerRequest] = getPendingAtLocation(ANY_HOST)
 
   /**
-   * Number of container requests at the given location that have not yet been fulfilled.
+   * A sequence of pending container requests at the given location that have not yet been
+   * fulfilled.
    */
-  private def getNumPendingAtLocation(location: String): Int =
-    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala.map(_.size).sum
+  private def getPendingAtLocation(location: String): Seq[ContainerRequest] = {
+    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala
+      .flatMap(_.asScala)
+      .toSeq
+  }
 
   /**
    * Request as many executors from the ResourceManager as needed to reach the desired total. If
@@ -251,20 +255,31 @@ private[yarn] class YarnAllocator(
    * Visible for testing.
    */
   def updateResourceRequests(): Unit = {
-    val numPendingAllocate = getNumPendingAllocate
+    val pendingAllocate = getPendingAllocate
+    val numPendingAllocate = pendingAllocate.size
     val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
 
-    // TODO. Consider locality preferences of pending container requests.
-    // Since the last time we made container requests, stages have completed and been submitted,
-    // and that the localities at which we requested our pending executors
-    // no longer apply to our current needs. We should consider to remove all outstanding
-    // container requests and add requests anew each time to avoid this.
     if (missing > 0) {
       logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " +
         s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead")
 
+      // Split the pending container request into three groups: locality matched list, locality
+      // unmatched list and non-locality list. Take the locality matched container request into
+      // consideration of container placement, treat as allocated containers.
+      // For locality unmatched and locality free container requests, cancel these container
+      // requests, since required locality preference has been changed, recalculating using
+      // container placement strategy.
+      val (localityMatched, localityUnMatched, localityFree) = splitPendingAllocationsByLocality(
+        hostToLocalTaskCounts, pendingAllocate)
+
+      // Remove the outdated container request and recalculate the requested container number
+      localityUnMatched.foreach(amClient.removeContainerRequest)
+      localityFree.foreach(amClient.removeContainerRequest)
+      val updatedNumContainer = missing + localityUnMatched.size + localityFree.size
+
       val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
-        missing, numLocalityAwareTasks, hostToLocalTaskCounts, allocatedHostToContainersMap)
+        updatedNumContainer, numLocalityAwareTasks, hostToLocalTaskCounts,
+          allocatedHostToContainersMap, localityMatched)
 
       for (locality <- containerLocalityPreferences) {
         val request = createContainerRequest(resource, locality.nodes, locality.racks)
@@ -291,7 +306,7 @@ private[yarn] class YarnAllocator(
    * Creates a container request, handling the reflection required to use YARN features that were
    * added in recent versions.
    */
-  protected def createContainerRequest(
+  private def createContainerRequest(
       resource: Resource,
       nodes: Array[String],
       racks: Array[String]): ContainerRequest = {
@@ -535,6 +550,38 @@ private[yarn] class YarnAllocator(
 
   private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
 
+  /**
+   * Split the pending container requests into 3 groups based on current localities of pending
+   * tasks.
+   * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
+   *                             container placement hint.
+   * @param pendingAllocations A sequence of pending allocation container request.
+   * @return A tuple of 3 sequences, first is a sequence of locality matched container
+   *         requests, second is a sequence of locality unmatched container requests, and third is a
+   *         sequence of locality free container requests.
+   */
+  private def splitPendingAllocationsByLocality(
+      hostToLocalTaskCount: Map[String, Int],
+      pendingAllocations: Seq[ContainerRequest]
+    ): (Seq[ContainerRequest], Seq[ContainerRequest], Seq[ContainerRequest]) = {
+    val localityMatched = ArrayBuffer[ContainerRequest]()
+    val localityUnMatched = ArrayBuffer[ContainerRequest]()
+    val localityFree = ArrayBuffer[ContainerRequest]()
+
+    val preferredHosts = hostToLocalTaskCount.keySet
+    pendingAllocations.foreach { cr =>
+      val nodes = cr.getNodes
+      if (nodes == null) {
+        localityFree += cr
+      } else if (nodes.asScala.toSet.intersect(preferredHosts).nonEmpty) {
+        localityMatched += cr
+      } else {
+        localityUnMatched += cr
+      }
+    }
+
+    (localityMatched.toSeq, localityUnMatched.toSeq, localityFree.toSeq)
+  }
 }
 
 private object YarnAllocator {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
index b7fe4ccc67a38..afb4b691b52de 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy.yarn
 
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.SparkFunSuite
@@ -26,6 +27,9 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
   private val yarnAllocatorSuite = new YarnAllocatorSuite
   import yarnAllocatorSuite._
 
+  def createContainerRequest(nodes: Array[String]): ContainerRequest =
+    new ContainerRequest(containerResource, nodes, null, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
+
   override def beforeEach() {
     yarnAllocatorSuite.beforeEach()
   }
@@ -44,7 +48,8 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
 
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10), handler.allocatedHostToContainersMap)
+      3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10),
+        handler.allocatedHostToContainersMap, Seq.empty)
 
     assert(localities.map(_.nodes) === Array(
       Array("host3", "host4", "host5"),
@@ -66,7 +71,8 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     ))
 
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+        handler.allocatedHostToContainersMap, Seq.empty)
 
     assert(localities.map(_.nodes) ===
       Array(null, Array("host2", "host3"), Array("host2", "host3")))
@@ -86,7 +92,8 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     ))
 
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+      1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+        handler.allocatedHostToContainersMap, Seq.empty)
 
     assert(localities.map(_.nodes) === Array(Array("host2", "host3")))
   }
@@ -105,7 +112,8 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     ))
 
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+        handler.allocatedHostToContainersMap, Seq.empty)
 
     assert(localities.map(_.nodes) === Array(null, null, null))
   }
@@ -118,8 +126,28 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
 
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      1, 0, Map.empty, handler.allocatedHostToContainersMap)
+      1, 0, Map.empty, handler.allocatedHostToContainersMap, Seq.empty)
 
     assert(localities.map(_.nodes) === Array(null))
   }
+
+  test("allocate locality preferred containers by considering the localities of pending requests") {
+    val handler = createAllocator(3)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(
+      createContainer("host1"),
+      createContainer("host1"),
+      createContainer("host2")
+    ))
+
+    val pendingAllocationRequests = Seq(
+      createContainerRequest(Array("host2", "host3")),
+      createContainerRequest(Array("host1", "host4")))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+        handler.allocatedHostToContainersMap, pendingAllocationRequests)
+
+    assert(localities.map(_.nodes) === Array(Array("host3")))
+  }
 }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 5d05f514adde3..bd80036c5cfa7 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -116,7 +116,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val handler = createAllocator(1)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (1)
+    handler.getPendingAllocate.size should be (1)
 
     val container = createContainer("host1")
     handler.handleAllocatedContainers(Array(container))
@@ -134,7 +134,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val handler = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (4)
+    handler.getPendingAllocate.size should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host1")
@@ -154,7 +154,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val handler = createAllocator(2)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (2)
+    handler.getPendingAllocate.size should be (2)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -174,11 +174,11 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val handler = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (4)
+    handler.getPendingAllocate.size should be (4)
 
     handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
     handler.updateResourceRequests()
-    handler.getNumPendingAllocate should be (3)
+    handler.getPendingAllocate.size should be (3)
 
     val container = createContainer("host1")
     handler.handleAllocatedContainers(Array(container))
@@ -189,18 +189,18 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
     handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty)
     handler.updateResourceRequests()
-    handler.getNumPendingAllocate should be (1)
+    handler.getPendingAllocate.size should be (1)
   }
 
   test("decrease total requested executors to less than currently running") {
     val handler = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (4)
+    handler.getPendingAllocate.size should be (4)
 
     handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
     handler.updateResourceRequests()
-    handler.getNumPendingAllocate should be (3)
+    handler.getPendingAllocate.size should be (3)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -210,7 +210,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
     handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
     handler.updateResourceRequests()
-    handler.getNumPendingAllocate should be (0)
+    handler.getPendingAllocate.size should be (0)
     handler.getNumExecutorsRunning should be (2)
   }
 
@@ -218,7 +218,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val handler = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (4)
+    handler.getPendingAllocate.size should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -233,14 +233,14 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.updateResourceRequests()
     handler.processCompletedContainers(statuses.toSeq)
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (1)
+    handler.getPendingAllocate.size should be (1)
   }
 
   test("lost executor removed from backend") {
     val handler = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (4)
+    handler.getPendingAllocate.size should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -255,7 +255,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.processCompletedContainers(statuses.toSeq)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getNumPendingAllocate should be (2)
+    handler.getPendingAllocate.size should be (2)
     handler.getNumExecutorsFailed should be (2)
     handler.getNumUnexpectedContainerRelease should be (2)
   }

From 71d1c907dec446db566b19f912159fd8f46deb7d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 2 Nov 2015 10:26:36 -0800
Subject: [PATCH 757/802] [SPARK-10997][CORE] Add "client mode" to netty rpc
 env.

"Client mode" means the RPC env will not listen for incoming connections.
This allows certain processes in the Spark stack (such as Executors or
tha YARN client-mode AM) to act as pure clients when using the netty-based
RPC backend, reducing the number of sockets needed by the app and also the
number of open ports.

Client connections are also preferred when endpoints that actually have
a listening socket are involved; so, for example, if a Worker connects
to a Master and the Master needs to send a message to a Worker endpoint,
that client connection will be used, even though the Worker is also
listening for incoming connections.

With this change, the workaround for SPARK-10987 isn't necessary anymore, and
is removed. The AM connects to the driver in "client mode", and that connection
is used for all driver <-> AM communication, and so the AM is properly notified
when the connection goes down.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9210 from vanzin/SPARK-10997.
---
 .../scala/org/apache/spark/SparkEnv.scala     |   7 +-
 .../CoarseGrainedExecutorBackend.scala        |  20 +-
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   8 +-
 .../apache/spark/rpc/netty/Dispatcher.scala   |   2 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 245 ++++++++++--------
 .../org/apache/spark/rpc/netty/Outbox.scala   |  24 +-
 .../spark/rpc/netty/RpcEndpointAddress.scala  |  24 +-
 .../cluster/CoarseGrainedClusterMessage.scala |  10 +-
 .../CoarseGrainedSchedulerBackend.scala       |  18 +-
 .../cluster/YarnSchedulerBackend.scala        |   2 -
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  50 ++--
 .../spark/rpc/akka/AkkaRpcEnvSuite.scala      |  11 +-
 .../rpc/netty/NettyRpcAddressSuite.scala      |   7 +-
 .../spark/rpc/netty/NettyRpcEnvSuite.scala    |   9 +-
 .../rpc/netty/NettyRpcHandlerSuite.scala      |   8 +-
 network/yarn/pom.xml                          |   5 +
 .../spark/deploy/yarn/ApplicationMaster.scala |   6 +-
 17 files changed, 266 insertions(+), 190 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 398e0936906a3..23ae9360f6a22 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -252,7 +252,8 @@ object SparkEnv extends Logging {
 
     // Create the ActorSystem for Akka and get the port it binds to.
     val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
-    val rpcEnv = RpcEnv.create(actorSystemName, hostname, port, conf, securityManager)
+    val rpcEnv = RpcEnv.create(actorSystemName, hostname, port, conf, securityManager,
+      clientMode = !isDriver)
     val actorSystem: ActorSystem =
       if (rpcEnv.isInstanceOf[AkkaRpcEnv]) {
         rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
@@ -262,9 +263,11 @@ object SparkEnv extends Logging {
       }
 
     // Figure out which port Akka actually bound to in case the original port is 0 or occupied.
+    // In the non-driver case, the RPC env's address may be null since it may not be listening
+    // for incoming connections.
     if (isDriver) {
       conf.set("spark.driver.port", rpcEnv.address.port.toString)
-    } else {
+    } else if (rpcEnv.address != null) {
       conf.set("spark.executor.port", rpcEnv.address.port.toString)
     }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index a9c6a05ecd434..c2ebf30596215 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -45,8 +45,6 @@ private[spark] class CoarseGrainedExecutorBackend(
     env: SparkEnv)
   extends ThreadSafeRpcEndpoint with ExecutorBackend with Logging {
 
-  Utils.checkHostPort(hostPort, "Expected hostport")
-
   var executor: Executor = null
   @volatile var driver: Option[RpcEndpointRef] = None
 
@@ -80,9 +78,8 @@ private[spark] class CoarseGrainedExecutorBackend(
   }
 
   override def receive: PartialFunction[Any, Unit] = {
-    case RegisteredExecutor =>
+    case RegisteredExecutor(hostname) =>
       logInfo("Successfully registered with driver")
-      val (hostname, _) = Utils.parseHostPort(hostPort)
       executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
 
     case RegisterExecutorFailed(message) =>
@@ -163,7 +160,8 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         hostname,
         port,
         executorConf,
-        new SecurityManager(executorConf))
+        new SecurityManager(executorConf),
+        clientMode = true)
       val driver = fetcher.setupEndpointRefByURI(driverUrl)
       val props = driver.askWithRetry[Seq[(String, String)]](RetrieveSparkProps) ++
         Seq[(String, String)](("spark.app.id", appId))
@@ -188,12 +186,12 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       val env = SparkEnv.createExecutorEnv(
         driverConf, executorId, hostname, port, cores, isLocal = false)
 
-      // SparkEnv sets spark.driver.port so it shouldn't be 0 anymore.
-      val boundPort = env.conf.getInt("spark.executor.port", 0)
-      assert(boundPort != 0)
-
-      // Start the CoarseGrainedExecutorBackend endpoint.
-      val sparkHostPort = hostname + ":" + boundPort
+      // SparkEnv will set spark.executor.port if the rpc env is listening for incoming
+      // connections (e.g., if it's using akka). Otherwise, the executor is running in
+      // client mode only, and does not accept incoming connections.
+      val sparkHostPort = env.conf.getOption("spark.executor.port").map { port =>
+          hostname + ":" + port
+        }.orNull
       env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(
         env.rpcEnv, driverUrl, executorId, sparkHostPort, cores, userClassPath, env))
       workerUrl.foreach { url =>
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 2c4a8b9a0a878..a560fd10cdf76 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -43,9 +43,10 @@ private[spark] object RpcEnv {
       host: String,
       port: Int,
       conf: SparkConf,
-      securityManager: SecurityManager): RpcEnv = {
+      securityManager: SecurityManager,
+      clientMode: Boolean = false): RpcEnv = {
     // Using Reflection to create the RpcEnv to avoid to depend on Akka directly
-    val config = RpcEnvConfig(conf, name, host, port, securityManager)
+    val config = RpcEnvConfig(conf, name, host, port, securityManager, clientMode)
     getRpcEnvFactory(conf).create(config)
   }
 }
@@ -139,4 +140,5 @@ private[spark] case class RpcEnvConfig(
     name: String,
     host: String,
     port: Int,
-    securityManager: SecurityManager)
+    securityManager: SecurityManager,
+    clientMode: Boolean)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index 7bf44a6565b61..eb25d6c7b721b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -55,7 +55,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   private var stopped = false
 
   def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
-    val addr = new RpcEndpointAddress(nettyEnv.address.host, nettyEnv.address.port, name)
+    val addr = RpcEndpointAddress(nettyEnv.address, name)
     val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
     synchronized {
       if (stopped) {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 284284eb805b7..09093819bb22c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -17,10 +17,12 @@
 package org.apache.spark.rpc.netty
 
 import java.io._
+import java.lang.{Boolean => JBoolean}
 import java.net.{InetSocketAddress, URI}
 import java.nio.ByteBuffer
 import java.util.concurrent._
 import java.util.concurrent.atomic.AtomicBoolean
+import javax.annotation.Nullable;
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
@@ -29,6 +31,7 @@ import scala.reflect.ClassTag
 import scala.util.{DynamicVariable, Failure, Success}
 import scala.util.control.NonFatal
 
+import com.google.common.base.Preconditions
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.network.TransportContext
 import org.apache.spark.network.client._
@@ -45,15 +48,14 @@ private[netty] class NettyRpcEnv(
     host: String,
     securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
 
-  // Override numConnectionsPerPeer to 1 for RPC.
   private val transportConf = SparkTransportConf.fromSparkConf(
     conf.clone.set("spark.shuffle.io.numConnectionsPerPeer", "1"),
     conf.getInt("spark.rpc.io.threads", 0))
 
   private val dispatcher: Dispatcher = new Dispatcher(this)
 
-  private val transportContext =
-    new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
+  private val transportContext = new TransportContext(transportConf,
+    new NettyRpcHandler(dispatcher, this))
 
   private val clientFactory = {
     val bootstraps: java.util.List[TransportClientBootstrap] =
@@ -95,7 +97,7 @@ private[netty] class NettyRpcEnv(
     }
   }
 
-  def start(port: Int): Unit = {
+  def startServer(port: Int): Unit = {
     val bootstraps: java.util.List[TransportServerBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
         java.util.Arrays.asList(new SaslServerBootstrap(transportConf, securityManager))
@@ -107,9 +109,9 @@ private[netty] class NettyRpcEnv(
       RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher))
   }
 
+  @Nullable
   override lazy val address: RpcAddress = {
-    require(server != null, "NettyRpcEnv has not yet started")
-    RpcAddress(host, server.getPort)
+    if (server != null) RpcAddress(host, server.getPort()) else null
   }
 
   override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
@@ -120,7 +122,7 @@ private[netty] class NettyRpcEnv(
     val addr = RpcEndpointAddress(uri)
     val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
     val verifier = new NettyRpcEndpointRef(
-      conf, RpcEndpointAddress(addr.host, addr.port, RpcEndpointVerifier.NAME), this)
+      conf, RpcEndpointAddress(addr.rpcAddress, RpcEndpointVerifier.NAME), this)
     verifier.ask[Boolean](RpcEndpointVerifier.CheckExistence(endpointRef.name)).flatMap { find =>
       if (find) {
         Future.successful(endpointRef)
@@ -135,28 +137,34 @@ private[netty] class NettyRpcEnv(
     dispatcher.stop(endpointRef)
   }
 
-  private def postToOutbox(address: RpcAddress, message: OutboxMessage): Unit = {
-    val targetOutbox = {
-      val outbox = outboxes.get(address)
-      if (outbox == null) {
-        val newOutbox = new Outbox(this, address)
-        val oldOutbox = outboxes.putIfAbsent(address, newOutbox)
-        if (oldOutbox == null) {
-          newOutbox
+  private def postToOutbox(receiver: NettyRpcEndpointRef, message: OutboxMessage): Unit = {
+    if (receiver.client != null) {
+      receiver.client.sendRpc(message.content, message.createCallback(receiver.client));
+    } else {
+      require(receiver.address != null,
+        "Cannot send message to client endpoint with no listen address.")
+      val targetOutbox = {
+        val outbox = outboxes.get(receiver.address)
+        if (outbox == null) {
+          val newOutbox = new Outbox(this, receiver.address)
+          val oldOutbox = outboxes.putIfAbsent(receiver.address, newOutbox)
+          if (oldOutbox == null) {
+            newOutbox
+          } else {
+            oldOutbox
+          }
         } else {
-          oldOutbox
+          outbox
         }
+      }
+      if (stopped.get) {
+        // It's possible that we put `targetOutbox` after stopping. So we need to clean it.
+        outboxes.remove(receiver.address)
+        targetOutbox.stop()
       } else {
-        outbox
+        targetOutbox.send(message)
       }
     }
-    if (stopped.get) {
-      // It's possible that we put `targetOutbox` after stopping. So we need to clean it.
-      outboxes.remove(address)
-      targetOutbox.stop()
-    } else {
-      targetOutbox.send(message)
-    }
   }
 
   private[netty] def send(message: RequestMessage): Unit = {
@@ -174,17 +182,14 @@ private[netty] class NettyRpcEnv(
       }(ThreadUtils.sameThread)
     } else {
       // Message to a remote RPC endpoint.
-      postToOutbox(remoteAddr, OutboxMessage(serialize(message), new RpcResponseCallback {
-
-        override def onFailure(e: Throwable): Unit = {
+      postToOutbox(message.receiver, OutboxMessage(serialize(message),
+        (e) => {
           logWarning(s"Exception when sending $message", e)
-        }
-
-        override def onSuccess(response: Array[Byte]): Unit = {
-          val ack = deserialize[Ack](response)
+        },
+        (client, response) => {
+          val ack = deserialize[Ack](client, response)
           logDebug(s"Receive ack from ${ack.sender}")
-        }
-      }))
+        }))
     }
   }
 
@@ -214,16 +219,14 @@ private[netty] class NettyRpcEnv(
           }
       }(ThreadUtils.sameThread)
     } else {
-      postToOutbox(remoteAddr, OutboxMessage(serialize(message), new RpcResponseCallback {
-
-        override def onFailure(e: Throwable): Unit = {
+      postToOutbox(message.receiver, OutboxMessage(serialize(message),
+        (e) => {
           if (!promise.tryFailure(e)) {
             logWarning("Ignore Exception", e)
           }
-        }
-
-        override def onSuccess(response: Array[Byte]): Unit = {
-          val reply = deserialize[AskResponse](response)
+        },
+        (client, response) => {
+          val reply = deserialize[AskResponse](client, response)
           if (reply.reply.isInstanceOf[RpcFailure]) {
             if (!promise.tryFailure(reply.reply.asInstanceOf[RpcFailure].e)) {
               logWarning(s"Ignore failure: ${reply.reply}")
@@ -231,8 +234,7 @@ private[netty] class NettyRpcEnv(
           } else if (!promise.trySuccess(reply.reply)) {
             logWarning(s"Ignore message: ${reply}")
           }
-        }
-      }))
+        }))
     }
     promise.future
   }
@@ -243,9 +245,11 @@ private[netty] class NettyRpcEnv(
       buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
   }
 
-  private[netty] def deserialize[T: ClassTag](bytes: Array[Byte]): T = {
-    deserialize { () =>
-      javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes))
+  private[netty] def deserialize[T: ClassTag](client: TransportClient, bytes: Array[Byte]): T = {
+    NettyRpcEnv.currentClient.withValue(client) {
+      deserialize { () =>
+        javaSerializerInstance.deserialize[T](ByteBuffer.wrap(bytes))
+      }
     }
   }
 
@@ -254,7 +258,7 @@ private[netty] class NettyRpcEnv(
   }
 
   override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
-    new RpcEndpointAddress(address.host, address.port, endpointName).toString
+    new RpcEndpointAddress(address, endpointName).toString
 
   override def shutdown(): Unit = {
     cleanup()
@@ -297,6 +301,7 @@ private[netty] class NettyRpcEnv(
       deserializationAction()
     }
   }
+
 }
 
 private[netty] object NettyRpcEnv extends Logging {
@@ -312,6 +317,13 @@ private[netty] object NettyRpcEnv extends Logging {
    * }}}
    */
   private[netty] val currentEnv = new DynamicVariable[NettyRpcEnv](null)
+
+  /**
+   * Similar to `currentEnv`, this variable references the client instance associated with an
+   * RPC, in case it's needed to find out the remote address during deserialization.
+   */
+  private[netty] val currentClient = new DynamicVariable[TransportClient](null)
+
 }
 
 private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
@@ -324,47 +336,68 @@ private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
       new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]
     val nettyEnv =
       new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager)
-    val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
-      nettyEnv.start(actualPort)
-      (nettyEnv, actualPort)
-    }
-    try {
-      Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1
-    } catch {
-      case NonFatal(e) =>
-        nettyEnv.shutdown()
-        throw e
+    if (!config.clientMode) {
+      val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
+        nettyEnv.startServer(actualPort)
+        (nettyEnv, actualPort)
+      }
+      try {
+        Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, "NettyRpcEnv")._1
+      } catch {
+        case NonFatal(e) =>
+          nettyEnv.shutdown()
+          throw e
+      }
     }
+    nettyEnv
   }
 }
 
-private[netty] class NettyRpcEndpointRef(@transient private val conf: SparkConf)
+/**
+ * The NettyRpcEnv version of RpcEndpointRef.
+ *
+ * This class behaves differently depending on where it's created. On the node that "owns" the
+ * RpcEndpoint, it's a simple wrapper around the RpcEndpointAddress instance.
+ *
+ * On other machines that receive a serialized version of the reference, the behavior changes. The
+ * instance will keep track of the TransportClient that sent the reference, so that messages
+ * to the endpoint are sent over the client connection, instead of needing a new connection to
+ * be opened.
+ *
+ * The RpcAddress of this ref can be null; what that means is that the ref can only be used through
+ * a client connection, since the process hosting the endpoint is not listening for incoming
+ * connections. These refs should not be shared with 3rd parties, since they will not be able to
+ * send messages to the endpoint.
+ *
+ * @param conf Spark configuration.
+ * @param endpointAddress The address where the endpoint is listening.
+ * @param nettyEnv The RpcEnv associated with this ref.
+ * @param local Whether the referenced endpoint lives in the same process.
+ */
+private[netty] class NettyRpcEndpointRef(
+    @transient private val conf: SparkConf,
+    endpointAddress: RpcEndpointAddress,
+    @transient @volatile private var nettyEnv: NettyRpcEnv)
   extends RpcEndpointRef(conf) with Serializable with Logging {
 
-  @transient @volatile private var nettyEnv: NettyRpcEnv = _
+  @transient @volatile var client: TransportClient = _
 
-  @transient @volatile private var _address: RpcEndpointAddress = _
+  private val _address = if (endpointAddress.rpcAddress != null) endpointAddress else null
+  private val _name = endpointAddress.name
 
-  def this(conf: SparkConf, _address: RpcEndpointAddress, nettyEnv: NettyRpcEnv) {
-    this(conf)
-    this._address = _address
-    this.nettyEnv = nettyEnv
-  }
-
-  override def address: RpcAddress = _address.toRpcAddress
+  override def address: RpcAddress = if (_address != null) _address.rpcAddress else null
 
   private def readObject(in: ObjectInputStream): Unit = {
     in.defaultReadObject()
-    _address = in.readObject().asInstanceOf[RpcEndpointAddress]
     nettyEnv = NettyRpcEnv.currentEnv.value
+    client = NettyRpcEnv.currentClient.value
   }
 
   private def writeObject(out: ObjectOutputStream): Unit = {
     out.defaultWriteObject()
-    out.writeObject(_address)
   }
 
-  override def name: String = _address.name
+  override def name: String = _name
 
   override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = {
     val promise = Promise[Any]()
@@ -429,41 +462,43 @@ private[netty] case class Ack(sender: NettyRpcEndpointRef) extends ResponseMessa
 private[netty] case class RpcFailure(e: Throwable)
 
 /**
- * Maintain the mapping relations between client addresses and [[RpcEnv]] addresses, broadcast
- * network events and forward messages to [[Dispatcher]].
+ * Dispatches incoming RPCs to registered endpoints.
+ *
+ * The handler keeps track of all client instances that communicate with it, so that the RpcEnv
+ * knows which `TransportClient` instance to use when sending RPCs to a client endpoint (i.e.,
+ * one that is not listening for incoming connections, but rather needs to be contacted via the
+ * client socket).
+ *
+ * Events are sent on a per-connection basis, so if a client opens multiple connections to the
+ * RpcEnv, multiple connection / disconnection events will be created for that client (albeit
+ * with different `RpcAddress` information).
  */
 private[netty] class NettyRpcHandler(
     dispatcher: Dispatcher, nettyEnv: NettyRpcEnv) extends RpcHandler with Logging {
 
-  private type ClientAddress = RpcAddress
-  private type RemoteEnvAddress = RpcAddress
-
-  // Store all client addresses and their NettyRpcEnv addresses.
-  // TODO: Is this even necessary?
-  @GuardedBy("this")
-  private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
+  // TODO: Can we add connection callback (channel registered) to the underlying framework?
+  // A variable to track whether we should dispatch the RemoteProcessConnected message.
+  private val clients = new ConcurrentHashMap[TransportClient, JBoolean]()
 
   override def receive(
-      client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
-    val requestMessage = nettyEnv.deserialize[RequestMessage](message)
-    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
+      client: TransportClient,
+      message: Array[Byte],
+      callback: RpcResponseCallback): Unit = {
+    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
     assert(addr != null)
-    val remoteEnvAddress = requestMessage.senderAddress
     val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-
-    // TODO: Can we add connection callback (channel registered) to the underlying framework?
-    // A variable to track whether we should dispatch the RemoteProcessConnected message.
-    var dispatchRemoteProcessConnected = false
-    synchronized {
-      if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
-        // clientAddr connects at the first time, fire "RemoteProcessConnected"
-        dispatchRemoteProcessConnected = true
-      }
+    if (clients.putIfAbsent(client, JBoolean.TRUE) == null) {
+      dispatcher.postToAll(RemoteProcessConnected(clientAddr))
     }
-    if (dispatchRemoteProcessConnected) {
-      dispatcher.postToAll(RemoteProcessConnected(remoteEnvAddress))
-    }
-    dispatcher.postRemoteMessage(requestMessage, callback)
+    val requestMessage = nettyEnv.deserialize[RequestMessage](client, message)
+    val messageToDispatch = if (requestMessage.senderAddress == null) {
+        // Create a new message with the socket address of the client as the sender.
+        RequestMessage(clientAddr, requestMessage.receiver, requestMessage.content,
+          requestMessage.needReply)
+      } else {
+        requestMessage
+      }
+    dispatcher.postRemoteMessage(messageToDispatch, callback)
   }
 
   override def getStreamManager: StreamManager = new OneForOneStreamManager
@@ -472,15 +507,7 @@ private[netty] class NettyRpcHandler(
     val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-      val broadcastMessage =
-        synchronized {
-          remoteAddresses.get(clientAddr).map(RemoteProcessConnectionError(cause, _))
-        }
-      if (broadcastMessage.isEmpty) {
-        logError(cause.getMessage, cause)
-      } else {
-        dispatcher.postToAll(broadcastMessage.get)
-      }
+      dispatcher.postToAll(RemoteProcessConnectionError(cause, clientAddr))
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null.
       // See java.net.Socket.getRemoteSocketAddress
@@ -493,15 +520,9 @@ private[netty] class NettyRpcHandler(
     val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
+      clients.remove(client)
       nettyEnv.removeOutbox(clientAddr)
-      val messageOpt: Option[RemoteProcessDisconnected] =
-      synchronized {
-        remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
-          remoteAddresses -= clientAddr
-          Some(RemoteProcessDisconnected(remoteEnvAddress))
-        }
-      }
-      messageOpt.foreach(dispatcher.postToAll)
+      dispatcher.postToAll(RemoteProcessDisconnected(clientAddr))
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null. In this case,
       // we can ignore it since we don't fire "Associated".
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
index 7d9d593b36241..2f6817f2eb935 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
@@ -26,7 +26,21 @@ import org.apache.spark.SparkException
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.rpc.RpcAddress
 
-private[netty] case class OutboxMessage(content: Array[Byte], callback: RpcResponseCallback)
+private[netty] case class OutboxMessage(content: Array[Byte],
+  _onFailure: (Throwable) => Unit,
+  _onSuccess: (TransportClient, Array[Byte]) => Unit) {
+
+  def createCallback(client: TransportClient): RpcResponseCallback = new RpcResponseCallback() {
+    override def onFailure(e: Throwable): Unit = {
+      _onFailure(e)
+    }
+
+    override def onSuccess(response: Array[Byte]): Unit = {
+      _onSuccess(client, response)
+    }
+  }
+
+}
 
 private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
 
@@ -68,7 +82,7 @@ private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
       }
     }
     if (dropped) {
-      message.callback.onFailure(new SparkException("Message is dropped because Outbox is stopped"))
+      message._onFailure(new SparkException("Message is dropped because Outbox is stopped"))
     } else {
       drainOutbox()
     }
@@ -108,7 +122,7 @@ private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
       try {
         val _client = synchronized { client }
         if (_client != null) {
-          _client.sendRpc(message.content, message.callback)
+          _client.sendRpc(message.content, message.createCallback(_client))
         } else {
           assert(stopped == true)
         }
@@ -181,7 +195,7 @@ private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
     // update messages and it's safe to just drain the queue.
     var message = messages.poll()
     while (message != null) {
-      message.callback.onFailure(e)
+      message._onFailure(e)
       message = messages.poll()
     }
     assert(messages.isEmpty)
@@ -215,7 +229,7 @@ private[netty] class Outbox(nettyEnv: NettyRpcEnv, val address: RpcAddress) {
     // update messages and it's safe to just drain the queue.
     var message = messages.poll()
     while (message != null) {
-      message.callback.onFailure(new SparkException("Message is dropped because Outbox is stopped"))
+      message._onFailure(new SparkException("Message is dropped because Outbox is stopped"))
       message = messages.poll()
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
index 87b6236936817..d2e94f943aba5 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
@@ -23,15 +23,25 @@ import org.apache.spark.rpc.RpcAddress
 /**
  * An address identifier for an RPC endpoint.
  *
- * @param host host name of the remote process.
- * @param port the port the remote RPC environment binds to.
- * @param name name of the remote endpoint.
+ * The `rpcAddress` may be null, in which case the endpoint is registered via a client-only
+ * connection and can only be reached via the client that sent the endpoint reference.
+ *
+ * @param rpcAddress The socket address of the endpint.
+ * @param name Name of the endpoint.
  */
-private[netty] case class RpcEndpointAddress(host: String, port: Int, name: String) {
+private[netty] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) {
+
+  require(name != null, "RpcEndpoint name must be provided.")
 
-  def toRpcAddress: RpcAddress = RpcAddress(host, port)
+  def this(host: String, port: Int, name: String) = {
+    this(RpcAddress(host, port), name)
+  }
 
-  override val toString = s"spark://$name@$host:$port"
+  override val toString = if (rpcAddress != null) {
+      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
+    } else {
+      s"spark-client://$name"
+    }
 }
 
 private[netty] object RpcEndpointAddress {
@@ -51,7 +61,7 @@ private[netty] object RpcEndpointAddress {
           uri.getQuery != null) {
         throw new SparkException("Invalid Spark URL: " + sparkUrl)
       }
-      RpcEndpointAddress(host, port, name)
+      new RpcEndpointAddress(host, port, name)
     } catch {
       case e: java.net.URISyntaxException =>
         throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 8103efa7302e7..f3d0d85476772 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -38,7 +38,7 @@ private[spark] object CoarseGrainedClusterMessages {
 
   sealed trait RegisterExecutorResponse
 
-  case object RegisteredExecutor extends CoarseGrainedClusterMessage
+  case class RegisteredExecutor(hostname: String) extends CoarseGrainedClusterMessage
     with RegisterExecutorResponse
 
   case class RegisterExecutorFailed(message: String) extends CoarseGrainedClusterMessage
@@ -51,9 +51,7 @@ private[spark] object CoarseGrainedClusterMessages {
       hostPort: String,
       cores: Int,
       logUrls: Map[String, String])
-    extends CoarseGrainedClusterMessage {
-    Utils.checkHostPort(hostPort, "Expected host port")
-  }
+    extends CoarseGrainedClusterMessage
 
   case class StatusUpdate(executorId: String, taskId: Long, state: TaskState,
     data: SerializableBuffer) extends CoarseGrainedClusterMessage
@@ -107,8 +105,4 @@ private[spark] object CoarseGrainedClusterMessages {
   // Used internally by executors to shut themselves down.
   case object Shutdown extends CoarseGrainedClusterMessage
 
-  // SPARK-10987: workaround for netty RPC issue; forces a connection from the driver back
-  // to the AM.
-  case object DriverHello extends CoarseGrainedClusterMessage
-
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 55a564b5c8eac..439a11927026b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -131,16 +131,22 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
 
       case RegisterExecutor(executorId, executorRef, hostPort, cores, logUrls) =>
-        Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
         if (executorDataMap.contains(executorId)) {
           context.reply(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
         } else {
-          logInfo("Registered executor: " + executorRef + " with ID " + executorId)
-          addressToExecutorId(executorRef.address) = executorId
+          // If the executor's rpc env is not listening for incoming connections, `hostPort`
+          // will be null, and the client connection should be used to contact the executor.
+          val executorAddress = if (executorRef.address != null) {
+              executorRef.address
+            } else {
+              context.senderAddress
+            }
+          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
+          addressToExecutorId(executorAddress) = executorId
           totalCoreCount.addAndGet(cores)
           totalRegisteredExecutors.addAndGet(1)
-          val (host, _) = Utils.parseHostPort(hostPort)
-          val data = new ExecutorData(executorRef, executorRef.address, host, cores, cores, logUrls)
+          val data = new ExecutorData(executorRef, executorRef.address, executorAddress.host,
+            cores, cores, logUrls)
           // This must be synchronized because variables mutated
           // in this block are read when requesting executors
           CoarseGrainedSchedulerBackend.this.synchronized {
@@ -151,7 +157,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             }
           }
           // Note: some tests expect the reply to come after we put the executor in the map
-          context.reply(RegisteredExecutor)
+          context.reply(RegisteredExecutor(executorAddress.host))
           listenerBus.post(
             SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
           makeOffers()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index e483688edef5f..cb24072d7d941 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -170,8 +170,6 @@ private[spark] abstract class YarnSchedulerBackend(
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
         amEndpoint = Option(am)
-        // See SPARK-10987.
-        am.send(DriverHello)
 
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 3bead6395d384..834e4743df866 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -48,7 +48,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     }
   }
 
-  def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv
+  def createRpcEnv(conf: SparkConf, name: String, port: Int, clientMode: Boolean = false): RpcEnv
 
   test("send a message locally") {
     @volatile var message: String = null
@@ -76,7 +76,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef("local", env.address, "send-remotely")
     try {
@@ -130,7 +130,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef("local", env.address, "ask-remotely")
     try {
@@ -158,7 +158,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     val shortProp = "spark.rpc.short.timeout"
     conf.set("spark.rpc.retry.wait", "0")
     conf.set("spark.rpc.numRetries", "1")
-    val anotherEnv = createRpcEnv(conf, "remote", 13345)
+    val anotherEnv = createRpcEnv(conf, "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef("local", env.address, "ask-timeout")
     try {
@@ -417,7 +417,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef("local", env.address, "sendWithReply-remotely")
     try {
@@ -457,7 +457,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef(
       "local", env.address, "sendWithReply-remotely-error")
@@ -497,26 +497,40 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
 
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef(
       "local", env.address, "network-events")
     val remoteAddress = anotherEnv.address
     rpcEndpointRef.send("hello")
     eventually(timeout(5 seconds), interval(5 millis)) {
-      assert(events === List(("onConnected", remoteAddress)))
+      // anotherEnv is connected in client mode, so the remote address may be unknown depending on
+      // the implementation. Account for that when doing checks.
+      if (remoteAddress != null) {
+        assert(events === List(("onConnected", remoteAddress)))
+      } else {
+        assert(events.size === 1)
+        assert(events(0)._1 === "onConnected")
+      }
     }
 
     anotherEnv.shutdown()
     anotherEnv.awaitTermination()
     eventually(timeout(5 seconds), interval(5 millis)) {
-      assert(events === List(
-        ("onConnected", remoteAddress),
-        ("onNetworkError", remoteAddress),
-        ("onDisconnected", remoteAddress)) ||
-        events === List(
-        ("onConnected", remoteAddress),
-        ("onDisconnected", remoteAddress)))
+      // Account for anotherEnv not having an address due to running in client mode.
+      if (remoteAddress != null) {
+        assert(events === List(
+          ("onConnected", remoteAddress),
+          ("onNetworkError", remoteAddress),
+          ("onDisconnected", remoteAddress)) ||
+          events === List(
+          ("onConnected", remoteAddress),
+          ("onDisconnected", remoteAddress)))
+      } else {
+        val eventNames = events.map(_._1)
+        assert(eventNames === List("onConnected", "onNetworkError", "onDisconnected") ||
+          eventNames === List("onConnected", "onDisconnected"))
+      }
     }
   }
 
@@ -529,7 +543,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
       }
     })
 
-    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345)
+    val anotherEnv = createRpcEnv(new SparkConf(), "remote", 13345, clientMode = true)
     // Use anotherEnv to find out the RpcEndpointRef
     val rpcEndpointRef = anotherEnv.setupEndpointRef(
       "local", env.address, "sendWithReply-unserializable-error")
@@ -558,7 +572,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     conf.set("spark.authenticate.secret", "good")
 
     val localEnv = createRpcEnv(conf, "authentication-local", 13345)
-    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345, clientMode = true)
 
     try {
       @volatile var message: String = null
@@ -589,7 +603,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     conf.set("spark.authenticate.secret", "good")
 
     val localEnv = createRpcEnv(conf, "authentication-local", 13345)
-    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345)
+    val remoteEnv = createRpcEnv(conf, "authentication-remote", 14345, clientMode = true)
 
     try {
       localEnv.setupEndpoint("ask-authentication", new RpcEndpoint {
diff --git a/core/src/test/scala/org/apache/spark/rpc/akka/AkkaRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/akka/AkkaRpcEnvSuite.scala
index 4aa75c9230b2c..6478ab51c4da2 100644
--- a/core/src/test/scala/org/apache/spark/rpc/akka/AkkaRpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/akka/AkkaRpcEnvSuite.scala
@@ -22,9 +22,12 @@ import org.apache.spark.{SSLSampleConfigs, SecurityManager, SparkConf}
 
 class AkkaRpcEnvSuite extends RpcEnvSuite {
 
-  override def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv = {
+  override def createRpcEnv(conf: SparkConf,
+      name: String,
+      port: Int,
+      clientMode: Boolean = false): RpcEnv = {
     new AkkaRpcEnvFactory().create(
-      RpcEnvConfig(conf, name, "localhost", port, new SecurityManager(conf)))
+      RpcEnvConfig(conf, name, "localhost", port, new SecurityManager(conf), clientMode))
   }
 
   test("setupEndpointRef: systemName, address, endpointName") {
@@ -37,7 +40,7 @@ class AkkaRpcEnvSuite extends RpcEnvSuite {
     })
     val conf = new SparkConf()
     val newRpcEnv = new AkkaRpcEnvFactory().create(
-      RpcEnvConfig(conf, "test", "localhost", 12346, new SecurityManager(conf)))
+      RpcEnvConfig(conf, "test", "localhost", 12346, new SecurityManager(conf), false))
     try {
       val newRef = newRpcEnv.setupEndpointRef("local", ref.address, "test_endpoint")
       assert(s"akka.tcp://local@${env.address}/user/test_endpoint" ===
@@ -56,7 +59,7 @@ class AkkaRpcEnvSuite extends RpcEnvSuite {
     val conf = SSLSampleConfigs.sparkSSLConfig()
     val securityManager = new SecurityManager(conf)
     val rpcEnv = new AkkaRpcEnvFactory().create(
-      RpcEnvConfig(conf, "test", "localhost", 12346, securityManager))
+      RpcEnvConfig(conf, "test", "localhost", 12346, securityManager, false))
     try {
       val uri = rpcEnv.uriOf("local", RpcAddress("1.2.3.4", 12345), "test_endpoint")
       assert("akka.ssl.tcp://local@1.2.3.4:12345/user/test_endpoint" === uri)
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
index 973a07a0bde3a..56743ba650b41 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
@@ -22,8 +22,13 @@ import org.apache.spark.SparkFunSuite
 class NettyRpcAddressSuite extends SparkFunSuite {
 
   test("toString") {
-    val addr = RpcEndpointAddress("localhost", 12345, "test")
+    val addr = new RpcEndpointAddress("localhost", 12345, "test")
     assert(addr.toString === "spark://test@localhost:12345")
   }
 
+  test("toString for client mode") {
+    val addr = RpcEndpointAddress(null, "test")
+    assert(addr.toString === "spark-client://test")
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
index be19668e17c04..ce83087ec04d6 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
@@ -22,8 +22,13 @@ import org.apache.spark.rpc._
 
 class NettyRpcEnvSuite extends RpcEnvSuite {
 
-  override def createRpcEnv(conf: SparkConf, name: String, port: Int): RpcEnv = {
-    val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf))
+  override def createRpcEnv(
+      conf: SparkConf,
+      name: String,
+      port: Int,
+      clientMode: Boolean = false): RpcEnv = {
+    val config = RpcEnvConfig(conf, "test", "localhost", port, new SecurityManager(conf),
+      clientMode)
     new NettyRpcEnvFactory().create(config)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
index 5430e4c0c4d6c..f9d8e80c98b66 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rpc._
 class NettyRpcHandlerSuite extends SparkFunSuite {
 
   val env = mock(classOf[NettyRpcEnv])
-  when(env.deserialize(any(classOf[Array[Byte]]))(any())).
+  when(env.deserialize(any(classOf[TransportClient]), any(classOf[Array[Byte]]))(any())).
     thenReturn(RequestMessage(RpcAddress("localhost", 12345), null, null, false))
 
   test("receive") {
@@ -42,7 +42,7 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.receive(client, null, null)
 
-    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 40000)))
   }
 
   test("connectionTerminated") {
@@ -57,9 +57,9 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.connectionTerminated(client)
 
-    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 40000)))
     verify(dispatcher, times(1)).postToAll(
-      RemoteProcessDisconnected(RpcAddress("localhost", 12345)))
+      RemoteProcessDisconnected(RpcAddress("localhost", 40000)))
   }
 
 }
diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index 541ed9a8d0ab6..e2360eff5cfe1 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -54,6 +54,11 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <scope>provided</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index c6a6d7ac56bf3..12ae350e4cef6 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -321,7 +321,8 @@ private[spark] class ApplicationMaster(
 
   private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
     val port = sparkConf.getInt("spark.yarn.am.port", 0)
-    rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr)
+    rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr,
+      clientMode = true)
     val driverRef = waitForSparkDriver()
     addAmIpFilter()
     registerAM(rpcEnv, driverRef, sparkConf.get("spark.driver.appUIAddress", ""), securityMgr)
@@ -574,9 +575,6 @@ private[spark] class ApplicationMaster(
       case x: AddWebUIFilter =>
         logInfo(s"Add WebUI Filter. $x")
         driver.send(x)
-
-      case DriverHello =>
-        // SPARK-10987: no action needed for this message.
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {

From f92f334ca47c03b980b06cf300aa652d0ffa1880 Mon Sep 17 00:00:00 2001
From: Jason White <jason.white@shopify.com>
Date: Mon, 2 Nov 2015 10:49:06 -0800
Subject: [PATCH 758/802] [SPARK-11437] [PYSPARK] Don't .take when converting
 RDD to DataFrame with provided schema

When creating a DataFrame from an RDD in PySpark, `createDataFrame` calls `.take(10)` to verify the first 10 rows of the RDD match the provided schema. Similar to https://issues.apache.org/jira/browse/SPARK-8070, but that issue affected cases where a schema was not provided.

Verifying the first 10 rows is of limited utility and causes the DAG to be executed non-lazily. If necessary, I believe this verification should be done lazily on all rows. However, since the caller is providing a schema to follow, I think it's acceptable to simply fail if the schema is incorrect.

marmbrus We chatted about this at SparkSummitEU. davies you made a similar change for the infer-schema path in https://github.com/apache/spark/pull/6606

Author: Jason White <jason.white@shopify.com>

Closes #9392 from JasonMWhite/createDataFrame_without_take.
---
 python/pyspark/sql/context.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 79453658a167a..924bb6433de0e 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -318,13 +318,7 @@ def _createFromRDD(self, rdd, schema, samplingRatio):
                     struct.names[i] = name
             schema = struct
 
-        elif isinstance(schema, StructType):
-            # take the first few rows to verify schema
-            rows = rdd.take(10)
-            for row in rows:
-                _verify_type(row, schema)
-
-        else:
+        elif not isinstance(schema, StructType):
             raise TypeError("schema should be StructType or list or None, but got: %s" % schema)
 
         # convert python objects to sql data

From b3aedca6b55c678e40a5961e2fd3af4cb8c52bba Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 2 Nov 2015 14:36:37 -0600
Subject: [PATCH 759/802] [SPARK-11456][TESTS] Remove deprecated
 junit.framework in Java tests

Replace use of `junit.framework` with `org.junit`, and touch up tests in question

Author: Sean Owen <sowen@cloudera.com>

Closes #9411 from srowen/SPARK-11456.
---
 .../spark/unsafe/bitset/BitSetSuite.java      |  4 +-
 .../unsafe/hash/Murmur3_x86_32Suite.java      | 11 +--
 .../unsafe/types/CalendarIntervalSuite.java   | 78 +++++++++----------
 .../spark/unsafe/types/UTF8StringSuite.java   | 74 +++++++++---------
 4 files changed, 84 insertions(+), 83 deletions(-)

diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
index a93fc0ee297c4..14e38683df4ab 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
@@ -17,7 +17,7 @@
 
 package org.apache.spark.unsafe.bitset;
 
-import junit.framework.Assert;
+import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
@@ -25,7 +25,7 @@
 public class BitSetSuite {
 
   private static BitSet createBitSet(int capacity) {
-    assert capacity % 64 == 0;
+    Assert.assertEquals(0, capacity % 64);
     return new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));
   }
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java b/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
index 2f8cb132ac8b4..e759cb33b3e6a 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
@@ -17,12 +17,13 @@
 
 package org.apache.spark.unsafe.hash;
 
+import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;
 
-import junit.framework.Assert;
 import org.apache.spark.unsafe.Platform;
+import org.junit.Assert;
 import org.junit.Test;
 
 /**
@@ -56,7 +57,7 @@ public void randomizedStressTest() {
     Random rand = new Random();
 
     // A set used to track collision rate.
-    Set<Integer> hashcodes = new HashSet<Integer>();
+    Set<Integer> hashcodes = new HashSet<>();
     for (int i = 0; i < size; i++) {
       int vint = rand.nextInt();
       long lint = rand.nextLong();
@@ -76,7 +77,7 @@ public void randomizedStressTestBytes() {
     Random rand = new Random();
 
     // A set used to track collision rate.
-    Set<Integer> hashcodes = new HashSet<Integer>();
+    Set<Integer> hashcodes = new HashSet<>();
     for (int i = 0; i < size; i++) {
       int byteArrSize = rand.nextInt(100) * 8;
       byte[] bytes = new byte[byteArrSize];
@@ -98,10 +99,10 @@ public void randomizedStressTestBytes() {
   public void randomizedStressTestPaddedStrings() {
     int size = 64000;
     // A set used to track collision rate.
-    Set<Integer> hashcodes = new HashSet<Integer>();
+    Set<Integer> hashcodes = new HashSet<>();
     for (int i = 0; i < size; i++) {
       int byteArrSize = 8;
-      byte[] strBytes = ("" + i).getBytes();
+      byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
       byte[] paddedBytes = new byte[byteArrSize];
       System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length);
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
index 80d4982c4b576..9e69e264ff287 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
@@ -19,7 +19,7 @@
 
 import org.junit.Test;
 
-import static junit.framework.Assert.*;
+import static org.junit.Assert.*;
 import static org.apache.spark.unsafe.types.CalendarInterval.*;
 
 public class CalendarIntervalSuite {
@@ -42,19 +42,19 @@ public void toStringTest() {
     CalendarInterval i;
 
     i = new CalendarInterval(34, 0);
-    assertEquals(i.toString(), "interval 2 years 10 months");
+    assertEquals("interval 2 years 10 months", i.toString());
 
     i = new CalendarInterval(-34, 0);
-    assertEquals(i.toString(), "interval -2 years -10 months");
+    assertEquals("interval -2 years -10 months", i.toString());
 
     i = new CalendarInterval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
-    assertEquals(i.toString(), "interval 3 weeks 13 hours 123 microseconds");
+    assertEquals("interval 3 weeks 13 hours 123 microseconds", i.toString());
 
     i = new CalendarInterval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
-    assertEquals(i.toString(), "interval -3 weeks -13 hours -123 microseconds");
+    assertEquals("interval -3 weeks -13 hours -123 microseconds", i.toString());
 
     i = new CalendarInterval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
-    assertEquals(i.toString(), "interval 2 years 10 months 3 weeks 13 hours 123 microseconds");
+    assertEquals("interval 2 years 10 months 3 weeks 13 hours 123 microseconds", i.toString());
   }
 
   @Test
@@ -73,32 +73,32 @@ public void fromStringTest() {
 
     input = "interval   -5  years  23   month";
     CalendarInterval result = new CalendarInterval(-5 * 12 + 23, 0);
-    assertEquals(CalendarInterval.fromString(input), result);
+    assertEquals(fromString(input), result);
 
     input = "interval   -5  years  23   month   ";
-    assertEquals(CalendarInterval.fromString(input), result);
+    assertEquals(fromString(input), result);
 
     input = "  interval   -5  years  23   month   ";
-    assertEquals(CalendarInterval.fromString(input), result);
+    assertEquals(fromString(input), result);
 
     // Error cases
     input = "interval   3month 1 hour";
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
 
     input = "interval 3 moth 1 hour";
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
 
     input = "interval";
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
 
     input = "int";
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
 
     input = "";
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
 
     input = null;
-    assertEquals(CalendarInterval.fromString(input), null);
+    assertNull(fromString(input));
   }
 
   @Test
@@ -108,15 +108,15 @@ public void fromYearMonthStringTest() {
 
     input = "99-10";
     i = new CalendarInterval(99 * 12 + 10, 0L);
-    assertEquals(CalendarInterval.fromYearMonthString(input), i);
+    assertEquals(fromYearMonthString(input), i);
 
     input = "-8-10";
     i = new CalendarInterval(-8 * 12 - 10, 0L);
-    assertEquals(CalendarInterval.fromYearMonthString(input), i);
+    assertEquals(fromYearMonthString(input), i);
 
     try {
       input = "99-15";
-      CalendarInterval.fromYearMonthString(input);
+      fromYearMonthString(input);
       fail("Expected to throw an exception for the invalid input");
     } catch (IllegalArgumentException e) {
       assertTrue(e.getMessage().contains("month 15 outside range"));
@@ -131,19 +131,19 @@ public void fromDayTimeStringTest() {
     input = "5 12:40:30.999999999";
     i = new CalendarInterval(0, 5 * MICROS_PER_DAY + 12 * MICROS_PER_HOUR +
       40 * MICROS_PER_MINUTE + 30 * MICROS_PER_SECOND + 999999L);
-    assertEquals(CalendarInterval.fromDayTimeString(input), i);
+    assertEquals(fromDayTimeString(input), i);
 
     input = "10 0:12:0.888";
     i = new CalendarInterval(0, 10 * MICROS_PER_DAY + 12 * MICROS_PER_MINUTE);
-    assertEquals(CalendarInterval.fromDayTimeString(input), i);
+    assertEquals(fromDayTimeString(input), i);
 
     input = "-3 0:0:0";
     i = new CalendarInterval(0, -3 * MICROS_PER_DAY);
-    assertEquals(CalendarInterval.fromDayTimeString(input), i);
+    assertEquals(fromDayTimeString(input), i);
 
     try {
       input = "5 30:12:20";
-      CalendarInterval.fromDayTimeString(input);
+      fromDayTimeString(input);
       fail("Expected to throw an exception for the invalid input");
     } catch (IllegalArgumentException e) {
       assertTrue(e.getMessage().contains("hour 30 outside range"));
@@ -151,7 +151,7 @@ public void fromDayTimeStringTest() {
 
     try {
       input = "5 30-12";
-      CalendarInterval.fromDayTimeString(input);
+      fromDayTimeString(input);
       fail("Expected to throw an exception for the invalid input");
     } catch (IllegalArgumentException e) {
       assertTrue(e.getMessage().contains("not match day-time format"));
@@ -165,19 +165,19 @@ public void fromSingleUnitStringTest() {
 
     input = "12";
     i = new CalendarInterval(12 * 12, 0L);
-    assertEquals(CalendarInterval.fromSingleUnitString("year", input), i);
+    assertEquals(fromSingleUnitString("year", input), i);
 
     input = "100";
     i = new CalendarInterval(0, 100 * MICROS_PER_DAY);
-    assertEquals(CalendarInterval.fromSingleUnitString("day", input), i);
+    assertEquals(fromSingleUnitString("day", input), i);
 
     input = "1999.38888";
     i = new CalendarInterval(0, 1999 * MICROS_PER_SECOND + 38);
-    assertEquals(CalendarInterval.fromSingleUnitString("second", input), i);
+    assertEquals(fromSingleUnitString("second", input), i);
 
     try {
       input = String.valueOf(Integer.MAX_VALUE);
-      CalendarInterval.fromSingleUnitString("year", input);
+      fromSingleUnitString("year", input);
       fail("Expected to throw an exception for the invalid input");
     } catch (IllegalArgumentException e) {
       assertTrue(e.getMessage().contains("outside range"));
@@ -185,7 +185,7 @@ public void fromSingleUnitStringTest() {
 
     try {
       input = String.valueOf(Long.MAX_VALUE / MICROS_PER_HOUR + 1);
-      CalendarInterval.fromSingleUnitString("hour", input);
+      fromSingleUnitString("hour", input);
       fail("Expected to throw an exception for the invalid input");
     } catch (IllegalArgumentException e) {
       assertTrue(e.getMessage().contains("outside range"));
@@ -197,16 +197,16 @@ public void addTest() {
     String input = "interval 3 month 1 hour";
     String input2 = "interval 2 month 100 hour";
 
-    CalendarInterval interval = CalendarInterval.fromString(input);
-    CalendarInterval interval2 = CalendarInterval.fromString(input2);
+    CalendarInterval interval = fromString(input);
+    CalendarInterval interval2 = fromString(input2);
 
     assertEquals(interval.add(interval2), new CalendarInterval(5, 101 * MICROS_PER_HOUR));
 
     input = "interval -10 month -81 hour";
     input2 = "interval 75 month 200 hour";
 
-    interval = CalendarInterval.fromString(input);
-    interval2 = CalendarInterval.fromString(input2);
+    interval = fromString(input);
+    interval2 = fromString(input2);
 
     assertEquals(interval.add(interval2), new CalendarInterval(65, 119 * MICROS_PER_HOUR));
   }
@@ -216,25 +216,25 @@ public void subtractTest() {
     String input = "interval 3 month 1 hour";
     String input2 = "interval 2 month 100 hour";
 
-    CalendarInterval interval = CalendarInterval.fromString(input);
-    CalendarInterval interval2 = CalendarInterval.fromString(input2);
+    CalendarInterval interval = fromString(input);
+    CalendarInterval interval2 = fromString(input2);
 
     assertEquals(interval.subtract(interval2), new CalendarInterval(1, -99 * MICROS_PER_HOUR));
 
     input = "interval -10 month -81 hour";
     input2 = "interval 75 month 200 hour";
 
-    interval = CalendarInterval.fromString(input);
-    interval2 = CalendarInterval.fromString(input2);
+    interval = fromString(input);
+    interval2 = fromString(input2);
 
     assertEquals(interval.subtract(interval2), new CalendarInterval(-85, -281 * MICROS_PER_HOUR));
   }
 
-  private void testSingleUnit(String unit, int number, int months, long microseconds) {
+  private static void testSingleUnit(String unit, int number, int months, long microseconds) {
     String input1 = "interval " + number + " " + unit;
     String input2 = "interval " + number + " " + unit + "s";
     CalendarInterval result = new CalendarInterval(months, microseconds);
-    assertEquals(CalendarInterval.fromString(input1), result);
-    assertEquals(CalendarInterval.fromString(input2), result);
+    assertEquals(fromString(input1), result);
+    assertEquals(fromString(input2), result);
   }
 }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 98aa8a2469a75..e21ffdcff9abf 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -24,13 +24,13 @@
 import com.google.common.collect.ImmutableMap;
 import org.junit.Test;
 
-import static junit.framework.Assert.*;
+import static org.junit.Assert.*;
 
 import static org.apache.spark.unsafe.types.UTF8String.*;
 
 public class UTF8StringSuite {
 
-  private void checkBasic(String str, int len) throws UnsupportedEncodingException {
+  private static void checkBasic(String str, int len) throws UnsupportedEncodingException {
     UTF8String s1 = fromString(str);
     UTF8String s2 = fromBytes(str.getBytes("utf8"));
     assertEquals(s1.numChars(), len);
@@ -42,12 +42,12 @@ private void checkBasic(String str, int len) throws UnsupportedEncodingException
 
     assertEquals(s1.hashCode(), s2.hashCode());
 
-    assertEquals(s1.compareTo(s2), 0);
+    assertEquals(0, s1.compareTo(s2));
 
-    assertEquals(s1.contains(s2), true);
-    assertEquals(s2.contains(s1), true);
-    assertEquals(s1.startsWith(s1), true);
-    assertEquals(s1.endsWith(s1), true);
+    assertTrue(s1.contains(s2));
+    assertTrue(s2.contains(s1));
+    assertTrue(s1.startsWith(s1));
+    assertTrue(s1.endsWith(s1));
   }
 
   @Test
@@ -59,8 +59,8 @@ public void basicTest() throws UnsupportedEncodingException {
 
   @Test
   public void emptyStringTest() {
-    assertEquals(fromString(""), EMPTY_UTF8);
-    assertEquals(fromBytes(new byte[0]), EMPTY_UTF8);
+    assertEquals(EMPTY_UTF8, fromString(""));
+    assertEquals(EMPTY_UTF8, fromBytes(new byte[0]));
     assertEquals(0, EMPTY_UTF8.numChars());
     assertEquals(0, EMPTY_UTF8.numBytes());
   }
@@ -76,9 +76,9 @@ public void prefix() {
 
     byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     byte[] buf2 = {1, 2, 3};
-    UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
-    UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
-    UTF8String str3 = UTF8String.fromBytes(buf2);
+    UTF8String str1 = fromBytes(buf1, 0, 3);
+    UTF8String str2 = fromBytes(buf1, 0, 8);
+    UTF8String str3 = fromBytes(buf2);
     assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
     assertEquals(str1.getPrefix(), str3.getPrefix());
   }
@@ -98,7 +98,7 @@ public void compareTo() {
     assertTrue(fromString("你好123").compareTo(fromString("你好122")) > 0);
   }
 
-  protected void testUpperandLower(String upper, String lower) {
+  protected static void testUpperandLower(String upper, String lower) {
     UTF8String us = fromString(upper);
     UTF8String ls = fromString(lower);
     assertEquals(ls, us.toLowerCase());
@@ -127,22 +127,22 @@ public void titleCase() {
   @Test
   public void concatTest() {
     assertEquals(EMPTY_UTF8, concat());
-    assertEquals(null, concat((UTF8String) null));
+    assertNull(concat((UTF8String) null));
     assertEquals(EMPTY_UTF8, concat(EMPTY_UTF8));
     assertEquals(fromString("ab"), concat(fromString("ab")));
     assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
     assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
-    assertEquals(null, concat(fromString("a"), null, fromString("c")));
-    assertEquals(null, concat(fromString("a"), null, null));
-    assertEquals(null, concat(null, null, null));
+    assertNull(concat(fromString("a"), null, fromString("c")));
+    assertNull(concat(fromString("a"), null, null));
+    assertNull(concat(null, null, null));
     assertEquals(fromString("数据砖头"), concat(fromString("数据"), fromString("砖头")));
   }
 
   @Test
   public void concatWsTest() {
     // Returns null if the separator is null
-    assertEquals(null, concatWs(null, (UTF8String)null));
-    assertEquals(null, concatWs(null, fromString("a")));
+    assertNull(concatWs(null, (UTF8String) null));
+    assertNull(concatWs(null, fromString("a")));
 
     // If separator is null, concatWs should skip all null inputs and never return null.
     UTF8String sep = fromString("哈哈");
@@ -381,16 +381,16 @@ public void split() {
   
   @Test
   public void levenshteinDistance() {
-    assertEquals(EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8), 0);
-    assertEquals(EMPTY_UTF8.levenshteinDistance(fromString("a")), 1);
-    assertEquals(fromString("aaapppp").levenshteinDistance(EMPTY_UTF8), 7);
-    assertEquals(fromString("frog").levenshteinDistance(fromString("fog")), 1);
-    assertEquals(fromString("fly").levenshteinDistance(fromString("ant")),3);
-    assertEquals(fromString("elephant").levenshteinDistance(fromString("hippo")), 7);
-    assertEquals(fromString("hippo").levenshteinDistance(fromString("elephant")), 7);
-    assertEquals(fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")), 8);
-    assertEquals(fromString("hello").levenshteinDistance(fromString("hallo")),1);
-    assertEquals(fromString("世界千世").levenshteinDistance(fromString("千a世b")),4);
+    assertEquals(0, EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8));
+    assertEquals(1, EMPTY_UTF8.levenshteinDistance(fromString("a")));
+    assertEquals(7, fromString("aaapppp").levenshteinDistance(EMPTY_UTF8));
+    assertEquals(1, fromString("frog").levenshteinDistance(fromString("fog")));
+    assertEquals(3, fromString("fly").levenshteinDistance(fromString("ant")));
+    assertEquals(7, fromString("elephant").levenshteinDistance(fromString("hippo")));
+    assertEquals(7, fromString("hippo").levenshteinDistance(fromString("elephant")));
+    assertEquals(8, fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")));
+    assertEquals(1, fromString("hello").levenshteinDistance(fromString("hallo")));
+    assertEquals(4, fromString("世界千世").levenshteinDistance(fromString("千a世b")));
   }
 
   @Test
@@ -432,14 +432,14 @@ public void createBlankString() {
 
   @Test
   public void findInSet() {
-    assertEquals(fromString("ab").findInSet(fromString("ab")), 1);
-    assertEquals(fromString("a,b").findInSet(fromString("b")), 2);
-    assertEquals(fromString("abc,b,ab,c,def").findInSet(fromString("ab")), 3);
-    assertEquals(fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")), 1);
-    assertEquals(fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")), 4);
-    assertEquals(fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")), 1);
-    assertEquals(fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")), 4);
-    assertEquals(fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")), 6);
+    assertEquals(1, fromString("ab").findInSet(fromString("ab")));
+    assertEquals(2, fromString("a,b").findInSet(fromString("b")));
+    assertEquals(3, fromString("abc,b,ab,c,def").findInSet(fromString("ab")));
+    assertEquals(1, fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")));
+    assertEquals(4, fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")));
+    assertEquals(1, fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")));
+    assertEquals(4, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")));
+    assertEquals(6, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")));
   }
 
   @Test

From 33ae7a35daa86c34f1f9f72f997e0c2d4cd8abec Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 2 Nov 2015 13:42:16 -0800
Subject: [PATCH 760/802] [SPARK-11358][MLLIB] deprecate runs in k-means

This PR deprecates `runs` in k-means. `runs` introduces extra complexity and overhead in MLlib's k-means implementation. I haven't seen much usage with `runs` not equal to `1`. We don't have a unit test for it either. We can deprecate this method in 1.6, and void it in 1.7. It helps us simplify the implementation.

cc: srowen

Author: Xiangrui Meng <meng@databricks.com>

Closes #9322 from mengxr/SPARK-11358.
---
 .../main/scala/org/apache/spark/mllib/clustering/KMeans.scala | 4 ++--
 python/pyspark/mllib/clustering.py                            | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 7168aac32c997..2895db7c9061b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -107,7 +107,7 @@ class KMeans private (
    * Number of runs of the algorithm to execute in parallel.
    */
   @Since("1.4.0")
-  @Experimental
+  @deprecated("Support for runs is deprecated. This param will have no effect in 1.7.0.", "1.6.0")
   def getRuns: Int = runs
 
   /**
@@ -117,7 +117,7 @@ class KMeans private (
    * return the best clustering found over any run. Default: 1.
    */
   @Since("0.8.0")
-  @Experimental
+  @deprecated("Support for runs is deprecated. This param will have no effect in 1.7.0.", "1.6.0")
   def setRuns(runs: Int): this.type = {
     if (runs <= 0) {
       throw new IllegalArgumentException("Number of runs must be positive")
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index d1c3755a785f2..8629aa5a17164 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -17,6 +17,7 @@
 
 import sys
 import array as pyarray
+import warnings
 
 if sys.version > '3':
     xrange = range
@@ -170,6 +171,9 @@ class KMeans(object):
     def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
               seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
         """Train a k-means clustering model."""
+        if runs != 1:
+            warnings.warn(
+                "Support for runs is deprecated in 1.6.0. This param will have no effect in 1.7.0.")
         clusterInitialModel = []
         if initialModel is not None:
             if not isinstance(initialModel, KMeansModel):

From db11ee5e56e5fac59895c772a9a87c5ac86888ef Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Mon, 2 Nov 2015 13:51:53 -0800
Subject: [PATCH 761/802] [SPARK-11371] Make "mean" an alias for "avg" operator

From Reynold in the thread 'Exception when using some aggregate operators' (http://search-hadoop.com/m/q3RTt0xFr22nXB4/):

I don't think these are bugs. The SQL standard for average is "avg", not "mean". Similarly, a distinct count is supposed to be written as "count(distinct col)", not "countDistinct(col)".
We can, however, make "mean" an alias for "avg" to improve compatibility between DataFrame and SQL.

Author: tedyu <yuzhihong@gmail.com>

Closes #9332 from ted-yu/master.
---
 .../spark/sql/catalyst/analysis/FunctionRegistry.scala   | 1 +
 .../spark/sql/hive/execution/AggregationQuerySuite.scala | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 5f3ec74ac0d92..24c1a7b7ac5af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -185,6 +185,7 @@ object FunctionRegistry {
     expression[Last]("last"),
     expression[Last]("last_value"),
     expression[Max]("max"),
+    expression[Average]("mean"),
     expression[Min]("min"),
     expression[Stddev]("stddev"),
     expression[StddevPop]("stddev_pop"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 0cf0e0aab9eb2..74061db0f28af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -298,6 +298,15 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         """.stripMargin),
       Row(1, 20.0) :: Row(2, -0.5) :: Row(3, null) :: Row(null, 10.0) :: Nil)
 
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT key, mean(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(1, 20.0) :: Row(2, -0.5) :: Row(3, null) :: Row(null, 10.0) :: Nil)
+
     checkAnswer(
       sqlContext.sql(
         """

From 2804674a7af8f11eeb1280459bc9145815398eed Mon Sep 17 00:00:00 2001
From: Rishabh Bhardwaj <rbnext29@gmail.com>
Date: Mon, 2 Nov 2015 14:03:50 -0800
Subject: [PATCH 762/802] [SPARK-11383][DOCS] Replaced example code in
 mllib-naive-bayes.md/mllib-isotonic-regression.md using include_example

I have made the required changes in mllib-naive-bayes.md/mllib-isotonic-regression.md and also verified them.
Kindle Review it.

Author: Rishabh Bhardwaj <rbnext29@gmail.com>

Closes #9353 from rishabhbhardwaj/SPARK-11383.
---
 docs/mllib-isotonic-regression.md             | 124 +-----------------
 docs/mllib-naive-bayes.md                     |  89 +------------
 .../mllib/JavaIsotonicRegressionExample.java  |  86 ++++++++++++
 .../examples/mllib/JavaNaiveBayesExample.java |  64 +++++++++
 .../mllib/isotonic_regression_example.py      |  56 ++++++++
 .../main/python/mllib/naive_bayes_example.py  |  56 ++++++++
 .../mllib/IsotonicRegressionExample.scala     |  66 ++++++++++
 .../examples/mllib/NaiveBayesExample.scala    |  57 ++++++++
 8 files changed, 391 insertions(+), 207 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
 create mode 100644 examples/src/main/python/mllib/isotonic_regression_example.py
 create mode 100644 examples/src/main/python/mllib/naive_bayes_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala

diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index f91a697b31891..85f9226b43416 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -61,42 +61,8 @@ labels and real labels in the test set.
 
 Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegression) and [`IsotonicRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegressionModel) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
-
-val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
-
-// Create label, feature, weight tuples from input data with weight set to default value 1.0.
-val parsedData = data.map { line =>
-  val parts = line.split(',').map(_.toDouble)
-  (parts(0), parts(1), 1.0)
-}
-
-// Split data into training (60%) and test (40%) sets.
-val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
-val training = splits(0)
-val test = splits(1)
-
-// Create isotonic regression model from training data.
-// Isotonic parameter defaults to true so it is only shown for demonstration
-val model = new IsotonicRegression().setIsotonic(true).run(training)
-
-// Create tuples of predicted and real labels.
-val predictionAndLabel = test.map { point =>
-  val predictedLabel = model.predict(point._2)
-  (predictedLabel, point._1)
-}
-
-// Calculate mean squared error between predicted and real labels.
-val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean()
-println("Mean Squared Error = " + meanSquaredError)
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = IsotonicRegressionModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala %}
 </div>
-
 <div data-lang="java" markdown="1">
 Data are read from a file where each line has a format label,feature
 i.e. 4710.28,500.00. The data are split to training and testing set.
@@ -105,66 +71,8 @@ labels and real labels in the test set.
 
 Refer to the [`IsotonicRegression` Java docs](api/java/org/apache/spark/mllib/regression/IsotonicRegression.html) and [`IsotonicRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/IsotonicRegressionModel.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.regression.IsotonicRegressionModel;
-import scala.Tuple2;
-import scala.Tuple3;
-
-JavaRDD<String> data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt");
-
-// Create label, feature, weight tuples from input data with weight set to default value 1.0.
-JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(
-  new Function<String, Tuple3<Double, Double, Double>>() {
-    public Tuple3<Double, Double, Double> call(String line) {
-      String[] parts = line.split(",");
-      return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0);
-    }
-  }
-);
-
-// Split data into training (60%) and test (40%) sets.
-JavaRDD<Tuple3<Double, Double, Double>>[] splits = parsedData.randomSplit(new double[] {0.6, 0.4}, 11L);
-JavaRDD<Tuple3<Double, Double, Double>> training = splits[0];
-JavaRDD<Tuple3<Double, Double, Double>> test = splits[1];
-
-// Create isotonic regression model from training data.
-// Isotonic parameter defaults to true so it is only shown for demonstration
-IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training);
-
-// Create tuples of predicted and real labels.
-JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(
-  new PairFunction<Tuple3<Double, Double, Double>, Double, Double>() {
-    @Override public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> point) {
-      Double predictedLabel = model.predict(point._2());
-      return new Tuple2<Double, Double>(predictedLabel, point._1());
-    }
-  }
-);
-
-// Calculate mean squared error between predicted and real labels.
-Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map(
-  new Function<Tuple2<Double, Double>, Object>() {
-    @Override public Object call(Tuple2<Double, Double> pl) {
-      return Math.pow(pl._1() - pl._2(), 2);
-    }
-  }
-).rdd()).mean();
-
-System.out.println("Mean Squared Error = " + meanSquaredError);
-
-// Save and load model
-model.save(sc.sc(), "myModelPath");
-IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(sc.sc(), "myModelPath");
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java %}
 </div>
-
 <div data-lang="python" markdown="1">
 Data are read from a file where each line has a format label,feature
 i.e. 4710.28,500.00. The data are split to training and testing set.
@@ -173,32 +81,6 @@ labels and real labels in the test set.
 
 Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.IsotonicRegression) and [`IsotonicRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.IsotonicRegressionModel) for more details on the API.
 
-{% highlight python %}
-import math
-from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
-
-data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
-
-# Create label, feature, weight tuples from input data with weight set to default value 1.0.
-parsedData = data.map(lambda line: tuple([float(x) for x in line.split(',')]) + (1.0,))
-
-# Split data into training (60%) and test (40%) sets.
-training, test = parsedData.randomSplit([0.6, 0.4], 11)
-
-# Create isotonic regression model from training data.
-# Isotonic parameter defaults to true so it is only shown for demonstration
-model = IsotonicRegression.train(training)
-
-# Create tuples of predicted and real labels.
-predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))
-
-# Calculate mean squared error between predicted and real labels.
-meanSquaredError = predictionAndLabel.map(lambda pl: math.pow((pl[0] - pl[1]), 2)).mean()
-print("Mean Squared Error = " + str(meanSquaredError))
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = IsotonicRegressionModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example python/mllib/isotonic_regression_example.py %}
 </div>
 </div>
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index f4f6a10c8299e..60ac6c7e5bb1a 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -40,32 +40,8 @@ can be used for evaluation and prediction.
 
 Refer to the [`NaiveBayes` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
-
-val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
-val parsedData = data.map { line =>
-  val parts = line.split(',')
-  LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
-}
-// Split data into training (60%) and test (40%).
-val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
-val training = splits(0)
-val test = splits(1)
-
-val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")
-
-val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
-val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = NaiveBayesModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala %}
 </div>
-
 <div data-lang="java" markdown="1">
 
 [NaiveBayes](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) implements
@@ -77,40 +53,8 @@ can be used for evaluation and prediction.
 
 Refer to the [`NaiveBayes` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) and [`NaiveBayesModel` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.classification.NaiveBayes;
-import org.apache.spark.mllib.classification.NaiveBayesModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-
-JavaRDD<LabeledPoint> training = ... // training set
-JavaRDD<LabeledPoint> test = ... // test set
-
-final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
-
-JavaPairRDD<Double, Double> predictionAndLabel = 
-  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
-    @Override public Tuple2<Double, Double> call(LabeledPoint p) {
-      return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
-    }
-  });
-double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
-    @Override public Boolean call(Tuple2<Double, Double> pl) {
-      return pl._1().equals(pl._2());
-    }
-  }).count() / (double) test.count();
-
-// Save and load model
-model.save(sc.sc(), "myModelPath");
-NaiveBayesModel sameModel = NaiveBayesModel.load(sc.sc(), "myModelPath");
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java %}
 </div>
-
 <div data-lang="python" markdown="1">
 
 [NaiveBayes](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayes) implements multinomial
@@ -124,33 +68,6 @@ Note that the Python API does not yet support model save/load but will in the fu
 
 Refer to the [`NaiveBayes` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayesModel) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.regression import LabeledPoint
-
-def parseLine(line):
-    parts = line.split(',')
-    label = float(parts[0])
-    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
-    return LabeledPoint(label, features)
-
-data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
-
-# Split data aproximately into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed = 0)
-
-# Train a naive Bayes model.
-model = NaiveBayes.train(training, 1.0)
-
-# Make prediction and test accuracy.
-predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
-accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = NaiveBayesModel.load(sc, "myModelPath")
-{% endhighlight %}
-
+{% include_example python/mllib/naive_bayes_example.py %}
 </div>
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
new file mode 100644
index 0000000000000..37e709b4cbc03
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import scala.Tuple2;
+import scala.Tuple3;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.regression.IsotonicRegression;
+import org.apache.spark.mllib.regression.IsotonicRegressionModel;
+// $example off$
+import org.apache.spark.SparkConf;
+
+public class JavaIsotonicRegressionExample {
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaIsotonicRegressionExample");
+    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+    // $example on$
+    JavaRDD<String> data = jsc.textFile("data/mllib/sample_isotonic_regression_data.txt");
+
+    // Create label, feature, weight tuples from input data with weight set to default value 1.0.
+    JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(
+      new Function<String, Tuple3<Double, Double, Double>>() {
+        public Tuple3<Double, Double, Double> call(String line) {
+          String[] parts = line.split(",");
+          return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0);
+        }
+      }
+    );
+
+    // Split data into training (60%) and test (40%) sets.
+    JavaRDD<Tuple3<Double, Double, Double>>[] splits = parsedData.randomSplit(new double[]{0.6, 0.4}, 11L);
+    JavaRDD<Tuple3<Double, Double, Double>> training = splits[0];
+    JavaRDD<Tuple3<Double, Double, Double>> test = splits[1];
+
+    // Create isotonic regression model from training data.
+    // Isotonic parameter defaults to true so it is only shown for demonstration
+    final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training);
+
+    // Create tuples of predicted and real labels.
+    JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(
+      new PairFunction<Tuple3<Double, Double, Double>, Double, Double>() {
+        @Override
+        public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> point) {
+          Double predictedLabel = model.predict(point._2());
+          return new Tuple2<Double, Double>(predictedLabel, point._1());
+        }
+      }
+    );
+
+    // Calculate mean squared error between predicted and real labels.
+    Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map(
+      new Function<Tuple2<Double, Double>, Object>() {
+        @Override
+        public Object call(Tuple2<Double, Double> pl) {
+          return Math.pow(pl._1() - pl._2(), 2);
+        }
+      }
+    ).rdd()).mean();
+    System.out.println("Mean Squared Error = " + meanSquaredError);
+
+    // Save and load model
+    model.save(jsc.sc(), "target/tmp/myIsotonicRegressionModel");
+    IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(jsc.sc(), "target/tmp/myIsotonicRegressionModel");
+    // $example off$
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
new file mode 100644
index 0000000000000..e6a5904bd71f0
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import scala.Tuple2;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.classification.NaiveBayes;
+import org.apache.spark.mllib.classification.NaiveBayesModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+// $example off$
+import org.apache.spark.SparkConf;
+
+public class JavaNaiveBayesExample {
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
+    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+    // $example on$
+    String path = "data/mllib/sample_naive_bayes_data.txt";
+    JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
+    JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}, 12345);
+    JavaRDD<LabeledPoint> training = tmp[0]; // training set
+    JavaRDD<LabeledPoint> test = tmp[1]; // test set
+    final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
+    JavaPairRDD<Double, Double> predictionAndLabel =
+      test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+        @Override
+        public Tuple2<Double, Double> call(LabeledPoint p) {
+          return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
+        }
+      });
+    double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
+      @Override
+      public Boolean call(Tuple2<Double, Double> pl) {
+        return pl._1().equals(pl._2());
+      }
+    }).count() / (double) test.count();
+
+    // Save and load model
+    model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
+    NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
+    // $example off$
+  }
+}
diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py
new file mode 100644
index 0000000000000..89dc9f4b6611a
--- /dev/null
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Isotonic Regression Example.
+"""
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+import math
+from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
+# $example off$
+
+if __name__ == "__main__":
+
+    sc = SparkContext(appName="PythonIsotonicRegressionExample")
+
+    # $example on$
+    data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
+
+    # Create label, feature, weight tuples from input data with weight set to default value 1.0.
+    parsedData = data.map(lambda line: tuple([float(x) for x in line.split(',')]) + (1.0,))
+
+    # Split data into training (60%) and test (40%) sets.
+    training, test = parsedData.randomSplit([0.6, 0.4], 11)
+
+    # Create isotonic regression model from training data.
+    # Isotonic parameter defaults to true so it is only shown for demonstration
+    model = IsotonicRegression.train(training)
+
+    # Create tuples of predicted and real labels.
+    predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))
+
+    # Calculate mean squared error between predicted and real labels.
+    meanSquaredError = predictionAndLabel.map(lambda pl: math.pow((pl[0] - pl[1]), 2)).mean()
+    print("Mean Squared Error = " + str(meanSquaredError))
+
+    # Save and load model
+    model.save(sc, "target/tmp/myIsotonicRegressionModel")
+    sameModel = IsotonicRegressionModel.load(sc, "target/tmp/myIsotonicRegressionModel")
+    # $example off$
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py
new file mode 100644
index 0000000000000..a2e7dacf25491
--- /dev/null
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+NaiveBayes Example.
+"""
+from __future__ import print_function
+
+# $example on$
+from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.regression import LabeledPoint
+
+
+def parseLine(line):
+    parts = line.split(',')
+    label = float(parts[0])
+    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
+    return LabeledPoint(label, features)
+# $example off$
+
+if __name__ == "__main__":
+
+    sc = SparkContext(appName="PythonNaiveBayesExample")
+
+    # $example on$
+    data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
+
+    # Split data aproximately into training (60%) and test (40%)
+    training, test = data.randomSplit([0.6, 0.4], seed=0)
+
+    # Train a naive Bayes model.
+    model = NaiveBayes.train(training, 1.0)
+
+    # Make prediction and test accuracy.
+    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
+    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
+
+    # Save and load model
+    model.save(sc, "target/tmp/myNaiveBayesModel")
+    sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
+    # $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
new file mode 100644
index 0000000000000..52ac9ae7dd2d0
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+object IsotonicRegressionExample {
+
+  def main(args: Array[String]) : Unit = {
+
+    val conf = new SparkConf().setAppName("IsotonicRegressionExample")
+    val sc = new SparkContext(conf)
+    // $example on$
+    val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
+
+    // Create label, feature, weight tuples from input data with weight set to default value 1.0.
+    val parsedData = data.map { line =>
+      val parts = line.split(',').map(_.toDouble)
+      (parts(0), parts(1), 1.0)
+    }
+
+    // Split data into training (60%) and test (40%) sets.
+    val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
+    val training = splits(0)
+    val test = splits(1)
+
+    // Create isotonic regression model from training data.
+    // Isotonic parameter defaults to true so it is only shown for demonstration
+    val model = new IsotonicRegression().setIsotonic(true).run(training)
+
+    // Create tuples of predicted and real labels.
+    val predictionAndLabel = test.map { point =>
+      val predictedLabel = model.predict(point._2)
+      (predictedLabel, point._1)
+    }
+
+    // Calculate mean squared error between predicted and real labels.
+    val meanSquaredError = predictionAndLabel.map { case (p, l) => math.pow((p - l), 2) }.mean()
+    println("Mean Squared Error = " + meanSquaredError)
+
+    // Save and load model
+    model.save(sc, "target/tmp/myIsotonicRegressionModel")
+    val sameModel = IsotonicRegressionModel.load(sc, "target/tmp/myIsotonicRegressionModel")
+    // $example off$
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
new file mode 100644
index 0000000000000..a7a47c2a3556a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NaiveBayesExample {
+
+  def main(args: Array[String]) : Unit = {
+    val conf = new SparkConf().setAppName("NaiveBayesExample")
+    val sc = new SparkContext(conf)
+    // $example on$
+    val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
+    val parsedData = data.map { line =>
+      val parts = line.split(',')
+      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
+    }
+
+    // Split data into training (60%) and test (40%).
+    val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
+    val training = splits(0)
+    val test = splits(1)
+
+    val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")
+
+    val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
+    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
+
+    // Save and load model
+    model.save(sc, "target/tmp/myNaiveBayesModel")
+    val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
+    // $example off$
+  }
+}
+
+// scalastyle:on println

From ecfb3e73fd0a99f0be96034710974e78b6f9d624 Mon Sep 17 00:00:00 2001
From: lihao <lihaowhu@gmail.com>
Date: Mon, 2 Nov 2015 16:09:22 -0800
Subject: [PATCH 763/802] [SPARK-10286][ML][PYSPARK][DOCS] Add @since
 annotation to pyspark.ml.param and pyspark.ml.*

Author: lihao <lihaowhu@gmail.com>

Closes #9275 from lidinghao/SPARK-10286.
---
 python/pyspark/ml/evaluation.py     |  20 ++++
 python/pyspark/ml/feature.py        | 164 ++++++++++++++++++++++++++++
 python/pyspark/ml/param/__init__.py |  16 +++
 python/pyspark/ml/pipeline.py       |  30 +++++
 4 files changed, 230 insertions(+)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index cb3b07947e488..dcc1738ec518b 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -17,6 +17,7 @@
 
 from abc import abstractmethod, ABCMeta
 
+from pyspark import since
 from pyspark.ml.wrapper import JavaWrapper
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol
@@ -31,6 +32,8 @@
 class Evaluator(Params):
     """
     Base class for evaluators that compute metrics from predictions.
+
+    .. versionadded:: 1.4.0
     """
 
     __metaclass__ = ABCMeta
@@ -46,6 +49,7 @@ def _evaluate(self, dataset):
         """
         raise NotImplementedError()
 
+    @since("1.4.0")
     def evaluate(self, dataset, params=None):
         """
         Evaluates the output with optional parameters.
@@ -66,6 +70,7 @@ def evaluate(self, dataset, params=None):
         else:
             raise ValueError("Params must be a param map but got %s." % type(params))
 
+    @since("1.5.0")
     def isLargerBetter(self):
         """
         Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
@@ -114,6 +119,8 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     0.70...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
     0.83...
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -138,6 +145,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
         kwargs = self.__init__._input_kwargs
         self._set(**kwargs)
 
+    @since("1.4.0")
     def setMetricName(self, value):
         """
         Sets the value of :py:attr:`metricName`.
@@ -145,6 +153,7 @@ def setMetricName(self, value):
         self._paramMap[self.metricName] = value
         return self
 
+    @since("1.4.0")
     def getMetricName(self):
         """
         Gets the value of metricName or its default value.
@@ -152,6 +161,7 @@ def getMetricName(self):
         return self.getOrDefault(self.metricName)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
                   metricName="areaUnderROC"):
         """
@@ -180,6 +190,8 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
     0.993...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
     2.649...
+
+    .. versionadded:: 1.4.0
     """
     # Because we will maximize evaluation value (ref: `CrossValidator`),
     # when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`),
@@ -205,6 +217,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
         kwargs = self.__init__._input_kwargs
         self._set(**kwargs)
 
+    @since("1.4.0")
     def setMetricName(self, value):
         """
         Sets the value of :py:attr:`metricName`.
@@ -212,6 +225,7 @@ def setMetricName(self, value):
         self._paramMap[self.metricName] = value
         return self
 
+    @since("1.4.0")
     def getMetricName(self):
         """
         Gets the value of metricName or its default value.
@@ -219,6 +233,7 @@ def getMetricName(self):
         return self.getOrDefault(self.metricName)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, predictionCol="prediction", labelCol="label",
                   metricName="rmse"):
         """
@@ -246,6 +261,8 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
     0.66...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "recall"})
     0.66...
+
+    .. versionadded:: 1.5.0
     """
     # a placeholder to make it appear in the generated doc
     metricName = Param(Params._dummy(), "metricName",
@@ -271,6 +288,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
         kwargs = self.__init__._input_kwargs
         self._set(**kwargs)
 
+    @since("1.5.0")
     def setMetricName(self, value):
         """
         Sets the value of :py:attr:`metricName`.
@@ -278,6 +296,7 @@ def setMetricName(self, value):
         self._paramMap[self.metricName] = value
         return self
 
+    @since("1.5.0")
     def getMetricName(self):
         """
         Gets the value of metricName or its default value.
@@ -285,6 +304,7 @@ def getMetricName(self):
         return self.getOrDefault(self.metricName)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, predictionCol="prediction", labelCol="label",
                   metricName="f1"):
         """
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 55bde6d0ea4fb..c7b6dd926c3e8 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -19,6 +19,7 @@
 if sys.version > '3':
     basestring = str
 
+from pyspark import since
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import keyword_only
@@ -51,6 +52,8 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}
     >>> binarizer.transform(df, params).head().vector
     1.0
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -71,6 +74,7 @@ def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, threshold=0.0, inputCol=None, outputCol=None):
         """
         setParams(self, threshold=0.0, inputCol=None, outputCol=None)
@@ -79,6 +83,7 @@ def setParams(self, threshold=0.0, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setThreshold(self, value):
         """
         Sets the value of :py:attr:`threshold`.
@@ -86,6 +91,7 @@ def setThreshold(self, value):
         self._paramMap[self.threshold] = value
         return self
 
+    @since("1.4.0")
     def getThreshold(self):
         """
         Gets the value of threshold or its default value.
@@ -114,6 +120,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     2.0
     >>> bucketizer.setParams(outputCol="b").transform(df).head().b
     0.0
+
+    .. versionadded:: 1.3.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -150,6 +158,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, splits=None, inputCol=None, outputCol=None):
         """
         setParams(self, splits=None, inputCol=None, outputCol=None)
@@ -158,6 +167,7 @@ def setParams(self, splits=None, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setSplits(self, value):
         """
         Sets the value of :py:attr:`splits`.
@@ -165,6 +175,7 @@ def setSplits(self, value):
         self._paramMap[self.splits] = value
         return self
 
+    @since("1.4.0")
     def getSplits(self):
         """
         Gets the value of threshold or its default value.
@@ -194,6 +205,8 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     ...
     >>> sorted(map(str, model.vocabulary))
     ['a', 'b', 'c']
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -242,6 +255,7 @@ def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outpu
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
         """
         setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
@@ -250,6 +264,7 @@ def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outp
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setMinTF(self, value):
         """
         Sets the value of :py:attr:`minTF`.
@@ -257,12 +272,14 @@ def setMinTF(self, value):
         self._paramMap[self.minTF] = value
         return self
 
+    @since("1.6.0")
     def getMinTF(self):
         """
         Gets the value of minTF or its default value.
         """
         return self.getOrDefault(self.minTF)
 
+    @since("1.6.0")
     def setMinDF(self, value):
         """
         Sets the value of :py:attr:`minDF`.
@@ -270,12 +287,14 @@ def setMinDF(self, value):
         self._paramMap[self.minDF] = value
         return self
 
+    @since("1.6.0")
     def getMinDF(self):
         """
         Gets the value of minDF or its default value.
         """
         return self.getOrDefault(self.minDF)
 
+    @since("1.6.0")
     def setVocabSize(self, value):
         """
         Sets the value of :py:attr:`vocabSize`.
@@ -283,6 +302,7 @@ def setVocabSize(self, value):
         self._paramMap[self.vocabSize] = value
         return self
 
+    @since("1.6.0")
     def getVocabSize(self):
         """
         Gets the value of vocabSize or its default value.
@@ -298,9 +318,12 @@ class CountVectorizerModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by CountVectorizer.
+
+    .. versionadded:: 1.6.0
     """
 
     @property
+    @since("1.6.0")
     def vocabulary(self):
         """
         An array of terms in the vocabulary.
@@ -331,6 +354,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
     >>> df3.head().origVec
     DenseVector([5.0, 8.0, 6.0])
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -351,6 +376,7 @@ def __init__(self, inverse=False, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, inverse=False, inputCol=None, outputCol=None):
         """
         setParams(self, inverse=False, inputCol=None, outputCol=None)
@@ -359,6 +385,7 @@ def setParams(self, inverse=False, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setInverse(self, value):
         """
         Sets the value of :py:attr:`inverse`.
@@ -366,6 +393,7 @@ def setInverse(self, value):
         self._paramMap[self.inverse] = value
         return self
 
+    @since("1.6.0")
     def getInverse(self):
         """
         Gets the value of inverse or its default value.
@@ -390,6 +418,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     DenseVector([2.0, 2.0, 9.0])
     >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod
     DenseVector([4.0, 3.0, 15.0])
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -410,6 +440,7 @@ def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
         """
         setParams(self, scalingVec=None, inputCol=None, outputCol=None)
@@ -418,6 +449,7 @@ def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setScalingVec(self, value):
         """
         Sets the value of :py:attr:`scalingVec`.
@@ -425,6 +457,7 @@ def setScalingVec(self, value):
         self._paramMap[self.scalingVec] = value
         return self
 
+    @since("1.5.0")
     def getScalingVec(self):
         """
         Gets the value of scalingVec or its default value.
@@ -449,6 +482,8 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}
     >>> hashingTF.transform(df, params).head().vector
     SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})
+
+    .. versionadded:: 1.3.0
     """
 
     @keyword_only
@@ -463,6 +498,7 @@ def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.3.0")
     def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
         """
         setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
@@ -490,6 +526,8 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
     >>> idf.fit(df, params).transform(df).head().vector
     DenseVector([0.2877, 0.0])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -510,6 +548,7 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, minDocFreq=0, inputCol=None, outputCol=None):
         """
         setParams(self, minDocFreq=0, inputCol=None, outputCol=None)
@@ -518,6 +557,7 @@ def setParams(self, minDocFreq=0, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setMinDocFreq(self, value):
         """
         Sets the value of :py:attr:`minDocFreq`.
@@ -525,6 +565,7 @@ def setMinDocFreq(self, value):
         self._paramMap[self.minDocFreq] = value
         return self
 
+    @since("1.4.0")
     def getMinDocFreq(self):
         """
         Gets the value of minDocFreq or its default value.
@@ -540,6 +581,8 @@ class IDFModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by IDF.
+
+    .. versionadded:: 1.4.0
     """
 
 
@@ -571,6 +614,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
     |[2.0]| [1.0]|
     +-----+------+
     ...
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -591,6 +636,7 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
         """
         setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
@@ -599,6 +645,7 @@ def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setMin(self, value):
         """
         Sets the value of :py:attr:`min`.
@@ -606,12 +653,14 @@ def setMin(self, value):
         self._paramMap[self.min] = value
         return self
 
+    @since("1.6.0")
     def getMin(self):
         """
         Gets the value of min or its default value.
         """
         return self.getOrDefault(self.min)
 
+    @since("1.6.0")
     def setMax(self, value):
         """
         Sets the value of :py:attr:`max`.
@@ -619,6 +668,7 @@ def setMax(self, value):
         self._paramMap[self.max] = value
         return self
 
+    @since("1.6.0")
     def getMax(self):
         """
         Gets the value of max or its default value.
@@ -634,6 +684,8 @@ class MinMaxScalerModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by :py:class:`MinMaxScaler`.
+
+    .. versionadded:: 1.6.0
     """
 
 
@@ -668,6 +720,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -686,6 +740,7 @@ def __init__(self, n=2, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, n=2, inputCol=None, outputCol=None):
         """
         setParams(self, n=2, inputCol=None, outputCol=None)
@@ -694,6 +749,7 @@ def setParams(self, n=2, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setN(self, value):
         """
         Sets the value of :py:attr:`n`.
@@ -701,6 +757,7 @@ def setN(self, value):
         self._paramMap[self.n] = value
         return self
 
+    @since("1.5.0")
     def getN(self):
         """
         Gets the value of n or its default value.
@@ -726,6 +783,8 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}
     >>> normalizer.transform(df, params).head().vector
     DenseVector([0.4286, -0.5714])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -744,6 +803,7 @@ def __init__(self, p=2.0, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, p=2.0, inputCol=None, outputCol=None):
         """
         setParams(self, p=2.0, inputCol=None, outputCol=None)
@@ -752,6 +812,7 @@ def setParams(self, p=2.0, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setP(self, value):
         """
         Sets the value of :py:attr:`p`.
@@ -759,6 +820,7 @@ def setP(self, value):
         self._paramMap[self.p] = value
         return self
 
+    @since("1.4.0")
     def getP(self):
         """
         Gets the value of p or its default value.
@@ -800,6 +862,8 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     >>> params = {encoder.dropLast: False, encoder.outputCol: "test"}
     >>> encoder.transform(td, params).head().test
     SparseVector(3, {0: 1.0})
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -818,6 +882,7 @@ def __init__(self, dropLast=True, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, dropLast=True, inputCol=None, outputCol=None):
         """
         setParams(self, dropLast=True, inputCol=None, outputCol=None)
@@ -826,6 +891,7 @@ def setParams(self, dropLast=True, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setDropLast(self, value):
         """
         Sets the value of :py:attr:`dropLast`.
@@ -833,6 +899,7 @@ def setDropLast(self, value):
         self._paramMap[self.dropLast] = value
         return self
 
+    @since("1.4.0")
     def getDropLast(self):
         """
         Gets the value of dropLast or its default value.
@@ -858,6 +925,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
     >>> px.setParams(outputCol="test").transform(df).head().test
     DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -877,6 +946,7 @@ def __init__(self, degree=2, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, degree=2, inputCol=None, outputCol=None):
         """
         setParams(self, degree=2, inputCol=None, outputCol=None)
@@ -885,6 +955,7 @@ def setParams(self, degree=2, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setDegree(self, value):
         """
         Sets the value of :py:attr:`degree`.
@@ -892,6 +963,7 @@ def setDegree(self, value):
         self._paramMap[self.degree] = value
         return self
 
+    @since("1.4.0")
     def getDegree(self):
         """
         Gets the value of degree or its default value.
@@ -929,6 +1001,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -951,6 +1025,7 @@ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, o
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
         """
         setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
@@ -959,6 +1034,7 @@ def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setMinTokenLength(self, value):
         """
         Sets the value of :py:attr:`minTokenLength`.
@@ -966,12 +1042,14 @@ def setMinTokenLength(self, value):
         self._paramMap[self.minTokenLength] = value
         return self
 
+    @since("1.4.0")
     def getMinTokenLength(self):
         """
         Gets the value of minTokenLength or its default value.
         """
         return self.getOrDefault(self.minTokenLength)
 
+    @since("1.4.0")
     def setGaps(self, value):
         """
         Sets the value of :py:attr:`gaps`.
@@ -979,12 +1057,14 @@ def setGaps(self, value):
         self._paramMap[self.gaps] = value
         return self
 
+    @since("1.4.0")
     def getGaps(self):
         """
         Gets the value of gaps or its default value.
         """
         return self.getOrDefault(self.gaps)
 
+    @since("1.4.0")
     def setPattern(self, value):
         """
         Sets the value of :py:attr:`pattern`.
@@ -992,6 +1072,7 @@ def setPattern(self, value):
         self._paramMap[self.pattern] = value
         return self
 
+    @since("1.4.0")
     def getPattern(self):
         """
         Gets the value of pattern or its default value.
@@ -1013,6 +1094,8 @@ class SQLTransformer(JavaTransformer):
     ...     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
     >>> sqlTrans.transform(df).head()
     Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1030,6 +1113,7 @@ def __init__(self, statement=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, statement=None):
         """
         setParams(self, statement=None)
@@ -1038,6 +1122,7 @@ def setParams(self, statement=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setStatement(self, value):
         """
         Sets the value of :py:attr:`statement`.
@@ -1045,6 +1130,7 @@ def setStatement(self, value):
         self._paramMap[self.statement] = value
         return self
 
+    @since("1.6.0")
     def getStatement(self):
         """
         Gets the value of statement or its default value.
@@ -1070,6 +1156,8 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     DenseVector([1.4142])
     >>> model.transform(df).collect()[1].scaled
     DenseVector([1.4142])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1090,6 +1178,7 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
         """
         setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
@@ -1098,6 +1187,7 @@ def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setWithMean(self, value):
         """
         Sets the value of :py:attr:`withMean`.
@@ -1105,12 +1195,14 @@ def setWithMean(self, value):
         self._paramMap[self.withMean] = value
         return self
 
+    @since("1.4.0")
     def getWithMean(self):
         """
         Gets the value of withMean or its default value.
         """
         return self.getOrDefault(self.withMean)
 
+    @since("1.4.0")
     def setWithStd(self, value):
         """
         Sets the value of :py:attr:`withStd`.
@@ -1118,6 +1210,7 @@ def setWithStd(self, value):
         self._paramMap[self.withStd] = value
         return self
 
+    @since("1.4.0")
     def getWithStd(self):
         """
         Gets the value of withStd or its default value.
@@ -1133,9 +1226,12 @@ class StandardScalerModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by StandardScaler.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.5.0")
     def std(self):
         """
         Standard deviation of the StandardScalerModel.
@@ -1143,6 +1239,7 @@ def std(self):
         return self._call_java("std")
 
     @property
+    @since("1.5.0")
     def mean(self):
         """
         Mean of the StandardScalerModel.
@@ -1171,6 +1268,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
     >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
     ...     key=lambda x: x[0])
     [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
+
+    .. versionadded:: 1.4.0
     """
 
     @keyword_only
@@ -1185,6 +1284,7 @@ def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
         setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
@@ -1202,8 +1302,11 @@ class StringIndexerModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by StringIndexer.
+
+    .. versionadded:: 1.4.0
     """
     @property
+    @since("1.5.0")
     def labels(self):
         """
         Ordered list of labels, corresponding to indices to be assigned.
@@ -1221,6 +1324,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
     The index-string mapping is either from the ML attributes of the input column,
     or from user-supplied labels (which take precedence over ML attributes).
     See L{StringIndexer} for converting strings into indices.
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make the labels show up in generated doc
@@ -1243,6 +1348,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, inputCol=None, outputCol=None, labels=None):
         """
         setParams(self, inputCol=None, outputCol=None, labels=None)
@@ -1251,6 +1357,7 @@ def setParams(self, inputCol=None, outputCol=None, labels=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setLabels(self, value):
         """
         Sets the value of :py:attr:`labels`.
@@ -1258,6 +1365,7 @@ def setLabels(self, value):
         self._paramMap[self.labels] = value
         return self
 
+    @since("1.6.0")
     def getLabels(self):
         """
         Gets the value of :py:attr:`labels` or its default value.
@@ -1271,6 +1379,8 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
 
     A feature transformer that filters out stop words from input.
     Note: null values from input array are preserved unless adding null to stopWords explicitly.
+
+    .. versionadded:: 1.6.0
     """
     # a placeholder to make the stopwords show up in generated doc
     stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
@@ -1297,6 +1407,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None,
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, inputCol=None, outputCol=None, stopWords=None,
                   caseSensitive=False):
         """
@@ -1307,6 +1418,7 @@ def setParams(self, inputCol=None, outputCol=None, stopWords=None,
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setStopWords(self, value):
         """
         Specify the stopwords to be filtered.
@@ -1314,12 +1426,14 @@ def setStopWords(self, value):
         self._paramMap[self.stopWords] = value
         return self
 
+    @since("1.6.0")
     def getStopWords(self):
         """
         Get the stopwords.
         """
         return self.getOrDefault(self.stopWords)
 
+    @since("1.6.0")
     def setCaseSensitive(self, value):
         """
         Set whether to do a case sensitive comparison over the stop words
@@ -1327,6 +1441,7 @@ def setCaseSensitive(self, value):
         self._paramMap[self.caseSensitive] = value
         return self
 
+    @since("1.6.0")
     def getCaseSensitive(self):
         """
         Get whether to do a case sensitive comparison over the stop words.
@@ -1360,6 +1475,8 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
+
+    .. versionadded:: 1.3.0
     """
 
     @keyword_only
@@ -1373,6 +1490,7 @@ def __init__(self, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.3.0")
     def setParams(self, inputCol=None, outputCol=None):
         """
         setParams(self, inputCol="input", outputCol="output")
@@ -1398,6 +1516,8 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
     >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}
     >>> vecAssembler.transform(df, params).head().vector
     DenseVector([0.0, 1.0])
+
+    .. versionadded:: 1.4.0
     """
 
     @keyword_only
@@ -1411,6 +1531,7 @@ def __init__(self, inputCols=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, inputCols=None, outputCol=None):
         """
         setParams(self, inputCols=None, outputCol=None)
@@ -1477,6 +1598,8 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> model2 = indexer.fit(df, params)
     >>> model2.transform(df).head().vector
     DenseVector([1.0, 0.0])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1501,6 +1624,7 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
         """
         setParams(self, maxCategories=20, inputCol=None, outputCol=None)
@@ -1509,6 +1633,7 @@ def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setMaxCategories(self, value):
         """
         Sets the value of :py:attr:`maxCategories`.
@@ -1516,6 +1641,7 @@ def setMaxCategories(self, value):
         self._paramMap[self.maxCategories] = value
         return self
 
+    @since("1.4.0")
     def getMaxCategories(self):
         """
         Gets the value of maxCategories or its default value.
@@ -1531,9 +1657,12 @@ class VectorIndexerModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by VectorIndexer.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.4.0")
     def numFeatures(self):
         """
         Number of features, i.e., length of Vectors which this transforms.
@@ -1541,6 +1670,7 @@ def numFeatures(self):
         return self._call_java("numFeatures")
 
     @property
+    @since("1.4.0")
     def categoryMaps(self):
         """
         Feature value index.  Keys are categorical feature indices (column indices).
@@ -1573,6 +1703,8 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
     >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
     >>> vs.transform(df).head().sliced
     DenseVector([2.3, 1.0])
+
+    .. versionadded:: 1.6.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1600,6 +1732,7 @@ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.6.0")
     def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         """
         setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
@@ -1608,6 +1741,7 @@ def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.6.0")
     def setIndices(self, value):
         """
         Sets the value of :py:attr:`indices`.
@@ -1615,12 +1749,14 @@ def setIndices(self, value):
         self._paramMap[self.indices] = value
         return self
 
+    @since("1.6.0")
     def getIndices(self):
         """
         Gets the value of indices or its default value.
         """
         return self.getOrDefault(self.indices)
 
+    @since("1.6.0")
     def setNames(self, value):
         """
         Sets the value of :py:attr:`names`.
@@ -1628,6 +1764,7 @@ def setNames(self, value):
         self._paramMap[self.names] = value
         return self
 
+    @since("1.6.0")
     def getNames(self):
         """
         Gets the value of names or its default value.
@@ -1666,6 +1803,8 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     ...
     >>> model.transform(doc).head().model
     DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1699,6 +1838,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
                   seed=None, inputCol=None, outputCol=None):
         """
@@ -1709,6 +1849,7 @@ def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setVectorSize(self, value):
         """
         Sets the value of :py:attr:`vectorSize`.
@@ -1716,12 +1857,14 @@ def setVectorSize(self, value):
         self._paramMap[self.vectorSize] = value
         return self
 
+    @since("1.4.0")
     def getVectorSize(self):
         """
         Gets the value of vectorSize or its default value.
         """
         return self.getOrDefault(self.vectorSize)
 
+    @since("1.4.0")
     def setNumPartitions(self, value):
         """
         Sets the value of :py:attr:`numPartitions`.
@@ -1729,12 +1872,14 @@ def setNumPartitions(self, value):
         self._paramMap[self.numPartitions] = value
         return self
 
+    @since("1.4.0")
     def getNumPartitions(self):
         """
         Gets the value of numPartitions or its default value.
         """
         return self.getOrDefault(self.numPartitions)
 
+    @since("1.4.0")
     def setMinCount(self, value):
         """
         Sets the value of :py:attr:`minCount`.
@@ -1742,6 +1887,7 @@ def setMinCount(self, value):
         self._paramMap[self.minCount] = value
         return self
 
+    @since("1.4.0")
     def getMinCount(self):
         """
         Gets the value of minCount or its default value.
@@ -1757,8 +1903,11 @@ class Word2VecModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by Word2Vec.
+
+    .. versionadded:: 1.4.0
     """
 
+    @since("1.5.0")
     def getVectors(self):
         """
         Returns the vector representation of the words as a dataframe
@@ -1766,6 +1915,7 @@ def getVectors(self):
         """
         return self._call_java("getVectors")
 
+    @since("1.5.0")
     def findSynonyms(self, word, num):
         """
         Find "num" number of words closest in similarity to "word".
@@ -1794,6 +1944,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     >>> model = pca.fit(df)
     >>> model.transform(df).collect()[0].pca_features
     DenseVector([1.648..., -4.013...])
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1811,6 +1963,7 @@ def __init__(self, k=None, inputCol=None, outputCol=None):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, k=None, inputCol=None, outputCol=None):
         """
         setParams(self, k=None, inputCol=None, outputCol=None)
@@ -1819,6 +1972,7 @@ def setParams(self, k=None, inputCol=None, outputCol=None):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setK(self, value):
         """
         Sets the value of :py:attr:`k`.
@@ -1826,6 +1980,7 @@ def setK(self, value):
         self._paramMap[self.k] = value
         return self
 
+    @since("1.5.0")
     def getK(self):
         """
         Gets the value of k or its default value.
@@ -1841,6 +1996,8 @@ class PCAModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by PCA.
+
+    .. versionadded:: 1.5.0
     """
 
 
@@ -1879,6 +2036,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
     |0.0|0.0|  a|   [0.0]|  0.0|
     +---+---+---+--------+-----+
     ...
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -1896,6 +2055,7 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label"):
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, formula=None, featuresCol="features", labelCol="label"):
         """
         setParams(self, formula=None, featuresCol="features", labelCol="label")
@@ -1904,6 +2064,7 @@ def setParams(self, formula=None, featuresCol="features", labelCol="label"):
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setFormula(self, value):
         """
         Sets the value of :py:attr:`formula`.
@@ -1911,6 +2072,7 @@ def setFormula(self, value):
         self._paramMap[self.formula] = value
         return self
 
+    @since("1.5.0")
     def getFormula(self):
         """
         Gets the value of :py:attr:`formula`.
@@ -1926,6 +2088,8 @@ class RFormulaModel(JavaModel):
     .. note:: Experimental
 
     Model fitted by :py:class:`RFormula`.
+
+    .. versionadded:: 1.5.0
     """
 
 
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 2e0c63cb47b17..35c9b776a3d5e 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -18,6 +18,7 @@
 from abc import ABCMeta
 import copy
 
+from pyspark import since
 from pyspark.ml.util import Identifiable
 
 
@@ -27,6 +28,8 @@
 class Param(object):
     """
     A param with self-contained documentation.
+
+    .. versionadded:: 1.3.0
     """
 
     def __init__(self, parent, name, doc):
@@ -56,6 +59,8 @@ class Params(Identifiable):
     """
     Components that take parameters. This also provides an internal
     param map to store parameter values attached to the instance.
+
+    .. versionadded:: 1.3.0
     """
 
     __metaclass__ = ABCMeta
@@ -72,6 +77,7 @@ def __init__(self):
         self._params = None
 
     @property
+    @since("1.3.0")
     def params(self):
         """
         Returns all params ordered by name. The default implementation
@@ -83,6 +89,7 @@ def params(self):
                                        [getattr(self, x) for x in dir(self) if x != "params"]))
         return self._params
 
+    @since("1.4.0")
     def explainParam(self, param):
         """
         Explains a single param and returns its name, doc, and optional
@@ -100,6 +107,7 @@ def explainParam(self, param):
         valueStr = "(" + ", ".join(values) + ")"
         return "%s: %s %s" % (param.name, param.doc, valueStr)
 
+    @since("1.4.0")
     def explainParams(self):
         """
         Returns the documentation of all params with their optionally
@@ -107,6 +115,7 @@ def explainParams(self):
         """
         return "\n".join([self.explainParam(param) for param in self.params])
 
+    @since("1.4.0")
     def getParam(self, paramName):
         """
         Gets a param by its name.
@@ -117,6 +126,7 @@ def getParam(self, paramName):
         else:
             raise ValueError("Cannot find param with name %s." % paramName)
 
+    @since("1.4.0")
     def isSet(self, param):
         """
         Checks whether a param is explicitly set by user.
@@ -124,6 +134,7 @@ def isSet(self, param):
         param = self._resolveParam(param)
         return param in self._paramMap
 
+    @since("1.4.0")
     def hasDefault(self, param):
         """
         Checks whether a param has a default value.
@@ -131,6 +142,7 @@ def hasDefault(self, param):
         param = self._resolveParam(param)
         return param in self._defaultParamMap
 
+    @since("1.4.0")
     def isDefined(self, param):
         """
         Checks whether a param is explicitly set by user or has
@@ -138,6 +150,7 @@ def isDefined(self, param):
         """
         return self.isSet(param) or self.hasDefault(param)
 
+    @since("1.4.0")
     def hasParam(self, paramName):
         """
         Tests whether this instance contains a param with a given
@@ -146,6 +159,7 @@ def hasParam(self, paramName):
         param = self._resolveParam(paramName)
         return param in self.params
 
+    @since("1.4.0")
     def getOrDefault(self, param):
         """
         Gets the value of a param in the user-supplied param map or its
@@ -157,6 +171,7 @@ def getOrDefault(self, param):
         else:
             return self._defaultParamMap[param]
 
+    @since("1.4.0")
     def extractParamMap(self, extra=None):
         """
         Extracts the embedded default param values and user-supplied
@@ -175,6 +190,7 @@ def extractParamMap(self, extra=None):
         paramMap.update(extra)
         return paramMap
 
+    @since("1.4.0")
     def copy(self, extra=None):
         """
         Creates a copy of this instance with the same uid and some
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 312a8502b3a2c..4475451edb781 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -17,6 +17,7 @@
 
 from abc import ABCMeta, abstractmethod
 
+from pyspark import since
 from pyspark.ml.param import Param, Params
 from pyspark.ml.util import keyword_only
 from pyspark.mllib.common import inherit_doc
@@ -26,6 +27,8 @@
 class Estimator(Params):
     """
     Abstract class for estimators that fit models to data.
+
+    .. versionadded:: 1.3.0
     """
 
     __metaclass__ = ABCMeta
@@ -42,6 +45,7 @@ def _fit(self, dataset):
         """
         raise NotImplementedError()
 
+    @since("1.3.0")
     def fit(self, dataset, params=None):
         """
         Fits a model to the input dataset with optional parameters.
@@ -73,6 +77,8 @@ class Transformer(Params):
     """
     Abstract class for transformers that transform one dataset into
     another.
+
+    .. versionadded:: 1.3.0
     """
 
     __metaclass__ = ABCMeta
@@ -88,6 +94,7 @@ def _transform(self, dataset):
         """
         raise NotImplementedError()
 
+    @since("1.3.0")
     def transform(self, dataset, params=None):
         """
         Transforms the input dataset with optional parameters.
@@ -113,6 +120,8 @@ def transform(self, dataset, params=None):
 class Model(Transformer):
     """
     Abstract class for models that are fitted by estimators.
+
+    .. versionadded:: 1.4.0
     """
 
     __metaclass__ = ABCMeta
@@ -136,6 +145,8 @@ class Pipeline(Estimator):
     consists of fitted models and transformers, corresponding to the
     pipeline stages. If there are no stages, the pipeline acts as an
     identity transformer.
+
+    .. versionadded:: 1.3.0
     """
 
     @keyword_only
@@ -151,6 +162,7 @@ def __init__(self, stages=None):
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
+    @since("1.3.0")
     def setStages(self, value):
         """
         Set pipeline stages.
@@ -161,6 +173,7 @@ def setStages(self, value):
         self._paramMap[self.stages] = value
         return self
 
+    @since("1.3.0")
     def getStages(self):
         """
         Get pipeline stages.
@@ -169,6 +182,7 @@ def getStages(self):
             return self._paramMap[self.stages]
 
     @keyword_only
+    @since("1.3.0")
     def setParams(self, stages=None):
         """
         setParams(self, stages=None)
@@ -204,7 +218,14 @@ def _fit(self, dataset):
                 transformers.append(stage)
         return PipelineModel(transformers)
 
+    @since("1.4.0")
     def copy(self, extra=None):
+        """
+        Creates a copy of this instance.
+
+        :param extra: extra parameters
+        :returns: new instance
+        """
         if extra is None:
             extra = dict()
         that = Params.copy(self, extra)
@@ -216,6 +237,8 @@ def copy(self, extra=None):
 class PipelineModel(Model):
     """
     Represents a compiled pipeline with transformers and fitted models.
+
+    .. versionadded:: 1.3.0
     """
 
     def __init__(self, stages):
@@ -227,7 +250,14 @@ def _transform(self, dataset):
             dataset = t.transform(dataset)
         return dataset
 
+    @since("1.4.0")
     def copy(self, extra=None):
+        """
+        Creates a copy of this instance.
+
+        :param extra: extra parameters
+        :returns: new instance
+        """
         if extra is None:
             extra = dict()
         stages = [stage.copy(extra) for stage in self.stages]

From ec03866a7ef2d0826520755d47c8c9480148a76c Mon Sep 17 00:00:00 2001
From: Dominik Dahlem <dominik.dahlem@gmail.combination>
Date: Mon, 2 Nov 2015 16:11:42 -0800
Subject: [PATCH 764/802] [SPARK-11343][ML] Allow float and double
 prediction/label columns in RegressionEvaluator

mengxr, felixcheung

This pull request just relaxes the type of the prediction/label columns to be float and double. Internally, these columns are casted to double. The other evaluators might need to be changed also.

Author: Dominik Dahlem <dominik.dahlem@gmail.combination>

Closes #9296 from dahlem/ddahlem_regression_evaluator_double_predictions_27102015.
---
 .../spark/ml/evaluation/RegressionEvaluator.scala    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 3fd34d8571017..ba012f444d3e0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -23,7 +23,8 @@ import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, FloatType}
 
 /**
  * :: Experimental ::
@@ -72,10 +73,13 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui
   @Since("1.4.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
-    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
-    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+    val predictionType = schema($(predictionCol)).dataType
+    require(predictionType == FloatType || predictionType == DoubleType)
+    val labelType = schema($(labelCol)).dataType
+    require(labelType == FloatType || labelType == DoubleType)
 
-    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
+    val predictionAndLabels = dataset
+      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
       .map { case Row(prediction: Double, label: Double) =>
         (prediction, label)
       }

From c020f7d9d43548d27ae4a9564ba38981fd530cb1 Mon Sep 17 00:00:00 2001
From: vectorijk <jiangkai@gmail.com>
Date: Mon, 2 Nov 2015 16:12:04 -0800
Subject: [PATCH 765/802] [SPARK-10592] [ML] [PySpark] Deprecate weights and
 use coefficients instead in ML models

Deprecated in `LogisticRegression` and `LinearRegression`

Author: vectorijk <jiangkai@gmail.com>

Closes #9311 from vectorijk/spark-10592.
---
 R/pkg/R/mllib.R                               |   6 +-
 .../classification/LogisticRegression.scala   |  11 +-
 .../apache/spark/ml/r/SparkRWrappers.scala    |  15 +-
 .../ml/regression/AFTSurvivalRegression.scala |  32 +--
 .../ml/regression/IsotonicRegression.scala    |   4 +-
 .../ml/regression/LinearRegression.scala      |  15 +-
 .../ml/classification/JavaOneVsRestSuite.java |   6 +-
 .../LogisticRegressionSuite.scala             | 152 ++++++++-------
 .../MultilayerPerceptronClassifierSuite.scala |   6 +-
 .../ml/classification/OneVsRestSuite.scala    |   6 +-
 .../AFTSurvivalRegressionSuite.scala          |  12 +-
 .../ml/regression/LinearRegressionSuite.scala | 184 +++++++++---------
 python/pyspark/ml/classification.py           |  13 ++
 python/pyspark/ml/regression.py               |  12 ++
 14 files changed, 263 insertions(+), 211 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index aadd5b8da5e3b..60bfadb8e7503 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -92,9 +92,9 @@ setMethod("summary", signature(x = "PipelineModel"),
           function(x, ...) {
             features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                    "getModelFeatures", x@model)
-            weights <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelWeights", x@model)
-            coefficients <- as.matrix(unlist(weights))
+            coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                   "getModelCoefficients", x@model)
+            coefficients <- as.matrix(unlist(coefficients))
             colnames(coefficients) <- c("Estimate")
             rownames(coefficients) <- unlist(features)
             return(list(coefficients = coefficients))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 6f839ff4d7cd8..a1335e7a1bde8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -392,11 +392,14 @@ class LogisticRegression(override val uid: String)
 @Experimental
 class LogisticRegressionModel private[ml] (
     override val uid: String,
-    val weights: Vector,
+    val coefficients: Vector,
     val intercept: Double)
   extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
   with LogisticRegressionParams {
 
+  @deprecated("Use coefficients instead.", "1.6.0")
+  def weights: Vector = coefficients
+
   override def setThreshold(value: Double): this.type = super.setThreshold(value)
 
   override def getThreshold: Double = super.getThreshold
@@ -407,7 +410,7 @@ class LogisticRegressionModel private[ml] (
 
   /** Margin (rawPrediction) for class label 1.  For binary classification only. */
   private val margin: Vector => Double = (features) => {
-    BLAS.dot(features, weights) + intercept
+    BLAS.dot(features, coefficients) + intercept
   }
 
   /** Score (probability) for class label 1.  For binary classification only. */
@@ -416,7 +419,7 @@ class LogisticRegressionModel private[ml] (
     1.0 / (1.0 + math.exp(-m))
   }
 
-  override val numFeatures: Int = weights.size
+  override val numFeatures: Int = coefficients.size
 
   override val numClasses: Int = 2
 
@@ -483,7 +486,7 @@ class LogisticRegressionModel private[ml] (
   }
 
   override def copy(extra: ParamMap): LogisticRegressionModel = {
-    val newModel = copyValues(new LogisticRegressionModel(uid, weights, intercept), extra)
+    val newModel = copyValues(new LogisticRegressionModel(uid, coefficients, intercept), extra)
     if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
     newModel.setParent(parent)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index 21ebf6d916db7..9162ec0e4e153 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -51,13 +51,22 @@ private[r] object SparkRWrappers {
     pipeline.fit(df)
   }
 
+  @deprecated("Use getModelCoefficients instead.", "1.6.0")
   def getModelWeights(model: PipelineModel): Array[Double] = {
     model.stages.last match {
       case m: LinearRegressionModel =>
         Array(m.intercept) ++ m.weights.toArray
-      case _: LogisticRegressionModel =>
-        throw new UnsupportedOperationException(
-          "No weights available for LogisticRegressionModel")  // SPARK-9492
+      case m: LogisticRegressionModel =>
+        Array(m.intercept) ++ m.weights.toArray
+    }
+  }
+
+  def getModelCoefficients(model: PipelineModel): Array[Double] = {
+    model.stages.last match {
+      case m: LinearRegressionModel =>
+        Array(m.intercept) ++ m.coefficients.toArray
+      case m: LogisticRegressionModel =>
+        Array(m.intercept) ++ m.coefficients.toArray
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index ac2c3d825f13c..4dbbc7d39931b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -200,17 +200,17 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 
     val numFeatures = dataset.select($(featuresCol)).take(1)(0).getAs[Vector](0).size
     /*
-       The weights vector has three parts:
+       The coefficients vector has three parts:
        the first element: Double, log(sigma), the log of scale parameter
        the second element: Double, intercept of the beta parameter
        the third to the end elements: Doubles, regression coefficients vector of the beta parameter
      */
-    val initialWeights = Vectors.zeros(numFeatures + 2)
+    val initialCoefficients = Vectors.zeros(numFeatures + 2)
 
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialWeights.toBreeze.toDenseVector)
+      initialCoefficients.toBreeze.toDenseVector)
 
-    val weights = {
+    val coefficients = {
       val arrayBuilder = mutable.ArrayBuilder.make[Double]
       var state: optimizer.State = null
       while (states.hasNext) {
@@ -227,10 +227,10 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 
     if (handlePersistence) instances.unpersist()
 
-    val coefficients = Vectors.dense(weights.slice(2, weights.length))
-    val intercept = weights(1)
-    val scale = math.exp(weights(0))
-    val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale)
+    val regressionCoefficients = Vectors.dense(coefficients.slice(2, coefficients.length))
+    val intercept = coefficients(1)
+    val scale = math.exp(coefficients(0))
+    val model = new AFTSurvivalRegressionModel(uid, regressionCoefficients, intercept, scale)
     copyValues(model.setParent(this))
   }
 
@@ -251,7 +251,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 @Since("1.6.0")
 class AFTSurvivalRegressionModel private[ml] (
     @Since("1.6.0") override val uid: String,
-    @Since("1.6.0") val coefficients: Vector,
+    @Since("1.6.0") val regressionCoefficients: Vector,
     @Since("1.6.0") val intercept: Double,
     @Since("1.6.0") val scale: Double)
   extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
@@ -275,7 +275,7 @@ class AFTSurvivalRegressionModel private[ml] (
   @Since("1.6.0")
   def predictQuantiles(features: Vector): Vector = {
     // scale parameter for the Weibull distribution of lifetime
-    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
+    val lambda = math.exp(BLAS.dot(regressionCoefficients, features) + intercept)
     // shape parameter for the Weibull distribution of lifetime
     val k = 1 / scale
     val quantiles = $(quantileProbabilities).map {
@@ -286,7 +286,7 @@ class AFTSurvivalRegressionModel private[ml] (
 
   @Since("1.6.0")
   def predict(features: Vector): Double = {
-    math.exp(BLAS.dot(coefficients, features) + intercept)
+    math.exp(BLAS.dot(regressionCoefficients, features) + intercept)
   }
 
   @Since("1.6.0")
@@ -309,7 +309,7 @@ class AFTSurvivalRegressionModel private[ml] (
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): AFTSurvivalRegressionModel = {
-    copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra)
+    copyValues(new AFTSurvivalRegressionModel(uid, regressionCoefficients, intercept, scale), extra)
       .setParent(parent)
   }
 }
@@ -369,17 +369,17 @@ class AFTSurvivalRegressionModel private[ml] (
  *   \frac{\partial (-\iota)}{\partial (\log\sigma)}=
  *   \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
  * }}}
- * @param weights The log of scale parameter, the intercept and
+ * @param coefficients including three part: The log of scale parameter, the intercept and
  *                regression coefficients corresponding to the features.
  * @param fitIntercept Whether to fit an intercept term.
  */
-private class AFTAggregator(weights: BDV[Double], fitIntercept: Boolean)
+private class AFTAggregator(coefficients: BDV[Double], fitIntercept: Boolean)
   extends Serializable {
 
   // beta is the intercept and regression coefficients to the covariates
-  private val beta = weights.slice(1, weights.length)
+  private val beta = coefficients.slice(1, coefficients.length)
   // sigma is the scale parameter of the AFT model
-  private val sigma = math.exp(weights(0))
+  private val sigma = math.exp(coefficients(0))
 
   private var totalCnt: Long = 0L
   private var lossSum = 0.0
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 2ff500f291abc..f4a17c8f9a582 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -87,8 +87,8 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
       lit(1.0)
     }
     dataset.select(col($(labelCol)), f, w)
-      .map { case Row(label: Double, feature: Double, weights: Double) =>
-      (label, feature, weights)
+      .map { case Row(label: Double, feature: Double, weight: Double) =>
+      (label, feature, weight)
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index f663b9bd9ac73..6e9c7442b8110 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -203,7 +203,7 @@ class LinearRegression(override val uid: String)
     val yMean = ySummarizer.mean(0)
     val yStd = math.sqrt(ySummarizer.variance(0))
 
-    // If the yStd is zero, then the intercept is yMean with zero weights;
+    // If the yStd is zero, then the intercept is yMean with zero coefficient;
     // as a result, training is not needed.
     if (yStd == 0.0) {
       logWarning(s"The standard deviation of the label is zero, so the coefficients will be " +
@@ -331,14 +331,17 @@ class LinearRegression(override val uid: String)
 @Experimental
 class LinearRegressionModel private[ml] (
     override val uid: String,
-    val weights: Vector,
+    val coefficients: Vector,
     val intercept: Double)
   extends RegressionModel[Vector, LinearRegressionModel]
   with LinearRegressionParams {
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
-  override val numFeatures: Int = weights.size
+  @deprecated("Use coefficients instead.", "1.6.0")
+  def weights: Vector = coefficients
+
+  override val numFeatures: Int = coefficients.size
 
   /**
    * Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is
@@ -387,11 +390,11 @@ class LinearRegressionModel private[ml] (
 
 
   override protected def predict(features: Vector): Double = {
-    dot(features, weights) + intercept
+    dot(features, coefficients) + intercept
   }
 
   override def copy(extra: ParamMap): LinearRegressionModel = {
-    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
+    val newModel = copyValues(new LinearRegressionModel(uid, coefficients, intercept), extra)
     if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
     newModel.setParent(parent)
   }
@@ -400,7 +403,7 @@ class LinearRegressionModel private[ml] (
 /**
  * :: Experimental ::
  * Linear regression training results. Currently, the training summary ignores the
- * training weights except for the objective trace.
+ * training coefficients except for the objective trace.
  * @param predictions predictions outputted by the model's `transform` method.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
index 253cabf0133d0..cbabafe1b541d 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaOneVsRestSuite.java
@@ -47,16 +47,16 @@ public void setUp() {
         jsql = new SQLContext(jsc);
         int nPoints = 3;
 
-        // The following weights and xMean/xVariance are computed from iris dataset with lambda=0.2.
+        // The following coefficients and xMean/xVariance are computed from iris dataset with lambda=0.2.
         // As a result, we are drawing samples from probability distribution of an actual model.
-        double[] weights = {
+        double[] coefficients = {
                 -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
                 -0.16624, -0.84355, -0.048509, -0.301789, 4.170682 };
 
         double[] xMean = {5.843, 3.057, 3.758, 1.199};
         double[] xVariance = {0.6856, 0.1899, 3.116, 0.581};
         List<LabeledPoint> points = JavaConverters.seqAsJavaListConverter(
-            generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+            generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
         ).asJava();
         datasetRDD = jsc.parallelize(points, 2);
         dataset = jsql.createDataFrame(datasetRDD, LabeledPoint.class);
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e0a795e5e0b00..325faf37e8eea 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -48,21 +48,22 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
        import org.apache.spark.mllib.classification.LogisticRegressionSuite
        val nPoints = 10000
-       val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
        val xMean = Array(5.843, 3.057, 3.758, 1.199)
        val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
        val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
-         weights, xMean, xVariance, true, nPoints, 42), 1)
+         coefficients, xMean, xVariance, true, nPoints, 42), 1)
        data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
          + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
      */
     binaryDataset = {
       val nPoints = 10000
-      val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+      val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
       val xMean = Array(5.843, 3.057, 3.758, 1.199)
       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
 
-      val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+      val testData =
+        generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
 
       sqlContext.createDataFrame(sc.parallelize(testData, 4))
     }
@@ -296,8 +297,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
-       weights
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
@@ -308,14 +309,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.7996864
      */
     val interceptR = 2.8366423
-    val weightsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
+    val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
 
     assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-3)
+    assert(model1.coefficients ~= coefficientsR relTol 1E-3)
 
     // Without regularization, with or without standardization will converge to the same solution.
     assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR relTol 1E-3)
   }
 
   test("binary logistic regression without intercept without regularization") {
@@ -332,9 +333,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights =
+       coefficients =
            coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
@@ -345,14 +346,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.7407946
      */
     val interceptR = 0.0
-    val weightsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
+    val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
 
     assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-2)
+    assert(model1.coefficients ~= coefficientsR relTol 1E-2)
 
     // Without regularization, with or without standardization should converge to the same solution.
     assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-2)
+    assert(model2.coefficients ~= coefficientsR relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with L1 regularization") {
@@ -371,8 +372,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
-       weights
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -383,10 +384,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.02481551
      */
     val interceptR1 = -0.05627428
-    val weightsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
+    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-2)
-    assert(model1.weights ~= weightsR1 absTol 2E-2)
+    assert(model1.coefficients ~= coefficientsR1 absTol 2E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -395,9 +396,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
            standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
@@ -408,10 +409,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.3722152
-    val weightsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
+    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
 
     assert(model2.intercept ~== interceptR2 relTol 1E-2)
-    assert(model2.weights ~= weightsR2 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L1 regularization") {
@@ -430,9 +431,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
            intercept=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -443,10 +444,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.03891782
      */
     val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
+    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 absTol 1E-3)
+    assert(model1.coefficients ~= coefficientsR1 absTol 1E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -455,9 +456,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
            intercept=FALSE, standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -468,10 +469,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
+    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
 
     assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with L2 regularization") {
@@ -490,8 +491,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
-       weights
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -502,10 +503,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.10062872
      */
     val interceptR1 = 0.15021751
-    val weightsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
+    val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
+    assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -514,9 +515,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
            standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -527,10 +528,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.06266838
      */
     val interceptR2 = 0.48657516
-    val weightsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+    val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
 
     assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L2 regularization") {
@@ -549,9 +550,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
            intercept=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -562,10 +563,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.09799775
      */
     val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+    val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
 
     assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-2)
+    assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -574,9 +575,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
            intercept=FALSE, standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                              s0
@@ -587,10 +588,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.053314311
      */
     val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+    val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
 
     assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-2)
+    assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with ElasticNet regularization") {
@@ -609,8 +610,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
-       weights
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -621,10 +622,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.15458796
      */
     val interceptR1 = 0.57734851
-    val weightsR1 = Vectors.dense(-0.05310287, 0.0, -0.08849250, -0.15458796)
+    val coefficientsR1 = Vectors.dense(-0.05310287, 0.0, -0.08849250, -0.15458796)
 
     assert(model1.intercept ~== interceptR1 relTol 6E-3)
-    assert(model1.weights ~== weightsR1 absTol 5E-3)
+    assert(model1.coefficients ~== coefficientsR1 absTol 5E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -633,9 +634,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
            standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -646,10 +647,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.05350074
      */
     val interceptR2 = 0.51555993
-    val weightsR2 = Vectors.dense(0.0, 0.0, -0.18807395, -0.05350074)
+    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.18807395, -0.05350074)
 
     assert(model2.intercept ~== interceptR2 relTol 6E-3)
-    assert(model2.weights ~= weightsR2 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with ElasticNet regularization") {
@@ -668,9 +669,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
            intercept=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -681,10 +682,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.142534158
      */
     val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
+    val coefficientsR1 = Vectors.dense(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 absTol 1E-2)
+    assert(model1.coefficients ~= coefficientsR1 absTol 1E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -693,9 +694,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
            intercept=FALSE, standardize=FALSE))
-       weights
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -706,10 +707,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(0.0, 0.03345223, -0.11304532, 0.0)
+    val coefficientsR2 = Vectors.dense(0.0, 0.03345223, -0.11304532, 0.0)
 
     assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with strong L1 regularization") {
@@ -732,8 +733,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         }).histogram
 
     /*
-       For binary logistic regression with strong L1 regularization, all the weights will be zeros.
-       As a result,
+       For binary logistic regression with strong L1 regularization, all the coefficients
+       will be zeros. As a result,
        {{{
        P(0) = 1 / (1 + \exp(b)), and
        P(1) = \exp(b) / (1 + \exp(b))
@@ -743,13 +744,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        }}}
      */
     val interceptTheory = math.log(histogram(1) / histogram(0))
-    val weightsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)
+    val coefficientsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptTheory relTol 1E-5)
-    assert(model1.weights ~= weightsTheory absTol 1E-6)
+    assert(model1.coefficients ~= coefficientsTheory absTol 1E-6)
 
     assert(model2.intercept ~== interceptTheory relTol 1E-5)
-    assert(model2.weights ~= weightsTheory absTol 1E-6)
+    assert(model2.coefficients ~= coefficientsTheory absTol 1E-6)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -758,8 +759,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
-       weights
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
+       coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
@@ -770,10 +771,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR = -0.248065
-    val weightsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)
+    val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptR relTol 1E-5)
-    assert(model1.weights ~== weightsR absTol 1E-6)
+    assert(model1.coefficients ~== coefficientsR absTol 1E-6)
   }
 
   test("evaluate on test set") {
@@ -814,10 +815,11 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("binary logistic regression with weighted samples") {
     val (dataset, weightedDataset) = {
       val nPoints = 1000
-      val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+      val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
       val xMean = Array(5.843, 3.057, 3.758, 1.199)
       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
-      val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+      val testData =
+        generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
 
       // Let's over-sample the positive samples twice.
       val data1 = testData.flatMap { case labeledPoint: LabeledPoint =>
@@ -863,9 +865,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val model1a0 = trainer1a.fit(dataset)
     val model1a1 = trainer1a.fit(weightedDataset)
     val model1b = trainer1b.fit(weightedDataset)
-    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+    assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3)
     assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
-    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+    assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3)
     assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
 
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index 2d1df9b2b82e8..17db8c44777d4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -53,16 +53,16 @@ class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSp
   test("3 class classification with 2 hidden layers") {
     val nPoints = 1000
 
-    // The following weights are taken from OneVsRestSuite.scala
+    // The following coefficients are taken from OneVsRestSuite.scala
     // they represent 3-class iris dataset
-    val weights = Array(
+    val coefficients = Array(
       -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
       -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
 
     val xMean = Array(5.843, 3.057, 3.758, 1.199)
     val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
     val rdd = sc.parallelize(generateMultinomialLogisticInput(
-      weights, xMean, xVariance, true, nPoints, 42), 2)
+      coefficients, xMean, xVariance, true, nPoints, 42), 2)
     val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features")
     val numClasses = 3
     val numIterations = 100
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 977f0e0b70c1a..5ea71c5317b7a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -43,16 +43,16 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val nPoints = 1000
 
-    // The following weights and xMean/xVariance are computed from iris dataset with lambda=0.2.
+    // The following coefficients and xMean/xVariance are computed from iris dataset with lambda=0.2
     // As a result, we are drawing samples from probability distribution of an actual model.
-    val weights = Array(
+    val coefficients = Array(
       -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
       -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
 
     val xMean = Array(5.843, 3.057, 3.758, 1.199)
     val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
     rdd = sc.parallelize(generateMultinomialLogisticInput(
-      weights, xMean, xVariance, true, nPoints, 42), 2)
+      coefficients, xMean, xVariance, true, nPoints, 42), 2)
     dataset = sqlContext.createDataFrame(rdd)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index 359f31027172b..c0f791bce13d1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -141,12 +141,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 5
        n= 1000
      */
-    val coefficientsR = Vectors.dense(-0.039)
+    val regressionCoefficientsR = Vectors.dense(-0.039)
     val interceptR = 1.759
     val scaleR = 1.41
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*
@@ -212,12 +212,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 5
        n= 1000
      */
-    val coefficientsR = Vectors.dense(-0.0844, 0.0677)
+    val regressionCoefficientsR = Vectors.dense(-0.0844, 0.0677)
     val interceptR = 1.9206
     val scaleR = 0.977
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*
@@ -282,12 +282,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 6
        n= 1000
      */
-    val coefficientsR = Vectors.dense(0.896, -0.709)
+    val regressionCoefficientsR = Vectors.dense(0.896, -0.709)
     val interceptR = 0.0
     val scaleR = 1.52
 
     assert(model.intercept === interceptR)
-    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a2a5c0bbdcb90..235c796d785a6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -122,8 +122,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
          features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
          label <- as.numeric(data$V1)
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
-         > weights
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         6.298698
@@ -131,17 +131,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          as.numeric.data.V3. 7.199082
        */
       val interceptR = 6.298698
-      val weightsR = Vectors.dense(4.700706, 7.199082)
+      val coefficientsR = Vectors.dense(4.700706, 7.199082)
 
       assert(model1.intercept ~== interceptR relTol 1E-3)
-      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model1.coefficients ~= coefficientsR relTol 1E-3)
       assert(model2.intercept ~== interceptR relTol 1E-3)
-      assert(model2.weights ~= weightsR relTol 1E-3)
+      assert(model2.coefficients ~= coefficientsR relTol 1E-3)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
-            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+              model1.intercept
           assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
@@ -159,37 +160,37 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
 
       /*
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
            intercept = FALSE))
-         > weights
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
          as.numeric.data.V2. 6.995908
          as.numeric.data.V3. 5.275131
        */
-      val weightsR = Vectors.dense(6.995908, 5.275131)
+      val coefficientsR = Vectors.dense(6.995908, 5.275131)
 
       assert(model1.intercept ~== 0 absTol 1E-3)
-      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model1.coefficients ~= coefficientsR relTol 1E-3)
       assert(model2.intercept ~== 0 absTol 1E-3)
-      assert(model2.weights ~= weightsR relTol 1E-3)
+      assert(model2.coefficients ~= coefficientsR relTol 1E-3)
 
       /*
          Then again with the data with no intercept:
-         > weightsWithoutIntercept
+         > coefficientsWithourIntercept
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)           .
          as.numeric.data3.V2. 4.70011
          as.numeric.data3.V3. 7.19943
        */
-      val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
+      val coefficientsWithourInterceptR = Vectors.dense(4.70011, 7.19943)
 
       assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
-      assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept1.coefficients ~= coefficientsWithourInterceptR relTol 1E-3)
       assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
-      assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept2.coefficients ~= coefficientsWithourInterceptR relTol 1E-3)
     }
   }
 
@@ -211,8 +212,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian",
+             alpha = 1.0, lambda = 0.57 ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
            (Intercept)         6.24300
@@ -220,14 +222,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 6.679841
          */
         val interceptR1 = 6.24300
-        val weightsR1 = Vectors.dense(4.024821, 6.679841)
+        val coefficientsR1 = Vectors.dense(4.024821, 6.679841)
         assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.weights ~= weightsR1 relTol 1E-3)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-             standardize=FALSE))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+             lambda = 0.57, standardize=FALSE ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
            (Intercept)         6.416948
@@ -235,16 +237,17 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 6.724286
          */
         val interceptR2 = 6.416948
-        val weightsR2 = Vectors.dense(3.893869, 6.724286)
+        val coefficientsR2 = Vectors.dense(3.893869, 6.724286)
 
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
-        assert(model2.weights ~= weightsR2 relTol 1E-3)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
             case Row(features: DenseVector, prediction1: Double) =>
-              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
-                model1.intercept
+              val prediction2 =
+                features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                  model1.intercept
               assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -269,9 +272,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-             intercept=FALSE))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+             lambda = 0.57, intercept=FALSE ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)          .
@@ -279,15 +282,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 4.772913
          */
         val interceptR1 = 0.0
-        val weightsR1 = Vectors.dense(6.299752, 4.772913)
+        val coefficientsR1 = Vectors.dense(6.299752, 4.772913)
 
         assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.weights ~= weightsR1 relTol 1E-3)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-             intercept=FALSE, standardize=FALSE))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+             lambda = 0.57, intercept=FALSE, standardize=FALSE ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
@@ -295,16 +298,17 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 4.764229
          */
         val interceptR2 = 0.0
-        val weightsR2 = Vectors.dense(6.232193, 4.764229)
+        val coefficientsR2 = Vectors.dense(6.232193, 4.764229)
 
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.weights ~= weightsR2 relTol 1E-3)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
             case Row(features: DenseVector, prediction1: Double) =>
-              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
-                model1.intercept
+              val prediction2 =
+                features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                  model1.intercept
               assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -321,8 +325,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
-         > weights
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         5.269376
@@ -330,15 +334,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          as.numeric.data.V3. 5.712356)
        */
       val interceptR1 = 5.269376
-      val weightsR1 = Vectors.dense(3.736216, 5.712356)
+      val coefficientsR1 = Vectors.dense(3.736216, 5.712356)
 
       assert(model1.intercept ~== interceptR1 relTol 1E-3)
-      assert(model1.weights ~= weightsR1 relTol 1E-3)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
       /*
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
            standardize=FALSE))
-         > weights
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         5.791109
@@ -346,15 +350,16 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          as.numeric.data.V3. 5.910406
        */
       val interceptR2 = 5.791109
-      val weightsR2 = Vectors.dense(3.435466, 5.910406)
+      val coefficientsR2 = Vectors.dense(3.435466, 5.910406)
 
       assert(model2.intercept ~== interceptR2 relTol 1E-3)
-      assert(model2.weights ~= weightsR2 relTol 1E-3)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
-            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+              model1.intercept
           assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
@@ -370,9 +375,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
            intercept = FALSE))
-         > weights
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
@@ -380,15 +385,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          as.numeric.data.V3. 4.214502
        */
       val interceptR1 = 0.0
-      val weightsR1 = Vectors.dense(5.522875, 4.214502)
+      val coefficientsR1 = Vectors.dense(5.522875, 4.214502)
 
       assert(model1.intercept ~== interceptR1 absTol 1E-3)
-      assert(model1.weights ~= weightsR1 relTol 1E-3)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
       /*
-         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
            intercept = FALSE, standardize=FALSE))
-         > weights
+         > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
@@ -396,15 +401,16 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
          as.numeric.data.V3. 4.187419
        */
       val interceptR2 = 0.0
-      val weightsR2 = Vectors.dense(5.263704, 4.187419)
+      val coefficientsR2 = Vectors.dense(5.263704, 4.187419)
 
       assert(model2.intercept ~== interceptR2 absTol 1E-3)
-      assert(model2.weights ~= weightsR2 relTol 1E-3)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
-            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+              model1.intercept
           assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
@@ -428,8 +434,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+             lambda = 1.6 ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         6.324108
@@ -437,15 +444,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 5.200403
          */
         val interceptR1 = 5.696056
-        val weightsR1 = Vectors.dense(3.670489, 6.001122)
+        val coefficientsR1 = Vectors.dense(3.670489, 6.001122)
 
         assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.weights ~= weightsR1 relTol 1E-3)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
              standardize=FALSE))
-           > weights
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         6.114723
@@ -453,16 +460,17 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 6.146531
          */
         val interceptR2 = 6.114723
-        val weightsR2 = Vectors.dense(3.409937, 6.146531)
+        val coefficientsR2 = Vectors.dense(3.409937, 6.146531)
 
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
-        assert(model2.weights ~= weightsR2 relTol 1E-3)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
-              model1.intercept
+            val prediction2 =
+              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -487,9 +495,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-             intercept=FALSE))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+             lambda = 1.6, intercept=FALSE ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                       s0
            (Intercept)         .
@@ -497,15 +505,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.dataM.V3. 4.322251
          */
         val interceptR1 = 0.0
-        val weightsR1 = Vectors.dense(5.673348, 4.322251)
+        val coefficientsR1 = Vectors.dense(5.673348, 4.322251)
 
         assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.weights ~= weightsR1 relTol 1E-3)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
 
         /*
-           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-             intercept=FALSE, standardize=FALSE))
-           > weights
+           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+             lambda = 1.6, intercept=FALSE, standardize=FALSE ))
+           > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
@@ -513,16 +521,17 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
            as.numeric.data.V3. 4.297622
          */
         val interceptR2 = 0.0
-        val weightsR2 = Vectors.dense(5.477988, 4.297622)
+        val coefficientsR2 = Vectors.dense(5.477988, 4.297622)
 
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.weights ~= weightsR2 relTol 1E-3)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
-              model1.intercept
+            val prediction2 =
+              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -554,7 +563,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val expectedResiduals = datasetWithDenseFeature.select("features", "label")
         .map { case Row(features: DenseVector, label: Double) =>
         val prediction =
-          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+          features(0) * model.coefficients(0) + features(1) * model.coefficients(1) +
+            model.intercept
         label - prediction
       }
         .zip(model.summary.residuals.map(_.getDouble(0)))
@@ -663,9 +673,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model1a1 = trainer1a.fit(weightedData)
       val model1b = trainer1b.fit(weightedData)
 
-      assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+      assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3)
       assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
-      assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+      assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3)
       assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
 
       val trainer2a = (new LinearRegression).setFitIntercept(true)
@@ -675,9 +685,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model2a0 = trainer2a.fit(data)
       val model2a1 = trainer2a.fit(weightedData)
       val model2b = trainer2b.fit(weightedData)
-      assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
+      assert(model2a0.coefficients !~= model2a1.coefficients absTol 1E-3)
       assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
-      assert(model2a0.weights ~== model2b.weights absTol 1E-3)
+      assert(model2a0.coefficients ~== model2b.coefficients absTol 1E-3)
       assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
 
       val trainer3a = (new LinearRegression).setFitIntercept(false)
@@ -687,8 +697,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model3a0 = trainer3a.fit(data)
       val model3a1 = trainer3a.fit(weightedData)
       val model3b = trainer3b.fit(weightedData)
-      assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
-      assert(model3a0.weights ~== model3b.weights absTol 1E-3)
+      assert(model3a0.coefficients !~= model3a1.coefficients absTol 1E-3)
+      assert(model3a0.coefficients ~== model3b.coefficients absTol 1E-3)
 
       val trainer4a = (new LinearRegression).setFitIntercept(false)
         .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
@@ -697,8 +707,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val model4a0 = trainer4a.fit(data)
       val model4a1 = trainer4a.fit(weightedData)
       val model4b = trainer4b.fit(weightedData)
-      assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
-      assert(model4a0.weights ~== model4b.weights absTol 1E-3)
+      assert(model4a0.coefficients !~= model4a1.coefficients absTol 1E-3)
+      assert(model4a0.coefficients ~== model4b.coefficients absTol 1E-3)
     }
   }
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 4cbe7fbd482da..2e468f67b8987 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 #
 
+import warnings
+
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -189,8 +192,18 @@ def weights(self):
         """
         Model weights.
         """
+
+        warnings.warn("weights is deprecated. Use coefficients instead.")
         return self._call_java("weights")
 
+    @property
+    @since("1.6.0")
+    def coefficients(self):
+        """
+        Model coefficients.
+        """
+        return self._call_java("coefficients")
+
     @property
     def intercept(self):
         """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index dc68815556d4e..ab26616f4a01d 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+import warnings
+
 from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
@@ -117,8 +119,18 @@ def weights(self):
         """
         Model weights.
         """
+
+        warnings.warn("weights is deprecated. Use coefficients instead.")
         return self._call_java("weights")
 
+    @property
+    @since("1.6.0")
+    def coefficients(self):
+        """
+        Model coefficients.
+        """
+        return self._call_java("coefficients")
+
     @property
     @since("1.4.0")
     def intercept(self):

From 476f4348e2ea57ea05f4b470abfe76d97eeb20ce Mon Sep 17 00:00:00 2001
From: Calvin Jia <jia.calvin@gmail.com>
Date: Mon, 2 Nov 2015 17:02:31 -0800
Subject: [PATCH 766/802] [SPARK-11236] [TEST-MAVEN] [TEST-HADOOP1.0] [CORE]
 Update Tachyon dependency 0.7.1 -> 0.8.1

This is a reopening of #9204 which failed hadoop1 sbt tests.

With the original PR, a classpath issue would occur due to the MIMA plugin pulling in hadoop-2.2 dependencies regardless of the hadoop version when building the `oldDeps` project. These affect the hadoop1 sbt build because they are placed in `lib_managed` and Tachyon 0.8.0's default hadoop version is 2.2.

Author: Calvin Jia <jia.calvin@gmail.com>

Closes #9395 from calvinjia/spark-11236.
---
 core/pom.xml         | 6 +-----
 make-distribution.sh | 8 ++++----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 1b6b13517bd56..570a25cf325a2 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -262,7 +262,7 @@
     <dependency>
       <groupId>org.tachyonproject</groupId>
       <artifactId>tachyon-client</artifactId>
-      <version>0.7.1</version>
+      <version>0.8.1</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.hadoop</groupId>
@@ -284,10 +284,6 @@
           <groupId>org.tachyonproject</groupId>
           <artifactId>tachyon-underfs-glusterfs</artifactId>
         </exclusion>
-        <exclusion>
-          <groupId>org.tachyonproject</groupId>
-          <artifactId>tachyon-underfs-s3</artifactId>
-        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/make-distribution.sh b/make-distribution.sh
index 24418ace26270..e1c2afdbc6d87 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -33,9 +33,9 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 SPARK_TACHYON=false
-TACHYON_VERSION="0.7.1"
+TACHYON_VERSION="0.8.1"
 TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz"
-TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}"
+TACHYON_URL="http://tachyon-project.org/downloads/files/${TACHYON_VERSION}/${TACHYON_TGZ}"
 
 MAKE_TGZ=false
 NAME=none
@@ -240,10 +240,10 @@ if [ "$SPARK_TACHYON" == "true" ]; then
   fi
 
   tar xzf "${TACHYON_TGZ}"
-  cp "tachyon-${TACHYON_VERSION}/core/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
+  cp "tachyon-${TACHYON_VERSION}/assembly/target/tachyon-assemblies-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib"
   mkdir -p "$DISTDIR/tachyon/src/main/java/tachyon/web"
   cp -r "tachyon-${TACHYON_VERSION}"/{bin,conf,libexec} "$DISTDIR/tachyon"
-  cp -r "tachyon-${TACHYON_VERSION}"/core/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
+  cp -r "tachyon-${TACHYON_VERSION}"/servers/src/main/java/tachyon/web "$DISTDIR/tachyon/src/main/java/tachyon/web"
 
   if [[ `uname -a` == Darwin* ]]; then
     # need to run sed differently on osx

From 21ad846238a9a79564e2e99a1def89fd31a0870d Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Mon, 2 Nov 2015 19:07:31 -0800
Subject: [PATCH 767/802] [MINOR][ML] removed the old `getModelWeights`
 function

Removed the old `getModelWeights` function which was private and renamed into `getModelCoefficients`

Author: DB Tsai <dbt@netflix.com>

Closes #9426 from dbtsai/feature-minor.
---
 .../scala/org/apache/spark/ml/r/SparkRWrappers.scala   | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index 9162ec0e4e153..24f76de806d8f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -51,16 +51,6 @@ private[r] object SparkRWrappers {
     pipeline.fit(df)
   }
 
-  @deprecated("Use getModelCoefficients instead.", "1.6.0")
-  def getModelWeights(model: PipelineModel): Array[Double] = {
-    model.stages.last match {
-      case m: LinearRegressionModel =>
-        Array(m.intercept) ++ m.weights.toArray
-      case m: LogisticRegressionModel =>
-        Array(m.intercept) ++ m.weights.toArray
-    }
-  }
-
   def getModelCoefficients(model: PipelineModel): Array[Double] = {
     model.stages.last match {
       case m: LinearRegressionModel =>

From 2cef1bb0b560a03aa7308f694b0c66347b90c9ea Mon Sep 17 00:00:00 2001
From: Nong Li <nongli@gmail.com>
Date: Mon, 2 Nov 2015 19:18:45 -0800
Subject: [PATCH 768/802] =?UTF-8?q?[SPARK-5354][SQL]=20Cached=20tables=20s?=
 =?UTF-8?q?hould=20preserve=20partitioning=20and=20ord=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ering.

For cached tables, we can just maintain the partitioning and ordering from the
source relation.

Author: Nong Li <nongli@gmail.com>

Closes #9404 from nongli/spark-5354.
---
 .../columnar/InMemoryColumnarTableScan.scala  |  7 +++
 .../apache/spark/sql/execution/Exchange.scala | 40 ++++++++++---
 .../apache/spark/sql/CachedTableSuite.scala   | 59 +++++++++++++++++++
 3 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index b4607b12fcefa..7eb1ad7cd8198 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{ConvertToUnsafe, LeafNode, SparkPlan}
 import org.apache.spark.sql.types.UserDefinedType
 import org.apache.spark.storage.StorageLevel
@@ -209,6 +210,12 @@ private[sql] case class InMemoryColumnarTableScan(
 
   override def output: Seq[Attribute] = attributes
 
+  // The cached version does not change the outputPartitioning of the original SparkPlan.
+  override def outputPartitioning: Partitioning = relation.child.outputPartitioning
+
+  // The cached version does not change the outputOrdering of the original SparkPlan.
+  override def outputOrdering: Seq[SortOrder] = relation.child.outputOrdering
+
   override def outputsUnsafeRows: Boolean = true
 
   private def statsFor(a: Attribute) = relation.partitionStatistics.forAttribute(a)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 7f60c8f5eaa95..e81108b7884d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -194,12 +194,13 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
  */
 private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[SparkPlan] {
   // TODO: Determine the number of partitions.
-  private def numPartitions: Int = sqlContext.conf.numShufflePartitions
+  private def defaultPartitions: Int = sqlContext.conf.numShufflePartitions
 
   /**
    * Given a required distribution, returns a partitioning that satisfies that distribution.
    */
-  private def canonicalPartitioning(requiredDistribution: Distribution): Partitioning = {
+  private def createPartitioning(requiredDistribution: Distribution,
+      numPartitions: Int): Partitioning = {
     requiredDistribution match {
       case AllTuples => SinglePartition
       case ClusteredDistribution(clustering) => HashPartitioning(clustering, numPartitions)
@@ -220,7 +221,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       if (child.outputPartitioning.satisfies(distribution)) {
         child
       } else {
-        Exchange(canonicalPartitioning(distribution), child)
+        Exchange(createPartitioning(distribution, defaultPartitions), child)
       }
     }
 
@@ -229,12 +230,33 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     if (children.length > 1
         && requiredChildDistributions.toSet != Set(UnspecifiedDistribution)
         && !Partitioning.allCompatible(children.map(_.outputPartitioning))) {
-      children = children.zip(requiredChildDistributions).map { case (child, distribution) =>
-        val targetPartitioning = canonicalPartitioning(distribution)
-        if (child.outputPartitioning.guarantees(targetPartitioning)) {
-          child
-        } else {
-          Exchange(targetPartitioning, child)
+
+      // First check if the existing partitions of the children all match. This means they are
+      // partitioned by the same partitioning into the same number of partitions. In that case,
+      // don't try to make them match `defaultPartitions`, just use the existing partitioning.
+      // TODO: this should be a cost based descision. For example, a big relation should probably
+      // maintain its existing number of partitions and smaller partitions should be shuffled.
+      // defaultPartitions is arbitrary.
+      val numPartitions = children.head.outputPartitioning.numPartitions
+      val useExistingPartitioning = children.zip(requiredChildDistributions).forall {
+        case (child, distribution) => {
+          child.outputPartitioning.guarantees(
+            createPartitioning(distribution, numPartitions))
+        }
+      }
+
+      children = if (useExistingPartitioning) {
+        children
+      } else {
+        children.zip(requiredChildDistributions).map {
+          case (child, distribution) => {
+            val targetPartitioning = createPartitioning(distribution, defaultPartitions)
+            if (child.outputPartitioning.guarantees(targetPartitioning)) {
+              child
+            } else {
+              Exchange(targetPartitioning, child)
+            }
+          }
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index fd566c8276bc1..605954b105d1e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+import org.apache.spark.sql.execution.Exchange
 import org.apache.spark.sql.execution.PhysicalRDD
 
 import scala.concurrent.duration._
@@ -353,4 +354,62 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
     assert(sparkPlan.collect { case e: InMemoryColumnarTableScan => e }.size === 3)
     assert(sparkPlan.collect { case e: PhysicalRDD => e }.size === 0)
   }
+
+  /**
+   * Verifies that the plan for `df` contains `expected` number of Exchange operators.
+   */
+  private def verifyNumExchanges(df: DataFrame, expected: Int): Unit = {
+    assert(df.queryExecution.executedPlan.collect { case e: Exchange => e }.size == expected)
+  }
+
+  test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
+    val table3x = testData.unionAll(testData).unionAll(testData)
+    table3x.registerTempTable("testData3x")
+
+    sql("SELECT key, value FROM testData3x ORDER BY key").registerTempTable("orderedTable")
+    sqlContext.cacheTable("orderedTable")
+    assertCached(sqlContext.table("orderedTable"))
+    // Should not have an exchange as the query is already sorted on the group by key.
+    verifyNumExchanges(sql("SELECT key, count(*) FROM orderedTable GROUP BY key"), 0)
+    checkAnswer(
+      sql("SELECT key, count(*) FROM orderedTable GROUP BY key ORDER BY key"),
+      sql("SELECT key, count(*) FROM testData3x GROUP BY key ORDER BY key").collect())
+    sqlContext.uncacheTable("orderedTable")
+
+    // Set up two tables distributed in the same way. Try this with the data distributed into
+    // different number of partitions.
+    for (numPartitions <- 1 until 10 by 4) {
+      testData.distributeBy(Column("key") :: Nil, numPartitions).registerTempTable("t1")
+      testData2.distributeBy(Column("a") :: Nil, numPartitions).registerTempTable("t2")
+      sqlContext.cacheTable("t1")
+      sqlContext.cacheTable("t2")
+
+      // Joining them should result in no exchanges.
+      verifyNumExchanges(sql("SELECT * FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a"), 0)
+      checkAnswer(sql("SELECT * FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a"),
+        sql("SELECT * FROM testData t1 JOIN testData2 t2 ON t1.key = t2.a"))
+
+      // Grouping on the partition key should result in no exchanges
+      verifyNumExchanges(sql("SELECT count(*) FROM t1 GROUP BY key"), 0)
+      checkAnswer(sql("SELECT count(*) FROM t1 GROUP BY key"),
+        sql("SELECT count(*) FROM testData GROUP BY key"))
+
+      sqlContext.uncacheTable("t1")
+      sqlContext.uncacheTable("t2")
+      sqlContext.dropTempTable("t1")
+      sqlContext.dropTempTable("t2")
+    }
+
+    // Distribute the tables into non-matching number of partitions. Need to shuffle.
+    testData.distributeBy(Column("key") :: Nil, 6).registerTempTable("t1")
+    testData2.distributeBy(Column("a") :: Nil, 3).registerTempTable("t2")
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
+
+    verifyNumExchanges(sql("SELECT * FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a"), 2)
+    sqlContext.uncacheTable("t1")
+    sqlContext.uncacheTable("t2")
+    sqlContext.dropTempTable("t1")
+    sqlContext.dropTempTable("t2")
+  }
 }

From 9cb5c731dadff9539126362827a258d6b65754bb Mon Sep 17 00:00:00 2001
From: Nong Li <nongli@gmail.com>
Date: Mon, 2 Nov 2015 20:32:08 -0800
Subject: [PATCH 769/802] [SPARK-11329][SQL] Support star expansion for
 structs.

1. Supporting expanding structs in Projections. i.e.
  "SELECT s.*" where s is a struct type.
  This is fixed by allowing the expand function to handle structs in addition to tables.

2. Supporting expanding * inside aggregate functions of structs.
   "SELECT max(struct(col1, structCol.*))"
   This requires recursively expanding the expressions. In this case, it it the aggregate
   expression "max(...)" and we need to recursively expand its children inputs.

Author: Nong Li <nongli@gmail.com>

Closes #9343 from nongli/spark-11329.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |   6 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  46 ++++--
 .../sql/catalyst/analysis/unresolved.scala    |  78 +++++++---
 .../scala/org/apache/spark/sql/Column.scala   |   3 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 133 ++++++++++++++++++
 .../org/apache/spark/sql/hive/HiveQl.scala    |   2 +-
 6 files changed, 230 insertions(+), 38 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 0fef04302714e..d7567e8613e3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -466,9 +466,9 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val baseExpression: Parser[Expression] =
     ( "*" ^^^ UnresolvedStar(None)
-    | ident <~ "." ~ "*" ^^ { case tableName => UnresolvedStar(Option(tableName)) }
-    | primary
-    )
+    | (ident <~ "."). + <~ "*" ^^ { case target => { UnresolvedStar(Option(target)) }
+    } | primary
+   )
 
   protected lazy val signedPrimary: Parser[Expression] =
     sign ~ primary ^^ { case s ~ e => if (s == "-") UnaryMinus(e) else e }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index beabacfc88e32..912c967b95f08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -279,6 +279,24 @@ class Analyzer(
    * a logical plan node's children.
    */
   object ResolveReferences extends Rule[LogicalPlan] {
+    /**
+     * Foreach expression, expands the matching attribute.*'s in `child`'s input for the subtree
+     * rooted at each expression.
+     */
+    def expandStarExpressions(exprs: Seq[Expression], child: LogicalPlan): Seq[Expression] = {
+      exprs.flatMap {
+        case s: Star => s.expand(child, resolver)
+        case e =>
+          e.transformDown {
+            case f1: UnresolvedFunction if containsStar(f1.children) =>
+              f1.copy(children = f1.children.flatMap {
+                case s: Star => s.expand(child, resolver)
+                case o => o :: Nil
+              })
+          } :: Nil
+      }
+    }
+
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case p: LogicalPlan if !p.childrenResolved => p
 
@@ -286,44 +304,42 @@ class Analyzer(
       case p @ Project(projectList, child) if containsStar(projectList) =>
         Project(
           projectList.flatMap {
-            case s: Star => s.expand(child.output, resolver)
+            case s: Star => s.expand(child, resolver)
             case UnresolvedAlias(f @ UnresolvedFunction(_, args, _)) if containsStar(args) =>
-              val expandedArgs = args.flatMap {
-                case s: Star => s.expand(child.output, resolver)
-                case o => o :: Nil
-              }
-              UnresolvedAlias(child = f.copy(children = expandedArgs)) :: Nil
+              val newChildren = expandStarExpressions(args, child)
+              UnresolvedAlias(child = f.copy(children = newChildren)) :: Nil
+            case Alias(f @ UnresolvedFunction(_, args, _), name) if containsStar(args) =>
+              val newChildren = expandStarExpressions(args, child)
+              Alias(child = f.copy(children = newChildren), name)() :: Nil
             case UnresolvedAlias(c @ CreateArray(args)) if containsStar(args) =>
               val expandedArgs = args.flatMap {
-                case s: Star => s.expand(child.output, resolver)
+                case s: Star => s.expand(child, resolver)
                 case o => o :: Nil
               }
               UnresolvedAlias(c.copy(children = expandedArgs)) :: Nil
             case UnresolvedAlias(c @ CreateStruct(args)) if containsStar(args) =>
               val expandedArgs = args.flatMap {
-                case s: Star => s.expand(child.output, resolver)
+                case s: Star => s.expand(child, resolver)
                 case o => o :: Nil
               }
               UnresolvedAlias(c.copy(children = expandedArgs)) :: Nil
             case o => o :: Nil
           },
           child)
+
       case t: ScriptTransformation if containsStar(t.input) =>
         t.copy(
           input = t.input.flatMap {
-            case s: Star => s.expand(t.child.output, resolver)
+            case s: Star => s.expand(t.child, resolver)
             case o => o :: Nil
           }
         )
 
       // If the aggregate function argument contains Stars, expand it.
       case a: Aggregate if containsStar(a.aggregateExpressions) =>
-        a.copy(
-          aggregateExpressions = a.aggregateExpressions.flatMap {
-            case s: Star => s.expand(a.child.output, resolver)
-            case o => o :: Nil
-          }
-        )
+        val expanded = expandStarExpressions(a.aggregateExpressions, a.child)
+            .map(_.asInstanceOf[NamedExpression])
+        a.copy(aggregateExpressions = expanded)
 
       // Special handling for cases when self-join introduce duplicate expression ids.
       case j @ Join(left, right, _, _) if !j.selfJoinResolved =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index c97365003935e..6975662e2b738 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.{TableIdentifier, errors}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.catalyst.{TableIdentifier, errors}
+import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
  * Thrown when an invalid attempt is made to access a property of a tree that has yet to be fully
@@ -158,7 +158,7 @@ abstract class Star extends LeafExpression with NamedExpression {
   override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute")
   override lazy val resolved = false
 
-  def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression]
+  def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression]
 }
 
 
@@ -166,26 +166,68 @@ abstract class Star extends LeafExpression with NamedExpression {
  * Represents all of the input attributes to a given relational operator, for example in
  * "SELECT * FROM ...".
  *
- * @param table an optional table that should be the target of the expansion.  If omitted all
- *              tables' columns are produced.
+ * This is also used to expand structs. For example:
+ * "SELECT record.* from (SELECT struct(a,b,c) as record ...)
+ *
+ * @param target an optional name that should be the target of the expansion.  If omitted all
+ *              targets' columns are produced. This can either be a table name or struct name. This
+ *              is a list of identifiers that is the path of the expansion.
  */
-case class UnresolvedStar(table: Option[String]) extends Star with Unevaluable {
+case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevaluable {
+
+  override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = {
 
-  override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = {
-    val expandedAttributes: Seq[Attribute] = table match {
+    // First try to expand assuming it is table.*.
+    val expandedAttributes: Seq[Attribute] = target match {
       // If there is no table specified, use all input attributes.
-      case None => input
+      case None => input.output
       // If there is a table, pick out attributes that are part of this table.
-      case Some(t) => input.filter(_.qualifiers.filter(resolver(_, t)).nonEmpty)
+      case Some(t) => if (t.size == 1) {
+        input.output.filter(_.qualifiers.filter(resolver(_, t.head)).nonEmpty)
+      } else {
+        List()
+      }
     }
-    expandedAttributes.zip(input).map {
-      case (n: NamedExpression, _) => n
-      case (e, originalAttribute) =>
-        Alias(e, originalAttribute.name)(qualifiers = originalAttribute.qualifiers)
+    if (!expandedAttributes.isEmpty) {
+      if (expandedAttributes.forall(_.isInstanceOf[NamedExpression])) {
+        return expandedAttributes
+      } else {
+        require(expandedAttributes.size == input.output.size)
+        expandedAttributes.zip(input.output).map {
+          case (e, originalAttribute) =>
+            Alias(e, originalAttribute.name)(qualifiers = originalAttribute.qualifiers)
+        }
+      }
+      return expandedAttributes
+    }
+
+    require(target.isDefined)
+
+    // Try to resolve it as a struct expansion. If there is a conflict and both are possible,
+    // (i.e. [name].* is both a table and a struct), the struct path can always be qualified.
+    val attribute = input.resolve(target.get, resolver)
+    if (attribute.isDefined) {
+      // This target resolved to an attribute in child. It must be a struct. Expand it.
+      attribute.get.dataType match {
+        case s: StructType => {
+          s.fields.map( f => {
+            val extract = GetStructField(attribute.get, f, s.getFieldIndex(f.name).get)
+            Alias(extract, target.get + "." + f.name)()
+          })
+        }
+        case _ => {
+          throw new AnalysisException("Can only star expand struct data types. Attribute: `" +
+            target.get + "`")
+        }
+      }
+    } else {
+      val from = input.inputSet.map(_.name).mkString(", ")
+      val targetString = target.get.mkString(".")
+      throw new AnalysisException(s"cannot resolve '$targetString.*' give input columns '$from'")
     }
   }
 
-  override def toString: String = table.map(_ + ".").getOrElse("") + "*"
+  override def toString: String = target.map(_ + ".").getOrElse("") + "*"
 }
 
 /**
@@ -225,7 +267,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
  * @param expressions Expressions to expand.
  */
 case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star with Unevaluable {
-  override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = expressions
+  override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = expressions
   override def toString: String = expressions.mkString("ResolvedStar(", ", ", ")")
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index e4f4cf1533ac4..3cde9d6cb4708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -60,7 +60,8 @@ class Column(protected[sql] val expr: Expression) extends Logging {
 
   def this(name: String) = this(name match {
     case "*" => UnresolvedStar(None)
-    case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2)))
+    case _ if name.endsWith(".*") => UnresolvedStar(Some(UnresolvedAttribute.parseAttributeName(
+      name.substring(0, name.length - 2))))
     case _ => UnresolvedAttribute.quotedString(name)
   })
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5413ef1287da1..ee54bff24b196 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1932,4 +1932,137 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       assert(sampled.count() == sampledOdd.count() + sampledEven.count())
     }
   }
+
+  test("Struct Star Expansion") {
+    val structDf = testData2.select("a", "b").as("record")
+
+    checkAnswer(
+      structDf.select($"record.a", $"record.b"),
+      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+
+    checkAnswer(
+      structDf.select($"record.*"),
+      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+
+    checkAnswer(
+      structDf.select($"record.*", $"record.*"),
+      Row(1, 1, 1, 1) :: Row(1, 2, 1, 2) :: Row(2, 1, 2, 1) :: Row(2, 2, 2, 2) ::
+        Row(3, 1, 3, 1) :: Row(3, 2, 3, 2) :: Nil)
+
+    checkAnswer(
+      sql("select struct(a, b) as r1, struct(b, a) as r2 from testData2").select($"r1.*", $"r2.*"),
+      Row(1, 1, 1, 1) :: Row(1, 2, 2, 1) :: Row(2, 1, 1, 2) :: Row(2, 2, 2, 2) ::
+        Row(3, 1, 1, 3) :: Row(3, 2, 2, 3) :: Nil)
+
+    // Try with a registered table.
+    sql("select struct(a, b) as record from testData2").registerTempTable("structTable")
+    checkAnswer(sql("SELECT record.* FROM structTable"),
+      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+
+    checkAnswer(sql(
+      """
+        | SELECT min(struct(record.*)) FROM
+        |   (select struct(a,b) as record from testData2) tmp
+      """.stripMargin),
+      Row(Row(1, 1)) :: Nil)
+
+    // Try with an alias on the select list
+    checkAnswer(sql(
+      """
+        | SELECT max(struct(record.*)) as r FROM
+        |   (select struct(a,b) as record from testData2) tmp
+      """.stripMargin).select($"r.*"),
+      Row(3, 2) :: Nil)
+
+    // With GROUP BY
+    checkAnswer(sql(
+      """
+        | SELECT min(struct(record.*)) FROM
+        |   (select a as a, struct(a,b) as record from testData2) tmp
+        | GROUP BY a
+      """.stripMargin),
+      Row(Row(1, 1)) :: Row(Row(2, 1)) :: Row(Row(3, 1)) :: Nil)
+
+    // With GROUP BY and alias
+    checkAnswer(sql(
+      """
+        | SELECT max(struct(record.*)) as r FROM
+        |   (select a as a, struct(a,b) as record from testData2) tmp
+        | GROUP BY a
+      """.stripMargin).select($"r.*"),
+      Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil)
+
+    // With GROUP BY and alias and additional fields in the struct
+    checkAnswer(sql(
+      """
+        | SELECT max(struct(a, record.*, b)) as r FROM
+        |   (select a as a, b as b, struct(a,b) as record from testData2) tmp
+        | GROUP BY a
+      """.stripMargin).select($"r.*"),
+      Row(1, 1, 2, 2) :: Row(2, 2, 2, 2) :: Row(3, 3, 2, 2) :: Nil)
+
+    // Create a data set that contains nested structs.
+    val nestedStructData = sql(
+      """
+        | SELECT struct(r1, r2) as record FROM
+        |   (SELECT struct(a, b) as r1, struct(b, a) as r2 FROM testData2) tmp
+      """.stripMargin)
+
+    checkAnswer(nestedStructData.select($"record.*"),
+      Row(Row(1, 1), Row(1, 1)) :: Row(Row(1, 2), Row(2, 1)) :: Row(Row(2, 1), Row(1, 2)) ::
+        Row(Row(2, 2), Row(2, 2)) :: Row(Row(3, 1), Row(1, 3)) :: Row(Row(3, 2), Row(2, 3)) :: Nil)
+    checkAnswer(nestedStructData.select($"record.r1"),
+      Row(Row(1, 1)) :: Row(Row(1, 2)) :: Row(Row(2, 1)) :: Row(Row(2, 2)) ::
+        Row(Row(3, 1)) :: Row(Row(3, 2)) :: Nil)
+    checkAnswer(
+      nestedStructData.select($"record.r1.*"),
+      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+
+    // Try with a registered table
+    nestedStructData.registerTempTable("nestedStructTable")
+    checkAnswer(sql("SELECT record.* FROM nestedStructTable"),
+      nestedStructData.select($"record.*"))
+    checkAnswer(sql("SELECT record.r1 FROM nestedStructTable"),
+      nestedStructData.select($"record.r1"))
+    checkAnswer(sql("SELECT record.r1.* FROM nestedStructTable"),
+      nestedStructData.select($"record.r1.*"))
+
+    // Create paths with unusual characters.
+    val specialCharacterPath = sql(
+      """
+        | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM
+        |   (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp
+      """.stripMargin)
+    specialCharacterPath.registerTempTable("specialCharacterTable")
+    checkAnswer(specialCharacterPath.select($"`r&&b.c`.*"),
+      nestedStructData.select($"record.*"))
+    checkAnswer(sql("SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"),
+      nestedStructData.select($"record.r1"))
+    checkAnswer(sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"),
+      nestedStructData.select($"record.r2"))
+    checkAnswer(sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"),
+      nestedStructData.select($"record.r1.*"))
+
+    // Try star expanding a scalar. This should fail.
+    assert(intercept[AnalysisException](sql("select a.* from testData2")).getMessage.contains(
+      "Can only star expand struct data types."))
+
+    // Try resolving something not there.
+    assert(intercept[AnalysisException](sql("SELECT abc.* FROM nestedStructTable"))
+      .getMessage.contains("cannot resolve"))
+  }
+
+
+  test("Struct Star Expansion - Name conflict") {
+    // Create a data set that contains a naming conflict
+    val nameConflict = sql("SELECT struct(a, b) as nameConflict, a as a FROM testData2")
+    nameConflict.registerTempTable("nameConflict")
+    // Unqualified should resolve to table.
+    checkAnswer(sql("SELECT nameConflict.* FROM nameConflict"),
+      Row(Row(1, 1), 1) :: Row(Row(1, 2), 1) :: Row(Row(2, 1), 2) :: Row(Row(2, 2), 2) ::
+        Row(Row(3, 1), 3) :: Row(Row(3, 2), 3) :: Nil)
+    // Qualify the struct type with the table name.
+    checkAnswer(sql("SELECT nameConflict.nameConflict.* FROM nameConflict"),
+      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 3697761f20c28..ab88c1e68fd72 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1505,7 +1505,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     // The format of dbName.tableName.* cannot be parsed by HiveParser. TOK_TABNAME will only
     // has a single child which is tableName.
     case Token("TOK_ALLCOLREF", Token("TOK_TABNAME", Token(name, Nil) :: Nil) :: Nil) =>
-      UnresolvedStar(Some(name))
+      UnresolvedStar(Some(UnresolvedAttribute.parseAttributeName(name)))
 
     /* Aggregate Functions */
     case Token("TOK_FUNCTIONSTAR", Token(COUNT(), Nil) :: Nil) => Count(Literal(1))

From efaa4721b511a1d29229facde6457a6dcda18966 Mon Sep 17 00:00:00 2001
From: Yves Raimond <yraimond@netflix.com>
Date: Mon, 2 Nov 2015 20:35:59 -0800
Subject: [PATCH 770/802] [SPARK-11432][GRAPHX] Personalized PageRank shouldn't
 use uniform initialization

Changes the personalized pagerank initialization to be non-uniform.

Author: Yves Raimond <yraimond@netflix.com>

Closes #9386 from moustaki/personalized-pagerank-init.
---
 .../apache/spark/graphx/lib/PageRank.scala    | 29 ++++++++++++-------
 .../spark/graphx/lib/PageRankSuite.scala      | 13 ++++++---
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index 8c0a461e99fa4..52b237fc15093 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -104,18 +104,23 @@ object PageRank extends Logging {
       graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15,
       srcId: Option[VertexId] = None): Graph[Double, Double] =
   {
+    val personalized = srcId isDefined
+    val src: VertexId = srcId.getOrElse(-1L)
+
     // Initialize the PageRank graph with each edge attribute having
-    // weight 1/outDegree and each vertex with attribute 1.0.
+    // weight 1/outDegree and each vertex with attribute resetProb.
+    // When running personalized pagerank, only the source vertex
+    // has an attribute resetProb. All others are set to 0.
     var rankGraph: Graph[Double, Double] = graph
       // Associate the degree with each vertex
       .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) }
       // Set the weight on the edges based on the degree
       .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src )
       // Set the vertex attributes to the initial pagerank values
-      .mapVertices( (id, attr) => resetProb )
+      .mapVertices { (id, attr) =>
+        if (!(id != src && personalized)) resetProb else 0.0
+      }
 
-    val personalized = srcId isDefined
-    val src: VertexId = srcId.getOrElse(-1L)
     def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 }
 
     var iteration = 0
@@ -192,6 +197,9 @@ object PageRank extends Logging {
       graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15,
       srcId: Option[VertexId] = None): Graph[Double, Double] =
   {
+    val personalized = srcId.isDefined
+    val src: VertexId = srcId.getOrElse(-1L)
+
     // Initialize the pagerankGraph with each edge attribute
     // having weight 1/outDegree and each vertex with attribute 1.0.
     val pagerankGraph: Graph[(Double, Double), Double] = graph
@@ -202,13 +210,11 @@ object PageRank extends Logging {
       // Set the weight on the edges based on the degree
       .mapTriplets( e => 1.0 / e.srcAttr )
       // Set the vertex attributes to (initalPR, delta = 0)
-      .mapVertices( (id, attr) => (0.0, 0.0) )
+      .mapVertices { (id, attr) =>
+        if (id == src) (resetProb, Double.NegativeInfinity) else (0.0, 0.0)
+      }
       .cache()
 
-    val personalized = srcId.isDefined
-    val src: VertexId = srcId.getOrElse(-1L)
-
-
     // Define the three functions needed to implement PageRank in the GraphX
     // version of Pregel
     def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {
@@ -225,7 +231,8 @@ object PageRank extends Logging {
       teleport = oldPR*delta
 
       val newPR = teleport + (1.0 - resetProb) * msgSum
-      (newPR, newPR - oldPR)
+      val newDelta = if (lastDelta == Double.NegativeInfinity) newPR else newPR - oldPR
+      (newPR, newDelta)
     }
 
     def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
@@ -239,7 +246,7 @@ object PageRank extends Logging {
     def messageCombiner(a: Double, b: Double): Double = a + b
 
     // The initial message received by all vertices in PageRank
-    val initialMessage = resetProb / (1.0 - resetProb)
+    val initialMessage = if (personalized) 0.0 else resetProb / (1.0 - resetProb)
 
     // Execute a dynamic version of Pregel.
     val vp = if (personalized) {
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
index 45f1e3011035e..bdff31446f8ee 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
@@ -109,17 +109,22 @@ class PageRankSuite extends SparkFunSuite with LocalSparkContext {
       assert(notMatching === 0)
 
       val staticErrors = staticRanks2.map { case (vid, pr) =>
-        val correct = (vid > 0 && pr == resetProb) ||
-          (vid == 0 && math.abs(pr - (resetProb + (1.0 - resetProb) * (resetProb *
-            (nVertices - 1)) )) < 1.0E-5)
+        val correct = (vid > 0 && pr == 0.0) ||
+          (vid == 0 && pr == resetProb)
         if (!correct) 1 else 0
       }
       assert(staticErrors.sum === 0)
 
       val dynamicRanks = starGraph.personalizedPageRank(0, 0, resetProb).vertices.cache()
       assert(compareRanks(staticRanks2, dynamicRanks) < errorTol)
+
+      // We have one outbound edge from 1 to 0
+      val otherStaticRanks2 = starGraph.staticPersonalizedPageRank(1, numIter = 2, resetProb)
+        .vertices.cache()
+      val otherDynamicRanks = starGraph.personalizedPageRank(1, 0, resetProb).vertices.cache()
+      assert(compareRanks(otherDynamicRanks, otherStaticRanks2) < errorTol)
     }
-  } // end of test Star PageRank
+  } // end of test Star PersonalPageRank
 
   test("Grid PageRank") {
     withSpark { sc =>

From 9cf56c96b7d02a14175d40b336da14c2e1c88339 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 2 Nov 2015 21:18:38 -0800
Subject: [PATCH 771/802] [SPARK-11469][SQL] Allow users to define
 nondeterministic udfs.

This is the first task (https://issues.apache.org/jira/browse/SPARK-11469) of https://issues.apache.org/jira/browse/SPARK-11438

Author: Yin Huai <yhuai@databricks.com>

Closes #9393 from yhuai/udfNondeterministic.
---
 project/MimaExcludes.scala                    |  47 +++++
 .../sql/catalyst/expressions/ScalaUDF.scala   |   7 +-
 .../apache/spark/sql/UDFRegistration.scala    | 164 ++++++++++--------
 .../spark/sql/UserDefinedFunction.scala       |  13 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala | 105 +++++++++++
 .../datasources/parquet/ParquetIOSuite.scala  |   4 +-
 6 files changed, 262 insertions(+), 78 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 8282f7ea62400..ec0e44b7f2d66 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -112,6 +112,53 @@ object MimaExcludes {
           "org.apache.spark.rdd.MapPartitionsWithPreparationRDD"),
         ProblemFilters.exclude[MissingClassProblem](
           "org.apache.spark.rdd.MapPartitionsWithPreparationRDD$")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$2"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$3"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$4"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$5"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$6"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$7"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$8"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$9"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$10"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$11"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$12"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$13"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$14"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$15"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$16"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$17"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$18"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$19"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$20"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$21"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$22"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$23"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 11c7950c0613b..a04af7f1dd877 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -30,13 +30,18 @@ case class ScalaUDF(
     function: AnyRef,
     dataType: DataType,
     children: Seq[Expression],
-    inputTypes: Seq[DataType] = Nil)
+    inputTypes: Seq[DataType] = Nil,
+    isDeterministic: Boolean = true)
   extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   override def nullable: Boolean = true
 
   override def toString: String = s"UDF(${children.mkString(",")})"
 
+  override def foldable: Boolean = deterministic && children.forall(_.foldable)
+
+  override def deterministic: Boolean = isDeterministic && children.forall(_.deterministic)
+
   // scalastyle:off
 
   /** This method has been generated by this script
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index fc4d0938c533a..f5b95e13e47bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -58,8 +58,10 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
    * Register a user-defined aggregate function (UDAF).
    *
    * @param name the name of the UDAF.
-   * @param udaf the UDAF needs to be registered.
+   * @param udaf the UDAF that needs to be registered.
    * @return the registered UDAF.
+   *
+   * @since 1.5.0
    */
   def register(
       name: String,
@@ -69,6 +71,22 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
     udaf
   }
 
+  /**
+   * Register a user-defined function (UDF).
+   *
+   * @param name the name of the UDF.
+   * @param udf the UDF that needs to be registered.
+   * @return the registered UDF.
+   *
+   * @since 1.6.0
+   */
+  def register(
+      name: String,
+      udf: UserDefinedFunction): UserDefinedFunction = {
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
+  }
+
   // scalastyle:off
 
   /* register 0-22 were generated by this script
@@ -86,9 +104,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
         def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
           val dataType = ScalaReflection.schemaFor[RT].dataType
           val inputTypes = Try($inputTypes).getOrElse(Nil)
-          def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-          functionRegistry.registerFunction(name, builder)
-          UserDefinedFunction(func, dataType)
+          val udf = UserDefinedFunction(func, dataType, inputTypes)
+          functionRegistry.registerFunction(name, udf.builder)
+          udf
         }""")
     }
 
@@ -118,9 +136,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -131,9 +149,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -144,9 +162,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -157,9 +175,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -170,9 +188,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -183,9 +201,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -196,9 +214,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -209,9 +227,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -222,9 +240,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -235,9 +253,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -248,9 +266,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -261,9 +279,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -274,9 +292,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -287,9 +305,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -300,9 +318,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -313,9 +331,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -326,9 +344,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -339,9 +357,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -352,9 +370,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -365,9 +383,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -378,9 +396,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -391,9 +409,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   /**
@@ -404,9 +422,9 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: ScalaReflection.schemaFor[A22].dataType :: Nil).getOrElse(Nil)
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes)
-    functionRegistry.registerFunction(name, builder)
-    UserDefinedFunction(func, dataType)
+    val udf = UserDefinedFunction(func, dataType, inputTypes)
+    functionRegistry.registerFunction(name, udf.builder)
+    udf
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
index 0f8cd280b5acb..1319391db5375 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
@@ -44,11 +44,20 @@ import org.apache.spark.sql.types.DataType
 case class UserDefinedFunction protected[sql] (
     f: AnyRef,
     dataType: DataType,
-    inputTypes: Seq[DataType] = Nil) {
+    inputTypes: Seq[DataType] = Nil,
+    deterministic: Boolean = true) {
 
   def apply(exprs: Column*): Column = {
-    Column(ScalaUDF(f, dataType, exprs.map(_.expr), inputTypes))
+    Column(ScalaUDF(f, dataType, exprs.map(_.expr), inputTypes, deterministic))
   }
+
+  protected[sql] def builder: Seq[Expression] => ScalaUDF = {
+    (exprs: Seq[Expression]) =>
+      ScalaUDF(f, dataType, exprs, inputTypes, deterministic)
+  }
+
+  def nondeterministic: UserDefinedFunction =
+    UserDefinedFunction(f, dataType, inputTypes, deterministic = false)
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index e0435a0dba6ad..6e510f0b8aff4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.expressions.ScalaUDF
+import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 
@@ -191,4 +193,107 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     // pass a decimal to intExpected.
     assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 1)
   }
+
+  private def checkNumUDFs(df: DataFrame, expectedNumUDFs: Int): Unit = {
+    val udfs = df.queryExecution.optimizedPlan.collect {
+      case p: logical.Project => p.projectList.flatMap {
+        case e => e.collect {
+          case udf: ScalaUDF => udf
+        }
+      }
+    }.flatten
+    assert(udfs.length === expectedNumUDFs)
+  }
+
+  test("foldable udf") {
+    import org.apache.spark.sql.functions._
+
+    val myUDF = udf((x: Int) => x + 1)
+
+    {
+      val df = sql("SELECT 1 as a")
+        .select(col("a"), myUDF(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF(col("b")).as("c"))
+      checkNumUDFs(df, 0)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+  }
+
+  test("nondeterministic udf: using UDFRegistration") {
+    import org.apache.spark.sql.functions._
+
+    val myUDF = sqlContext.udf.register("plusOne1", (x: Int) => x + 1)
+    sqlContext.udf.register("plusOne2", myUDF.nondeterministic)
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), myUDF(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF(col("b")).as("c"))
+      checkNumUDFs(df, 3)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), callUDF("plusOne1", col("a")).as("b"))
+        .select(col("a"), col("b"), callUDF("plusOne1", col("b")).as("c"))
+      checkNumUDFs(df, 3)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), myUDF.nondeterministic(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF.nondeterministic(col("b")).as("c"))
+      checkNumUDFs(df, 2)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), callUDF("plusOne2", col("a")).as("b"))
+        .select(col("a"), col("b"), callUDF("plusOne2", col("b")).as("c"))
+      checkNumUDFs(df, 2)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+  }
+
+  test("nondeterministic udf: using udf function") {
+    import org.apache.spark.sql.functions._
+
+    val myUDF = udf((x: Int) => x + 1)
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), myUDF(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF(col("b")).as("c"))
+      checkNumUDFs(df, 3)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+
+    {
+      val df = sqlContext.range(1, 2).select(col("id").as("a"))
+        .select(col("a"), myUDF.nondeterministic(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF.nondeterministic(col("b")).as("c"))
+      checkNumUDFs(df, 2)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+
+    {
+      // nondeterministicUDF will not be foldable.
+      val df = sql("SELECT 1 as a")
+        .select(col("a"), myUDF.nondeterministic(col("a")).as("b"))
+        .select(col("a"), col("b"), myUDF.nondeterministic(col("b")).as("c"))
+      checkNumUDFs(df, 2)
+      checkAnswer(df, Row(1, 2, 3))
+    }
+  }
+
+  test("override a registered udf") {
+    sqlContext.udf.register("intExpected", (x: Int) => x)
+    assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 1)
+
+    sqlContext.udf.register("intExpected", (x: Int) => x + 1)
+    assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 2)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 72744799897be..f14b2886a9ecb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -381,7 +381,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
         intercept[org.apache.spark.SparkException] {
-          sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
+          sqlContext.range(1, 2).selectExpr("div0(id) as a").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
         val fs = path.getFileSystem(hadoopConfiguration)
@@ -405,7 +405,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
         intercept[org.apache.spark.SparkException] {
-          sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
+          sqlContext.range(1, 2).selectExpr("div0(id) as a").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
         val fs = path.getFileSystem(hadoopConfiguration)

From c34c27fe9244939d8c905cd689536dfb81c74d7d Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Mon, 2 Nov 2015 23:52:36 -0800
Subject: [PATCH 772/802] [SPARK-9034][SQL] Reflect field names defined in
 GenericUDTF

Hive GenericUDTF#initialize() defines field names in a returned schema though,
the current HiveGenericUDTF drops these names.
We might need to reflect these in a logical plan tree.

Author: navis.ryu <navis@apache.org>

Closes #8456 from navis/SPARK-9034.
---
 .../spark/sql/catalyst/analysis/Analyzer.scala       | 11 +++++------
 .../spark/sql/catalyst/expressions/generators.scala  | 12 +++++++-----
 .../main/scala/org/apache/spark/sql/DataFrame.scala  | 10 +++++-----
 .../sql/hive/execution/HiveCompatibilitySuite.scala  |  1 +
 .../scala/org/apache/spark/sql/hive/hiveUDFs.scala   |  2 +-
 ...GenericUDTF #1-0-ff502d8c06f4b32f57aa45057b7fab0e |  1 +
 ...GenericUDTF #2-0-d6d0def30a7fad5f90fd835361820c30 |  1 +
 ...l_view_noalias-0-50131c0ba7b7a6b65c789a5a8497bada |  1 +
 ...l_view_noalias-1-72509f06e1f7c5d5ccc292f775f8eea7 |  0
 ...l_view_noalias-2-6d5806dd1d2511911a5de1e205523f42 |  2 ++
 ...l_view_noalias-3-155b3cc2f5054725a9c2acca3c38c00a |  0
 ...l_view_noalias-4-3b7045ace234af8e5e86d8ac23ccee56 |  2 ++
 ...al_view_noalias-5-e1eca4e08216897d090259d4fd1e3fe |  0
 ...l_view_noalias-6-16d227442dd775615c6ecfceedc6c612 |  0
 ...l_view_noalias-7-66cb5ab20690dd85b2ed95bbfb9481d3 |  2 ++
 .../spark/sql/hive/execution/HiveQuerySuite.scala    |  6 ++++++
 16 files changed, 34 insertions(+), 17 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #1-0-ff502d8c06f4b32f57aa45057b7fab0e
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #2-0-d6d0def30a7fad5f90fd835361820c30
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-0-50131c0ba7b7a6b65c789a5a8497bada
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-1-72509f06e1f7c5d5ccc292f775f8eea7
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-2-6d5806dd1d2511911a5de1e205523f42
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-3-155b3cc2f5054725a9c2acca3c38c00a
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-4-3b7045ace234af8e5e86d8ac23ccee56
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-5-e1eca4e08216897d090259d4fd1e3fe
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-6-16d227442dd775615c6ecfceedc6c612
 create mode 100644 sql/hive/src/test/resources/golden/lateral_view_noalias-7-66cb5ab20690dd85b2ed95bbfb9481d3

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 912c967b95f08..899ee67352df4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -147,7 +147,7 @@ class Analyzer(
             case u @ UnresolvedAlias(child) => child match {
               case ne: NamedExpression => ne
               case e if !e.resolved => u
-              case g: Generator if g.elementTypes.size > 1 => MultiAlias(g, Nil)
+              case g: Generator => MultiAlias(g, Nil)
               case c @ Cast(ne: NamedExpression, _) => Alias(c, ne.name)()
               case other => Alias(other, s"_c$i")()
             }
@@ -722,7 +722,7 @@ class Analyzer(
 
     /**
      * Construct the output attributes for a [[Generator]], given a list of names.  If the list of
-     * names is empty names are assigned by ordinal (i.e., _c0, _c1, ...) to match Hive's defaults.
+     * names is empty names are assigned from field names in generator.
      */
     private def makeGeneratorOutput(
         generator: Generator,
@@ -731,13 +731,12 @@ class Analyzer(
 
       if (names.length == elementTypes.length) {
         names.zip(elementTypes).map {
-          case (name, (t, nullable)) =>
+          case (name, (t, nullable, _)) =>
             AttributeReference(name, t, nullable)()
         }
       } else if (names.isEmpty) {
-        elementTypes.zipWithIndex.map {
-          // keep the default column names as Hive does _c0, _c1, _cN
-          case ((t, nullable), i) => AttributeReference(s"_c$i", t, nullable)()
+        elementTypes.map {
+          case (t, nullable, name) => AttributeReference(name, t, nullable)()
         }
       } else {
         failAnalysis(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 1a2092c909c56..894a0730d1c2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -53,7 +53,7 @@ trait Generator extends Expression {
    * The output element data types in structure of Seq[(DataType, Nullable)]
    * TODO we probably need to add more information like metadata etc.
    */
-  def elementTypes: Seq[(DataType, Boolean)]
+  def elementTypes: Seq[(DataType, Boolean, String)]
 
   /** Should be implemented by child classes to perform specific Generators. */
   override def eval(input: InternalRow): TraversableOnce[InternalRow]
@@ -69,7 +69,7 @@ trait Generator extends Expression {
  * A generator that produces its output using the provided lambda function.
  */
 case class UserDefinedGenerator(
-    elementTypes: Seq[(DataType, Boolean)],
+    elementTypes: Seq[(DataType, Boolean, String)],
     function: Row => TraversableOnce[InternalRow],
     children: Seq[Expression])
   extends Generator with CodegenFallback {
@@ -112,9 +112,11 @@ case class Explode(child: Expression) extends UnaryExpression with Generator wit
     }
   }
 
-  override def elementTypes: Seq[(DataType, Boolean)] = child.dataType match {
-    case ArrayType(et, containsNull) => (et, containsNull) :: Nil
-    case MapType(kt, vt, valueContainsNull) => (kt, false) :: (vt, valueContainsNull) :: Nil
+  // hive-compatible default alias for explode function ("col" for array, "key", "value" for map)
+  override def elementTypes: Seq[(DataType, Boolean, String)] = child.dataType match {
+    case ArrayType(et, containsNull) => (et, containsNull, "col") :: Nil
+    case MapType(kt, vt, valueContainsNull) =>
+      (kt, false, "key") :: (vt, valueContainsNull, "value") :: Nil
   }
 
   override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 53ad3c0266cdb..fc0ab632f9930 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1175,7 +1175,8 @@ class DataFrame private[sql](
   def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame = {
     val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
 
-    val elementTypes = schema.toAttributes.map { attr => (attr.dataType, attr.nullable) }
+    val elementTypes = schema.toAttributes.map {
+      attr => (attr.dataType, attr.nullable, attr.name) }
     val names = schema.toAttributes.map(_.name)
     val convert = CatalystTypeConverters.createToCatalystConverter(schema)
 
@@ -1184,7 +1185,7 @@ class DataFrame private[sql](
     val generator = UserDefinedGenerator(elementTypes, rowFunction, input.map(_.expr))
 
     Generate(generator, join = true, outer = false,
-      qualifier = None, names.map(UnresolvedAttribute(_)), logicalPlan)
+      qualifier = None, generatorOutput = Nil, logicalPlan)
   }
 
   /**
@@ -1203,8 +1204,7 @@ class DataFrame private[sql](
     val dataType = ScalaReflection.schemaFor[B].dataType
     val attributes = AttributeReference(outputColumn, dataType)() :: Nil
     // TODO handle the metadata?
-    val elementTypes = attributes.map { attr => (attr.dataType, attr.nullable) }
-    val names = attributes.map(_.name)
+    val elementTypes = attributes.map { attr => (attr.dataType, attr.nullable, attr.name) }
 
     def rowFunction(row: Row): TraversableOnce[InternalRow] = {
       val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
@@ -1213,7 +1213,7 @@ class DataFrame private[sql](
     val generator = UserDefinedGenerator(elementTypes, rowFunction, apply(inputColumn).expr :: Nil)
 
     Generate(generator, join = true, outer = false,
-      qualifier = None, names.map(UnresolvedAttribute(_)), logicalPlan)
+      qualifier = None, generatorOutput = Nil, logicalPlan)
   }
 
   /////////////////////////////////////////////////////////////////////////////
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 6ed40b03975d0..2d0d7b8af3581 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -661,6 +661,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "join_star",
     "lateral_view",
     "lateral_view_cp",
+    "lateral_view_noalias",
     "lateral_view_ppd",
     "leftsemijoin",
     "leftsemijoin_mr",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 0b5e863506142..a9db70119d011 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -511,7 +511,7 @@ private[hive] case class HiveGenericUDTF(
   protected lazy val collector = new UDTFCollector
 
   override lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
-    field => (inspectorToDataType(field.getFieldObjectInspector), true)
+    field => (inspectorToDataType(field.getFieldObjectInspector), true, field.getFieldName)
   }
 
   @transient
diff --git a/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #1-0-ff502d8c06f4b32f57aa45057b7fab0e b/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #1-0-ff502d8c06f4b32f57aa45057b7fab0e
new file mode 100644
index 0000000000000..1cf253f92c055
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #1-0-ff502d8c06f4b32f57aa45057b7fab0e	
@@ -0,0 +1 @@
+238
diff --git a/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #2-0-d6d0def30a7fad5f90fd835361820c30 b/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #2-0-d6d0def30a7fad5f90fd835361820c30
new file mode 100644
index 0000000000000..60878ffb77064
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-9034 Reflect field names defined in GenericUDTF #2-0-d6d0def30a7fad5f90fd835361820c30	
@@ -0,0 +1 @@
+238	val_238
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/lateral_view_noalias-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/lateral_view_noalias-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-1-72509f06e1f7c5d5ccc292f775f8eea7 b/sql/hive/src/test/resources/golden/lateral_view_noalias-1-72509f06e1f7c5d5ccc292f775f8eea7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-2-6d5806dd1d2511911a5de1e205523f42 b/sql/hive/src/test/resources/golden/lateral_view_noalias-2-6d5806dd1d2511911a5de1e205523f42
new file mode 100644
index 0000000000000..0da0d93886e01
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/lateral_view_noalias-2-6d5806dd1d2511911a5de1e205523f42
@@ -0,0 +1,2 @@
+key1	100
+key2	200
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-3-155b3cc2f5054725a9c2acca3c38c00a b/sql/hive/src/test/resources/golden/lateral_view_noalias-3-155b3cc2f5054725a9c2acca3c38c00a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-4-3b7045ace234af8e5e86d8ac23ccee56 b/sql/hive/src/test/resources/golden/lateral_view_noalias-4-3b7045ace234af8e5e86d8ac23ccee56
new file mode 100644
index 0000000000000..0da0d93886e01
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/lateral_view_noalias-4-3b7045ace234af8e5e86d8ac23ccee56
@@ -0,0 +1,2 @@
+key1	100
+key2	200
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-5-e1eca4e08216897d090259d4fd1e3fe b/sql/hive/src/test/resources/golden/lateral_view_noalias-5-e1eca4e08216897d090259d4fd1e3fe
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-6-16d227442dd775615c6ecfceedc6c612 b/sql/hive/src/test/resources/golden/lateral_view_noalias-6-16d227442dd775615c6ecfceedc6c612
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/lateral_view_noalias-7-66cb5ab20690dd85b2ed95bbfb9481d3 b/sql/hive/src/test/resources/golden/lateral_view_noalias-7-66cb5ab20690dd85b2ed95bbfb9481d3
new file mode 100644
index 0000000000000..4ba46bbda5b04
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/lateral_view_noalias-7-66cb5ab20690dd85b2ed95bbfb9481d3
@@ -0,0 +1,2 @@
+key1	100	key1	100
+key2	200	key2	200
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index e597d6865f67a..fc72e3c7dc6aa 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -563,6 +563,12 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   createQueryTest("Specify the udtf output",
     "SELECT d FROM (SELECT explode(array(1,1)) d FROM src LIMIT 1) t")
 
+  createQueryTest("SPARK-9034 Reflect field names defined in GenericUDTF #1",
+    "SELECT col FROM (SELECT explode(array(key,value)) FROM src LIMIT 1) t")
+
+  createQueryTest("SPARK-9034 Reflect field names defined in GenericUDTF #2",
+    "SELECT key,value FROM (SELECT explode(map(key,value)) FROM src LIMIT 1) t")
+
   test("sampling") {
     sql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
     sql("SELECT * FROM src TABLESAMPLE(100 PERCENT) s")

From d728d5c98658c44ed2949b55d36edeaa46f8c980 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 3 Nov 2015 00:12:49 -0800
Subject: [PATCH 773/802] [SPARK-9858][SPARK-9859][SPARK-9861][SQL] Add an
 ExchangeCoordinator to estimate the number of post-shuffle partitions for
 aggregates and joins

https://issues.apache.org/jira/browse/SPARK-9858
https://issues.apache.org/jira/browse/SPARK-9859
https://issues.apache.org/jira/browse/SPARK-9861

Author: Yin Huai <yhuai@databricks.com>

Closes #9276 from yhuai/numReducer.
---
 .../plans/physical/partitioning.scala         |   8 +
 .../scala/org/apache/spark/sql/SQLConf.scala  |  27 +
 .../apache/spark/sql/execution/Exchange.scala | 217 +++++++-
 .../sql/execution/ExchangeCoordinator.scala   | 260 ++++++++++
 .../spark/sql/execution/ShuffledRowRDD.scala  | 134 ++++-
 .../execution/ExchangeCoordinatorSuite.scala  | 479 ++++++++++++++++++
 .../spark/sql/execution/PlannerSuite.scala    |   8 +-
 .../execution/UnsafeRowSerializerSuite.scala  |   7 +-
 .../sql/execution/joins/InnerJoinSuite.scala  |  19 +-
 9 files changed, 1115 insertions(+), 44 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/ExchangeCoordinator.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 86b9417477ba3..9312c8123e92e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -165,6 +165,11 @@ sealed trait Partitioning {
    * produced by `A` could have also been produced by `B`.
    */
   def guarantees(other: Partitioning): Boolean = this == other
+
+  def withNumPartitions(newNumPartitions: Int): Partitioning = {
+    throw new IllegalStateException(
+      s"It is not allowed to call withNumPartitions method of a ${this.getClass.getSimpleName}")
+  }
 }
 
 object Partitioning {
@@ -249,6 +254,9 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
     case _ => false
   }
 
+  override def withNumPartitions(newNumPartitions: Int): HashPartitioning = {
+    HashPartitioning(expressions, newNumPartitions)
+  }
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 6f2892085a8f8..ed8b634ad5630 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -233,6 +233,25 @@ private[spark] object SQLConf {
     defaultValue = Some(200),
     doc = "The default number of partitions to use when shuffling data for joins or aggregations.")
 
+  val SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE =
+    longConf("spark.sql.adaptive.shuffle.targetPostShuffleInputSize",
+      defaultValue = Some(64 * 1024 * 1024),
+      doc = "The target post-shuffle input size in bytes of a task.")
+
+  val ADAPTIVE_EXECUTION_ENABLED = booleanConf("spark.sql.adaptive.enabled",
+    defaultValue = Some(false),
+    doc = "When true, enable adaptive query execution.")
+
+  val SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS =
+    intConf("spark.sql.adaptive.minNumPostShufflePartitions",
+      defaultValue = Some(-1),
+      doc = "The advisory minimal number of post-shuffle partitions provided to " +
+        "ExchangeCoordinator. This setting is used in our test to make sure we " +
+        "have enough parallelism to expose issues that will not be exposed with a " +
+        "single partition. When the value is a non-positive value, this setting will" +
+        "not be provided to ExchangeCoordinator.",
+      isPublic = false)
+
   val TUNGSTEN_ENABLED = booleanConf("spark.sql.tungsten.enabled",
     defaultValue = Some(true),
     doc = "When true, use the optimized Tungsten physical execution backend which explicitly " +
@@ -487,6 +506,14 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def numShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS)
 
+  private[spark] def targetPostShuffleInputSize: Long =
+    getConf(SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE)
+
+  private[spark] def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED)
+
+  private[spark] def minNumPostShufflePartitions: Int =
+    getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS)
+
   private[spark] def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
   private[spark] def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index e81108b7884d1..0f72ec6cc107a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -36,9 +36,23 @@ import org.apache.spark.util.MutablePair
 /**
  * Performs a shuffle that will result in the desired `newPartitioning`.
  */
-case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
+case class Exchange(
+    var newPartitioning: Partitioning,
+    child: SparkPlan,
+    @transient coordinator: Option[ExchangeCoordinator]) extends UnaryNode {
 
-  override def nodeName: String = if (tungstenMode) "TungstenExchange" else "Exchange"
+  override def nodeName: String = {
+    val extraInfo = coordinator match {
+      case Some(exchangeCoordinator) if exchangeCoordinator.isEstimated =>
+        "Shuffle"
+      case Some(exchangeCoordinator) if !exchangeCoordinator.isEstimated =>
+        "May shuffle"
+      case None => "Shuffle without coordinator"
+    }
+
+    val simpleNodeName = if (tungstenMode) "TungstenExchange" else "Exchange"
+    s"$simpleNodeName($extraInfo)"
+  }
 
   /**
    * Returns true iff we can support the data type, and we are not doing range partitioning.
@@ -129,7 +143,27 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
     }
   }
 
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this , "execute") {
+  override protected def doPrepare(): Unit = {
+    // If an ExchangeCoordinator is needed, we register this Exchange operator
+    // to the coordinator when we do prepare. It is important to make sure
+    // we register this operator right before the execution instead of register it
+    // in the constructor because it is possible that we create new instances of
+    // Exchange operators when we transform the physical plan
+    // (then the ExchangeCoordinator will hold references of unneeded Exchanges).
+    // So, we should only call registerExchange just before we start to execute
+    // the plan.
+    coordinator match {
+      case Some(exchangeCoordinator) => exchangeCoordinator.registerExchange(this)
+      case None =>
+    }
+  }
+
+  /**
+   * Returns a [[ShuffleDependency]] that will partition rows of its child based on
+   * the partitioning scheme defined in `newPartitioning`. Those partitions of
+   * the returned ShuffleDependency will be the input of shuffle.
+   */
+  private[sql] def prepareShuffleDependency(): ShuffleDependency[Int, InternalRow, InternalRow] = {
     val rdd = child.execute()
     val part: Partitioner = newPartitioning match {
       case RoundRobinPartitioning(numPartitions) => new HashPartitioner(numPartitions)
@@ -181,7 +215,54 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
         }
       }
     }
-    new ShuffledRowRDD(rddWithPartitionIds, serializer, part.numPartitions)
+
+    // Now, we manually create a ShuffleDependency. Because pairs in rddWithPartitionIds
+    // are in the form of (partitionId, row) and every partitionId is in the expected range
+    // [0, part.numPartitions - 1]. The partitioner of this is a PartitionIdPassthrough.
+    val dependency =
+      new ShuffleDependency[Int, InternalRow, InternalRow](
+        rddWithPartitionIds,
+        new PartitionIdPassthrough(part.numPartitions),
+        Some(serializer))
+
+    dependency
+  }
+
+  /**
+   * Returns a [[ShuffledRowRDD]] that represents the post-shuffle dataset.
+   * This [[ShuffledRowRDD]] is created based on a given [[ShuffleDependency]] and an optional
+   * partition start indices array. If this optional array is defined, the returned
+   * [[ShuffledRowRDD]] will fetch pre-shuffle partitions based on indices of this array.
+   */
+  private[sql] def preparePostShuffleRDD(
+      shuffleDependency: ShuffleDependency[Int, InternalRow, InternalRow],
+      specifiedPartitionStartIndices: Option[Array[Int]] = None): ShuffledRowRDD = {
+    // If an array of partition start indices is provided, we need to use this array
+    // to create the ShuffledRowRDD. Also, we need to update newPartitioning to
+    // update the number of post-shuffle partitions.
+    specifiedPartitionStartIndices.foreach { indices =>
+      assert(newPartitioning.isInstanceOf[HashPartitioning])
+      newPartitioning = newPartitioning.withNumPartitions(indices.length)
+    }
+    new ShuffledRowRDD(shuffleDependency, specifiedPartitionStartIndices)
+  }
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this , "execute") {
+    coordinator match {
+      case Some(exchangeCoordinator) =>
+        val shuffleRDD = exchangeCoordinator.postShuffleRDD(this)
+        assert(shuffleRDD.partitions.length == newPartitioning.numPartitions)
+        shuffleRDD
+      case None =>
+        val shuffleDependency = prepareShuffleDependency()
+        preparePostShuffleRDD(shuffleDependency)
+    }
+  }
+}
+
+object Exchange {
+  def apply(newPartitioning: Partitioning, child: SparkPlan): Exchange = {
+    Exchange(newPartitioning, child, None: Option[ExchangeCoordinator])
   }
 }
 
@@ -193,13 +274,22 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
  * input partition ordering requirements are met.
  */
 private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[SparkPlan] {
-  // TODO: Determine the number of partitions.
-  private def defaultPartitions: Int = sqlContext.conf.numShufflePartitions
+  private def defaultNumPreShufflePartitions: Int = sqlContext.conf.numShufflePartitions
+
+  private def targetPostShuffleInputSize: Long = sqlContext.conf.targetPostShuffleInputSize
+
+  private def adaptiveExecutionEnabled: Boolean = sqlContext.conf.adaptiveExecutionEnabled
+
+  private def minNumPostShufflePartitions: Option[Int] = {
+    val minNumPostShufflePartitions = sqlContext.conf.minNumPostShufflePartitions
+    if (minNumPostShufflePartitions > 0) Some(minNumPostShufflePartitions) else None
+  }
 
   /**
    * Given a required distribution, returns a partitioning that satisfies that distribution.
    */
-  private def createPartitioning(requiredDistribution: Distribution,
+  private def createPartitioning(
+      requiredDistribution: Distribution,
       numPartitions: Int): Partitioning = {
     requiredDistribution match {
       case AllTuples => SinglePartition
@@ -209,6 +299,98 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     }
   }
 
+  /**
+   * Adds [[ExchangeCoordinator]] to [[Exchange]]s if adaptive query execution is enabled
+   * and partitioning schemes of these [[Exchange]]s support [[ExchangeCoordinator]].
+   */
+  private def withExchangeCoordinator(
+      children: Seq[SparkPlan],
+      requiredChildDistributions: Seq[Distribution]): Seq[SparkPlan] = {
+    val supportsCoordinator =
+      if (children.exists(_.isInstanceOf[Exchange])) {
+        // Right now, ExchangeCoordinator only support HashPartitionings.
+        children.forall {
+          case e @ Exchange(hash: HashPartitioning, _, _) => true
+          case child =>
+            child.outputPartitioning match {
+              case hash: HashPartitioning => true
+              case collection: PartitioningCollection =>
+                collection.partitionings.exists(_.isInstanceOf[HashPartitioning])
+              case _ => false
+            }
+        }
+      } else {
+        // In this case, although we do not have Exchange operators, we may still need to
+        // shuffle data when we have more than one children because data generated by
+        // these children may not be partitioned in the same way.
+        // Please see the comment in withCoordinator for more details.
+        val supportsDistribution =
+          requiredChildDistributions.forall(_.isInstanceOf[ClusteredDistribution])
+        children.length > 1 && supportsDistribution
+      }
+
+    val withCoordinator =
+      if (adaptiveExecutionEnabled && supportsCoordinator) {
+        val coordinator =
+          new ExchangeCoordinator(
+            children.length,
+            targetPostShuffleInputSize,
+            minNumPostShufflePartitions)
+        children.zip(requiredChildDistributions).map {
+          case (e: Exchange, _) =>
+            // This child is an Exchange, we need to add the coordinator.
+            e.copy(coordinator = Some(coordinator))
+          case (child, distribution) =>
+            // If this child is not an Exchange, we need to add an Exchange for now.
+            // Ideally, we can try to avoid this Exchange. However, when we reach here,
+            // there are at least two children operators (because if there is a single child
+            // and we can avoid Exchange, supportsCoordinator will be false and we
+            // will not reach here.). Although we can make two children have the same number of
+            // post-shuffle partitions. Their numbers of pre-shuffle partitions may be different.
+            // For example, let's say we have the following plan
+            //         Join
+            //         /  \
+            //       Agg  Exchange
+            //       /      \
+            //    Exchange  t2
+            //      /
+            //     t1
+            // In this case, because a post-shuffle partition can include multiple pre-shuffle
+            // partitions, a HashPartitioning will not be strictly partitioned by the hashcodes
+            // after shuffle. So, even we can use the child Exchange operator of the Join to
+            // have a number of post-shuffle partitions that matches the number of partitions of
+            // Agg, we cannot say these two children are partitioned in the same way.
+            // Here is another case
+            //         Join
+            //         /  \
+            //       Agg1  Agg2
+            //       /      \
+            //   Exchange1  Exchange2
+            //       /       \
+            //      t1       t2
+            // In this case, two Aggs shuffle data with the same column of the join condition.
+            // After we use ExchangeCoordinator, these two Aggs may not be partitioned in the same
+            // way. Let's say that Agg1 and Agg2 both have 5 pre-shuffle partitions and 2
+            // post-shuffle partitions. It is possible that Agg1 fetches those pre-shuffle
+            // partitions by using a partitionStartIndices [0, 3]. However, Agg2 may fetch its
+            // pre-shuffle partitions by using another partitionStartIndices [0, 4].
+            // So, Agg1 and Agg2 are actually not co-partitioned.
+            //
+            // It will be great to introduce a new Partitioning to represent the post-shuffle
+            // partitions when one post-shuffle partition includes multiple pre-shuffle partitions.
+            val targetPartitioning =
+              createPartitioning(distribution, defaultNumPreShufflePartitions)
+            assert(targetPartitioning.isInstanceOf[HashPartitioning])
+            Exchange(targetPartitioning, child, Some(coordinator))
+        }
+      } else {
+        // If we do not need ExchangeCoordinator, the original children are returned.
+        children
+      }
+
+    withCoordinator
+  }
+
   private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = {
     val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution
     val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering
@@ -221,7 +403,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       if (child.outputPartitioning.satisfies(distribution)) {
         child
       } else {
-        Exchange(createPartitioning(distribution, defaultPartitions), child)
+        Exchange(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
       }
     }
 
@@ -234,7 +416,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       // First check if the existing partitions of the children all match. This means they are
       // partitioned by the same partitioning into the same number of partitions. In that case,
       // don't try to make them match `defaultPartitions`, just use the existing partitioning.
-      // TODO: this should be a cost based descision. For example, a big relation should probably
+      // TODO: this should be a cost based decision. For example, a big relation should probably
       // maintain its existing number of partitions and smaller partitions should be shuffled.
       // defaultPartitions is arbitrary.
       val numPartitions = children.head.outputPartitioning.numPartitions
@@ -250,7 +432,8 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       } else {
         children.zip(requiredChildDistributions).map {
           case (child, distribution) => {
-            val targetPartitioning = createPartitioning(distribution, defaultPartitions)
+            val targetPartitioning =
+              createPartitioning(distribution, defaultNumPreShufflePartitions)
             if (child.outputPartitioning.guarantees(targetPartitioning)) {
               child
             } else {
@@ -261,12 +444,24 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       }
     }
 
+    // Now, we need to add ExchangeCoordinator if necessary.
+    // Actually, it is not a good idea to add ExchangeCoordinators while we are adding Exchanges.
+    // However, with the way that we plan the query, we do not have a place where we have a
+    // global picture of all shuffle dependencies of a post-shuffle stage. So, we add coordinator
+    // at here for now.
+    // Once we finish https://issues.apache.org/jira/browse/SPARK-10665,
+    // we can first add Exchanges and then add coordinator once we have a DAG of query fragments.
+    children = withExchangeCoordinator(children, requiredChildDistributions)
+
     // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings:
     children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
       if (requiredOrdering.nonEmpty) {
         // If child.outputOrdering is [a, b] and requiredOrdering is [a], we do not need to sort.
         if (requiredOrdering != child.outputOrdering.take(requiredOrdering.length)) {
-          sqlContext.planner.BasicOperators.getSortOperator(requiredOrdering, global = false, child)
+          sqlContext.planner.BasicOperators.getSortOperator(
+            requiredOrdering,
+            global = false,
+            child)
         } else {
           child
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExchangeCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExchangeCoordinator.scala
new file mode 100644
index 0000000000000..8dbd69e1f44b8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExchangeCoordinator.scala
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.util.{Map => JMap, HashMap => JHashMap}
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.{Logging, SimpleFutureAction, ShuffleDependency, MapOutputStatistics}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+
+/**
+ * A coordinator used to determines how we shuffle data between stages generated by Spark SQL.
+ * Right now, the work of this coordinator is to determine the number of post-shuffle partitions
+ * for a stage that needs to fetch shuffle data from one or multiple stages.
+ *
+ * A coordinator is constructed with three parameters, `numExchanges`,
+ * `targetPostShuffleInputSize`, and `minNumPostShufflePartitions`.
+ *  - `numExchanges` is used to indicated that how many [[Exchange]]s that will be registered to
+ *    this coordinator. So, when we start to do any actual work, we have a way to make sure that
+ *    we have got expected number of [[Exchange]]s.
+ *  - `targetPostShuffleInputSize` is the targeted size of a post-shuffle partition's
+ *    input data size. With this parameter, we can estimate the number of post-shuffle partitions.
+ *    This parameter is configured through
+ *    `spark.sql.adaptive.shuffle.targetPostShuffleInputSize`.
+ *  - `minNumPostShufflePartitions` is an optional parameter. If it is defined, this coordinator
+ *    will try to make sure that there are at least `minNumPostShufflePartitions` post-shuffle
+ *    partitions.
+ *
+ * The workflow of this coordinator is described as follows:
+ *  - Before the execution of a [[SparkPlan]], for an [[Exchange]] operator,
+ *    if an [[ExchangeCoordinator]] is assigned to it, it registers itself to this coordinator.
+ *    This happens in the `doPrepare` method.
+ *  - Once we start to execute a physical plan, an [[Exchange]] registered to this coordinator will
+ *    call `postShuffleRDD` to get its corresponding post-shuffle [[ShuffledRowRDD]].
+ *    If this coordinator has made the decision on how to shuffle data, this [[Exchange]] will
+ *    immediately get its corresponding post-shuffle [[ShuffledRowRDD]].
+ *  - If this coordinator has not made the decision on how to shuffle data, it will ask those
+ *    registered [[Exchange]]s to submit their pre-shuffle stages. Then, based on the the size
+ *    statistics of pre-shuffle partitions, this coordinator will determine the number of
+ *    post-shuffle partitions and pack multiple pre-shuffle partitions with continuous indices
+ *    to a single post-shuffle partition whenever necessary.
+ *  - Finally, this coordinator will create post-shuffle [[ShuffledRowRDD]]s for all registered
+ *    [[Exchange]]s. So, when an [[Exchange]] calls `postShuffleRDD`, this coordinator can
+ *    lookup the corresponding [[RDD]].
+ *
+ * The strategy used to determine the number of post-shuffle partitions is described as follows.
+ * To determine the number of post-shuffle partitions, we have a target input size for a
+ * post-shuffle partition. Once we have size statistics of pre-shuffle partitions from stages
+ * corresponding to the registered [[Exchange]]s, we will do a pass of those statistics and
+ * pack pre-shuffle partitions with continuous indices to a single post-shuffle partition until
+ * the size of a post-shuffle partition is equal or greater than the target size.
+ * For example, we have two stages with the following pre-shuffle partition size statistics:
+ * stage 1: [100 MB, 20 MB, 100 MB, 10MB, 30 MB]
+ * stage 2: [10 MB,  10 MB, 70 MB,  5 MB, 5 MB]
+ * assuming the target input size is 128 MB, we will have three post-shuffle partitions,
+ * which are:
+ *  - post-shuffle partition 0: pre-shuffle partition 0 and 1
+ *  - post-shuffle partition 1: pre-shuffle partition 2
+ *  - post-shuffle partition 2: pre-shuffle partition 3 and 4
+ */
+private[sql] class ExchangeCoordinator(
+    numExchanges: Int,
+    advisoryTargetPostShuffleInputSize: Long,
+    minNumPostShufflePartitions: Option[Int] = None)
+  extends Logging {
+
+  // The registered Exchange operators.
+  private[this] val exchanges = ArrayBuffer[Exchange]()
+
+  // This map is used to lookup the post-shuffle ShuffledRowRDD for an Exchange operator.
+  private[this] val postShuffleRDDs: JMap[Exchange, ShuffledRowRDD] =
+    new JHashMap[Exchange, ShuffledRowRDD](numExchanges)
+
+  // A boolean that indicates if this coordinator has made decision on how to shuffle data.
+  // This variable will only be updated by doEstimationIfNecessary, which is protected by
+  // synchronized.
+  @volatile private[this] var estimated: Boolean = false
+
+  /**
+   * Registers an [[Exchange]] operator to this coordinator. This method is only allowed to be
+   * called in the `doPrepare` method of an [[Exchange]] operator.
+   */
+  def registerExchange(exchange: Exchange): Unit = synchronized {
+    exchanges += exchange
+  }
+
+  def isEstimated: Boolean = estimated
+
+  /**
+   * Estimates partition start indices for post-shuffle partitions based on
+   * mapOutputStatistics provided by all pre-shuffle stages.
+   */
+  private[sql] def estimatePartitionStartIndices(
+      mapOutputStatistics: Array[MapOutputStatistics]): Array[Int] = {
+    // If we have mapOutputStatistics.length <= numExchange, it is because we do not submit
+    // a stage when the number of partitions of this dependency is 0.
+    assert(mapOutputStatistics.length <= numExchanges)
+
+    // If minNumPostShufflePartitions is defined, it is possible that we need to use a
+    // value less than advisoryTargetPostShuffleInputSize as the target input size of
+    // a post shuffle task.
+    val targetPostShuffleInputSize = minNumPostShufflePartitions match {
+      case Some(numPartitions) =>
+        val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum
+        // The max at here is to make sure that when we have an empty table, we
+        // only have a single post-shuffle partition.
+        val maxPostShuffleInputSize =
+          math.max(math.ceil(totalPostShuffleInputSize / numPartitions.toDouble).toLong, 16)
+        math.min(maxPostShuffleInputSize, advisoryTargetPostShuffleInputSize)
+
+      case None => advisoryTargetPostShuffleInputSize
+    }
+
+    logInfo(
+      s"advisoryTargetPostShuffleInputSize: $advisoryTargetPostShuffleInputSize, " +
+      s"targetPostShuffleInputSize $targetPostShuffleInputSize.")
+
+    // Make sure we do get the same number of pre-shuffle partitions for those stages.
+    val distinctNumPreShufflePartitions =
+      mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct
+    assert(
+      distinctNumPreShufflePartitions.length == 1,
+      "There should be only one distinct value of the number pre-shuffle partitions " +
+        "among registered Exchange operator.")
+    val numPreShufflePartitions = distinctNumPreShufflePartitions.head
+
+    val partitionStartIndices = ArrayBuffer[Int]()
+    // The first element of partitionStartIndices is always 0.
+    partitionStartIndices += 0
+
+    var postShuffleInputSize = 0L
+
+    var i = 0
+    while (i < numPreShufflePartitions) {
+      // We calculate the total size of ith pre-shuffle partitions from all pre-shuffle stages.
+      // Then, we add the total size to postShuffleInputSize.
+      var j = 0
+      while (j < mapOutputStatistics.length) {
+        postShuffleInputSize += mapOutputStatistics(j).bytesByPartitionId(i)
+        j += 1
+      }
+
+      // If the current postShuffleInputSize is equal or greater than the
+      // targetPostShuffleInputSize, We need to add a new element in partitionStartIndices.
+      if (postShuffleInputSize >= targetPostShuffleInputSize) {
+        if (i < numPreShufflePartitions - 1) {
+          // Next start index.
+          partitionStartIndices += i + 1
+        } else {
+          // This is the last element. So, we do not need to append the next start index to
+          // partitionStartIndices.
+        }
+        // reset postShuffleInputSize.
+        postShuffleInputSize = 0L
+      }
+
+      i += 1
+    }
+
+    partitionStartIndices.toArray
+  }
+
+  private def doEstimationIfNecessary(): Unit = synchronized {
+    // It is unlikely that this method will be called from multiple threads
+    // (when multiple threads trigger the execution of THIS physical)
+    // because in common use cases, we will create new physical plan after
+    // users apply operations (e.g. projection) to an existing DataFrame.
+    // However, if it happens, we have synchronized to make sure only one
+    // thread will trigger the job submission.
+    if (!estimated) {
+      // Make sure we have the expected number of registered Exchange operators.
+      assert(exchanges.length == numExchanges)
+
+      val newPostShuffleRDDs = new JHashMap[Exchange, ShuffledRowRDD](numExchanges)
+
+      // Submit all map stages
+      val shuffleDependencies = ArrayBuffer[ShuffleDependency[Int, InternalRow, InternalRow]]()
+      val submittedStageFutures = ArrayBuffer[SimpleFutureAction[MapOutputStatistics]]()
+      var i = 0
+      while (i < numExchanges) {
+        val exchange = exchanges(i)
+        val shuffleDependency = exchange.prepareShuffleDependency()
+        shuffleDependencies += shuffleDependency
+        if (shuffleDependency.rdd.partitions.length != 0) {
+          // submitMapStage does not accept RDD with 0 partition.
+          // So, we will not submit this dependency.
+          submittedStageFutures +=
+            exchange.sqlContext.sparkContext.submitMapStage(shuffleDependency)
+        }
+        i += 1
+      }
+
+      // Wait for the finishes of those submitted map stages.
+      val mapOutputStatistics = new Array[MapOutputStatistics](submittedStageFutures.length)
+      i = 0
+      while (i < submittedStageFutures.length) {
+        // This call is a blocking call. If the stage has not finished, we will wait at here.
+        mapOutputStatistics(i) = submittedStageFutures(i).get()
+        i += 1
+      }
+
+      // Now, we estimate partitionStartIndices. partitionStartIndices.length will be the
+      // number of post-shuffle partitions.
+      val partitionStartIndices =
+        if (mapOutputStatistics.length == 0) {
+          None
+        } else {
+          Some(estimatePartitionStartIndices(mapOutputStatistics))
+        }
+
+      i = 0
+      while (i < numExchanges) {
+        val exchange = exchanges(i)
+        val rdd =
+          exchange.preparePostShuffleRDD(shuffleDependencies(i), partitionStartIndices)
+        newPostShuffleRDDs.put(exchange, rdd)
+
+        i += 1
+      }
+
+      // Finally, we set postShuffleRDDs and estimated.
+      assert(postShuffleRDDs.isEmpty)
+      assert(newPostShuffleRDDs.size() == numExchanges)
+      postShuffleRDDs.putAll(newPostShuffleRDDs)
+      estimated = true
+    }
+  }
+
+  def postShuffleRDD(exchange: Exchange): ShuffledRowRDD = {
+    doEstimationIfNecessary()
+
+    if (!postShuffleRDDs.containsKey(exchange)) {
+      throw new IllegalStateException(
+        s"The given $exchange is not registered in this coordinator.")
+    }
+
+    postShuffleRDDs.get(exchange)
+  }
+
+  override def toString: String = {
+    s"coordinator[target post-shuffle partition size: $advisoryTargetPostShuffleInputSize]"
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index fb338b90bf79b..42891287a3006 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -17,14 +17,23 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.Arrays
+
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.serializer.Serializer
 import org.apache.spark.sql.catalyst.InternalRow
 
-private class ShuffledRowRDDPartition(val idx: Int) extends Partition {
-  override val index: Int = idx
-  override def hashCode(): Int = idx
+/**
+ * The [[Partition]] used by [[ShuffledRowRDD]]. A post-shuffle partition
+ * (identified by `postShufflePartitionIndex`) contains a range of pre-shuffle partitions
+ * (`startPreShufflePartitionIndex` to `endPreShufflePartitionIndex - 1`, inclusive).
+ */
+private final class ShuffledRowRDDPartition(
+    val postShufflePartitionIndex: Int,
+    val startPreShufflePartitionIndex: Int,
+    val endPreShufflePartitionIndex: Int) extends Partition {
+  override val index: Int = postShufflePartitionIndex
+  override def hashCode(): Int = postShufflePartitionIndex
 }
 
 /**
@@ -35,33 +44,107 @@ private class PartitionIdPassthrough(override val numPartitions: Int) extends Pa
   override def getPartition(key: Any): Int = key.asInstanceOf[Int]
 }
 
+/**
+ * A Partitioner that might group together one or more partitions from the parent.
+ *
+ * @param parent a parent partitioner
+ * @param partitionStartIndices indices of partitions in parent that should create new partitions
+ *   in child (this should be an array of increasing partition IDs). For example, if we have a
+ *   parent with 5 partitions, and partitionStartIndices is [0, 2, 4], we get three output
+ *   partitions, corresponding to partition ranges [0, 1], [2, 3] and [4] of the parent partitioner.
+ */
+class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: Array[Int])
+  extends Partitioner {
+
+  @transient private lazy val parentPartitionMapping: Array[Int] = {
+    val n = parent.numPartitions
+    val result = new Array[Int](n)
+    for (i <- 0 until partitionStartIndices.length) {
+      val start = partitionStartIndices(i)
+      val end = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n
+      for (j <- start until end) {
+        result(j) = i
+      }
+    }
+    result
+  }
+
+  override def numPartitions: Int = partitionStartIndices.length
+
+  override def getPartition(key: Any): Int = {
+    parentPartitionMapping(parent.getPartition(key))
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case c: CoalescedPartitioner =>
+      c.parent == parent && Arrays.equals(c.partitionStartIndices, partitionStartIndices)
+    case _ =>
+      false
+  }
+
+  override def hashCode(): Int = 31 * parent.hashCode() + Arrays.hashCode(partitionStartIndices)
+}
+
 /**
  * This is a specialized version of [[org.apache.spark.rdd.ShuffledRDD]] that is optimized for
  * shuffling rows instead of Java key-value pairs. Note that something like this should eventually
  * be implemented in Spark core, but that is blocked by some more general refactorings to shuffle
  * interfaces / internals.
  *
- * @param prev the RDD being shuffled. Elements of this RDD are (partitionId, Row) pairs.
- *             Partition ids should be in the range [0, numPartitions - 1].
- * @param serializer the serializer used during the shuffle.
- * @param numPartitions the number of post-shuffle partitions.
+ * This RDD takes a [[ShuffleDependency]] (`dependency`),
+ * and a optional array of partition start indices as input arguments
+ * (`specifiedPartitionStartIndices`).
+ *
+ * The `dependency` has the parent RDD of this RDD, which represents the dataset before shuffle
+ * (i.e. map output). Elements of this RDD are (partitionId, Row) pairs.
+ * Partition ids should be in the range [0, numPartitions - 1].
+ * `dependency.partitioner` is the original partitioner used to partition
+ * map output, and `dependency.partitioner.numPartitions` is the number of pre-shuffle partitions
+ * (i.e. the number of partitions of the map output).
+ *
+ * When `specifiedPartitionStartIndices` is defined, `specifiedPartitionStartIndices.length`
+ * will be the number of post-shuffle partitions. For this case, the `i`th post-shuffle
+ * partition includes `specifiedPartitionStartIndices[i]` to
+ * `specifiedPartitionStartIndices[i+1] - 1` (inclusive).
+ *
+ * When `specifiedPartitionStartIndices` is not defined, there will be
+ * `dependency.partitioner.numPartitions` post-shuffle partitions. For this case,
+ * a post-shuffle partition is created for every pre-shuffle partition.
  */
 class ShuffledRowRDD(
-    @transient var prev: RDD[Product2[Int, InternalRow]],
-    serializer: Serializer,
-    numPartitions: Int)
-  extends RDD[InternalRow](prev.context, Nil) {
+    var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
+    specifiedPartitionStartIndices: Option[Array[Int]] = None)
+  extends RDD[InternalRow](dependency.rdd.context, Nil) {
 
-  private val part: Partitioner = new PartitionIdPassthrough(numPartitions)
+  private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions
 
-  override def getDependencies: Seq[Dependency[_]] = {
-    List(new ShuffleDependency[Int, InternalRow, InternalRow](prev, part, Some(serializer)))
+  private[this] val partitionStartIndices: Array[Int] = specifiedPartitionStartIndices match {
+    case Some(indices) => indices
+    case None =>
+      // When specifiedPartitionStartIndices is not defined, every post-shuffle partition
+      // corresponds to a pre-shuffle partition.
+      (0 until numPreShufflePartitions).toArray
   }
 
-  override val partitioner = Some(part)
+  private[this] val part: Partitioner =
+    new CoalescedPartitioner(dependency.partitioner, partitionStartIndices)
+
+  override def getDependencies: Seq[Dependency[_]] = List(dependency)
+
+  override val partitioner: Option[Partitioner] = Some(part)
 
   override def getPartitions: Array[Partition] = {
-    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i))
+    assert(partitionStartIndices.length == part.numPartitions)
+    Array.tabulate[Partition](partitionStartIndices.length) { i =>
+      val startIndex = partitionStartIndices(i)
+      val endIndex =
+        if (i < partitionStartIndices.length - 1) {
+          partitionStartIndices(i + 1)
+        } else {
+          numPreShufflePartitions
+        }
+      new ShuffledRowRDDPartition(i, startIndex, endIndex)
+    }
   }
 
   override def getPreferredLocations(partition: Partition): Seq[String] = {
@@ -71,15 +154,20 @@ class ShuffledRowRDD(
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
-    val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]]
-    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
-      .read()
-      .asInstanceOf[Iterator[Product2[Int, InternalRow]]]
-      .map(_._2)
+    val shuffledRowPartition = split.asInstanceOf[ShuffledRowRDDPartition]
+    // The range of pre-shuffle partitions that we are fetching at here is
+    // [startPreShufflePartitionIndex, endPreShufflePartitionIndex - 1].
+    val reader =
+      SparkEnv.get.shuffleManager.getReader(
+        dependency.shuffleHandle,
+        shuffledRowPartition.startPreShufflePartitionIndex,
+        shuffledRowPartition.endPreShufflePartitionIndex,
+        context)
+    reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2)
   }
 
   override def clearDependencies() {
     super.clearDependencies()
-    prev = null
+    dependency = null
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
new file mode 100644
index 0000000000000..25f2f5caeed15
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
@@ -0,0 +1,479 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql._
+import org.apache.spark.{SparkFunSuite, SparkContext, SparkConf, MapOutputStatistics}
+
+class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
+
+  private var originalActiveSQLContext: Option[SQLContext] = _
+  private var originalInstantiatedSQLContext: Option[SQLContext] = _
+
+  override protected def beforeAll(): Unit = {
+    originalActiveSQLContext = SQLContext.getActiveContextOption()
+    originalInstantiatedSQLContext = SQLContext.getInstantiatedContextOption()
+
+    SQLContext.clearActive()
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.clearInstantiatedContext(ctx))
+  }
+
+  override protected def afterAll(): Unit = {
+    // Set these states back.
+    originalActiveSQLContext.foreach(ctx => SQLContext.setActive(ctx))
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.setInstantiatedContext(ctx))
+  }
+
+  private def checkEstimation(
+      coordinator: ExchangeCoordinator,
+      bytesByPartitionIdArray: Array[Array[Long]],
+      expectedPartitionStartIndices: Array[Int]): Unit = {
+    val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map {
+      case (bytesByPartitionId, index) =>
+        new MapOutputStatistics(index, bytesByPartitionId)
+    }
+    val estimatedPartitionStartIndices =
+      coordinator.estimatePartitionStartIndices(mapOutputStatistics)
+    assert(estimatedPartitionStartIndices === expectedPartitionStartIndices)
+  }
+
+  test("test estimatePartitionStartIndices - 1 Exchange") {
+    val coordinator = new ExchangeCoordinator(1, 100L)
+
+    {
+      // All bytes per partition are 0.
+      val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+
+    {
+      // Some bytes per partition are 0 and total size is less than the target size.
+      // 1 post-shuffle partition is needed.
+      val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+
+    {
+      // 2 post-shuffle partitions are needed.
+      val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0)
+      val expectedPartitionStartIndices = Array[Int](0, 3)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+
+    {
+      // There are a few large pre-shuffle partitions.
+      val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+
+    {
+      // All pre-shuffle partitions are larger than the targeted size.
+      val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+
+    {
+      // The last pre-shuffle partition is in a single post-shuffle partition.
+      val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 4)
+      checkEstimation(coordinator, Array(bytesByPartitionId), expectedPartitionStartIndices)
+    }
+  }
+
+  test("test estimatePartitionStartIndices - 2 Exchanges") {
+    val coordinator = new ExchangeCoordinator(2, 100L)
+
+    {
+      // If there are multiple values of the number of pre-shuffle partitions,
+      // we should see an assertion error.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0)
+      val mapOutputStatistics =
+        Array(
+          new MapOutputStatistics(0, bytesByPartitionId1),
+          new MapOutputStatistics(1, bytesByPartitionId2))
+      intercept[AssertionError](coordinator.estimatePartitionStartIndices(mapOutputStatistics))
+    }
+
+    {
+      // All bytes per partition are 0.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // Some bytes per partition are 0.
+      // 1 post-shuffle partition is needed.
+      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // 2 post-shuffle partition are needed.
+      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 3)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // 2 post-shuffle partition are needed.
+      val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 2)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // 2 post-shuffle partition are needed.
+      val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 2, 4)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // There are a few large pre-shuffle partitions.
+      val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 2, 3)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // All pairs of pre-shuffle partitions are larger than the targeted size.
+      val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+  }
+
+  test("test estimatePartitionStartIndices and enforce minimal number of reducers") {
+    val coordinator = new ExchangeCoordinator(2, 100L, Some(2))
+
+    {
+      // The minimal number of post-shuffle partitions is not enforced because
+      // the size of data is 0.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // The minimal number of post-shuffle partitions is enforced.
+      val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20)
+      val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5)
+      val expectedPartitionStartIndices = Array[Int](0, 3)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+
+    {
+      // The number of post-shuffle partitions is determined by the coordinator.
+      val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20)
+      val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 2, 4)
+      checkEstimation(
+        coordinator,
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices)
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Query tests
+  ///////////////////////////////////////////////////////////////////////////
+
+  val numInputPartitions: Int = 10
+
+  def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    QueryTest.checkAnswer(actual, expectedAnswer) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
+  }
+
+  def withSQLContext(
+      f: SQLContext => Unit,
+      targetNumPostShufflePartitions: Int,
+      minNumPostShufflePartitions: Option[Int]): Unit = {
+    val sparkConf =
+      new SparkConf(false)
+        .setMaster("local[*]")
+        .setAppName("test")
+        .set("spark.ui.enabled", "false")
+        .set("spark.driver.allowMultipleContexts", "true")
+        .set(SQLConf.SHUFFLE_PARTITIONS.key, "5")
+        .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true")
+        .set(
+          SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key,
+          targetNumPostShufflePartitions.toString)
+    minNumPostShufflePartitions match {
+      case Some(numPartitions) =>
+        sparkConf.set(SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS.key, numPartitions.toString)
+      case None =>
+        sparkConf.set(SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS.key, "-1")
+    }
+    val sparkContext = new SparkContext(sparkConf)
+    val sqlContext = new TestSQLContext(sparkContext)
+    try f(sqlContext) finally sparkContext.stop()
+  }
+
+  Seq(Some(3), None).foreach { minNumPostShufflePartitions =>
+    val testNameNote = minNumPostShufflePartitions match {
+      case Some(numPartitions) => "(minNumPostShufflePartitions: 3)"
+      case None => ""
+    }
+
+    test(s"determining the number of reducers: aggregate operator$testNameNote") {
+      val test = { sqlContext: SQLContext =>
+        val df =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 20 as key", "id as value")
+        val agg = df.groupBy("key").count
+
+        // Check the answer first.
+        checkAnswer(
+          agg,
+          sqlContext.range(0, 20).selectExpr("id", "50 as cnt").collect())
+
+        // Then, let's look at the number of post-shuffle partitions estimated
+        // by the ExchangeCoordinator.
+        val exchanges = agg.queryExecution.executedPlan.collect {
+          case e: Exchange => e
+        }
+        assert(exchanges.length === 1)
+        minNumPostShufflePartitions match {
+          case Some(numPartitions) =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 3)
+              case o =>
+            }
+
+          case None =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 2)
+              case o =>
+            }
+        }
+      }
+
+      withSQLContext(test, 1536, minNumPostShufflePartitions)
+    }
+
+    test(s"determining the number of reducers: join operator$testNameNote") {
+      val test = { sqlContext: SQLContext =>
+        val df1 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key1", "id as value1")
+        val df2 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key2", "id as value2")
+
+        val join = df1.join(df2, col("key1") === col("key2")).select(col("key1"), col("value2"))
+
+        // Check the answer first.
+        val expectedAnswer =
+          sqlContext
+            .range(0, 1000)
+            .selectExpr("id % 500 as key", "id as value")
+            .unionAll(sqlContext.range(0, 1000).selectExpr("id % 500 as key", "id as value"))
+        checkAnswer(
+          join,
+          expectedAnswer.collect())
+
+        // Then, let's look at the number of post-shuffle partitions estimated
+        // by the ExchangeCoordinator.
+        val exchanges = join.queryExecution.executedPlan.collect {
+          case e: Exchange => e
+        }
+        assert(exchanges.length === 2)
+        minNumPostShufflePartitions match {
+          case Some(numPartitions) =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 3)
+              case o =>
+            }
+
+          case None =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 2)
+              case o =>
+            }
+        }
+      }
+
+      withSQLContext(test, 16384, minNumPostShufflePartitions)
+    }
+
+    test(s"determining the number of reducers: complex query 1$testNameNote") {
+      val test = { sqlContext: SQLContext =>
+        val df1 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key1", "id as value1")
+            .groupBy("key1")
+            .count
+            .toDF("key1", "cnt1")
+        val df2 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key2", "id as value2")
+            .groupBy("key2")
+            .count
+            .toDF("key2", "cnt2")
+
+        val join = df1.join(df2, col("key1") === col("key2")).select(col("key1"), col("cnt2"))
+
+        // Check the answer first.
+        val expectedAnswer =
+          sqlContext
+            .range(0, 500)
+            .selectExpr("id", "2 as cnt")
+        checkAnswer(
+          join,
+          expectedAnswer.collect())
+
+        // Then, let's look at the number of post-shuffle partitions estimated
+        // by the ExchangeCoordinator.
+        val exchanges = join.queryExecution.executedPlan.collect {
+          case e: Exchange => e
+        }
+        assert(exchanges.length === 4)
+        minNumPostShufflePartitions match {
+          case Some(numPartitions) =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 3)
+              case o =>
+            }
+
+          case None =>
+            assert(exchanges.forall(_.coordinator.isDefined))
+            assert(exchanges.map(_.outputPartitioning.numPartitions).toSeq.toSet === Set(1, 2))
+        }
+      }
+
+      withSQLContext(test, 6144, minNumPostShufflePartitions)
+    }
+
+    test(s"determining the number of reducers: complex query 2$testNameNote") {
+      val test = { sqlContext: SQLContext =>
+        val df1 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key1", "id as value1")
+            .groupBy("key1")
+            .count
+            .toDF("key1", "cnt1")
+        val df2 =
+          sqlContext
+            .range(0, 1000, 1, numInputPartitions)
+            .selectExpr("id % 500 as key2", "id as value2")
+
+        val join =
+          df1
+            .join(df2, col("key1") === col("key2"))
+            .select(col("key1"), col("cnt1"), col("value2"))
+
+        // Check the answer first.
+        val expectedAnswer =
+          sqlContext
+            .range(0, 1000)
+            .selectExpr("id % 500 as key", "2 as cnt", "id as value")
+        checkAnswer(
+          join,
+          expectedAnswer.collect())
+
+        // Then, let's look at the number of post-shuffle partitions estimated
+        // by the ExchangeCoordinator.
+        val exchanges = join.queryExecution.executedPlan.collect {
+          case e: Exchange => e
+        }
+        assert(exchanges.length === 3)
+        minNumPostShufflePartitions match {
+          case Some(numPartitions) =>
+            exchanges.foreach {
+              case e: Exchange =>
+                assert(e.coordinator.isDefined)
+                assert(e.outputPartitioning.numPartitions === 3)
+              case o =>
+            }
+
+          case None =>
+            assert(exchanges.forall(_.coordinator.isDefined))
+            assert(exchanges.map(_.outputPartitioning.numPartitions).toSeq.toSet === Set(2, 3))
+        }
+      }
+
+      withSQLContext(test, 6144, minNumPostShufflePartitions)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index ebdab1c26d7bd..2076c573b56c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -268,7 +268,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case Exchange(_, _) => true }.isEmpty) {
+    if (outputPlan.collect { case e: Exchange => true }.isEmpty) {
       fail(s"Exchange should have been added:\n$outputPlan")
     }
   }
@@ -306,7 +306,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case Exchange(_, _) => true }.isEmpty) {
+    if (outputPlan.collect { case e: Exchange => true }.isEmpty) {
       fail(s"Exchange should have been added:\n$outputPlan")
     }
   }
@@ -326,7 +326,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case Exchange(_, _) => true }.nonEmpty) {
+    if (outputPlan.collect { case e: Exchange => true }.nonEmpty) {
       fail(s"Exchange should not have been added:\n$outputPlan")
     }
   }
@@ -349,7 +349,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case Exchange(_, _) => true }.nonEmpty) {
+    if (outputPlan.collect { case e: Exchange => true }.nonEmpty) {
       fail(s"No Exchanges should have been added:\n$outputPlan")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index d32572b54b8a8..09e258299de5a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -152,7 +152,12 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
     val rowsRDD = sc.parallelize(Seq((0, unsafeRow), (1, unsafeRow), (0, unsafeRow)))
       .asInstanceOf[RDD[Product2[Int, InternalRow]]]
-    val shuffled = new ShuffledRowRDD(rowsRDD, new UnsafeRowSerializer(2), 2)
+    val dependency =
+      new ShuffleDependency[Int, InternalRow, InternalRow](
+        rowsRDD,
+        new PartitionIdPassthrough(2),
+        Some(new UnsafeRowSerializer(2)))
+    val shuffled = new ShuffledRowRDD(dependency)
     shuffled.count()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
index da58e96f3e6f7..066c16e535c76 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -49,7 +49,16 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(null, "e")
     )), new StructType().add("n", IntegerType).add("l", StringType))
 
-  private lazy val myTestData = Seq(
+  private lazy val myTestData1 = Seq(
+    (1, 1),
+    (1, 2),
+    (2, 1),
+    (2, 2),
+    (3, 1),
+    (3, 2)
+  ).toDF("a", "b")
+
+  private lazy val myTestData2 = Seq(
     (1, 1),
     (1, 2),
     (2, 1),
@@ -184,8 +193,8 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
   )
 
   {
-    lazy val left = myTestData.where("a = 1")
-    lazy val right = myTestData.where("a = 1")
+    lazy val left = myTestData1.where("a = 1")
+    lazy val right = myTestData2.where("a = 1")
     testInnerJoin(
       "inner join, multiple matches",
       left,
@@ -201,8 +210,8 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
   }
 
   {
-    lazy val left = myTestData.where("a = 1")
-    lazy val right = myTestData.where("a = 2")
+    lazy val left = myTestData1.where("a = 1")
+    lazy val right = myTestData2.where("a = 2")
     testInnerJoin(
       "inner join, no matches",
       left,

From 67e23b39ac3cdee06668fa9131951278b9731e29 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 3 Nov 2015 11:42:08 +0100
Subject: [PATCH 774/802] [SPARK-10429] [SQL] make mutableProjection atomic

Right now, SQL's mutable projection updates every value of the mutable project after it evaluates the corresponding expression. This makes the behavior of MutableProjection confusing and complicate the implementation of common aggregate functions like stddev because developers need to be aware that when evaluating {{i+1}}th expression of a mutable projection, {{i}}th slot of the mutable row has already been updated.

This PR make the MutableProjection atomic, by generating all the results of expressions first, then copy them into mutableRow.

Had run a mircro-benchmark, there is no notable performance difference between using class members and local variables.

cc yhuai

Author: Davies Liu <davies@databricks.com>

Closes #9422 from davies/atomic_mutable and squashes the following commits:

bbc1758 [Davies Liu] support wide table
8a0ae14 [Davies Liu] fix bug
bec07da [Davies Liu] refactor
2891628 [Davies Liu] make mutableProjection atomic
---
 .../sql/catalyst/expressions/Projection.scala |  13 +-
 .../expressions/aggregate/functions.scala     | 154 ++++++++----------
 .../codegen/GenerateMutableProjection.scala   |  28 +++-
 3 files changed, 97 insertions(+), 98 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index afe52e6a667eb..a6fe730f6dad4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
-import org.apache.spark.sql.types.{DataType, Decimal, StructType, _}
-import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
@@ -62,6 +61,8 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
+  private[this] val buffer = new Array[Any](expressions.size)
+
   expressions.foreach(_.foreach {
     case n: Nondeterministic => n.setInitialValues()
     case _ =>
@@ -79,7 +80,13 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
   override def apply(input: InternalRow): InternalRow = {
     var i = 0
     while (i < exprArray.length) {
-      mutableRow(i) = exprArray(i).eval(input)
+      // Store the result into buffer first, to make the projection atomic (needed by aggregation)
+      buffer(i) = exprArray(i).eval(input)
+      i += 1
+    }
+    i = 0
+    while (i < exprArray.length) {
+      mutableRow(i) = buffer(i)
       i += 1
     }
     mutableRow
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 5d2eb7b017ab9..f2c3eca095115 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -57,37 +57,37 @@ case class Average(child: Expression) extends DeclarativeAggregate {
     case _ => DoubleType
   }
 
-  private val currentSum = AttributeReference("currentSum", sumDataType)()
-  private val currentCount = AttributeReference("currentCount", LongType)()
+  private val sum = AttributeReference("sum", sumDataType)()
+  private val count = AttributeReference("count", LongType)()
 
-  override val aggBufferAttributes = currentSum :: currentCount :: Nil
+  override val aggBufferAttributes = sum :: count :: Nil
 
   override val initialValues = Seq(
-    /* currentSum = */ Cast(Literal(0), sumDataType),
-    /* currentCount = */ Literal(0L)
+    /* sum = */ Cast(Literal(0), sumDataType),
+    /* count = */ Literal(0L)
   )
 
   override val updateExpressions = Seq(
-    /* currentSum = */
+    /* sum = */
     Add(
-      currentSum,
+      sum,
       Coalesce(Cast(child, sumDataType) :: Cast(Literal(0), sumDataType) :: Nil)),
-    /* currentCount = */ If(IsNull(child), currentCount, currentCount + 1L)
+    /* count = */ If(IsNull(child), count, count + 1L)
   )
 
   override val mergeExpressions = Seq(
-    /* currentSum = */ currentSum.left + currentSum.right,
-    /* currentCount = */ currentCount.left + currentCount.right
+    /* sum = */ sum.left + sum.right,
+    /* count = */ count.left + count.right
   )
 
-  // If all input are nulls, currentCount will be 0 and we will get null after the division.
+  // If all input are nulls, count will be 0 and we will get null after the division.
   override val evaluateExpression = child.dataType match {
     case DecimalType.Fixed(p, s) =>
       // increase the precision and scale to prevent precision loss
       val dt = DecimalType.bounded(p + 14, s + 4)
-      Cast(Cast(currentSum, dt) / Cast(currentCount, dt), resultType)
+      Cast(Cast(sum, dt) / Cast(count, dt), resultType)
     case _ =>
-      Cast(currentSum, resultType) / Cast(currentCount, resultType)
+      Cast(sum, resultType) / Cast(count, resultType)
   }
 }
 
@@ -102,23 +102,23 @@ case class Count(child: Expression) extends DeclarativeAggregate {
   // Expected input data type.
   override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
 
-  private val currentCount = AttributeReference("currentCount", LongType)()
+  private val count = AttributeReference("count", LongType)()
 
-  override val aggBufferAttributes = currentCount :: Nil
+  override val aggBufferAttributes = count :: Nil
 
   override val initialValues = Seq(
-    /* currentCount = */ Literal(0L)
+    /* count = */ Literal(0L)
   )
 
   override val updateExpressions = Seq(
-    /* currentCount = */ If(IsNull(child), currentCount, currentCount + 1L)
+    /* count = */ If(IsNull(child), count, count + 1L)
   )
 
   override val mergeExpressions = Seq(
-    /* currentCount = */ currentCount.left + currentCount.right
+    /* count = */ count.left + count.right
   )
 
-  override val evaluateExpression = Cast(currentCount, LongType)
+  override val evaluateExpression = Cast(count, LongType)
 }
 
 /**
@@ -372,101 +372,77 @@ abstract class StddevAgg(child: Expression) extends DeclarativeAggregate {
 
   private val resultType = DoubleType
 
-  private val preCount = AttributeReference("preCount", resultType)()
-  private val currentCount = AttributeReference("currentCount", resultType)()
-  private val preAvg = AttributeReference("preAvg", resultType)()
-  private val currentAvg = AttributeReference("currentAvg", resultType)()
-  private val currentMk = AttributeReference("currentMk", resultType)()
+  private val count = AttributeReference("count", resultType)()
+  private val avg = AttributeReference("avg", resultType)()
+  private val mk = AttributeReference("mk", resultType)()
 
-  override val aggBufferAttributes = preCount :: currentCount :: preAvg ::
-                                  currentAvg :: currentMk :: Nil
+  override val aggBufferAttributes = count :: avg :: mk :: Nil
 
   override val initialValues = Seq(
-    /* preCount = */ Cast(Literal(0), resultType),
-    /* currentCount = */ Cast(Literal(0), resultType),
-    /* preAvg = */ Cast(Literal(0), resultType),
-    /* currentAvg = */ Cast(Literal(0), resultType),
-    /* currentMk = */ Cast(Literal(0), resultType)
+    /* count = */ Cast(Literal(0), resultType),
+    /* avg = */ Cast(Literal(0), resultType),
+    /* mk = */ Cast(Literal(0), resultType)
   )
 
   override val updateExpressions = {
+    val value = Cast(child, resultType)
+    val newCount = count + Cast(Literal(1), resultType)
 
     // update average
     // avg = avg + (value - avg)/count
-    def avgAdd: Expression = {
-      currentAvg + ((Cast(child, resultType) - currentAvg) / currentCount)
-    }
+    val newAvg = avg + (value - avg) / newCount
 
     // update sum of square of difference from mean
     // Mk = Mk + (value - preAvg) * (value - updatedAvg)
-    def mkAdd: Expression = {
-      val delta1 = Cast(child, resultType) - preAvg
-      val delta2 = Cast(child, resultType) - currentAvg
-      currentMk + (delta1 * delta2)
-    }
+    val newMk = mk + (value - avg) * (value - newAvg)
 
     Seq(
-      /* preCount = */ If(IsNull(child), preCount, currentCount),
-      /* currentCount = */ If(IsNull(child), currentCount,
-                           Add(currentCount, Cast(Literal(1), resultType))),
-      /* preAvg = */ If(IsNull(child), preAvg, currentAvg),
-      /* currentAvg = */ If(IsNull(child), currentAvg, avgAdd),
-      /* currentMk = */ If(IsNull(child), currentMk, mkAdd)
+      /* count = */ If(IsNull(child), count, newCount),
+      /* avg = */ If(IsNull(child), avg, newAvg),
+      /* mk = */ If(IsNull(child), mk, newMk)
     )
   }
 
   override val mergeExpressions = {
 
     // count merge
-    def countMerge: Expression = {
-      currentCount.left + currentCount.right
-    }
+    val newCount = count.left + count.right
 
     // average merge
-    def avgMerge: Expression = {
-      ((currentAvg.left * preCount) + (currentAvg.right * currentCount.right)) /
-      (preCount + currentCount.right)
-    }
+    val newAvg = ((avg.left * count.left) + (avg.right * count.right)) / newCount
 
     // update sum of square differences
-    def mkMerge: Expression = {
-      val avgDelta = currentAvg.right - preAvg
-      val mkDelta = (avgDelta * avgDelta) * (preCount * currentCount.right) /
-        (preCount + currentCount.right)
-
-      currentMk.left + currentMk.right + mkDelta
+    val newMk = {
+      val avgDelta = avg.right - avg.left
+      val mkDelta = (avgDelta * avgDelta) * (count.left * count.right) / newCount
+      mk.left + mk.right + mkDelta
     }
 
     Seq(
-      /* preCount = */ If(IsNull(currentCount.left),
-                         Cast(Literal(0), resultType), currentCount.left),
-      /* currentCount = */ If(IsNull(currentCount.left), currentCount.right,
-                             If(IsNull(currentCount.right), currentCount.left, countMerge)),
-      /* preAvg = */ If(IsNull(currentAvg.left), Cast(Literal(0), resultType), currentAvg.left),
-      /* currentAvg = */ If(IsNull(currentAvg.left), currentAvg.right,
-                           If(IsNull(currentAvg.right), currentAvg.left, avgMerge)),
-      /* currentMk = */ If(IsNull(currentMk.left), currentMk.right,
-                          If(IsNull(currentMk.right), currentMk.left, mkMerge))
+      /* count = */ If(IsNull(count.left), count.right,
+                       If(IsNull(count.right), count.left, newCount)),
+      /* avg = */ If(IsNull(avg.left), avg.right,
+                     If(IsNull(avg.right), avg.left, newAvg)),
+      /* mk = */ If(IsNull(mk.left), mk.right,
+                    If(IsNull(mk.right), mk.left, newMk))
     )
   }
 
   override val evaluateExpression = {
-    // when currentCount == 0, return null
-    // when currentCount == 1, return 0
-    // when currentCount >1
-    // stddev_samp = sqrt (currentMk/(currentCount -1))
-    // stddev_pop = sqrt (currentMk/currentCount)
-    val varCol = {
+    // when count == 0, return null
+    // when count == 1, return 0
+    // when count >1
+    // stddev_samp = sqrt (mk/(count -1))
+    // stddev_pop = sqrt (mk/count)
+    val varCol =
       if (isSample) {
-        currentMk / Cast((currentCount - Cast(Literal(1), resultType)), resultType)
-      }
-      else {
-        currentMk / currentCount
+        mk / Cast((count - Cast(Literal(1), resultType)), resultType)
+      } else {
+        mk / count
       }
-    }
 
-    If(EqualTo(currentCount, Cast(Literal(0), resultType)), Cast(Literal(null), resultType),
-      If(EqualTo(currentCount, Cast(Literal(1), resultType)), Cast(Literal(0), resultType),
+    If(EqualTo(count, Cast(Literal(0), resultType)), Cast(Literal(null), resultType),
+      If(EqualTo(count, Cast(Literal(1), resultType)), Cast(Literal(0), resultType),
         Cast(Sqrt(varCol), resultType)))
   }
 }
@@ -499,30 +475,30 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
 
   private val sumDataType = resultType
 
-  private val currentSum = AttributeReference("currentSum", sumDataType)()
+  private val sum = AttributeReference("sum", sumDataType)()
 
   private val zero = Cast(Literal(0), sumDataType)
 
-  override val aggBufferAttributes = currentSum :: Nil
+  override val aggBufferAttributes = sum :: Nil
 
   override val initialValues = Seq(
-    /* currentSum = */ Literal.create(null, sumDataType)
+    /* sum = */ Literal.create(null, sumDataType)
   )
 
   override val updateExpressions = Seq(
-    /* currentSum = */
-    Coalesce(Seq(Add(Coalesce(Seq(currentSum, zero)), Cast(child, sumDataType)), currentSum))
+    /* sum = */
+    Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(child, sumDataType)), sum))
   )
 
   override val mergeExpressions = {
-    val add = Add(Coalesce(Seq(currentSum.left, zero)), Cast(currentSum.right, sumDataType))
+    val add = Add(Coalesce(Seq(sum.left, zero)), Cast(sum.right, sumDataType))
     Seq(
-      /* currentSum = */
-      Coalesce(Seq(add, currentSum.left))
+      /* sum = */
+      Coalesce(Seq(add, sum.left))
     )
   }
 
-  override val evaluateExpression = Cast(currentSum, resultType)
+  override val evaluateExpression = Cast(sum, resultType)
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index e8ee64756d5d0..4b66069b5f55a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -44,28 +44,42 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
       case (NoOp, _) => ""
       case (e, i) =>
         val evaluationCode = e.gen(ctx)
+        val isNull = s"isNull_$i"
+        val value = s"value_$i"
+        ctx.addMutableState("boolean", isNull, s"this.$isNull = true;")
+        ctx.addMutableState(ctx.javaType(e.dataType), value,
+          s"this.$value = ${ctx.defaultValue(e.dataType)};")
+        s"""
+          ${evaluationCode.code}
+          this.$isNull = ${evaluationCode.isNull};
+          this.$value = ${evaluationCode.value};
+         """
+    }
+    val updates = expressions.zipWithIndex.map {
+      case (NoOp, _) => ""
+      case (e, i) =>
         if (e.dataType.isInstanceOf[DecimalType]) {
           // Can't call setNullAt on DecimalType, because we need to keep the offset
           s"""
-            ${evaluationCode.code}
-            if (${evaluationCode.isNull}) {
+            if (this.isNull_$i) {
               ${ctx.setColumn("mutableRow", e.dataType, i, null)};
             } else {
-              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.value)};
+              ${ctx.setColumn("mutableRow", e.dataType, i, s"this.value_$i")};
             }
           """
         } else {
           s"""
-            ${evaluationCode.code}
-            if (${evaluationCode.isNull}) {
+            if (this.isNull_$i) {
               mutableRow.setNullAt($i);
             } else {
-              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.value)};
+              ${ctx.setColumn("mutableRow", e.dataType, i, s"this.value_$i")};
             }
           """
         }
     }
+
     val allProjections = ctx.splitExpressions(ctx.INPUT_ROW, projectionCodes)
+    val allUpdates = ctx.splitExpressions(ctx.INPUT_ROW, updates)
 
     val code = s"""
       public Object generate($exprType[] expr) {
@@ -98,6 +112,8 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
         public Object apply(Object _i) {
           InternalRow ${ctx.INPUT_ROW} = (InternalRow) _i;
           $allProjections
+          // copy all the results into MutableRow
+          $allUpdates
           return mutableRow;
         }
       }

From 425ff03f5ac4f3ddda1ba06656e620d5426f4209 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 3 Nov 2015 12:47:39 +0100
Subject: [PATCH 775/802] [SPARK-11436] [SQL] rebind right encoder when join 2
 datasets

When we join 2 datasets, we will combine 2 encoders into a tupled one, and use it as the encoder for the jioned dataset. Assume both of the 2 encoders are flat, their `constructExpression`s both reference to the first element of input row. However, when we combine 2 encoders, the schema of input row changed,  now the right encoder should reference to second element of input row. So we should rebind right encoder to let it know the new schema of input row before combine it.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9391 from cloud-fan/join and squashes the following commits:

846d3ab [Wenchen Fan] rebind right encoder when join 2 datasets
---
 .../src/main/scala/org/apache/spark/sql/Dataset.scala     | 4 +++-
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala    | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e0ab5f593e933..ed98a2541598f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -390,7 +390,9 @@ class Dataset[T] private(
     val rightEncoder =
       if (other.encoder.flat) other.encoder else other.encoder.nested(rightData.toAttribute)
     implicit val tuple2Encoder: Encoder[(T, U)] =
-      ExpressionEncoder.tuple(leftEncoder, rightEncoder)
+      ExpressionEncoder.tuple(
+        leftEncoder,
+        rightEncoder.rebind(right.output, left.output ++ right.output))
 
     withPlan[(T, U)](other) { (left, right) =>
       Project(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 993e6d269ee03..95b8d05cf4414 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -214,4 +214,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       cogrouped,
       1 -> "a#", 2 -> "#q", 3 -> "abcfoo#w", 5 -> "hello#er")
   }
+
+  test("SPARK-11436: we should rebind right encoder when join 2 datasets") {
+    val ds1 = Seq("1", "2").toDS().as("a")
+    val ds2 = Seq(2, 3).toDS().as("b")
+
+    val joined = ds1.joinWith(ds2, $"a.value" === $"b.value")
+    checkAnswer(joined, ("2", 2))
+  }
 }

From b86f2cab67989f09ba1ba8604e52cd4b1e44e436 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 3 Nov 2015 13:02:17 +0100
Subject: [PATCH 776/802] [SPARK-11404] [SQL] Support for groupBy using column
 expressions

This PR adds a new method `groupBy(cols: Column*)` to `Dataset` that allows users to group using column expressions instead of a lambda function.  Since the return type of these expressions is not known at compile time, we just set the key type as a generic `Row`.  If the user would like to work the key in a type-safe way, they can call `grouped.asKey[Type]`, which is also added in this PR.

```scala
val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
val grouped = ds.groupBy($"_1").asKey[String]
val agged = grouped.mapGroups { case (g, iter) =>
  Iterator((g, iter.map(_._2).sum))
}

agged.collect()

res0: Array(("a", 30), ("b", 3), ("c", 1))
```

Author: Michael Armbrust <michael@databricks.com>

Closes #9359 from marmbrus/columnGroupBy and squashes the following commits:

bbcb03b [Michael Armbrust] Update DatasetSuite.scala
8fd2908 [Michael Armbrust] Update DatasetSuite.scala
0b0e2f8 [Michael Armbrust] [SPARK-11404] [SQL] Support for groupBy using column expressions
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 36 ++++++++++++--
 .../org/apache/spark/sql/GroupedDataset.scala | 28 +++++++++--
 .../org/apache/spark/sql/DatasetSuite.scala   | 48 +++++++++++++++++++
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index ed98a2541598f..7b75aeec4cf3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.Inner
@@ -78,9 +79,17 @@ class Dataset[T] private(
    * ************* */
 
   /**
-   * Returns a new `Dataset` where each record has been mapped on to the specified type.
-   * TODO: should bind here...
-   * TODO: document binding rules
+   * Returns a new `Dataset` where each record has been mapped on to the specified type.  The
+   * method used to map columns depend on the type of `U`:
+   *  - When `U` is a class, fields for the class will be mapped to columns of the same name
+   *    (case sensitivity is determined by `spark.sql.caseSensitive`)
+   *  - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will
+   *    be assigned to `_1`).
+   *  - When `U` is a primitive type (i.e. String, Int, etc). then the first column of the
+   *    [[DataFrame]] will be used.
+   *
+   * If the schema of the [[DataFrame]] does not match the desired `U` type, you can use `select`
+   * along with `alias` or `as` to rearrange or rename as required.
    * @since 1.6.0
    */
   def as[U : Encoder]: Dataset[U] = {
@@ -225,6 +234,27 @@ class Dataset[T] private(
       withGroupingKey.newColumns)
   }
 
+  /**
+   * Returns a [[GroupedDataset]] where the data is grouped by the given [[Column]] expressions.
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def groupBy(cols: Column*): GroupedDataset[Row, T] = {
+    val withKeyColumns = logicalPlan.output ++ cols.map(_.expr).map(UnresolvedAlias)
+    val withKey = Project(withKeyColumns, logicalPlan)
+    val executed = sqlContext.executePlan(withKey)
+
+    val dataAttributes = executed.analyzed.output.dropRight(cols.size)
+    val keyAttributes = executed.analyzed.output.takeRight(cols.size)
+
+    new GroupedDataset(
+      RowEncoder(keyAttributes.toStructType),
+      encoderFor[T],
+      executed,
+      dataAttributes,
+      keyAttributes)
+  }
+
   /* ****************** *
    *  Typed Relational  *
    * ****************** */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
index 612f2b60cd405..96d6e9dd548e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.encoders.Encoder
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, encoderFor, Encoder}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.QueryExecution
@@ -34,11 +34,33 @@ class GroupedDataset[K, T] private[sql](
     private val dataAttributes: Seq[Attribute],
     private val groupingAttributes: Seq[Attribute]) extends Serializable {
 
-  private implicit def kEnc = kEncoder
-  private implicit def tEnc = tEncoder
+  private implicit val kEnc = kEncoder match {
+    case e: ExpressionEncoder[K] => e.resolve(groupingAttributes)
+    case other =>
+      throw new UnsupportedOperationException("Only expression encoders are currently supported")
+  }
+
+  private implicit val tEnc = tEncoder match {
+    case e: ExpressionEncoder[T] => e.resolve(dataAttributes)
+    case other =>
+      throw new UnsupportedOperationException("Only expression encoders are currently supported")
+  }
+
   private def logicalPlan = queryExecution.analyzed
   private def sqlContext = queryExecution.sqlContext
 
+  /**
+   * Returns a new [[GroupedDataset]] where the type of the key has been mapped to the specified
+   * type. The mapping of key columns to the type follows the same rules as `as` on [[Dataset]].
+   */
+  def asKey[L : Encoder]: GroupedDataset[L, T] =
+    new GroupedDataset(
+      encoderFor[L],
+      tEncoder,
+      queryExecution,
+      dataAttributes,
+      groupingAttributes)
+
   /**
    * Returns a [[Dataset]] that contains each unique key.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 95b8d05cf4414..5973fa7f2a76b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -203,6 +203,54 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       ("a", 30), ("b", 3), ("c", 1))
   }
 
+  test("groupBy columns, mapGroups") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val grouped = ds.groupBy($"_1")
+    val agged = grouped.mapGroups { case (g, iter) =>
+      Iterator((g.getString(0), iter.map(_._2).sum))
+    }
+
+    checkAnswer(
+      agged,
+      ("a", 30), ("b", 3), ("c", 1))
+  }
+
+  test("groupBy columns asKey, mapGroups") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val grouped = ds.groupBy($"_1").asKey[String]
+    val agged = grouped.mapGroups { case (g, iter) =>
+      Iterator((g, iter.map(_._2).sum))
+    }
+
+    checkAnswer(
+      agged,
+      ("a", 30), ("b", 3), ("c", 1))
+  }
+
+  test("groupBy columns asKey tuple, mapGroups") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val grouped = ds.groupBy($"_1", lit(1)).asKey[(String, Int)]
+    val agged = grouped.mapGroups { case (g, iter) =>
+      Iterator((g, iter.map(_._2).sum))
+    }
+
+    checkAnswer(
+      agged,
+      (("a", 1), 30), (("b", 1), 3), (("c", 1), 1))
+  }
+
+  test("groupBy columns asKey class, mapGroups") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val grouped = ds.groupBy($"_1".as("a"), lit(1).as("b")).asKey[ClassData]
+    val agged = grouped.mapGroups { case (g, iter) =>
+      Iterator((g, iter.map(_._2).sum))
+    }
+
+    checkAnswer(
+      agged,
+      (ClassData("a", 1), 30), (ClassData("b", 1), 3), (ClassData("c", 1), 1))
+  }
+
   test("cogroup") {
     val ds1 = Seq(1 -> "a", 3 -> "abc", 5 -> "hello", 3 -> "foo").toDS()
     val ds2 = Seq(2 -> "q", 3 -> "w", 5 -> "e", 5 -> "r").toDS()

From 233e534ac43ea25ac1b0e6a985f6928d46c5d03a Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Tue, 3 Nov 2015 12:46:11 +0000
Subject: [PATCH 777/802] [SPARK-11344] Made ApplicationDescription and
 DriverDescription case classes

DriverDescription refactored to case class because it included no mutable fields.

ApplicationDescription had one mutable field, which was appUiUrl. This field was set by the driver to point to the driver web UI. Master was modifying this field when the application was removed to redirect requests to history server. This was wrong because objects which are sent over the wire should be immutable. Now appUiUrl is immutable in ApplicationDescription and always points to the driver UI even if it is already shutdown. The UI url which master exposes to the user and modifies dynamically is now included into ApplicationInfo - a data object which describes the application state internally in master. That URL in ApplicationInfo is initialised with the value from ApplicationDescription.

ApplicationDescription also included value user, which is now a part of case class fields.

Author: Jacek Lewandowski <lewandowski.jacek@gmail.com>

Closes #9299 from jacek-lewandowski/SPARK-11344.
---
 .../spark/deploy/ApplicationDescription.scala | 33 ++++++-------------
 .../spark/deploy/DriverDescription.scala      | 21 ++++--------
 .../spark/deploy/master/ApplicationInfo.scala |  7 ++++
 .../apache/spark/deploy/master/Master.scala   | 12 ++++---
 .../deploy/master/ui/ApplicationPage.scala    |  2 +-
 .../spark/deploy/master/ui/MasterPage.scala   |  2 +-
 .../apache/spark/deploy/DeployTestUtils.scala |  3 +-
 7 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
index ae99432f5ce86..78bbd5c03f4a6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
@@ -19,30 +19,17 @@ package org.apache.spark.deploy
 
 import java.net.URI
 
-private[spark] class ApplicationDescription(
-    val name: String,
-    val maxCores: Option[Int],
-    val memoryPerExecutorMB: Int,
-    val command: Command,
-    var appUiUrl: String,
-    val eventLogDir: Option[URI] = None,
+private[spark] case class ApplicationDescription(
+    name: String,
+    maxCores: Option[Int],
+    memoryPerExecutorMB: Int,
+    command: Command,
+    appUiUrl: String,
+    eventLogDir: Option[URI] = None,
     // short name of compression codec used when writing event logs, if any (e.g. lzf)
-    val eventLogCodec: Option[String] = None,
-    val coresPerExecutor: Option[Int] = None)
-  extends Serializable {
-
-  val user = System.getProperty("user.name", "<unknown>")
-
-  def copy(
-      name: String = name,
-      maxCores: Option[Int] = maxCores,
-      memoryPerExecutorMB: Int = memoryPerExecutorMB,
-      command: Command = command,
-      appUiUrl: String = appUiUrl,
-      eventLogDir: Option[URI] = eventLogDir,
-      eventLogCodec: Option[String] = eventLogCodec): ApplicationDescription =
-    new ApplicationDescription(
-      name, maxCores, memoryPerExecutorMB, command, appUiUrl, eventLogDir, eventLogCodec)
+    eventLogCodec: Option[String] = None,
+    coresPerExecutor: Option[Int] = None,
+    user: String = System.getProperty("user.name", "<unknown>")) {
 
   override def toString: String = "ApplicationDescription(" + name + ")"
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala b/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala
index 659fb434a80f5..1f5626ab5a896 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala
@@ -17,21 +17,12 @@
 
 package org.apache.spark.deploy
 
-private[deploy] class DriverDescription(
-    val jarUrl: String,
-    val mem: Int,
-    val cores: Int,
-    val supervise: Boolean,
-    val command: Command)
-  extends Serializable {
-
-  def copy(
-      jarUrl: String = jarUrl,
-      mem: Int = mem,
-      cores: Int = cores,
-      supervise: Boolean = supervise,
-      command: Command = command): DriverDescription =
-    new DriverDescription(jarUrl, mem, cores, supervise, command)
+private[deploy] case class DriverDescription(
+    jarUrl: String,
+    mem: Int,
+    cores: Int,
+    supervise: Boolean,
+    command: Command) {
 
   override def toString: String = s"DriverDescription (${command.mainClass})"
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
index b40d20f9f7868..ac553b71115df 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
@@ -41,6 +41,7 @@ private[spark] class ApplicationInfo(
   @transient var coresGranted: Int = _
   @transient var endTime: Long = _
   @transient var appSource: ApplicationSource = _
+  @transient @volatile var appUIUrlAtHistoryServer: Option[String] = None
 
   // A cap on the number of executors this application can have at any given time.
   // By default, this is infinite. Only after the first allocation request is issued by the
@@ -135,4 +136,10 @@ private[spark] class ApplicationInfo(
     }
   }
 
+  /**
+   * Returns the original application UI url unless there is its address at history server
+   * is defined
+   */
+  def curAppUIUrl: String = appUIUrlAtHistoryServer.getOrElse(desc.appUiUrl)
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 6715d6c70f497..b25a487806c7f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -768,7 +768,8 @@ private[deploy] class Master(
       ApplicationInfo = {
     val now = System.currentTimeMillis()
     val date = new Date(now)
-    new ApplicationInfo(now, newApplicationId(date), desc, date, driver, defaultCores)
+    val appId = newApplicationId(date)
+    new ApplicationInfo(now, appId, desc, date, driver, defaultCores)
   }
 
   private def registerApplication(app: ApplicationInfo): Unit = {
@@ -920,7 +921,7 @@ private[deploy] class Master(
       val eventLogDir = app.desc.eventLogDir
         .getOrElse {
           // Event logging is not enabled for this application
-          app.desc.appUiUrl = notFoundBasePath
+          app.appUIUrlAtHistoryServer = Some(notFoundBasePath)
           return None
         }
 
@@ -954,7 +955,7 @@ private[deploy] class Master(
       appIdToUI(app.id) = ui
       webUi.attachSparkUI(ui)
       // Application UI is successfully rebuilt, so link the Master UI to it
-      app.desc.appUiUrl = ui.basePath
+      app.appUIUrlAtHistoryServer = Some(ui.basePath)
       Some(ui)
     } catch {
       case fnf: FileNotFoundException =>
@@ -964,7 +965,7 @@ private[deploy] class Master(
         logWarning(msg)
         msg += " Did you specify the correct logging directory?"
         msg = URLEncoder.encode(msg, "UTF-8")
-        app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title"
+        app.appUIUrlAtHistoryServer = Some(notFoundBasePath + s"?msg=$msg&title=$title")
         None
       case e: Exception =>
         // Relay exception message to application UI page
@@ -973,7 +974,8 @@ private[deploy] class Master(
         var msg = s"Exception in replaying log for application $appName!"
         logError(msg, e)
         msg = URLEncoder.encode(msg, "UTF-8")
-        app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&exception=$exception&title=$title"
+        app.appUIUrlAtHistoryServer =
+            Some(notFoundBasePath + s"?msg=$msg&exception=$exception&title=$title")
         None
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index e28e7e379ac91..f405aa2bdc8b3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -76,7 +76,7 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app")
             </li>
             <li><strong>Submit Date:</strong> {app.submitDate}</li>
             <li><strong>State:</strong> {app.state}</li>
-            <li><strong><a href={app.desc.appUiUrl}>Application Detail UI</a></strong></li>
+            <li><strong><a href={app.curAppUIUrl}>Application Detail UI</a></strong></li>
           </ul>
         </div>
       </div>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index c3e20ebf8d6eb..ee539dd1f5113 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -206,7 +206,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
         {killLink}
       </td>
       <td>
-        <a href={app.desc.appUiUrl}>{app.desc.name}</a>
+        <a href={app.curAppUIUrl}>{app.desc.name}</a>
       </td>
       <td>
         {app.coresGranted}
diff --git a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala
index 967aa0976f0ce..3164760b08a71 100644
--- a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala
@@ -31,8 +31,9 @@ private[deploy] object DeployTestUtils {
   }
 
   def createAppInfo() : ApplicationInfo = {
+    val appDesc = createAppDesc()
     val appInfo = new ApplicationInfo(JsonConstants.appInfoStartTime,
-      "id", createAppDesc(), JsonConstants.submitDate, null, Int.MaxValue)
+      "id", appDesc, JsonConstants.submitDate, null, Int.MaxValue)
     appInfo.endTime = JsonConstants.currTimeInMillis
     appInfo
   }

From d188a67762dfc09929e30931509be5851e29dfa5 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Tue, 3 Nov 2015 22:30:23 +0800
Subject: [PATCH 778/802] [SPARK-10533][SQL] handle scientific notation in
 sqlParser

https://issues.apache.org/jira/browse/SPARK-10533

val df = sqlContext.createDataFrame(Seq(("a",1.0),("b",2.0),("c",3.0)))
df.filter("_2 < 2.0e1").show

Scientific notation didn't work.

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #9085 from adrian-wang/scinotation.
---
 .../sql/catalyst/AbstractSparkSQLParser.scala     | 15 +++++++++++++--
 .../org/apache/spark/sql/catalyst/SqlParser.scala | 11 +++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala     | 11 ++++++++---
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
index 2bac08eac4fe2..04ac4f20c66ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
@@ -82,6 +82,10 @@ class SqlLexical extends StdLexical {
     override def toString: String = chars
   }
 
+  case class DecimalLit(chars: String) extends Token {
+    override def toString: String = chars
+  }
+
   /* This is a work around to support the lazy setting */
   def initialize(keywords: Seq[String]): Unit = {
     reserved.clear()
@@ -102,8 +106,12 @@ class SqlLexical extends StdLexical {
   }
 
   override lazy val token: Parser[Token] =
-    ( identChar ~ (identChar | digit).* ^^
-      { case first ~ rest => processIdent((first :: rest).mkString) }
+    ( rep1(digit) ~ ('.' ~> digit.*).? ~ (exp ~> sign.? ~ rep1(digit)) ^^ {
+        case i ~ None ~ (sig ~ rest) =>
+          DecimalLit(i.mkString + "e" + sig.mkString + rest.mkString)
+        case i ~ Some(d) ~ (sig ~ rest) =>
+          DecimalLit(i.mkString + "." + d.mkString + "e" + sig.mkString + rest.mkString)
+      }
     | digit.* ~ identChar ~ (identChar | digit).* ^^
       { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) }
     | rep1(digit) ~ ('.' ~> digit.*).? ^^ {
@@ -125,6 +133,9 @@ class SqlLexical extends StdLexical {
 
   override def identChar: Parser[Elem] = letter | elem('_')
 
+  private lazy val sign: Parser[Elem] = elem("s", c => c == '+' || c == '-')
+  private lazy val exp: Parser[Elem] = elem("e", c => c == 'E' || c == 'e')
+
   override def whitespace: Parser[Any] =
     ( whitespaceChar
     | '/' ~ '*' ~ comment
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index d7567e8613e3c..1ba559d9e3b18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -337,6 +337,9 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     | sign.? ~ unsignedFloat ^^ {
       case s ~ f => Literal(toDecimalOrDouble(s.getOrElse("") + f))
     }
+    | sign.? ~ unsignedDecimal ^^ {
+      case s ~ d => Literal(toDecimalOrDouble(s.getOrElse("") + d))
+    }
     )
 
   protected lazy val unsignedFloat: Parser[String] =
@@ -344,6 +347,14 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     | elem("decimal", _.isInstanceOf[lexical.FloatLit]) ^^ (_.chars)
     )
 
+  protected lazy val unsignedDecimal: Parser[String] =
+    ( "." ~> decimalLit ^^ { u => "0." + u }
+    | elem("scientific_notation", _.isInstanceOf[lexical.DecimalLit]) ^^ (_.chars)
+    )
+
+  def decimalLit: Parser[String] =
+    elem("scientific_notation", _.isInstanceOf[lexical.DecimalLit]) ^^ (_.chars)
+
   protected lazy val sign: Parser[String] = ("+" | "-")
 
   protected lazy val integral: Parser[String] =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 6b86c5951b413..a883bcb7b1012 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -177,9 +177,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("filterExpr") {
-    checkAnswer(
-      testData.filter("key > 90"),
-      testData.collect().filter(_.getInt(0) > 90).toSeq)
+    val res = testData.collect().filter(_.getInt(0) > 90).toSeq
+    checkAnswer(testData.filter("key > 90"), res)
+    checkAnswer(testData.filter("key > 9.0e1"), res)
+    checkAnswer(testData.filter("key > .9e+2"), res)
+    checkAnswer(testData.filter("key > 0.9e+2"), res)
+    checkAnswer(testData.filter("key > 900e-1"), res)
+    checkAnswer(testData.filter("key > 900.0E-1"), res)
+    checkAnswer(testData.filter("key > 9.e+1"), res)
   }
 
   test("filterExpr using where") {

From 57446eb69ceb6b8856ab22b54abb22b47b80f841 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 3 Nov 2015 07:06:00 -0800
Subject: [PATCH 779/802] [SPARK-11256] Mark all
 Stage/ResultStage/ShuffleMapStage internal state as private.

Author: Reynold Xin <rxin@databricks.com>

Closes #9219 from rxin/stage-cleanup1.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 33 +++++-----
 .../apache/spark/scheduler/ResultStage.scala  | 19 +++++-
 .../spark/scheduler/ShuffleMapStage.scala     | 61 +++++++++++++------
 .../org/apache/spark/scheduler/Stage.scala    |  5 +-
 4 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 995862ece5944..5673fbf2c8fea 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -23,7 +23,7 @@ import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.Map
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Stack}
+import scala.collection.mutable.{HashMap, HashSet, Stack}
 import scala.concurrent.duration._
 import scala.language.existentials
 import scala.language.postfixOps
@@ -535,10 +535,8 @@ class DAGScheduler(
     jobIdToActiveJob -= job.jobId
     activeJobs -= job
     job.finalStage match {
-      case r: ResultStage =>
-        r.resultOfJob = None
-      case m: ShuffleMapStage =>
-        m.mapStageJobs = m.mapStageJobs.filter(_ != job)
+      case r: ResultStage => r.removeActiveJob()
+      case m: ShuffleMapStage => m.removeActiveJob(job)
     }
   }
 
@@ -848,7 +846,7 @@ class DAGScheduler(
     val jobSubmissionTime = clock.getTimeMillis()
     jobIdToActiveJob(jobId) = job
     activeJobs += job
-    finalStage.resultOfJob = Some(job)
+    finalStage.setActiveJob(job)
     val stageIds = jobIdToStageIds(jobId).toArray
     val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
     listenerBus.post(
@@ -880,7 +878,7 @@ class DAGScheduler(
     val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
     clearCacheLocs()
     logInfo("Got map stage job %s (%s) with %d output partitions".format(
-      jobId, callSite.shortForm, dependency.rdd.partitions.size))
+      jobId, callSite.shortForm, dependency.rdd.partitions.length))
     logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
     logInfo("Parents of final stage: " + finalStage.parents)
     logInfo("Missing parents: " + getMissingParentStages(finalStage))
@@ -888,7 +886,7 @@ class DAGScheduler(
     val jobSubmissionTime = clock.getTimeMillis()
     jobIdToActiveJob(jobId) = job
     activeJobs += job
-    finalStage.mapStageJobs = job :: finalStage.mapStageJobs
+    finalStage.addActiveJob(job)
     val stageIds = jobIdToStageIds(jobId).toArray
     val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
     listenerBus.post(
@@ -950,12 +948,12 @@ class DAGScheduler(
     // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
     // event.
     outputCommitCoordinator.stageStart(stage.id)
-    val taskIdToLocations = try {
+    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
       stage match {
         case s: ShuffleMapStage =>
           partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
         case s: ResultStage =>
-          val job = s.resultOfJob.get
+          val job = s.activeJob.get
           partitionsToCompute.map { id =>
             val p = s.partitions(id)
             (id, getPreferredLocs(stage.rdd, p))
@@ -1016,7 +1014,7 @@ class DAGScheduler(
           }
 
         case stage: ResultStage =>
-          val job = stage.resultOfJob.get
+          val job = stage.activeJob.get
           partitionsToCompute.map { id =>
             val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
@@ -1132,7 +1130,7 @@ class DAGScheduler(
             // Cast to ResultStage here because it's part of the ResultTask
             // TODO Refactor this out to a function that accepts a ResultStage
             val resultStage = stage.asInstanceOf[ResultStage]
-            resultStage.resultOfJob match {
+            resultStage.activeJob match {
               case Some(job) =>
                 if (!job.finished(rt.outputId)) {
                   updateAccumulators(event)
@@ -1187,7 +1185,7 @@ class DAGScheduler(
               //       we registered these map outputs.
               mapOutputTracker.registerMapOutputs(
                 shuffleStage.shuffleDep.shuffleId,
-                shuffleStage.outputLocs.map(_.headOption.orNull),
+                shuffleStage.outputLocInMapOutputTrackerFormat(),
                 changeEpoch = true)
 
               clearCacheLocs()
@@ -1197,8 +1195,7 @@ class DAGScheduler(
                 // TODO: Lower-level scheduler should also deal with this
                 logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name +
                   ") because some of its tasks had failed: " +
-                  shuffleStage.outputLocs.zipWithIndex.filter(_._1.isEmpty)
-                    .map(_._2).mkString(", "))
+                  shuffleStage.findMissingPartitions().mkString(", "))
                 submitStage(shuffleStage)
               } else {
                 // Mark any map-stage jobs waiting on this stage as finished
@@ -1312,8 +1309,10 @@ class DAGScheduler(
         // TODO: This will be really slow if we keep accumulating shuffle map stages
         for ((shuffleId, stage) <- shuffleToMapStage) {
           stage.removeOutputsOnExecutor(execId)
-          val locs = stage.outputLocs.map(_.headOption.orNull)
-          mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true)
+          mapOutputTracker.registerMapOutputs(
+            shuffleId,
+            stage.outputLocInMapOutputTrackerFormat(),
+            changeEpoch = true)
         }
         if (shuffleToMapStage.isEmpty) {
           mapOutputTracker.incrementEpoch()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index c1d86af7e8fb5..d1687830ff7bf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -41,10 +41,25 @@ private[spark] class ResultStage(
    * The active job for this result stage. Will be empty if the job has already finished
    * (e.g., because the job was cancelled).
    */
-  var resultOfJob: Option[ActiveJob] = None
+  private[this] var _activeJob: Option[ActiveJob] = None
 
+  def activeJob: Option[ActiveJob] = _activeJob
+
+  def setActiveJob(job: ActiveJob): Unit = {
+    _activeJob = Option(job)
+  }
+
+  def removeActiveJob(): Unit = {
+    _activeJob = None
+  }
+
+  /**
+   * Returns the sequence of partition ids that are missing (i.e. needs to be computed).
+   *
+   * This can only be called when there is an active job.
+   */
   override def findMissingPartitions(): Seq[Int] = {
-    val job = resultOfJob.get
+    val job = activeJob.get
     (0 until job.numPartitions).filter(id => !job.finished(id))
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 3832d99eddaef..51416e5ce97fc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -43,35 +43,53 @@ private[spark] class ShuffleMapStage(
     val shuffleDep: ShuffleDependency[_, _, _])
   extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
 
+  private[this] var _mapStageJobs: List[ActiveJob] = Nil
+
+  private[this] var _numAvailableOutputs: Int = 0
+
+  /**
+   * List of [[MapStatus]] for each partition. The index of the array is the map partition id,
+   * and each value in the array is the list of possible [[MapStatus]] for a partition
+   * (a single task might run multiple times).
+   */
+  private[this] val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
+
   override def toString: String = "ShuffleMapStage " + id
 
-  /** Running map-stage jobs that were submitted to execute this stage independently (if any) */
-  var mapStageJobs: List[ActiveJob] = Nil
+  /**
+   * Returns the list of active jobs,
+   * i.e. map-stage jobs that were submitted to execute this stage independently (if any).
+   */
+  def mapStageJobs: Seq[ActiveJob] = _mapStageJobs
+
+  /** Adds the job to the active job list. */
+  def addActiveJob(job: ActiveJob): Unit = {
+    _mapStageJobs = job :: _mapStageJobs
+  }
+
+  /** Removes the job from the active job list. */
+  def removeActiveJob(job: ActiveJob): Unit = {
+    _mapStageJobs = _mapStageJobs.filter(_ != job)
+  }
 
   /**
    * Number of partitions that have shuffle outputs.
    * When this reaches [[numPartitions]], this map stage is ready.
    * This should be kept consistent as `outputLocs.filter(!_.isEmpty).size`.
    */
-  var numAvailableOutputs: Int = 0
+  def numAvailableOutputs: Int = _numAvailableOutputs
 
   /**
    * Returns true if the map stage is ready, i.e. all partitions have shuffle outputs.
    * This should be the same as `outputLocs.contains(Nil)`.
    */
-  def isAvailable: Boolean = numAvailableOutputs == numPartitions
-
-  /**
-   * List of [[MapStatus]] for each partition. The index of the array is the map partition id,
-   * and each value in the array is the list of possible [[MapStatus]] for a partition
-   * (a single task might run multiple times).
-   */
-  val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
+  def isAvailable: Boolean = _numAvailableOutputs == numPartitions
 
+  /** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */
   override def findMissingPartitions(): Seq[Int] = {
     val missing = (0 until numPartitions).filter(id => outputLocs(id).isEmpty)
-    assert(missing.size == numPartitions - numAvailableOutputs,
-      s"${missing.size} missing, expected ${numPartitions - numAvailableOutputs}")
+    assert(missing.size == numPartitions - _numAvailableOutputs,
+      s"${missing.size} missing, expected ${numPartitions - _numAvailableOutputs}")
     missing
   }
 
@@ -79,7 +97,7 @@ private[spark] class ShuffleMapStage(
     val prevList = outputLocs(partition)
     outputLocs(partition) = status :: prevList
     if (prevList == Nil) {
-      numAvailableOutputs += 1
+      _numAvailableOutputs += 1
     }
   }
 
@@ -88,10 +106,19 @@ private[spark] class ShuffleMapStage(
     val newList = prevList.filterNot(_.location == bmAddress)
     outputLocs(partition) = newList
     if (prevList != Nil && newList == Nil) {
-      numAvailableOutputs -= 1
+      _numAvailableOutputs -= 1
     }
   }
 
+  /**
+   * Returns an array of [[MapStatus]] (index by partition id). For each partition, the returned
+   * value contains only one (i.e. the first) [[MapStatus]]. If there is no entry for the partition,
+   * that position is filled with null.
+   */
+  def outputLocInMapOutputTrackerFormat(): Array[MapStatus] = {
+    outputLocs.map(_.headOption.orNull)
+  }
+
   /**
    * Removes all shuffle outputs associated with this executor. Note that this will also remove
    * outputs which are served by an external shuffle server (if one exists), as they are still
@@ -105,12 +132,12 @@ private[spark] class ShuffleMapStage(
       outputLocs(partition) = newList
       if (prevList != Nil && newList == Nil) {
         becameUnavailable = true
-        numAvailableOutputs -= 1
+        _numAvailableOutputs -= 1
       }
     }
     if (becameUnavailable) {
       logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format(
-        this, execId, numAvailableOutputs, numPartitions, isAvailable))
+        this, execId, _numAvailableOutputs, numPartitions, isAvailable))
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index 5ce4a484344f1..7ea24a217bd39 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -71,8 +71,8 @@ private[scheduler] abstract class Stage(
   /** The ID to use for the next new attempt for this stage. */
   private var nextAttemptId: Int = 0
 
-  val name = callSite.shortForm
-  val details = callSite.longForm
+  val name: String = callSite.shortForm
+  val details: String = callSite.longForm
 
   private var _internalAccumulators: Seq[Accumulator[Long]] = Seq.empty
 
@@ -134,6 +134,7 @@ private[scheduler] abstract class Stage(
   def latestInfo: StageInfo = _latestInfo
 
   override final def hashCode(): Int = id
+
   override final def equals(other: Any): Boolean = other match {
     case stage: Stage => stage != null && stage.id == id
     case _ => false

From d6035d97c91fe78b1336ade48134252915263ea6 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Tue, 3 Nov 2015 07:41:50 -0800
Subject: [PATCH 780/802] [SPARK-10304] [SQL] Partition discovery should throw
 an exception if the dir structure is invalid

JIRA: https://issues.apache.org/jira/browse/SPARK-10304

This patch detects if the structure of partition directories is not valid.

The test cases are from #8547. Thanks zhzhan.

cc liancheng

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #8840 from viirya/detect_invalid_part_dir.
---
 .../datasources/PartitioningUtils.scala       | 36 +++++++++++++------
 .../ParquetPartitionDiscoverySuite.scala      | 36 +++++++++++++++++--
 2 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 0a2007e15843c..628c5e18936c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -77,9 +77,11 @@ private[sql] object PartitioningUtils {
       defaultPartitionName: String,
       typeInference: Boolean): PartitionSpec = {
     // First, we need to parse every partition's path and see if we can find partition values.
-    val pathsWithPartitionValues = paths.flatMap { path =>
-      parsePartition(path, defaultPartitionName, typeInference).map(path -> _)
-    }
+    val (partitionValues, optBasePaths) = paths.map { path =>
+      parsePartition(path, defaultPartitionName, typeInference)
+    }.unzip
+
+    val pathsWithPartitionValues = paths.zip(partitionValues).flatMap(x => x._2.map(x._1 -> _))
 
     if (pathsWithPartitionValues.isEmpty) {
       // This dataset is not partitioned.
@@ -87,6 +89,12 @@ private[sql] object PartitioningUtils {
     } else {
       // This dataset is partitioned. We need to check whether all partitions have the same
       // partition columns and resolve potential type conflicts.
+      val basePaths = optBasePaths.flatMap(x => x)
+      assert(
+        basePaths.distinct.size == 1,
+        "Conflicting directory structures detected. Suspicious paths:\b" +
+          basePaths.mkString("\n\t", "\n\t", "\n\n"))
+
       val resolvedPartitionValues = resolvePartitions(pathsWithPartitionValues)
 
       // Creates the StructType which represents the partition columns.
@@ -110,12 +118,12 @@ private[sql] object PartitioningUtils {
   }
 
   /**
-   * Parses a single partition, returns column names and values of each partition column.  For
-   * example, given:
+   * Parses a single partition, returns column names and values of each partition column, also
+   * the base path.  For example, given:
    * {{{
    *   path = hdfs://<host>:<port>/path/to/partition/a=42/b=hello/c=3.14
    * }}}
-   * it returns:
+   * it returns the partition:
    * {{{
    *   PartitionValues(
    *     Seq("a", "b", "c"),
@@ -124,34 +132,40 @@ private[sql] object PartitioningUtils {
    *       Literal.create("hello", StringType),
    *       Literal.create(3.14, FloatType)))
    * }}}
+   * and the base path:
+   * {{{
+   *   /path/to/partition
+   * }}}
    */
   private[sql] def parsePartition(
       path: Path,
       defaultPartitionName: String,
-      typeInference: Boolean): Option[PartitionValues] = {
+      typeInference: Boolean): (Option[PartitionValues], Option[Path]) = {
     val columns = ArrayBuffer.empty[(String, Literal)]
     // Old Hadoop versions don't have `Path.isRoot`
     var finished = path.getParent == null
     var chopped = path
+    var basePath = path
 
     while (!finished) {
       // Sometimes (e.g., when speculative task is enabled), temporary directories may be left
       // uncleaned.  Here we simply ignore them.
       if (chopped.getName.toLowerCase == "_temporary") {
-        return None
+        return (None, None)
       }
 
       val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName, typeInference)
       maybeColumn.foreach(columns += _)
+      basePath = chopped
       chopped = chopped.getParent
-      finished = maybeColumn.isEmpty || chopped.getParent == null
+      finished = (maybeColumn.isEmpty && !columns.isEmpty) || chopped.getParent == null
     }
 
     if (columns.isEmpty) {
-      None
+      (None, Some(path))
     } else {
       val (columnNames, values) = columns.reverse.unzip
-      Some(PartitionValues(columnNames, values))
+      (Some(PartitionValues(columnNames, values)), Some(basePath))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 3a23b8ed66808..67b6a37fa502e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -58,14 +58,46 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     check(defaultPartitionName, Literal.create(null, NullType))
   }
 
+  test("parse invalid partitioned directories") {
+    // Invalid
+    var paths = Seq(
+      "hdfs://host:9000/invalidPath",
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/a=10.5/b=hello")
+
+    var exception = intercept[AssertionError] {
+      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+    }
+    assert(exception.getMessage().contains("Conflicting directory structures detected"))
+
+    // Valid
+    paths = Seq(
+      "hdfs://host:9000/path/_temporary",
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/_temporary/path")
+
+    parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+
+    // Invalid
+    paths = Seq(
+      "hdfs://host:9000/path/_temporary",
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/path1")
+
+    exception = intercept[AssertionError] {
+      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+    }
+    assert(exception.getMessage().contains("Conflicting directory structures detected"))
+  }
+
   test("parse partition") {
     def check(path: String, expected: Option[PartitionValues]): Unit = {
-      assert(expected === parsePartition(new Path(path), defaultPartitionName, true))
+      assert(expected === parsePartition(new Path(path), defaultPartitionName, true)._1)
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName, true).get
+        parsePartition(new Path(path), defaultPartitionName, true)
       }.getMessage
 
       assert(message.contains(expected))

From d6f10aa7ea2806c0fbcfc31d7dee91d28319fab7 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 3 Nov 2015 08:29:07 -0800
Subject: [PATCH 781/802] [SPARK-9836][ML] Provide R-like summary statistics
 for OLS via normal equation solver

https://issues.apache.org/jira/browse/SPARK-9836

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9413 from yanboliang/spark-9836.
---
 .../spark/ml/optim/WeightedLeastSquares.scala |  15 +-
 .../ml/regression/LinearRegression.scala      |  90 +++++++++++-
 .../mllib/linalg/CholeskyDecomposition.scala  |  16 +++
 .../ml/regression/LinearRegressionSuite.scala | 129 ++++++++++++++++++
 4 files changed, 243 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 3d64f7f296137..e612a2122ed62 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -26,10 +26,12 @@ import org.apache.spark.rdd.RDD
  * Model fitted by [[WeightedLeastSquares]].
  * @param coefficients model coefficients
  * @param intercept model intercept
+ * @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1
  */
 private[ml] class WeightedLeastSquaresModel(
     val coefficients: DenseVector,
-    val intercept: Double) extends Serializable
+    val intercept: Double,
+    val diagInvAtWA: DenseVector) extends Serializable
 
 /**
  * Weighted least squares solver via normal equation.
@@ -73,7 +75,9 @@ private[ml] class WeightedLeastSquares(
     val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_))
     summary.validate()
     logInfo(s"Number of instances: ${summary.count}.")
+    val k = summary.k
     val triK = summary.triK
+    val wSum = summary.wSum
     val bBar = summary.bBar
     val bStd = summary.bStd
     val aBar = summary.aBar
@@ -109,6 +113,11 @@ private[ml] class WeightedLeastSquares(
 
     val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values))
 
+    val aaInv = CholeskyDecomposition.inverse(aaBar.values, k)
+    // aaInv is a packed upper triangular matrix, here we get all elements on diagonal
+    val diagInvAtWA = new DenseVector((1 to k).map { i =>
+      aaInv(i + (i - 1) * i / 2 - 1) / wSum }.toArray)
+
     // compute intercept
     val intercept = if (fitIntercept) {
       bBar - BLAS.dot(aBar, x)
@@ -116,7 +125,7 @@ private[ml] class WeightedLeastSquares(
       0.0
     }
 
-    new WeightedLeastSquaresModel(x, intercept)
+    new WeightedLeastSquaresModel(x, intercept, diagInvAtWA)
   }
 }
 
@@ -131,7 +140,7 @@ private[ml] object WeightedLeastSquares {
     var k: Int = _
     var count: Long = _
     var triK: Int = _
-    private var wSum: Double = _
+    var wSum: Double = _
     private var wwSum: Double = _
     private var bSum: Double = _
     private var bbSum: Double = _
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 6e9c7442b8110..c51e30483ab3d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -21,6 +21,7 @@ import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
+import breeze.stats.distributions.StudentsT
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
@@ -36,7 +37,7 @@ import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.functions.{col, udf, lit}
+import org.apache.spark.sql.functions._
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -173,8 +174,11 @@ class LinearRegression(override val uid: String)
         summaryModel.transform(dataset),
         predictionColName,
         $(labelCol),
+        summaryModel,
+        model.diagInvAtWA.toArray,
         $(featuresCol),
         Array(0D))
+
       return lrModel.setSummary(trainingSummary)
     }
 
@@ -221,6 +225,8 @@ class LinearRegression(override val uid: String)
         summaryModel.transform(dataset),
         predictionColName,
         $(labelCol),
+        model,
+        Array(0D),
         $(featuresCol),
         Array(0D))
       return copyValues(model.setSummary(trainingSummary))
@@ -316,6 +322,8 @@ class LinearRegression(override val uid: String)
       summaryModel.transform(dataset),
       predictionColName,
       $(labelCol),
+      model,
+      Array(0D),
       $(featuresCol),
       objectiveHistory)
     model.setSummary(trainingSummary)
@@ -371,7 +379,8 @@ class LinearRegressionModel private[ml] (
   private[regression] def evaluate(dataset: DataFrame): LinearRegressionSummary = {
     // Handle possible missing or invalid prediction columns
     val (summaryModel, predictionColName) = findSummaryModelAndPredictionCol()
-    new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName, $(labelCol))
+    new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName,
+      $(labelCol), this, Array(0D))
   }
 
   /**
@@ -412,9 +421,11 @@ class LinearRegressionTrainingSummary private[regression] (
     predictions: DataFrame,
     predictionCol: String,
     labelCol: String,
+    model: LinearRegressionModel,
+    diagInvAtWA: Array[Double],
     val featuresCol: String,
     val objectiveHistory: Array[Double])
-  extends LinearRegressionSummary(predictions, predictionCol, labelCol) {
+  extends LinearRegressionSummary(predictions, predictionCol, labelCol, model, diagInvAtWA) {
 
   /** Number of training iterations until termination */
   val totalIterations = objectiveHistory.length
@@ -430,7 +441,9 @@ class LinearRegressionTrainingSummary private[regression] (
 class LinearRegressionSummary private[regression] (
     @transient val predictions: DataFrame,
     val predictionCol: String,
-    val labelCol: String) extends Serializable {
+    val labelCol: String,
+    val model: LinearRegressionModel,
+    val diagInvAtWA: Array[Double]) extends Serializable {
 
   @transient private val metrics = new RegressionMetrics(
     predictions
@@ -474,6 +487,75 @@ class LinearRegressionSummary private[regression] (
     predictions.select(t(col(predictionCol), col(labelCol)).as("residuals"))
   }
 
+  /** Number of instances in DataFrame predictions */
+  lazy val numInstances: Long = predictions.count()
+
+  /** Degrees of freedom */
+  private val degreesOfFreedom: Long = if (model.getFitIntercept) {
+    numInstances - model.coefficients.size - 1
+  } else {
+    numInstances - model.coefficients.size
+  }
+
+  /**
+   * The weighted residuals, the usual residuals rescaled by
+   * the square root of the instance weights.
+   */
+  lazy val devianceResiduals: Array[Double] = {
+    val weighted = if (model.getWeightCol.isEmpty) lit(1.0) else sqrt(col(model.getWeightCol))
+    val dr = predictions.select(col(model.getLabelCol).minus(col(model.getPredictionCol))
+      .multiply(weighted).as("weightedResiduals"))
+      .select(min(col("weightedResiduals")).as("min"), max(col("weightedResiduals")).as("max"))
+      .first()
+    Array(dr.getDouble(0), dr.getDouble(1))
+  }
+
+  /**
+   * Standard error of estimated coefficients.
+   * Note that standard error of estimated intercept is not supported currently.
+   */
+  lazy val coefficientStandardErrors: Array[Double] = {
+    if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
+      throw new UnsupportedOperationException(
+        "No Std. Error of coefficients available for this LinearRegressionModel")
+    } else {
+      val rss = if (model.getWeightCol.isEmpty) {
+        meanSquaredError * numInstances
+      } else {
+        val t = udf { (pred: Double, label: Double, weight: Double) =>
+          math.pow(label - pred, 2.0) * weight }
+        predictions.select(t(col(model.getPredictionCol), col(model.getLabelCol),
+          col(model.getWeightCol)).as("wse")).agg(sum(col("wse"))).first().getDouble(0)
+      }
+      val sigma2 = rss / degreesOfFreedom
+      diagInvAtWA.map(_ * sigma2).map(math.sqrt(_))
+    }
+  }
+
+  /** T-statistic of estimated coefficients.
+    * Note that t-statistic of estimated intercept is not supported currently.
+    */
+  lazy val tValues: Array[Double] = {
+    if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
+      throw new UnsupportedOperationException(
+        "No t-statistic available for this LinearRegressionModel")
+    } else {
+      model.coefficients.toArray.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
+    }
+  }
+
+  /** Two-sided p-value of estimated coefficients.
+    * Note that p-value of estimated intercept is not supported currently.
+    */
+  lazy val pValues: Array[Double] = {
+    if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
+      throw new UnsupportedOperationException(
+        "No p-value available for this LinearRegressionModel")
+    } else {
+      tValues.map { x => 2.0 * (1.0 - StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
+    }
+  }
+
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
index 66eb40b6f4a69..0cd371e9cce34 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
@@ -40,4 +40,20 @@ private[spark] object CholeskyDecomposition {
     assert(code == 0, s"lapack.dpotrs returned $code.")
     bx
   }
+
+  /**
+   * Computes the inverse of a real symmetric positive definite matrix A
+   * using the Cholesky factorization A = U**T*U.
+   * The input arguments are modified in-place to store the inverse matrix.
+   * @param UAi the upper triangular factor U from the Cholesky factorization A = U**T*U
+   * @param k the dimension of A
+   * @return the upper triangle of the (symmetric) inverse of A
+   */
+  def inverse(UAi: Array[Double], k: Int): Array[Double] = {
+    val info = new intW(0)
+    lapack.dpptri("U", k, UAi, info)
+    val code = info.`val`
+    assert(code == 0, s"lapack.dpptri returned $code.")
+    UAi
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 235c796d785a6..fbf83e8922861 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -35,6 +35,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   @transient var datasetWithDenseFeature: DataFrame = _
   @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _
   @transient var datasetWithSparseFeature: DataFrame = _
+  @transient var datasetWithWeight: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -73,6 +74,22 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
         xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
         seed, eps = 0.1, sparsity = 0.7), 2))
+
+    /*
+       R code:
+
+       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+       b <- c(17, 19, 23, 29)
+       w <- c(1, 2, 3, 4)
+       df <- as.data.frame(cbind(A, b))
+     */
+    datasetWithWeight = sqlContext.createDataFrame(
+      sc.parallelize(Seq(
+        Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+        Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
+        Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
+        Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
+      ), 2))
   }
 
   test("params") {
@@ -603,6 +620,16 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         // To clalify that the normal solver is used here.
         assert(model.summary.objectiveHistory.length == 1)
         assert(model.summary.objectiveHistory(0) == 0.0)
+        val devianceResidualsR = Array(-0.35566, 0.34504)
+        val seCoefR = Array(0.0011756, 0.0009032)
+        val tValsR = Array(3998, 7971)
+        val pValsR = Array(0, 0)
+        model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
+          assert(x._1 ~== x._2 absTol 1E-3) }
+        model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
+          assert(x._1 ~== x._2 absTol 1E-3) }
+        model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
+        model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
       }
     }
   }
@@ -725,4 +752,106 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .sliding(2)
         .forall(x => x(0) >= x(1)))
   }
+
+  test("linear regression summary with weighted samples and intercept by normal solver") {
+    /*
+       R code:
+
+       model <- glm(formula = "b ~ .", data = df, weights = w)
+       summary(model)
+
+       Call:
+       glm(formula = "b ~ .", data = df, weights = w)
+
+       Deviance Residuals:
+            1       2       3       4
+        1.920  -1.358  -1.109   0.960
+
+       Coefficients:
+                   Estimate Std. Error t value Pr(>|t|)
+       (Intercept)   18.080      9.608   1.882    0.311
+       V1             6.080      5.556   1.094    0.471
+       V2            -0.600      1.960  -0.306    0.811
+
+       (Dispersion parameter for gaussian family taken to be 7.68)
+
+           Null deviance: 202.00  on 3  degrees of freedom
+       Residual deviance:   7.68  on 1  degrees of freedom
+       AIC: 18.783
+
+       Number of Fisher Scoring iterations: 2
+     */
+
+    val model = new LinearRegression()
+      .setWeightCol("weight")
+      .setSolver("normal")
+      .fit(datasetWithWeight)
+    val coefficientsR = Vectors.dense(Array(6.080, -0.600))
+    val interceptR = 18.080
+    val devianceResidualsR = Array(-1.358, 1.920)
+    val seCoefR = Array(5.556, 1.960)
+    val tValsR = Array(1.094, -0.306)
+    val pValsR = Array(0.471, 0.811)
+
+    assert(model.coefficients ~== coefficientsR absTol 1E-3)
+    assert(model.intercept ~== interceptR absTol 1E-3)
+    model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
+      assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
+      assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
+  }
+
+  test("linear regression summary with weighted samples and w/o intercept by normal solver") {
+    /*
+       R code:
+
+       model <- glm(formula = "b ~ . -1", data = df, weights = w)
+       summary(model)
+
+       Call:
+       glm(formula = "b ~ . -1", data = df, weights = w)
+
+       Deviance Residuals:
+            1       2       3       4
+        1.950   2.344  -4.600   2.103
+
+       Coefficients:
+          Estimate Std. Error t value Pr(>|t|)
+       V1  -3.7271     2.9032  -1.284   0.3279
+       V2   3.0100     0.6022   4.998   0.0378 *
+       ---
+       Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+       (Dispersion parameter for gaussian family taken to be 17.4376)
+
+           Null deviance: 5962.000  on 4  degrees of freedom
+       Residual deviance:   34.875  on 2  degrees of freedom
+       AIC: 22.835
+
+       Number of Fisher Scoring iterations: 2
+     */
+
+    val model = new LinearRegression()
+      .setWeightCol("weight")
+      .setSolver("normal")
+      .setFitIntercept(false)
+      .fit(datasetWithWeight)
+    val coefficientsR = Vectors.dense(Array(-3.7271, 3.0100))
+    val interceptR = 0.0
+    val devianceResidualsR = Array(-4.600, 2.344)
+    val seCoefR = Array(2.9032, 0.6022)
+    val tValsR = Array(-1.284, 4.998)
+    val pValsR = Array(0.3279, 0.0378)
+
+    assert(model.coefficients ~== coefficientsR absTol 1E-3)
+    assert(model.intercept === interceptR)
+    model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
+      assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
+      assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
+    model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
+  }
 }

From 3434572b141075f00698d94e6ee80febd3093c3b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 3 Nov 2015 08:31:16 -0800
Subject: [PATCH 782/802] [MINOR][ML] Fix naming conventions of
 AFTSurvivalRegression coefficients

Rename ```regressionCoefficients``` back to ```coefficients```, and name ```weights``` to ```parameters```.
See discussion [here](https://github.com/apache/spark/pull/9311/files#diff-e277fd0bc21f825d3196b4551c01fe5fR230). mengxr vectorijk dbtsai

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9431 from yanboliang/aft-coefficients.
---
 .../ml/regression/AFTSurvivalRegression.scala | 38 +++++++++----------
 .../AFTSurvivalRegressionSuite.scala          | 12 +++---
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 4dbbc7d39931b..b7d095872ffa5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -200,17 +200,17 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 
     val numFeatures = dataset.select($(featuresCol)).take(1)(0).getAs[Vector](0).size
     /*
-       The coefficients vector has three parts:
+       The parameters vector has three parts:
        the first element: Double, log(sigma), the log of scale parameter
        the second element: Double, intercept of the beta parameter
        the third to the end elements: Doubles, regression coefficients vector of the beta parameter
      */
-    val initialCoefficients = Vectors.zeros(numFeatures + 2)
+    val initialParameters = Vectors.zeros(numFeatures + 2)
 
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialCoefficients.toBreeze.toDenseVector)
+      initialParameters.toBreeze.toDenseVector)
 
-    val coefficients = {
+    val parameters = {
       val arrayBuilder = mutable.ArrayBuilder.make[Double]
       var state: optimizer.State = null
       while (states.hasNext) {
@@ -227,10 +227,10 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 
     if (handlePersistence) instances.unpersist()
 
-    val regressionCoefficients = Vectors.dense(coefficients.slice(2, coefficients.length))
-    val intercept = coefficients(1)
-    val scale = math.exp(coefficients(0))
-    val model = new AFTSurvivalRegressionModel(uid, regressionCoefficients, intercept, scale)
+    val coefficients = Vectors.dense(parameters.slice(2, parameters.length))
+    val intercept = parameters(1)
+    val scale = math.exp(parameters(0))
+    val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale)
     copyValues(model.setParent(this))
   }
 
@@ -251,7 +251,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 @Since("1.6.0")
 class AFTSurvivalRegressionModel private[ml] (
     @Since("1.6.0") override val uid: String,
-    @Since("1.6.0") val regressionCoefficients: Vector,
+    @Since("1.6.0") val coefficients: Vector,
     @Since("1.6.0") val intercept: Double,
     @Since("1.6.0") val scale: Double)
   extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
@@ -275,7 +275,7 @@ class AFTSurvivalRegressionModel private[ml] (
   @Since("1.6.0")
   def predictQuantiles(features: Vector): Vector = {
     // scale parameter for the Weibull distribution of lifetime
-    val lambda = math.exp(BLAS.dot(regressionCoefficients, features) + intercept)
+    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
     // shape parameter for the Weibull distribution of lifetime
     val k = 1 / scale
     val quantiles = $(quantileProbabilities).map {
@@ -286,7 +286,7 @@ class AFTSurvivalRegressionModel private[ml] (
 
   @Since("1.6.0")
   def predict(features: Vector): Double = {
-    math.exp(BLAS.dot(regressionCoefficients, features) + intercept)
+    math.exp(BLAS.dot(coefficients, features) + intercept)
   }
 
   @Since("1.6.0")
@@ -309,7 +309,7 @@ class AFTSurvivalRegressionModel private[ml] (
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): AFTSurvivalRegressionModel = {
-    copyValues(new AFTSurvivalRegressionModel(uid, regressionCoefficients, intercept, scale), extra)
+    copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra)
       .setParent(parent)
   }
 }
@@ -369,17 +369,17 @@ class AFTSurvivalRegressionModel private[ml] (
  *   \frac{\partial (-\iota)}{\partial (\log\sigma)}=
  *   \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
  * }}}
- * @param coefficients including three part: The log of scale parameter, the intercept and
+ * @param parameters including three part: The log of scale parameter, the intercept and
  *                regression coefficients corresponding to the features.
  * @param fitIntercept Whether to fit an intercept term.
  */
-private class AFTAggregator(coefficients: BDV[Double], fitIntercept: Boolean)
+private class AFTAggregator(parameters: BDV[Double], fitIntercept: Boolean)
   extends Serializable {
 
   // beta is the intercept and regression coefficients to the covariates
-  private val beta = coefficients.slice(1, coefficients.length)
+  private val beta = parameters.slice(1, parameters.length)
   // sigma is the scale parameter of the AFT model
-  private val sigma = math.exp(coefficients(0))
+  private val sigma = math.exp(parameters(0))
 
   private var totalCnt: Long = 0L
   private var lossSum = 0.0
@@ -449,15 +449,15 @@ private class AFTAggregator(coefficients: BDV[Double], fitIntercept: Boolean)
 
 /**
  * AFTCostFun implements Breeze's DiffFunction[T] for AFT cost.
- * It returns the loss and gradient at a particular point (coefficients).
+ * It returns the loss and gradient at a particular point (parameters).
  * It's used in Breeze's convex optimization routines.
  */
 private class AFTCostFun(data: RDD[AFTPoint], fitIntercept: Boolean)
   extends DiffFunction[BDV[Double]] {
 
-  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+  override def calculate(parameters: BDV[Double]): (Double, BDV[Double]) = {
 
-    val aftAggregator = data.treeAggregate(new AFTAggregator(coefficients, fitIntercept))(
+    val aftAggregator = data.treeAggregate(new AFTAggregator(parameters, fitIntercept))(
       seqOp = (c, v) => (c, v) match {
         case (aggregator, instance) => aggregator.add(instance)
       },
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index c0f791bce13d1..359f31027172b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -141,12 +141,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 5
        n= 1000
      */
-    val regressionCoefficientsR = Vectors.dense(-0.039)
+    val coefficientsR = Vectors.dense(-0.039)
     val interceptR = 1.759
     val scaleR = 1.41
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*
@@ -212,12 +212,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 5
        n= 1000
      */
-    val regressionCoefficientsR = Vectors.dense(-0.0844, 0.0677)
+    val coefficientsR = Vectors.dense(-0.0844, 0.0677)
     val interceptR = 1.9206
     val scaleR = 0.977
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*
@@ -282,12 +282,12 @@ class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContex
        Number of Newton-Raphson Iterations: 6
        n= 1000
      */
-    val regressionCoefficientsR = Vectors.dense(0.896, -0.709)
+    val coefficientsR = Vectors.dense(0.896, -0.709)
     val interceptR = 0.0
     val scaleR = 1.52
 
     assert(model.intercept === interceptR)
-    assert(model.regressionCoefficients ~== regressionCoefficientsR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
     assert(model.scale ~== scaleR relTol 1E-3)
 
     /*

From f54ff19b1edd4903950cb334987a447445fa97ef Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 3 Nov 2015 08:32:37 -0800
Subject: [PATCH 783/802] [SPARK-11349][ML] Support transform string label for
 RFormula

Currently ```RFormula``` can only handle label with ```NumericType``` or ```BinaryType``` (cast it to ```DoubleType``` as the label of Linear Regression training), we should also support label of ```StringType``` which is needed for Logistic Regression (glm with family = "binomial").
For label of ```StringType```, we should use ```StringIndexer``` to transform it to 0-based index.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9302 from yanboliang/spark-11349.
---
 .../apache/spark/ml/feature/RFormula.scala    | 10 +++++++++-
 .../spark/ml/feature/RFormulaSuite.scala      | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index f9b840097f3ed..5c43a41bee3b4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -132,6 +132,14 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
       .setOutputCol($(featuresCol))
     encoderStages += new VectorAttributeRewriter($(featuresCol), prefixesToRewrite.toMap)
     encoderStages += new ColumnPruner(tempColumns.toSet)
+
+    if (dataset.schema.fieldNames.contains(resolvedFormula.label) &&
+      dataset.schema(resolvedFormula.label).dataType == StringType) {
+      encoderStages += new StringIndexer()
+        .setInputCol(resolvedFormula.label)
+        .setOutputCol($(labelCol))
+    }
+
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
     copyValues(new RFormulaModel(uid, resolvedFormula, pipelineModel).setParent(this))
   }
@@ -172,7 +180,7 @@ class RFormulaModel private[feature](
   override def transformSchema(schema: StructType): StructType = {
     checkCanTransform(schema)
     val withFeatures = pipelineModel.transformSchema(schema)
-    if (hasLabelCol(schema)) {
+    if (hasLabelCol(withFeatures)) {
       withFeatures
     } else if (schema.exists(_.name == resolvedFormula.label)) {
       val nullable = schema(resolvedFormula.label).dataType match {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index b56013008b116..dc20a5ec2152d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -107,6 +107,25 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(result.collect() === expected.collect())
   }
 
+  test("index string label") {
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val original = sqlContext.createDataFrame(
+      Seq(("male", "foo", 4), ("female", "bar", 4), ("female", "bar", 5), ("male", "baz", 5))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        ("male", "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
+        ("female", "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 0.0),
+        ("female", "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 0.0),
+        ("male", "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 1.0))
+    ).toDF("id", "a", "b", "features", "label")
+    // assert(result.schema.toString == resultSchema.toString)
+    assert(result.collect() === expected.collect())
+  }
+
   test("attribute generation") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original = sqlContext.createDataFrame(

From b2e4b314d989de8cad012bbddba703b31d8378a4 Mon Sep 17 00:00:00 2001
From: Mark Grover <grover.markgrover@gmail.com>
Date: Tue, 3 Nov 2015 08:51:40 -0800
Subject: [PATCH 784/802] [SPARK-9790][YARN] Expose in WebUI if NodeManager is
 the reason why executors were killed.

Author: Mark Grover <grover.markgrover@gmail.com>

Closes #8093 from markgrover/nm2.
---
 .../main/scala/org/apache/spark/TaskEndReason.scala   |  8 ++++++--
 .../scala/org/apache/spark/rpc/RpcEndpointRef.scala   |  4 ++--
 .../org/apache/spark/scheduler/TaskSetManager.scala   |  4 ++--
 .../cluster/CoarseGrainedSchedulerBackend.scala       |  5 +++--
 .../scheduler/cluster/YarnSchedulerBackend.scala      |  1 +
 .../scala/org/apache/spark/util/JsonProtocol.scala    | 11 ++++++++---
 .../spark/ui/jobs/JobProgressListenerSuite.scala      |  2 +-
 .../org/apache/spark/util/JsonProtocolSuite.scala     | 11 ++++++-----
 8 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 18278b292ff5a..13241b77bf97b 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -223,8 +223,10 @@ case class TaskCommitDenied(
  * the task crashed the JVM.
  */
 @DeveloperApi
-case class ExecutorLostFailure(execId: String, exitCausedByApp: Boolean = true)
-  extends TaskFailedReason {
+case class ExecutorLostFailure(
+    execId: String,
+    exitCausedByApp: Boolean = true,
+    reason: Option[String]) extends TaskFailedReason {
   override def toErrorString: String = {
     val exitBehavior = if (exitCausedByApp) {
       "caused by one of the running tasks"
@@ -232,6 +234,8 @@ case class ExecutorLostFailure(execId: String, exitCausedByApp: Boolean = true)
       "unrelated to the running tasks"
     }
     s"ExecutorLostFailure (executor ${execId} exited due to an issue ${exitBehavior})"
+    s"ExecutorLostFailure (executor ${execId} exited ${exitBehavior})" +
+      reason.map { r => s" Reason: $r" }.getOrElse("")
   }
 
   override def countTowardsTaskFailures: Boolean = exitCausedByApp
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
index f25710bb5bd6e..623da3e9c11b8 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
@@ -67,7 +67,7 @@ private[spark] abstract class RpcEndpointRef(conf: SparkConf)
    * The default `timeout` will be used in every trial of calling `sendWithReply`. Because this
    * method retries, the message handling in the receiver side should be idempotent.
    *
-   * Note: this is a blocking action which may cost a lot of time,  so don't call it in an message
+   * Note: this is a blocking action which may cost a lot of time,  so don't call it in a message
    * loop of [[RpcEndpoint]].
    *
    * @param message the message to send
@@ -82,7 +82,7 @@ private[spark] abstract class RpcEndpointRef(conf: SparkConf)
    * retries. `timeout` will be used in every trial of calling `sendWithReply`. Because this method
    * retries, the message handling in the receiver side should be idempotent.
    *
-   * Note: this is a blocking action which may cost a lot of time, so don't call it in an message
+   * Note: this is a blocking action which may cost a lot of time, so don't call it in a message
    * loop of [[RpcEndpoint]].
    *
    * @param message the message to send
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 9b3fad9012abc..114468c48c44c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -802,8 +802,8 @@ private[spark] class TaskSetManager(
         case exited: ExecutorExited => exited.exitCausedByApp
         case _ => true
       }
-      handleFailedTask(
-        tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, exitCausedByApp))
+      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, exitCausedByApp,
+        Some(reason.toString)))
     }
     // recalculate valid locality levels and waits when executor is lost
     recomputeLocality()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 439a11927026b..ebce5021b19dc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -125,7 +125,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             // Ignoring the task kill since the executor is not registered.
             logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
         }
-
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
@@ -195,7 +194,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
       addressToExecutorId
         .get(remoteAddress)
-        .foreach(removeExecutor(_, SlaveLost("remote Rpc client disassociated")))
+        .foreach(removeExecutor(_, SlaveLost("Remote RPC client disassociated. Likely due to " +
+          "containers exceeding thresholds, or network issues. Check driver logs for WARN " +
+          "messages.")))
     }
 
     // Make fake resource offers on just one executor
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index cb24072d7d941..d75d6f673e84e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -175,6 +175,7 @@ private[spark] abstract class YarnSchedulerBackend(
         addWebUIFilter(filterName, filterParams, proxyBase)
 
       case RemoveExecutor(executorId, reason) =>
+        logWarning(reason.toString)
         removeExecutor(executorId, reason)
     }
 
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index ad6615c1124d0..ee2eb58cf5e2a 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -367,9 +367,10 @@ private[spark] object JsonProtocol {
         ("Job ID" -> taskCommitDenied.jobID) ~
         ("Partition ID" -> taskCommitDenied.partitionID) ~
         ("Attempt Number" -> taskCommitDenied.attemptNumber)
-      case ExecutorLostFailure(executorId, exitCausedByApp) =>
+      case ExecutorLostFailure(executorId, exitCausedByApp, reason) =>
         ("Executor ID" -> executorId) ~
-        ("Exit Caused By App" -> exitCausedByApp)
+        ("Exit Caused By App" -> exitCausedByApp) ~
+        ("Loss Reason" -> reason.map(_.toString))
       case _ => Utils.emptyJson
     }
     ("Reason" -> reason) ~ json
@@ -812,7 +813,11 @@ private[spark] object JsonProtocol {
       case `executorLostFailure` =>
         val exitCausedByApp = Utils.jsonOption(json \ "Exit Caused By App").map(_.extract[Boolean])
         val executorId = Utils.jsonOption(json \ "Executor ID").map(_.extract[String])
-        ExecutorLostFailure(executorId.getOrElse("Unknown"), exitCausedByApp.getOrElse(true))
+        val reason = Utils.jsonOption(json \ "Loss Reason").map(_.extract[String])
+        ExecutorLostFailure(
+          executorId.getOrElse("Unknown"),
+          exitCausedByApp.getOrElse(true),
+          reason)
       case `unknownReason` => UnknownReason
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index b140387d309f3..e02f5a1b20fe3 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -243,7 +243,7 @@ class JobProgressListenerSuite extends SparkFunSuite with LocalSparkContext with
       ExceptionFailure("Exception", "description", null, null, None, None),
       TaskResultLost,
       TaskKilled,
-      ExecutorLostFailure("0"),
+      ExecutorLostFailure("0", true, Some("Induced failure")),
       UnknownReason)
     var failCount = 0
     for (reason <- taskFailedReasons) {
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 86137f259c13d..953456c2caa89 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -152,7 +152,7 @@ class JsonProtocolSuite extends SparkFunSuite {
     testTaskEndReason(TaskResultLost)
     testTaskEndReason(TaskKilled)
     testTaskEndReason(TaskCommitDenied(2, 3, 4))
-    testTaskEndReason(ExecutorLostFailure("100", true))
+    testTaskEndReason(ExecutorLostFailure("100", true, Some("Induced failure")))
     testTaskEndReason(UnknownReason)
 
     // BlockId
@@ -296,10 +296,10 @@ class JsonProtocolSuite extends SparkFunSuite {
 
   test("ExecutorLostFailure backward compatibility") {
     // ExecutorLostFailure in Spark 1.1.0 does not have an "Executor ID" property.
-    val executorLostFailure = ExecutorLostFailure("100", true)
+    val executorLostFailure = ExecutorLostFailure("100", true, Some("Induced failure"))
     val oldEvent = JsonProtocol.taskEndReasonToJson(executorLostFailure)
       .removeField({ _._1 == "Executor ID" })
-    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown", true)
+    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown", true, Some("Induced failure"))
     assert(expectedExecutorLostFailure === JsonProtocol.taskEndReasonFromJson(oldEvent))
   }
 
@@ -603,10 +603,11 @@ class JsonProtocolSuite extends SparkFunSuite {
         assert(jobId1 === jobId2)
         assert(partitionId1 === partitionId2)
         assert(attemptNumber1 === attemptNumber2)
-      case (ExecutorLostFailure(execId1, exit1CausedByApp),
-          ExecutorLostFailure(execId2, exit2CausedByApp)) =>
+      case (ExecutorLostFailure(execId1, exit1CausedByApp, reason1),
+          ExecutorLostFailure(execId2, exit2CausedByApp, reason2)) =>
         assert(execId1 === execId2)
         assert(exit1CausedByApp === exit2CausedByApp)
+        assert(reason1 === reason2)
       case (UnknownReason, UnknownReason) =>
       case _ => fail("Task end reasons don't match in types!")
     }

From ebf8b0b48deaad64f7ca27051caee763451e2623 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 3 Nov 2015 10:07:45 -0800
Subject: [PATCH 785/802] [SPARK-10978][SQL] Allow data sources to eliminate
 filters

This PR adds a new method `unhandledFilters` to `BaseRelation`. Data sources which implement this method properly may avoid the overhead of defensive filtering done by Spark SQL.

Author: Cheng Lian <lian@databricks.com>

Closes #9399 from liancheng/spark-10978.unhandled-filters.
---
 .../datasources/DataSourceStrategy.scala      | 131 ++++++++++++++----
 .../apache/spark/sql/sources/interfaces.scala |   9 ++
 .../parquet/ParquetFilterSuite.scala          |   2 +-
 .../spark/sql/sources/FilteredScanSuite.scala | 129 ++++++++++++-----
 .../SimpleTextHadoopFsRelationSuite.scala     |  47 ++++++-
 .../sql/sources/SimpleTextRelation.scala      |  65 ++++++++-
 6 files changed, 315 insertions(+), 68 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 65859865c8fbc..7265d6a4de2e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -43,7 +43,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         l,
         projects,
         filters,
-        (a, f) => toCatalystRDD(l, a, t.buildScan(a, f))) :: Nil
+        (requestedColumns, allPredicates, _) =>
+          toCatalystRDD(l, requestedColumns, t.buildScan(requestedColumns, allPredicates))) :: Nil
 
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _)) =>
       pruneFilterProject(
@@ -266,47 +267,81 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       relation,
       projects,
       filterPredicates,
-      (requestedColumns, pushedFilters) => {
-        scanBuilder(requestedColumns, selectFilters(pushedFilters).toArray)
+      (requestedColumns, _, pushedFilters) => {
+        scanBuilder(requestedColumns, pushedFilters.toArray)
       })
   }
 
-  // Based on Catalyst expressions.
+  // Based on Catalyst expressions. The `scanBuilder` function accepts three arguments:
+  //
+  //  1. A `Seq[Attribute]`, containing all required column attributes. Used to handle relation
+  //     traits that support column pruning (e.g. `PrunedScan` and `PrunedFilteredScan`).
+  //
+  //  2. A `Seq[Expression]`, containing all gathered Catalyst filter expressions, only used for
+  //     `CatalystScan`.
+  //
+  //  3. A `Seq[Filter]`, containing all data source `Filter`s that are converted from (possibly a
+  //     subset of) Catalyst filter expressions and can be handled by `relation`.  Used to handle
+  //     relation traits (`CatalystScan` excluded) that support filter push-down (e.g.
+  //     `PrunedFilteredScan` and `HadoopFsRelation`).
+  //
+  // Note that 2 and 3 shouldn't be used together.
   protected def pruneFilterProjectRaw(
-      relation: LogicalRelation,
-      projects: Seq[NamedExpression],
-      filterPredicates: Seq[Expression],
-      scanBuilder: (Seq[Attribute], Seq[Expression]) => RDD[InternalRow]) = {
+    relation: LogicalRelation,
+    projects: Seq[NamedExpression],
+    filterPredicates: Seq[Expression],
+    scanBuilder: (Seq[Attribute], Seq[Expression], Seq[Filter]) => RDD[InternalRow]) = {
 
     val projectSet = AttributeSet(projects.flatMap(_.references))
     val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
-    val filterCondition = filterPredicates.reduceLeftOption(expressions.And)
 
-    val pushedFilters = filterPredicates.map { _ transform {
+    val candidatePredicates = filterPredicates.map { _ transform {
       case a: AttributeReference => relation.attributeMap(a) // Match original case of attributes.
     }}
 
+    val (unhandledPredicates, pushedFilters) =
+      selectFilters(relation.relation, candidatePredicates)
+
+    // A set of column attributes that are only referenced by pushed down filters.  We can eliminate
+    // them from requested columns.
+    val handledSet = {
+      val handledPredicates = filterPredicates.filterNot(unhandledPredicates.contains)
+      val unhandledSet = AttributeSet(unhandledPredicates.flatMap(_.references))
+      AttributeSet(handledPredicates.flatMap(_.references)) --
+        (projectSet ++ unhandledSet).map(relation.attributeMap)
+    }
+
+    // Combines all Catalyst filter `Expression`s that are either not convertible to data source
+    // `Filter`s or cannot be handled by `relation`.
+    val filterCondition = unhandledPredicates.reduceLeftOption(expressions.And)
+
     if (projects.map(_.toAttribute) == projects &&
         projectSet.size == projects.size &&
         filterSet.subsetOf(projectSet)) {
       // When it is possible to just use column pruning to get the right projection and
       // when the columns of this projection are enough to evaluate all filter conditions,
       // just do a scan followed by a filter, with no extra project.
-      val requestedColumns =
-        projects.asInstanceOf[Seq[Attribute]] // Safe due to if above.
-          .map(relation.attributeMap)            // Match original case of attributes.
+      val requestedColumns = projects
+        // Safe due to if above.
+        .asInstanceOf[Seq[Attribute]]
+        // Match original case of attributes.
+        .map(relation.attributeMap)
+        // Don't request columns that are only referenced by pushed filters.
+        .filterNot(handledSet.contains)
 
       val scan = execution.PhysicalRDD.createFromDataSource(
         projects.map(_.toAttribute),
-        scanBuilder(requestedColumns, pushedFilters),
+        scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
         relation.relation)
       filterCondition.map(execution.Filter(_, scan)).getOrElse(scan)
     } else {
-      val requestedColumns = (projectSet ++ filterSet).map(relation.attributeMap).toSeq
+      // Don't request columns that are only referenced by pushed filters.
+      val requestedColumns =
+        (projectSet ++ filterSet -- handledSet).map(relation.attributeMap).toSeq
 
       val scan = execution.PhysicalRDD.createFromDataSource(
         requestedColumns,
-        scanBuilder(requestedColumns, pushedFilters),
+        scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
         relation.relation)
       execution.Project(projects, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan))
     }
@@ -334,11 +369,12 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
   }
 
   /**
-   * Selects Catalyst predicate [[Expression]]s which are convertible into data source [[Filter]]s,
-   * and convert them.
+   * Tries to translate a Catalyst [[Expression]] into data source [[Filter]].
+   *
+   * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`.
    */
-  protected[sql] def selectFilters(filters: Seq[Expression]) = {
-    def translate(predicate: Expression): Option[Filter] = predicate match {
+  protected[sql] def translateFilter(predicate: Expression): Option[Filter] = {
+    predicate match {
       case expressions.EqualTo(a: Attribute, Literal(v, t)) =>
         Some(sources.EqualTo(a.name, convertToScala(v, t)))
       case expressions.EqualTo(Literal(v, t), a: Attribute) =>
@@ -387,16 +423,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         Some(sources.IsNotNull(a.name))
 
       case expressions.And(left, right) =>
-        (translate(left) ++ translate(right)).reduceOption(sources.And)
+        (translateFilter(left) ++ translateFilter(right)).reduceOption(sources.And)
 
       case expressions.Or(left, right) =>
         for {
-          leftFilter <- translate(left)
-          rightFilter <- translate(right)
+          leftFilter <- translateFilter(left)
+          rightFilter <- translateFilter(right)
         } yield sources.Or(leftFilter, rightFilter)
 
       case expressions.Not(child) =>
-        translate(child).map(sources.Not)
+        translateFilter(child).map(sources.Not)
 
       case expressions.StartsWith(a: Attribute, Literal(v: UTF8String, StringType)) =>
         Some(sources.StringStartsWith(a.name, v.toString))
@@ -409,7 +445,52 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
 
       case _ => None
     }
+  }
+
+  /**
+   * Selects Catalyst predicate [[Expression]]s which are convertible into data source [[Filter]]s
+   * and can be handled by `relation`.
+   *
+   * @return A pair of `Seq[Expression]` and `Seq[Filter]`. The first element contains all Catalyst
+   *         predicate [[Expression]]s that are either not convertible or cannot be handled by
+   *         `relation`. The second element contains all converted data source [[Filter]]s that can
+   *        be handled by `relation`.
+   */
+  protected[sql] def selectFilters(
+    relation: BaseRelation,
+    predicates: Seq[Expression]): (Seq[Expression], Seq[Filter]) = {
+
+    // For conciseness, all Catalyst filter expressions of type `expressions.Expression` below are
+    // called `predicate`s, while all data source filters of type `sources.Filter` are simply called
+    // `filter`s.
+
+    val translated: Seq[(Expression, Filter)] =
+      for {
+        predicate <- predicates
+        filter <- translateFilter(predicate)
+      } yield predicate -> filter
+
+    // A map from original Catalyst expressions to corresponding translated data source filters.
+    val translatedMap: Map[Expression, Filter] = translated.toMap
+
+    // Catalyst predicate expressions that cannot be translated to data source filters.
+    val unrecognizedPredicates = predicates.filterNot(translatedMap.contains)
+
+    // Data source filters that cannot be handled by `relation`
+    val unhandledFilters = relation.unhandledFilters(translatedMap.values.toArray).toSet
+
+    val (unhandled, handled) = translated.partition {
+      case (predicate, filter) =>
+        unhandledFilters.contains(filter)
+    }
+
+    // Catalyst predicate expressions that can be translated to data source filters, but cannot be
+    // handled by `relation`.
+    val (unhandledPredicates, _) = unhandled.unzip
+
+    // Translated data source filters that can be handled by `relation`
+    val (_, handledFilters) = handled.unzip
 
-    filters.flatMap(translate)
+    (unrecognizedPredicates ++ unhandledPredicates, handledFilters)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 7a553511483ff..e296d631f0f30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -233,6 +233,15 @@ abstract class BaseRelation {
    * @since 1.4.0
    */
   def needConversion: Boolean = true
+
+  /**
+   * Given an array of [[Filter]]s, returns an array of [[Filter]]s that this data source relation
+   * cannot handle.  Spark SQL will apply all returned [[Filter]]s against rows returned by this
+   * data source relation.
+   *
+   * @since 1.6.0
+   */
+  def unhandledFilters(filters: Array[Filter]): Array[Filter] = filters
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index f88ddc77a6a4e..c24c9f025dad7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -59,7 +59,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }.flatten
       assert(analyzedPredicate.nonEmpty)
 
-      val selectedFilters = DataSourceStrategy.selectFilters(analyzedPredicate)
+      val selectedFilters = analyzedPredicate.flatMap(DataSourceStrategy.translateFilter)
       assert(selectedFilters.nonEmpty)
 
       selectedFilters.foreach { pred =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 68ce37c00077e..7541e723029bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.sources
 
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+
 import scala.language.existentials
 
 import org.apache.spark.rdd.RDD
@@ -44,16 +46,39 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL
       StructField("b", IntegerType, nullable = false) ::
       StructField("c", StringType, nullable = false) :: Nil)
 
+  override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
+    def unhandled(filter: Filter): Boolean = {
+      filter match {
+        case EqualTo(col, v) => col == "b"
+        case EqualNullSafe(col, v) => col == "b"
+        case LessThan(col, v: Int) => col == "b"
+        case LessThanOrEqual(col, v: Int) => col == "b"
+        case GreaterThan(col, v: Int) => col == "b"
+        case GreaterThanOrEqual(col, v: Int) => col == "b"
+        case In(col, values) => col == "b"
+        case IsNull(col) => col == "b"
+        case IsNotNull(col) => col == "b"
+        case Not(pred) => unhandled(pred)
+        case And(left, right) => unhandled(left) || unhandled(right)
+        case Or(left, right) => unhandled(left) || unhandled(right)
+        case _ => false
+      }
+    }
+
+    filters.filter(unhandled)
+  }
+
   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
     val rowBuilders = requiredColumns.map {
       case "a" => (i: Int) => Seq(i)
       case "b" => (i: Int) => Seq(i * 2)
       case "c" => (i: Int) =>
         val c = (i - 1 + 'a').toChar.toString
-        Seq(c * 5 + c.toUpperCase() * 5)
+        Seq(c * 5 + c.toUpperCase * 5)
     }
 
     FiltersPushed.list = filters
+    ColumnsRequired.set = requiredColumns.toSet
 
     // Predicate test on integer column
     def translateFilterOnA(filter: Filter): Int => Boolean = filter match {
@@ -86,9 +111,8 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL
     }
 
     def eval(a: Int) = {
-      val c = (a - 1 + 'a').toChar.toString * 5 + (a - 1 + 'a').toChar.toString.toUpperCase() * 5
-      !filters.map(translateFilterOnA(_)(a)).contains(false) &&
-        !filters.map(translateFilterOnC(_)(c)).contains(false)
+      val c = (a - 1 + 'a').toChar.toString * 5 + (a - 1 + 'a').toChar.toString.toUpperCase * 5
+      filters.forall(translateFilterOnA(_)(a)) && filters.forall(translateFilterOnC(_)(c))
     }
 
     sqlContext.sparkContext.parallelize(from to to).filter(eval).map(i =>
@@ -101,6 +125,11 @@ object FiltersPushed {
   var list: Seq[Filter] = Nil
 }
 
+// Used together with `SimpleFilteredScan` to check pushed columns.
+object ColumnsRequired {
+  var set: Set[String] = Set.empty
+}
+
 class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
   protected override lazy val sql = caseInsensitiveContext.sql _
 
@@ -115,12 +144,15 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
         |  to '10'
         |)
       """.stripMargin)
+
+    // UDF for testing filter push-down
+    caseInsensitiveContext.udf.register("udf_gt3", (_: Int) > 3)
   }
 
   sqlTest(
     "SELECT * FROM oneToTenFiltered",
     (1 to 10).map(i => Row(i, i * 2, (i - 1 + 'a').toChar.toString * 5
-      + (i - 1 + 'a').toChar.toString.toUpperCase() * 5)).toSeq)
+      + (i - 1 + 'a').toChar.toString.toUpperCase * 5)).toSeq)
 
   sqlTest(
     "SELECT a, b FROM oneToTenFiltered",
@@ -202,49 +234,64 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
     "SELECT a, b, c FROM oneToTenFiltered WHERE c like '%eE%'",
     Seq(Row(5, 5 * 2, "e" * 5 + "E" * 5)))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE A = 1", 1)
-  testPushDown("SELECT a FROM oneToTenFiltered WHERE A = 1", 1)
-  testPushDown("SELECT b FROM oneToTenFiltered WHERE A = 1", 1)
-  testPushDown("SELECT a, b FROM oneToTenFiltered WHERE A = 1", 1)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a = 1", 1)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 = a", 1)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE A = 1", 1, Set("a", "b", "c"))
+  testPushDown("SELECT a FROM oneToTenFiltered WHERE A = 1", 1, Set("a"))
+  testPushDown("SELECT b FROM oneToTenFiltered WHERE A = 1", 1, Set("b"))
+  testPushDown("SELECT a, b FROM oneToTenFiltered WHERE A = 1", 1, Set("a", "b"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a = 1", 1, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 = a", 1, Set("a", "b", "c"))
+
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a > 1", 9, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a >= 2", 9, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a > 1", 9)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a >= 2", 9)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 < a", 9, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE 2 <= a", 9, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 < a", 9)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE 2 <= a", 9)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 > a", 0, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE 2 >= a", 2, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE 1 > a", 0)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE 2 >= a", 2)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 1", 0, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a <= 2", 2, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 1", 0)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a <= 2", 2)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a > 1 AND a < 10", 8, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a > 1 AND a < 10", 8)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a IN (1,3,5)", 3, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a IN (1,3,5)", 3)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a = 20", 0, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE b = 1", 10, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a = 20", 0)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE b = 1", 10)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 5 AND a > 1", 3, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 3 OR a > 8", 4, Set("a", "b", "c"))
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE NOT (a < 6)", 5, Set("a", "b", "c"))
 
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 5 AND a > 1", 3)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 3 OR a > 8", 4)
-  testPushDown("SELECT * FROM oneToTenFiltered WHERE NOT (a < 6)", 5)
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like 'c%'", 1, Set("a", "b", "c"))
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like 'C%'", 0, Set("a", "b", "c"))
 
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like 'c%'", 1)
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like 'C%'", 0)
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%D'", 1, Set("a", "b", "c"))
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%d'", 0, Set("a", "b", "c"))
 
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%D'", 1)
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%d'", 0)
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%eE%'", 1, Set("a", "b", "c"))
+  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%Ee%'", 0, Set("a", "b", "c"))
 
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%eE%'", 1)
-  testPushDown("SELECT a, b, c FROM oneToTenFiltered WHERE c like '%Ee%'", 0)
+  testPushDown("SELECT c FROM oneToTenFiltered WHERE c = 'aaaaaAAAAA'", 1, Set("c"))
+  testPushDown("SELECT c FROM oneToTenFiltered WHERE c IN ('aaaaaAAAAA', 'foo')", 1, Set("c"))
 
-  testPushDown("SELECT c FROM oneToTenFiltered WHERE c = 'aaaaaAAAAA'", 1)
-  testPushDown("SELECT c FROM oneToTenFiltered WHERE c IN ('aaaaaAAAAA', 'foo')", 1)
+  // Columns only referenced by UDF filter must be required, as UDF filters can't be pushed down.
+  testPushDown("SELECT c FROM oneToTenFiltered WHERE udf_gt3(A)", 10, Set("a", "c"))
 
-  def testPushDown(sqlString: String, expectedCount: Int): Unit = {
+  // A query with an unconvertible filter, an unhandled filter, and a handled filter.
+  testPushDown(
+    """SELECT a
+      |  FROM oneToTenFiltered
+      | WHERE udf_gt3(b)
+      |   AND b < 16
+      |   AND c IN ('bbbbbBBBBB', 'cccccCCCCC', 'dddddDDDDD', 'foo')
+    """.stripMargin.split("\n").map(_.trim).mkString(" "), 3, Set("a", "b"))
+
+  def testPushDown(
+      sqlString: String,
+      expectedCount: Int,
+      requiredColumnNames: Set[String]): Unit = {
     test(s"PushDown Returns $expectedCount: $sqlString") {
       val queryExecution = sql(sqlString).queryExecution
       val rawPlan = queryExecution.executedPlan.collect {
@@ -254,6 +301,17 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
         case _ => fail(s"More than one PhysicalRDD found\n$queryExecution")
       }
       val rawCount = rawPlan.execute().count()
+      assert(ColumnsRequired.set === requiredColumnNames)
+
+      assert {
+        val table = caseInsensitiveContext.table("oneToTenFiltered")
+        val relation = table.queryExecution.logical.collectFirst {
+          case LogicalRelation(r, _) => r
+        }.get
+
+        // `relation` should be able to handle all pushed filters
+        relation.unhandledFilters(FiltersPushed.list.toArray).isEmpty
+      }
 
       if (rawCount != expectedCount) {
         fail(
@@ -264,4 +322,3 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext {
     }
   }
 }
-
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index a3a124488d983..d945408341fc9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -18,11 +18,16 @@
 package org.apache.spark.sql.sources
 
 import org.apache.hadoop.fs.Path
-
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.execution.PhysicalRDD
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
+  import testImplicits._
+
   override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
 
   // We have a very limited number of supported types at here since it is just for a
@@ -64,4 +69,44 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
           .load(file.getCanonicalPath))
     }
   }
+
+  private val writer = testDF.write.option("dataSchema", dataSchema.json).format(dataSourceName)
+  private val reader = sqlContext.read.option("dataSchema", dataSchema.json).format(dataSourceName)
+
+  test("unhandledFilters") {
+    withTempPath { dir =>
+
+      val path = dir.getCanonicalPath
+      writer.save(s"$path/p=0")
+      writer.save(s"$path/p=1")
+
+      val isOdd = udf((_: Int) % 2 == 1)
+      val df = reader.load(path)
+        .filter(
+          // This filter is inconvertible
+          isOdd('a) &&
+            // This filter is convertible but unhandled
+            'a > 1 &&
+            // This filter is convertible and handled
+            'b > "val_1" &&
+            // This filter references a partiiton column, won't be pushed down
+            'p === 1
+        ).select('a, 'p)
+      val rawScan = df.queryExecution.executedPlan collect {
+        case p: PhysicalRDD => p
+      } match {
+        case Seq(p) => p
+      }
+
+      val outputSchema = new StructType().add("a", IntegerType).add("p", IntegerType)
+
+      assertResult(Set((2, 1), (3, 1))) {
+        rawScan.execute().collect()
+          .map { CatalystTypeConverters.convertToScala(_, outputSchema) }
+          .map { case Row(a, p) => (a, p) }.toSet
+      }
+
+      checkAnswer(df, Row(3, 1))
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index aeaaa3e1c5220..da09e1b00ae48 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.sources
 
 import java.text.NumberFormat
-import java.util.UUID
 
 import com.google.common.base.Objects
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -26,12 +25,12 @@ import org.apache.hadoop.io.{NullWritable, Text}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
-import org.apache.spark.rdd.RDD
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, expressions}
 import org.apache.spark.sql.types.{DataType, StructType}
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SQLContext, sources}
 
 /**
  * A simple example [[HadoopFsRelationProvider]].
@@ -124,6 +123,53 @@ class SimpleTextRelation(
     }
   }
 
+  override def buildScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputFiles: Array[FileStatus]): RDD[Row] = {
+
+    val fields = this.dataSchema.map(_.dataType)
+    val inputAttributes = this.dataSchema.toAttributes
+    val outputAttributes = requiredColumns.flatMap(name => inputAttributes.find(_.name == name))
+    val dataSchema = this.dataSchema
+
+    val inputPaths = inputFiles.map(_.getPath).mkString(",")
+    sparkContext.textFile(inputPaths).mapPartitions { iterator =>
+      // Constructs a filter predicate to simulate filter push-down
+      val predicate = {
+        val filterCondition: Expression = filters.collect {
+          // According to `unhandledFilters`, `SimpleTextRelation` only handles `GreaterThan` filter
+          case sources.GreaterThan(column, value) =>
+            val dataType = dataSchema(column).dataType
+            val literal = Literal.create(value, dataType)
+            val attribute = inputAttributes.find(_.name == column).get
+            expressions.GreaterThan(attribute, literal)
+        }.reduceOption(expressions.And).getOrElse(Literal(true))
+        InterpretedPredicate.create(filterCondition, inputAttributes)
+      }
+
+      // Uses a simple projection to simulate column pruning
+      val projection = new InterpretedMutableProjection(outputAttributes, inputAttributes)
+      val toScala = {
+        val requiredSchema = StructType.fromAttributes(outputAttributes)
+        CatalystTypeConverters.createToScalaConverter(requiredSchema)
+      }
+
+      iterator.map { record =>
+        new GenericInternalRow(record.split(",", -1).zip(fields).map {
+          case (v, dataType) =>
+            val value = if (v == "") null else v
+            // `Cast`ed values are always of internal types (e.g. UTF8String instead of String)
+            Cast(Literal(value), dataType).eval()
+        })
+      }.filter { row =>
+        predicate(row)
+      }.map { row =>
+        toScala(projection(row)).asInstanceOf[Row]
+      }
+    }
+  }
+
   override def prepareJobForWrite(job: Job): OutputWriterFactory = new OutputWriterFactory {
     job.setOutputFormatClass(classOf[TextOutputFormat[_, _]])
 
@@ -134,6 +180,15 @@ class SimpleTextRelation(
       new SimpleTextOutputWriter(path, context)
     }
   }
+
+  // `SimpleTextRelation` only handles `GreaterThan` filter.  This is used to test filter push-down
+  // and `BaseRelation.unhandledFilters()`.
+  override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
+    filters.filter {
+      case _: GreaterThan => false
+      case _ => true
+    }
+  }
 }
 
 /**

From a9676cc7107c5df6c62a58668c4d95ced1238370 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 3 Nov 2015 11:53:10 -0800
Subject: [PATCH 786/802] [SPARK-11407][SPARKR] Add doc for running from
 RStudio

![image](https://cloud.githubusercontent.com/assets/8969467/10871746/612ba44a-80a4-11e5-99a0-40b9931dee52.png)
(This is without css, but you get the idea)
shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9401 from felixcheung/rstudioprogrammingguide.
---
 docs/sparkr.md | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/docs/sparkr.md b/docs/sparkr.md
index 497a276679f3b..437bd4756c276 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -30,14 +30,22 @@ The entry point into SparkR is the `SparkContext` which connects your R program
 You can create a `SparkContext` using `sparkR.init` and pass in options such as the application name
 , any spark packages depended on, etc. Further, to work with DataFrames we will need a `SQLContext`,
 which can be created from the  SparkContext. If you are working from the `sparkR` shell, the
-`SQLContext` and `SparkContext` should already be created for you.
+`SQLContext` and `SparkContext` should already be created for you, and you would not need to call
+`sparkR.init`.
 
+<div data-lang="r" markdown="1">
 {% highlight r %}
 sc <- sparkR.init()
 sqlContext <- sparkRSQL.init(sc)
 {% endhighlight %}
+</div>
+
+## Starting Up from RStudio
 
-In the event you are creating `SparkContext` instead of using `sparkR` shell or `spark-submit`, you 
+You can also start SparkR from RStudio. You can connect your R program to a Spark cluster from
+RStudio, R shell, Rscript or other R IDEs. To start, make sure SPARK_HOME is set in environment
+(you can check [Sys.getenv](https://stat.ethz.ch/R-manual/R-devel/library/base/html/Sys.getenv.html)),
+load the SparkR package, and call `sparkR.init` as below. In addition to calling `sparkR.init`, you
 could also specify certain Spark driver properties. Normally these
 [Application properties](configuration.html#application-properties) and
 [Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the
@@ -45,9 +53,41 @@ driver JVM process would have been started, in this case SparkR takes care of th
 them, pass them as you would other configuration properties in the `sparkEnvir` argument to
 `sparkR.init()`.
 
+<div data-lang="r" markdown="1">
 {% highlight r %}
-sc <- sparkR.init("local[*]", "SparkR", "/home/spark", list(spark.driver.memory="2g"))
+if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
+  Sys.setenv(SPARK_HOME = "/home/spark")
+}
+library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
+sc <- sparkR.init(master = "local[*]", sparkEnvir = list(spark.driver.memory="2g"))
 {% endhighlight %}
+</div>
+
+The following options can be set in `sparkEnvir` with `sparkR.init` from RStudio:
+
+<table class="table">
+  <tr><th>Property Name</th><th>Property group</th><th><code>spark-submit</code> equivalent</th></tr>
+  <tr>
+    <td><code>spark.driver.memory</code></td>
+    <td>Application Properties</td>
+    <td><code>--driver-memory</code></td>
+  </tr>
+  <tr>
+    <td><code>spark.driver.extraClassPath</code></td>
+    <td>Runtime Environment</td>
+    <td><code>--driver-class-path</code></td>
+  </tr>
+  <tr>
+    <td><code>spark.driver.extraJavaOptions</code></td>
+    <td>Runtime Environment</td>
+    <td><code>--driver-java-options</code></td>
+  </tr>
+  <tr>
+    <td><code>spark.driver.extraLibraryPath</code></td>
+    <td>Runtime Environment</td>
+    <td><code>--driver-library-path</code></td>
+  </tr>
+</table>
 
 </div>
 

From 1d04dc95c0d3caa485936e65b0493bcc9719f27e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 3 Nov 2015 13:33:46 -0800
Subject: [PATCH 787/802] [SPARK-11467][SQL] add Python API for stddev/variance

Add Python API for stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis

Author: Davies Liu <davies@databricks.com>

Closes #9424 from davies/py_var.
---
 python/pyspark/sql/functions.py               | 17 ++++
 python/pyspark/sql/group.py                   | 88 +++++++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 67 --------------
 3 files changed, 105 insertions(+), 67 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fa04f4cd83b6f..2f7c2f4aacd47 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -122,6 +122,21 @@ def _():
     'bitwiseNOT': 'Computes bitwise not.',
 }
 
+_functions_1_6 = {
+    # unary math functions
+    "stddev": "Aggregate function: returns the unbiased sample standard deviation of" +
+              " the expression in a group.",
+    "stddev_samp": "Aggregate function: returns the unbiased sample standard deviation of" +
+              " the expression in a group.",
+    "stddev_pop": "Aggregate function: returns population standard deviation of" +
+              " the expression in a group.",
+    "variance": "Aggregate function: returns the population variance of the values in a group.",
+    "var_samp": "Aggregate function: returns the unbiased variance of the values in a group.",
+    "var_pop":  "Aggregate function: returns the population variance of the values in a group.",
+    "skewness": "Aggregate function: returns the skewness of the values in a group.",
+    "kurtosis": "Aggregate function: returns the kurtosis of the values in a group."
+}
+
 # math functions that take two arguments as input
 _binary_mathfunctions = {
     'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
@@ -172,6 +187,8 @@ def _():
     globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
 for _name, _doc in _window_functions.items():
     globals()[_name] = since(1.4)(_create_window_function(_name, _doc))
+for _name, _doc in _functions_1_6.items():
+    globals()[_name] = since(1.6)(_create_function(_name, _doc))
 del _name, _doc
 
 
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 71c0bccc5eeff..946b53e71c2c6 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,6 +167,94 @@ def sum(self, *cols):
         [Row(sum(age)=7, sum(height)=165)]
         """
 
+    @df_varargs_api
+    @since(1.6)
+    def stddev(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev('age', 'height').collect()
+        [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_samp(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev_samp('age', 'height').collect()
+        [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_pop(self, *cols):
+        """Compute the population standard deviation for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev_pop('age', 'height').collect()
+        [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def variance(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().variance('age', 'height').collect()
+        [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_pop(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().var_pop('age', 'height').collect()
+        [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_samp(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().var_samp('age', 'height').collect()
+        [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def skewness(self, *cols):
+        """Compute the skewness for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().skewness('age', 'height').collect()
+        [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def kurtosis(self, *cols):
+        """Compute the kurtosis for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().kurtosis('age', 'height').collect()
+        [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
+        """
+
 
 def _test():
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5a5c695e6ab3b..c8c52831668cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -254,14 +254,6 @@ object functions {
    */
   def kurtosis(e: Column): Column = Kurtosis(e.expr)
 
-  /**
-   * Aggregate function: returns the kurtosis of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
-
   /**
    * Aggregate function: returns the last value in a group.
    *
@@ -336,14 +328,6 @@ object functions {
    */
   def skewness(e: Column): Column = Skewness(e.expr)
 
-  /**
-   * Aggregate function: returns the skewness of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def skewness(columnName: String): Column = skewness(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
@@ -353,15 +337,6 @@ object functions {
    */
   def stddev(e: Column): Column = Stddev(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev(columnName: String): Column = stddev(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
@@ -371,15 +346,6 @@ object functions {
    */
   def stddev_samp(e: Column): Column = StddevSamp(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
-
   /**
    * Aggregate function: returns the population standard deviation of
    * the expression in a group.
@@ -389,15 +355,6 @@ object functions {
    */
   def stddev_pop(e: Column): Column = StddevPop(e.expr)
 
-  /**
-   * Aggregate function: returns the population standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
-
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
@@ -438,14 +395,6 @@ object functions {
    */
   def variance(e: Column): Column = Variance(e.expr)
 
-  /**
-   * Aggregate function: returns the population variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def variance(columnName: String): Column = variance(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased variance of the values in a group.
    *
@@ -454,14 +403,6 @@ object functions {
    */
   def var_samp(e: Column): Column = VarianceSamp(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_samp(columnName: String): Column = var_samp(Column(columnName))
-
   /**
    * Aggregate function: returns the population variance of the values in a group.
    *
@@ -470,14 +411,6 @@ object functions {
    */
   def var_pop(e: Column): Column = VariancePop(e.expr)
 
-  /**
-   * Aggregate function: returns the population variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_pop(columnName: String): Column = var_pop(Column(columnName))
-
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Window functions
   //////////////////////////////////////////////////////////////////////////////////////////////

From f6fcb4874ce20a1daa91b7434cf9c0254a89e979 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 4 Nov 2015 00:15:50 +0100
Subject: [PATCH 788/802] [SPARK-11477] [SQL] support create Dataset from RDD

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9434 from cloud-fan/rdd2ds and squashes the following commits:

0892d72 [Wenchen Fan] support create Dataset from RDD
---
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala | 9 +++++++++
 .../main/scala/org/apache/spark/sql/SQLImplicits.scala   | 4 ++++
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala   | 7 +++++++
 3 files changed, 20 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2cb94430e6178..5ad3871093fc8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -499,6 +499,15 @@ class SQLContext private[sql](
     new Dataset[T](this, plan)
   }
 
+  def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = {
+    val enc = encoderFor[T]
+    val attributes = enc.schema.toAttributes
+    val encoded = data.map(d => enc.toRow(d))
+    val plan = LogicalRDD(attributes, encoded)(self)
+
+    new Dataset[T](this, plan)
+  }
+
   /**
    * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be
    * converted to Catalyst rows.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index f460a86414c41..f2904e270811e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -48,6 +48,10 @@ abstract class SQLImplicits {
   implicit def newBooleanEncoder: Encoder[Boolean] = ExpressionEncoder[Boolean](flat = true)
   implicit def newStringEncoder: Encoder[String] = ExpressionEncoder[String](flat = true)
 
+  implicit def rddToDatasetHolder[T : Encoder](rdd: RDD[T]): DatasetHolder[T] = {
+    DatasetHolder(_sqlContext.createDataset(rdd))
+  }
+
   implicit def localSeqToDatasetHolder[T : Encoder](s: Seq[T]): DatasetHolder[T] = {
     DatasetHolder(_sqlContext.createDataset(s))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 5973fa7f2a76b..3e9b621cfd67f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -34,6 +34,13 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       data: _*)
   }
 
+  test("toDS with RDD") {
+    val ds = sparkContext.makeRDD(Seq("a", "b", "c"), 3).toDS()
+    checkAnswer(
+      ds.mapPartitions(_ => Iterator(1)),
+      1, 1, 1)
+  }
+
   test("as tuple") {
     val data = Seq(("a", 1), ("b", 2)).toDF("a", "b")
     checkAnswer(

From 680b4e7bca935dc1569f35fa319bdfb01a12f7e0 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Tue, 3 Nov 2015 15:26:35 -0800
Subject: [PATCH 789/802] Fix typo in WebUI

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #9444 from jaceklaskowski/TImely-fix.
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 712782d27b3cf..51425e599e748 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -49,7 +49,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
             ("shuffle-read-time-proportion", "Shuffle Read Time"),
             ("executor-runtime-proportion", "Executor Computing Time"),
             ("shuffle-write-time-proportion", "Shuffle Write Time"),
-            ("serialization-time-proportion", "Result Serialization TIme"),
+            ("serialization-time-proportion", "Result Serialization Time"),
             ("getting-result-time-proportion", "Getting Result Time"))
 
           legendPairs.zipWithIndex.map {

From 53e9cee3e4e845d1f875c487215c0f22503347b1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 3 Nov 2015 16:26:28 -0800
Subject: [PATCH 790/802] [SPARK-11466][CORE] Avoid mockito in multi-threaded
 FsHistoryProviderSuite test.

The test functionality should be the same, but without using mockito; logs don't
really say anything useful but I suspect it may be the cause of the flakiness,
since updating mocks when multiple threads may be using it doesn't work very
well. It also allows some other cleanup (= less test code in FsHistoryProvider).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9425 from vanzin/SPARK-11466.
---
 .../deploy/history/FsHistoryProvider.scala    | 31 ++++++--------
 .../history/FsHistoryProviderSuite.scala      | 42 +++++++++----------
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 24aa386c7212b..718efc4f3bd5e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -113,35 +113,30 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   }
 
   // Conf option used for testing the initialization code.
-  val initThread = if (!conf.getBoolean("spark.history.testing.skipInitialize", false)) {
-      initialize(None)
-    } else {
-      null
-    }
+  val initThread = initialize()
 
-  private[history] def initialize(errorHandler: Option[Thread.UncaughtExceptionHandler]): Thread = {
+  private[history] def initialize(): Thread = {
     if (!isFsInSafeMode()) {
       startPolling()
-      return null
+      null
+    } else {
+      startSafeModeCheckThread(None)
     }
+  }
 
+  private[history] def startSafeModeCheckThread(
+      errorHandler: Option[Thread.UncaughtExceptionHandler]): Thread = {
     // Cannot probe anything while the FS is in safe mode, so spawn a new thread that will wait
     // for the FS to leave safe mode before enabling polling. This allows the main history server
     // UI to be shown (so that the user can see the HDFS status).
-    //
-    // The synchronization in the run() method is needed because of the tests; mockito can
-    // misbehave if the test is modifying the mocked methods while the thread is calling
-    // them.
     val initThread = new Thread(new Runnable() {
       override def run(): Unit = {
         try {
-          clock.synchronized {
-            while (isFsInSafeMode()) {
-              logInfo("HDFS is still in safe mode. Waiting...")
-              val deadline = clock.getTimeMillis() +
-                TimeUnit.SECONDS.toMillis(SAFEMODE_CHECK_INTERVAL_S)
-              clock.waitTillTime(deadline)
-            }
+          while (isFsInSafeMode()) {
+            logInfo("HDFS is still in safe mode. Waiting...")
+            val deadline = clock.getTimeMillis() +
+              TimeUnit.SECONDS.toMillis(SAFEMODE_CHECK_INTERVAL_S)
+            clock.waitTillTime(deadline)
           }
           startPolling()
         } catch {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 833aab14ca2da..5cab17f8a38f5 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -41,7 +41,7 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark.{Logging, SparkConf, SparkFunSuite}
 import org.apache.spark.io._
 import org.apache.spark.scheduler._
-import org.apache.spark.util.{JsonProtocol, ManualClock, Utils}
+import org.apache.spark.util.{Clock, JsonProtocol, ManualClock, Utils}
 
 class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
 
@@ -423,22 +423,16 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
 
   test("provider waits for safe mode to finish before initializing") {
     val clock = new ManualClock()
-    val conf = createTestConf().set("spark.history.testing.skipInitialize", "true")
-    val provider = spy(new FsHistoryProvider(conf, clock))
-    doReturn(true).when(provider).isFsInSafeMode()
-
-    val initThread = provider.initialize(None)
+    val provider = new SafeModeTestProvider(createTestConf(), clock)
+    val initThread = provider.initialize()
     try {
       provider.getConfig().keys should contain ("HDFS State")
 
       clock.setTime(5000)
       provider.getConfig().keys should contain ("HDFS State")
 
-      // Synchronization needed because of mockito.
-      clock.synchronized {
-        doReturn(false).when(provider).isFsInSafeMode()
-        clock.setTime(10000)
-      }
+      provider.inSafeMode = false
+      clock.setTime(10000)
 
       eventually(timeout(1 second), interval(10 millis)) {
         provider.getConfig().keys should not contain ("HDFS State")
@@ -451,18 +445,12 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
   test("provider reports error after FS leaves safe mode") {
     testDir.delete()
     val clock = new ManualClock()
-    val conf = createTestConf().set("spark.history.testing.skipInitialize", "true")
-    val provider = spy(new FsHistoryProvider(conf, clock))
-    doReturn(true).when(provider).isFsInSafeMode()
-
+    val provider = new SafeModeTestProvider(createTestConf(), clock)
     val errorHandler = mock(classOf[Thread.UncaughtExceptionHandler])
-    val initThread = provider.initialize(Some(errorHandler))
+    val initThread = provider.startSafeModeCheckThread(Some(errorHandler))
     try {
-      // Synchronization needed because of mockito.
-      clock.synchronized {
-        doReturn(false).when(provider).isFsInSafeMode()
-        clock.setTime(10000)
-      }
+      provider.inSafeMode = false
+      clock.setTime(10000)
 
       eventually(timeout(1 second), interval(10 millis)) {
         verify(errorHandler).uncaughtException(any(), any())
@@ -530,4 +518,16 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     log
   }
 
+  private class SafeModeTestProvider(conf: SparkConf, clock: Clock)
+    extends FsHistoryProvider(conf, clock) {
+
+    @volatile var inSafeMode = true
+
+    // Skip initialization so that we can manually start the safe mode check thread.
+    private[history] override def initialize(): Thread = null
+
+    private[history] override def isFsInSafeMode(): Boolean = inSafeMode
+
+  }
+
 }

From 5051262d4ca6a2c529c9b1ba86d54cce60a7af17 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 3 Nov 2015 16:27:56 -0800
Subject: [PATCH 791/802] [SPARK-11489][SQL] Only include common first order
 statistics in GroupedData

We added a bunch of higher order statistics such as skewness and kurtosis to GroupedData. I don't think they are common enough to justify being listed, since users can always use the normal statistics aggregate functions.

That is to say, after this change, we won't support
```scala
df.groupBy("key").kurtosis("colA", "colB")
```

However, we will still support
```scala
df.groupBy("key").agg(kurtosis(col("colA")), kurtosis(col("colB")))
```

Author: Reynold Xin <rxin@databricks.com>

Closes #9446 from rxin/SPARK-11489.
---
 python/pyspark/sql/group.py                   |  88 -----------
 .../org/apache/spark/sql/GroupedData.scala    | 146 ++++--------------
 .../apache/spark/sql/JavaDataFrameSuite.java  |   1 -
 3 files changed, 28 insertions(+), 207 deletions(-)

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 946b53e71c2c6..71c0bccc5eeff 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,94 +167,6 @@ def sum(self, *cols):
         [Row(sum(age)=7, sum(height)=165)]
         """
 
-    @df_varargs_api
-    @since(1.6)
-    def stddev(self, *cols):
-        """Compute the sample standard deviation for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().stddev('age', 'height').collect()
-        [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def stddev_samp(self, *cols):
-        """Compute the sample standard deviation for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().stddev_samp('age', 'height').collect()
-        [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def stddev_pop(self, *cols):
-        """Compute the population standard deviation for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().stddev_pop('age', 'height').collect()
-        [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def variance(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().variance('age', 'height').collect()
-        [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def var_pop(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().var_pop('age', 'height').collect()
-        [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def var_samp(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().var_samp('age', 'height').collect()
-        [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def skewness(self, *cols):
-        """Compute the skewness for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().skewness('age', 'height').collect()
-        [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def kurtosis(self, *cols):
-        """Compute the kurtosis for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are ignored.
-
-        >>> df3.groupBy().kurtosis('age', 'height').collect()
-        [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
-        """
-
 
 def _test():
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index dc96384a4d28d..c2b2a4013d510 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -26,42 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
-/**
- * Companion object for GroupedData
- */
-private[sql] object GroupedData {
-  def apply(
-      df: DataFrame,
-      groupingExprs: Seq[Expression],
-      groupType: GroupType): GroupedData = {
-    new GroupedData(df, groupingExprs, groupType: GroupType)
-  }
-
-  /**
-   * The Grouping Type
-   */
-  private[sql] trait GroupType
-
-  /**
-   * To indicate it's the GroupBy
-   */
-  private[sql] object GroupByType extends GroupType
-
-  /**
-   * To indicate it's the CUBE
-   */
-  private[sql] object CubeType extends GroupType
-
-  /**
-   * To indicate it's the ROLLUP
-   */
-  private[sql] object RollupType extends GroupType
-}
 
 /**
  * :: Experimental ::
  * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]].
  *
+ * The main method is the agg function, which has multiple variants. This class also contains
+ * convenience some first order statistics such as mean, sum for convenience.
+ *
  * @since 1.3.0
  */
 @Experimental
@@ -124,7 +96,7 @@ class GroupedData protected[sql](
       case "avg" | "average" | "mean" => Average
       case "max" => Max
       case "min" => Min
-      case "stddev" => Stddev
+      case "stddev" | "std" => Stddev
       case "stddev_pop" => StddevPop
       case "stddev_samp" => StddevSamp
       case "variance" => Variance
@@ -255,30 +227,6 @@ class GroupedData protected[sql](
     aggregateNumericColumns(colNames : _*)(Average)
   }
 
-  /**
-   * Compute the skewness for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the skewness values for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def skewness(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Skewness)
-  }
-
-  /**
-   * Compute the kurtosis for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the kurtosis values for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def kurtosis(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Kurtosis)
-  }
-
   /**
    * Compute the max value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
@@ -316,86 +264,48 @@ class GroupedData protected[sql](
   }
 
   /**
-   * Compute the sample standard deviation for each numeric columns for each group.
+   * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
+   * When specified columns are given, only compute the sum for them.
    *
-   * @since 1.6.0
+   * @since 1.3.0
    */
   @scala.annotation.varargs
-  def stddev(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Stddev)
+  def sum(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Sum)
   }
+}
 
-  /**
-   * Compute the population standard deviation for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def stddev_pop(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(StddevPop)
-  }
 
-  /**
-   * Compute the sample standard deviation for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def stddev_samp(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(StddevSamp)
+/**
+ * Companion object for GroupedData.
+ */
+private[sql] object GroupedData {
+
+  def apply(
+      df: DataFrame,
+      groupingExprs: Seq[Expression],
+      groupType: GroupType): GroupedData = {
+    new GroupedData(df, groupingExprs, groupType: GroupType)
   }
 
   /**
-   * Compute the sum for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the sum for them.
-   *
-   * @since 1.3.0
+   * The Grouping Type
    */
-  @scala.annotation.varargs
-  def sum(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Sum)
-  }
+  private[sql] trait GroupType
 
   /**
-   * Compute the sample variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the GroupBy
    */
-  @scala.annotation.varargs
-  def variance(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Variance)
-  }
+  private[sql] object GroupByType extends GroupType
 
   /**
-   * Compute the population variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the CUBE
    */
-  @scala.annotation.varargs
-  def var_pop(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(VariancePop)
-  }
+  private[sql] object CubeType extends GroupType
 
   /**
-   * Compute the sample variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the ROLLUP
    */
-  @scala.annotation.varargs
-  def var_samp(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(VarianceSamp)
-  }
+  private[sql] object RollupType extends GroupType
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index a1a3fdbb486ea..49f516e86d754 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -91,7 +91,6 @@ public void testVarargMethods() {
     df.groupBy().mean("key");
     df.groupBy().max("key");
     df.groupBy().min("key");
-    df.groupBy().stddev("key");
     df.groupBy().sum("key");
 
     // Varargs in column expressions

From d648a4ad546eb05deab1005e92b815b2cbea621b Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Tue, 3 Nov 2015 16:38:22 -0800
Subject: [PATCH 792/802] [DOC] Missing link to R DataFrame API doc

Author: lewuathe <lewuathe@me.com>
Author: Lewuathe <lewuathe@me.com>

Closes #9394 from Lewuathe/missing-link-to-R-dataframe.
---
 R/pkg/R/DataFrame.R           | 105 +++++++++++++++++++++++++++++++---
 docs/sql-programming-guide.md |   2 +-
 2 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 87a2c66ffd2a9..df5bc8137187b 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -23,15 +23,23 @@ NULL
 setOldClass("jobj")
 
 #' @title S4 class that represents a DataFrame
-#' @description DataFrames can be created using functions like
-#'              \code{jsonFile}, \code{table} etc.
+#' @description DataFrames can be created using functions like \link{createDataFrame},
+#'              \link{jsonFile}, \link{table} etc.
+#' @family dataframe_funcs
 #' @rdname DataFrame
-#' @seealso jsonFile, table
 #' @docType class
 #'
 #' @slot env An R environment that stores bookkeeping states of the DataFrame
 #' @slot sdf A Java object reference to the backing Scala DataFrame
+#' @seealso \link{createDataFrame}, \link{jsonFile}, \link{table}
+#' @seealso \url{https://spark.apache.org/docs/latest/sparkr.html#sparkr-dataframes}
 #' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' df <- createDataFrame(sqlContext, faithful)
+#'}
 setClass("DataFrame",
          slots = list(env = "environment",
                       sdf = "jobj"))
@@ -46,7 +54,6 @@ setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) {
 
 #' @rdname DataFrame
 #' @export
-#'
 #' @param sdf A Java object reference to the backing Scala DataFrame
 #' @param isCached TRUE if the dataFrame is cached
 dataFrame <- function(sdf, isCached = FALSE) {
@@ -61,6 +68,7 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname printSchema
 #' @name printSchema
 #' @export
@@ -85,6 +93,7 @@ setMethod("printSchema",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname schema
 #' @name schema
 #' @export
@@ -108,6 +117,7 @@ setMethod("schema",
 #'
 #' @param x A SparkSQL DataFrame
 #' @param extended Logical. If extended is False, explain() only prints the physical plan.
+#' @family dataframe_funcs
 #' @rdname explain
 #' @name explain
 #' @export
@@ -138,6 +148,7 @@ setMethod("explain",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname isLocal
 #' @name isLocal
 #' @export
@@ -162,6 +173,7 @@ setMethod("isLocal",
 #' @param x A SparkSQL DataFrame
 #' @param numRows The number of rows to print. Defaults to 20.
 #'
+#' @family dataframe_funcs
 #' @rdname showDF
 #' @name showDF
 #' @export
@@ -186,6 +198,7 @@ setMethod("showDF",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname show
 #' @name show
 #' @export
@@ -212,6 +225,7 @@ setMethod("show", "DataFrame",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname dtypes
 #' @name dtypes
 #' @export
@@ -237,6 +251,7 @@ setMethod("dtypes",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname columns
 #' @name columns
 #' @aliases names
@@ -257,6 +272,7 @@ setMethod("columns",
             })
           })
 
+#' @family dataframe_funcs
 #' @rdname columns
 #' @name names
 setMethod("names",
@@ -265,6 +281,7 @@ setMethod("names",
             columns(x)
           })
 
+#' @family dataframe_funcs
 #' @rdname columns
 #' @name names<-
 setMethod("names<-",
@@ -283,6 +300,7 @@ setMethod("names<-",
 #' @param x A SparkSQL DataFrame
 #' @param tableName A character vector containing the name of the table
 #'
+#' @family dataframe_funcs
 #' @rdname registerTempTable
 #' @name registerTempTable
 #' @export
@@ -310,6 +328,7 @@ setMethod("registerTempTable",
 #' @param overwrite A logical argument indicating whether or not to overwrite
 #' the existing rows in the table.
 #'
+#' @family dataframe_funcs
 #' @rdname insertInto
 #' @name insertInto
 #' @export
@@ -334,6 +353,7 @@ setMethod("insertInto",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname cache
 #' @name cache
 #' @export
@@ -360,6 +380,8 @@ setMethod("cache",
 #' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
 #'
 #' @param x The DataFrame to persist
+#'
+#' @family dataframe_funcs
 #' @rdname persist
 #' @name persist
 #' @export
@@ -386,6 +408,8 @@ setMethod("persist",
 #'
 #' @param x The DataFrame to unpersist
 #' @param blocking Whether to block until all blocks are deleted
+#'
+#' @family dataframe_funcs
 #' @rdname unpersist-methods
 #' @name unpersist
 #' @export
@@ -412,6 +436,8 @@ setMethod("unpersist",
 #'
 #' @param x A SparkSQL DataFrame
 #' @param numPartitions The number of partitions to use.
+#'
+#' @family dataframe_funcs
 #' @rdname repartition
 #' @name repartition
 #' @export
@@ -435,8 +461,10 @@ setMethod("repartition",
 # Convert the rows of a DataFrame into JSON objects and return an RDD where
 # each element contains a JSON string.
 #
-#@param x A SparkSQL DataFrame
+# @param x A SparkSQL DataFrame
 # @return A StringRRDD of JSON objects
+#
+# @family dataframe_funcs
 # @rdname tojson
 # @export
 # @examples
@@ -462,6 +490,8 @@ setMethod("toJSON",
 #'
 #' @param x A SparkSQL DataFrame
 #' @param path The directory where the file is saved
+#'
+#' @family dataframe_funcs
 #' @rdname saveAsParquetFile
 #' @name saveAsParquetFile
 #' @export
@@ -484,6 +514,8 @@ setMethod("saveAsParquetFile",
 #' Return a new DataFrame containing the distinct rows in this DataFrame.
 #'
 #' @param x A SparkSQL DataFrame
+#'
+#' @family dataframe_funcs
 #' @rdname distinct
 #' @name distinct
 #' @export
@@ -506,6 +538,7 @@ setMethod("distinct",
 #
 #' @description Returns a new DataFrame containing distinct rows in this DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname unique
 #' @name unique
 #' @aliases distinct
@@ -522,6 +555,8 @@ setMethod("unique",
 #' @param x A SparkSQL DataFrame
 #' @param withReplacement Sampling with replacement or not
 #' @param fraction The (rough) sample target fraction
+#'
+#' @family dataframe_funcs
 #' @rdname sample
 #' @aliases sample_frac
 #' @export
@@ -545,6 +580,7 @@ setMethod("sample",
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname sample
 #' @name sample_frac
 setMethod("sample_frac",
@@ -560,6 +596,7 @@ setMethod("sample_frac",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname count
 #' @name count
 #' @aliases nrow
@@ -583,6 +620,7 @@ setMethod("count",
 #'
 #' @name nrow
 #'
+#' @family dataframe_funcs
 #' @rdname nrow
 #' @aliases count
 setMethod("nrow",
@@ -595,6 +633,7 @@ setMethod("nrow",
 #'
 #' @param x a SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname ncol
 #' @name ncol
 #' @export
@@ -615,6 +654,7 @@ setMethod("ncol",
 #' Returns the dimentions (number of rows and columns) of a DataFrame
 #' @param x a SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname dim
 #' @name dim
 #' @export
@@ -637,6 +677,8 @@ setMethod("dim",
 #' @param x A SparkSQL DataFrame
 #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns
 #' should be converted to factors. FALSE by default.
+#'
+#' @family dataframe_funcs
 #' @rdname collect
 #' @name collect
 #' @export
@@ -704,6 +746,7 @@ setMethod("collect",
 #' @param num The number of rows to return
 #' @return A new DataFrame containing the number of rows specified.
 #'
+#' @family dataframe_funcs
 #' @rdname limit
 #' @name limit
 #' @export
@@ -724,6 +767,7 @@ setMethod("limit",
 
 #' Take the first NUM rows of a DataFrame and return a the results as a data.frame
 #'
+#' @family dataframe_funcs
 #' @rdname take
 #' @name take
 #' @export
@@ -752,6 +796,7 @@ setMethod("take",
 #' @param num The number of rows to return. Default is 6.
 #' @return A data.frame
 #'
+#' @family dataframe_funcs
 #' @rdname head
 #' @name head
 #' @export
@@ -774,6 +819,7 @@ setMethod("head",
 #'
 #' @param x A SparkSQL DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname first
 #' @name first
 #' @export
@@ -797,6 +843,7 @@ setMethod("first",
 #
 # @param x A Spark DataFrame
 #
+# @family dataframe_funcs
 # @rdname DataFrame
 # @export
 # @examples
@@ -827,6 +874,7 @@ setMethod("toRDD",
 #' @return a GroupedData
 #' @seealso GroupedData
 #' @aliases group_by
+#' @family dataframe_funcs
 #' @rdname groupBy
 #' @name groupBy
 #' @export
@@ -851,6 +899,7 @@ setMethod("groupBy",
              groupedData(sgd)
            })
 
+#' @family dataframe_funcs
 #' @rdname groupBy
 #' @name group_by
 setMethod("group_by",
@@ -864,6 +913,7 @@ setMethod("group_by",
 #' Compute aggregates by specifying a list of columns
 #'
 #' @param x a DataFrame
+#' @family dataframe_funcs
 #' @rdname agg
 #' @name agg
 #' @aliases summarize
@@ -874,6 +924,7 @@ setMethod("agg",
             agg(groupBy(x), ...)
           })
 
+#' @family dataframe_funcs
 #' @rdname agg
 #' @name summarize
 setMethod("summarize",
@@ -889,6 +940,7 @@ setMethod("summarize",
 # the requested map function.                                                     #
 ###################################################################################
 
+# @family dataframe_funcs
 # @rdname lapply
 setMethod("lapply",
           signature(X = "DataFrame", FUN = "function"),
@@ -897,6 +949,7 @@ setMethod("lapply",
             lapply(rdd, FUN)
           })
 
+# @family dataframe_funcs
 # @rdname lapply
 setMethod("map",
           signature(X = "DataFrame", FUN = "function"),
@@ -904,6 +957,7 @@ setMethod("map",
             lapply(X, FUN)
           })
 
+# @family dataframe_funcs
 # @rdname flatMap
 setMethod("flatMap",
           signature(X = "DataFrame", FUN = "function"),
@@ -911,7 +965,7 @@ setMethod("flatMap",
             rdd <- toRDD(X)
             flatMap(rdd, FUN)
           })
-
+# @family dataframe_funcs
 # @rdname lapplyPartition
 setMethod("lapplyPartition",
           signature(X = "DataFrame", FUN = "function"),
@@ -920,6 +974,7 @@ setMethod("lapplyPartition",
             lapplyPartition(rdd, FUN)
           })
 
+# @family dataframe_funcs
 # @rdname lapplyPartition
 setMethod("mapPartitions",
           signature(X = "DataFrame", FUN = "function"),
@@ -927,6 +982,7 @@ setMethod("mapPartitions",
             lapplyPartition(X, FUN)
           })
 
+# @family dataframe_funcs
 # @rdname foreach
 setMethod("foreach",
           signature(x = "DataFrame", func = "function"),
@@ -935,6 +991,7 @@ setMethod("foreach",
             foreach(rdd, func)
           })
 
+# @family dataframe_funcs
 # @rdname foreach
 setMethod("foreachPartition",
           signature(x = "DataFrame", func = "function"),
@@ -1034,6 +1091,7 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #' @param select expression for the single Column or a list of columns to select from the DataFrame
 #' @return A new DataFrame containing only the rows that meet the condition with selected columns
 #' @export
+#' @family dataframe_funcs
 #' @rdname subset
 #' @name subset
 #' @aliases [
@@ -1064,6 +1122,7 @@ setMethod("subset", signature(x = "DataFrame"),
 #' @param col A list of columns or single Column or name
 #' @return A new DataFrame with selected columns
 #' @export
+#' @family dataframe_funcs
 #' @rdname select
 #' @name select
 #' @family subsetting functions
@@ -1091,6 +1150,7 @@ setMethod("select", signature(x = "DataFrame", col = "character"),
             }
           })
 
+#' @family dataframe_funcs
 #' @rdname select
 #' @export
 setMethod("select", signature(x = "DataFrame", col = "Column"),
@@ -1102,6 +1162,7 @@ setMethod("select", signature(x = "DataFrame", col = "Column"),
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname select
 #' @export
 setMethod("select",
@@ -1126,6 +1187,7 @@ setMethod("select",
 #' @param expr A string containing a SQL expression
 #' @param ... Additional expressions
 #' @return A DataFrame
+#' @family dataframe_funcs
 #' @rdname selectExpr
 #' @name selectExpr
 #' @export
@@ -1153,6 +1215,7 @@ setMethod("selectExpr",
 #' @param colName A string containing the name of the new column.
 #' @param col A Column expression.
 #' @return A DataFrame with the new column added.
+#' @family dataframe_funcs
 #' @rdname withColumn
 #' @name withColumn
 #' @aliases mutate transform
@@ -1178,6 +1241,7 @@ setMethod("withColumn",
 #' @param .data A DataFrame
 #' @param col a named argument of the form name = col
 #' @return A new DataFrame with the new columns added.
+#' @family dataframe_funcs
 #' @rdname withColumn
 #' @name mutate
 #' @aliases withColumn transform
@@ -1211,6 +1275,7 @@ setMethod("mutate",
           })
 
 #' @export
+#' @family dataframe_funcs
 #' @rdname withColumn
 #' @name transform
 #' @aliases withColumn mutate
@@ -1228,6 +1293,7 @@ setMethod("transform",
 #' @param existingCol The name of the column you want to change.
 #' @param newCol The new column name.
 #' @return A DataFrame with the column name changed.
+#' @family dataframe_funcs
 #' @rdname withColumnRenamed
 #' @name withColumnRenamed
 #' @export
@@ -1259,6 +1325,7 @@ setMethod("withColumnRenamed",
 #' @param x A DataFrame
 #' @param newCol A named pair of the form new_column_name = existing_column
 #' @return A DataFrame with the column name changed.
+#' @family dataframe_funcs
 #' @rdname withColumnRenamed
 #' @name rename
 #' @aliases withColumnRenamed
@@ -1303,6 +1370,7 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' @param decreasing A logical argument indicating sorting order for columns when
 #'                   a character vector is specified for col
 #' @return A DataFrame where all elements are sorted.
+#' @family dataframe_funcs
 #' @rdname arrange
 #' @name arrange
 #' @aliases orderby
@@ -1329,6 +1397,7 @@ setMethod("arrange",
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname arrange
 #' @export
 setMethod("arrange",
@@ -1360,6 +1429,7 @@ setMethod("arrange",
             do.call("arrange", c(x, jcols))
           })
 
+#' @family dataframe_funcs
 #' @rdname arrange
 #' @name orderby
 setMethod("orderBy",
@@ -1376,6 +1446,7 @@ setMethod("orderBy",
 #' @param condition The condition to filter on. This may either be a Column expression
 #' or a string containing a SQL statement
 #' @return A DataFrame containing only the rows that meet the condition.
+#' @family dataframe_funcs
 #' @rdname filter
 #' @name filter
 #' @family subsetting functions
@@ -1399,6 +1470,7 @@ setMethod("filter",
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname filter
 #' @name where
 setMethod("where",
@@ -1419,6 +1491,7 @@ setMethod("where",
 #' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
 #' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
 #' @return A DataFrame containing the result of the join operation.
+#' @family dataframe_funcs
 #' @rdname join
 #' @name join
 #' @export
@@ -1477,6 +1550,7 @@ setMethod("join",
 #'   be returned. If all.x is set to FALSE and all.y is set to TRUE, a right
 #'   outer join will be returned. If all.x and all.y are set to TRUE, a full
 #'   outer join will be returned.
+#' @family dataframe_funcs
 #' @rdname merge
 #' @export
 #' @examples
@@ -1608,6 +1682,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #' @param x A Spark DataFrame
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the union.
+#' @family dataframe_funcs
 #' @rdname unionAll
 #' @name unionAll
 #' @export
@@ -1627,9 +1702,10 @@ setMethod("unionAll",
           })
 
 #' @title Union two or more DataFrames
-#
+#'
 #' @description Returns a new DataFrame containing rows of all parameters.
-#
+#'
+#' @family dataframe_funcs
 #' @rdname rbind
 #' @name rbind
 #' @aliases unionAll
@@ -1651,6 +1727,7 @@ setMethod("rbind",
 #' @param x A Spark DataFrame
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the intersect.
+#' @family dataframe_funcs
 #' @rdname intersect
 #' @name intersect
 #' @export
@@ -1677,6 +1754,7 @@ setMethod("intersect",
 #' @param x A Spark DataFrame
 #' @param y A Spark DataFrame
 #' @return A DataFrame containing the result of the except operation.
+#' @family dataframe_funcs
 #' @rdname except
 #' @name except
 #' @export
@@ -1716,6 +1794,7 @@ setMethod("except",
 #' @param source A name for external data source
 #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode
 #'
+#' @family dataframe_funcs
 #' @rdname write.df
 #' @name write.df
 #' @aliases saveDF
@@ -1751,6 +1830,7 @@ setMethod("write.df",
             callJMethod(df@sdf, "save", source, jmode, options)
           })
 
+#' @family dataframe_funcs
 #' @rdname write.df
 #' @name saveDF
 #' @export
@@ -1781,6 +1861,7 @@ setMethod("saveDF",
 #' @param source A name for external data source
 #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode
 #'
+#' @family dataframe_funcs
 #' @rdname saveAsTable
 #' @name saveAsTable
 #' @export
@@ -1821,6 +1902,7 @@ setMethod("saveAsTable",
 #' @param col A string of name
 #' @param ... Additional expressions
 #' @return A DataFrame
+#' @family dataframe_funcs
 #' @rdname describe
 #' @name describe
 #' @aliases summary
@@ -1843,6 +1925,7 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname describe
 #' @name describe
 setMethod("describe",
@@ -1857,6 +1940,7 @@ setMethod("describe",
 #'
 #' @description Computes statistics for numeric columns of the DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname summary
 #' @name summary
 setMethod("summary",
@@ -1881,6 +1965,7 @@ setMethod("summary",
 #' @param cols Optional list of column names to consider.
 #' @return A DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname nafunctions
 #' @name dropna
 #' @aliases na.omit
@@ -1910,6 +1995,7 @@ setMethod("dropna",
             dataFrame(sdf)
           })
 
+#' @family dataframe_funcs
 #' @rdname nafunctions
 #' @name na.omit
 #' @export
@@ -1937,6 +2023,7 @@ setMethod("na.omit",
 #'             column is simply ignored.
 #' @return A DataFrame
 #'
+#' @family dataframe_funcs
 #' @rdname nafunctions
 #' @name fillna
 #' @export
@@ -2000,6 +2087,7 @@ setMethod("fillna",
 #' @title Download data from a DataFrame into a data.frame
 #' @param x a DataFrame
 #' @return a data.frame
+#' @family dataframe_funcs
 #' @rdname as.data.frame
 #' @examples \dontrun{
 #'
@@ -2020,6 +2108,7 @@ setMethod("as.data.frame",
 #' the DataFrame is searched by R when evaluating a variable, so columns in
 #' the DataFrame can be accessed by simply giving their names.
 #'
+#' @family dataframe_funcs
 #' @rdname attach
 #' @title Attach DataFrame to R search path
 #' @param what (DataFrame) The DataFrame to attach
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index f07c9573696ed..510b3599721a3 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -160,7 +160,7 @@ showDF(df)
 
 ## DataFrame Operations
 
-DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.DataFrame), [Java](api/java/index.html?org/apache/spark/sql/DataFrame.html), and [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame).
+DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.DataFrame), [Java](api/java/index.html?org/apache/spark/sql/DataFrame.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/DataFrame.html).
 
 Here we include some basic examples of structured data processing using DataFrames:
 

From e352de0db2789919e1e0385b79f29b508a6b2b77 Mon Sep 17 00:00:00 2001
From: Nong <nong@cloudera.com>
Date: Tue, 3 Nov 2015 16:44:37 -0800
Subject: [PATCH 793/802] [SPARK-11329] [SQL] Cleanup from spark-11329 fix.

Author: Nong <nong@cloudera.com>

Closes #9442 from nongli/spark-11483.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  4 +-
 .../sql/catalyst/analysis/unresolved.scala    | 18 +----
 .../scala/org/apache/spark/sql/Column.scala   |  6 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 79 +++++++++++--------
 4 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 1ba559d9e3b18..440e9e28fa783 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -477,8 +477,8 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val baseExpression: Parser[Expression] =
     ( "*" ^^^ UnresolvedStar(None)
-    | (ident <~ "."). + <~ "*" ^^ { case target => { UnresolvedStar(Option(target)) }
-    } | primary
+    | (ident <~ "."). + <~ "*" ^^ { case target => UnresolvedStar(Option(target))}
+    | primary
    )
 
   protected lazy val signedPrimary: Parser[Expression] =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 6975662e2b738..eae17c86ddc7a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -183,28 +183,16 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
       case None => input.output
       // If there is a table, pick out attributes that are part of this table.
       case Some(t) => if (t.size == 1) {
-        input.output.filter(_.qualifiers.filter(resolver(_, t.head)).nonEmpty)
+        input.output.filter(_.qualifiers.exists(resolver(_, t.head)))
       } else {
         List()
       }
     }
-    if (!expandedAttributes.isEmpty) {
-      if (expandedAttributes.forall(_.isInstanceOf[NamedExpression])) {
-        return expandedAttributes
-      } else {
-        require(expandedAttributes.size == input.output.size)
-        expandedAttributes.zip(input.output).map {
-          case (e, originalAttribute) =>
-            Alias(e, originalAttribute.name)(qualifiers = originalAttribute.qualifiers)
-        }
-      }
-      return expandedAttributes
-    }
-
-    require(target.isDefined)
+    if (expandedAttributes.nonEmpty) return expandedAttributes
 
     // Try to resolve it as a struct expansion. If there is a conflict and both are possible,
     // (i.e. [name].* is both a table and a struct), the struct path can always be qualified.
+    require(target.isDefined)
     val attribute = input.resolve(target.get, resolver)
     if (attribute.isDefined) {
       // This target resolved to an attribute in child. It must be a struct. Expand it.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 3cde9d6cb4708..c73f696962de5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -60,8 +60,10 @@ class Column(protected[sql] val expr: Expression) extends Logging {
 
   def this(name: String) = this(name match {
     case "*" => UnresolvedStar(None)
-    case _ if name.endsWith(".*") => UnresolvedStar(Some(UnresolvedAttribute.parseAttributeName(
-      name.substring(0, name.length - 2))))
+    case _ if name.endsWith(".*") => {
+      val parts = UnresolvedAttribute.parseAttributeName(name.substring(0, name.length - 2))
+      UnresolvedStar(Some(parts))
+    }
     case _ => UnresolvedAttribute.quotedString(name)
   })
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index ee54bff24b196..6388a8b9c3720 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
-import org.apache.spark.sql.execution.joins.{SortMergeJoin, CartesianProduct}
+import org.apache.spark.sql.execution.joins.{CartesianProduct, SortMergeJoin}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
@@ -1956,7 +1956,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
     // Try with a registered table.
     sql("select struct(a, b) as record from testData2").registerTempTable("structTable")
-    checkAnswer(sql("SELECT record.* FROM structTable"),
+    checkAnswer(
+      sql("SELECT record.* FROM structTable"),
       Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
 
     checkAnswer(sql(
@@ -2019,50 +2020,62 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
 
     // Try with a registered table
-    nestedStructData.registerTempTable("nestedStructTable")
-    checkAnswer(sql("SELECT record.* FROM nestedStructTable"),
-      nestedStructData.select($"record.*"))
-    checkAnswer(sql("SELECT record.r1 FROM nestedStructTable"),
-      nestedStructData.select($"record.r1"))
-    checkAnswer(sql("SELECT record.r1.* FROM nestedStructTable"),
-      nestedStructData.select($"record.r1.*"))
-
-    // Create paths with unusual characters.
+    withTempTable("nestedStructTable") {
+      nestedStructData.registerTempTable("nestedStructTable")
+      checkAnswer(
+        sql("SELECT record.* FROM nestedStructTable"),
+        nestedStructData.select($"record.*"))
+      checkAnswer(
+        sql("SELECT record.r1 FROM nestedStructTable"),
+        nestedStructData.select($"record.r1"))
+      checkAnswer(
+        sql("SELECT record.r1.* FROM nestedStructTable"),
+        nestedStructData.select($"record.r1.*"))
+
+      // Try resolving something not there.
+      assert(intercept[AnalysisException](sql("SELECT abc.* FROM nestedStructTable"))
+        .getMessage.contains("cannot resolve"))
+    }
+
+    // Create paths with unusual characters
     val specialCharacterPath = sql(
       """
         | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM
         |   (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp
       """.stripMargin)
-    specialCharacterPath.registerTempTable("specialCharacterTable")
-    checkAnswer(specialCharacterPath.select($"`r&&b.c`.*"),
-      nestedStructData.select($"record.*"))
-    checkAnswer(sql("SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"),
-      nestedStructData.select($"record.r1"))
-    checkAnswer(sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"),
-      nestedStructData.select($"record.r2"))
-    checkAnswer(sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"),
-      nestedStructData.select($"record.r1.*"))
+    withTempTable("specialCharacterTable") {
+      specialCharacterPath.registerTempTable("specialCharacterTable")
+      checkAnswer(
+        specialCharacterPath.select($"`r&&b.c`.*"),
+        nestedStructData.select($"record.*"))
+      checkAnswer(
+        sql("SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"),
+        nestedStructData.select($"record.r1"))
+      checkAnswer(
+        sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"),
+        nestedStructData.select($"record.r2"))
+      checkAnswer(
+        sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"),
+        nestedStructData.select($"record.r1.*"))
+    }
 
     // Try star expanding a scalar. This should fail.
     assert(intercept[AnalysisException](sql("select a.* from testData2")).getMessage.contains(
       "Can only star expand struct data types."))
-
-    // Try resolving something not there.
-    assert(intercept[AnalysisException](sql("SELECT abc.* FROM nestedStructTable"))
-      .getMessage.contains("cannot resolve"))
   }
 
-
   test("Struct Star Expansion - Name conflict") {
     // Create a data set that contains a naming conflict
     val nameConflict = sql("SELECT struct(a, b) as nameConflict, a as a FROM testData2")
-    nameConflict.registerTempTable("nameConflict")
-    // Unqualified should resolve to table.
-    checkAnswer(sql("SELECT nameConflict.* FROM nameConflict"),
-      Row(Row(1, 1), 1) :: Row(Row(1, 2), 1) :: Row(Row(2, 1), 2) :: Row(Row(2, 2), 2) ::
-        Row(Row(3, 1), 3) :: Row(Row(3, 2), 3) :: Nil)
-    // Qualify the struct type with the table name.
-    checkAnswer(sql("SELECT nameConflict.nameConflict.* FROM nameConflict"),
-      Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+    withTempTable("nameConflict") {
+      nameConflict.registerTempTable("nameConflict")
+      // Unqualified should resolve to table.
+      checkAnswer(sql("SELECT nameConflict.* FROM nameConflict"),
+        Row(Row(1, 1), 1) :: Row(Row(1, 2), 1) :: Row(Row(2, 1), 2) :: Row(Row(2, 2), 2) ::
+          Row(Row(3, 1), 3) :: Row(Row(3, 2), 3) :: Nil)
+      // Qualify the struct type with the table name.
+      checkAnswer(sql("SELECT nameConflict.nameConflict.* FROM nameConflict"),
+        Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil)
+    }
   }
 }

From 2692bdb7dbf36d6247f595d5fd0cb9cda89e1fdd Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 3 Nov 2015 20:25:58 -0800
Subject: [PATCH 794/802] [SPARK-11455][SQL] fix case sensitivity of partition
 by

depend on `caseSensitive` to do column name equality check, instead of just `==`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9410 from cloud-fan/partition.
---
 .../datasources/PartitioningUtils.scala       |  7 ++---
 .../datasources/ResolvedDataSource.scala      | 27 ++++++++++++++-----
 .../sql/execution/datasources/rules.scala     |  6 +++--
 .../org/apache/spark/sql/DataFrameSuite.scala | 10 +++++++
 4 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 628c5e18936c5..16dc23661c070 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -287,10 +287,11 @@ private[sql] object PartitioningUtils {
 
   def validatePartitionColumnDataTypes(
       schema: StructType,
-      partitionColumns: Array[String]): Unit = {
+      partitionColumns: Array[String],
+      caseSensitive: Boolean): Unit = {
 
-    ResolvedDataSource.partitionColumnsSchema(schema, partitionColumns).foreach { field =>
-      field.dataType match {
+    ResolvedDataSource.partitionColumnsSchema(schema, partitionColumns, caseSensitive).foreach {
+      field => field.dataType match {
         case _: AtomicType => // OK
         case _ => throw new AnalysisException(s"Cannot use ${field.dataType} for partition column")
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
index 54beabbf63b5f..86a306b8f941d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -99,7 +99,8 @@ object ResolvedDataSource extends Logging {
           val maybePartitionsSchema = if (partitionColumns.isEmpty) {
             None
           } else {
-            Some(partitionColumnsSchema(schema, partitionColumns))
+            Some(partitionColumnsSchema(
+              schema, partitionColumns, sqlContext.conf.caseSensitiveAnalysis))
           }
 
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
@@ -172,14 +173,24 @@ object ResolvedDataSource extends Logging {
 
   def partitionColumnsSchema(
       schema: StructType,
-      partitionColumns: Array[String]): StructType = {
+      partitionColumns: Array[String],
+      caseSensitive: Boolean): StructType = {
+    val equality = columnNameEquality(caseSensitive)
     StructType(partitionColumns.map { col =>
-      schema.find(_.name == col).getOrElse {
+      schema.find(f => equality(f.name, col)).getOrElse {
         throw new RuntimeException(s"Partition column $col not found in schema $schema")
       }
     }).asNullable
   }
 
+  private def columnNameEquality(caseSensitive: Boolean): (String, String) => Boolean = {
+    if (caseSensitive) {
+      org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
+    } else {
+      org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
+    }
+  }
+
   /** Create a [[ResolvedDataSource]] for saving the content of the given DataFrame. */
   def apply(
       sqlContext: SQLContext,
@@ -207,14 +218,18 @@ object ResolvedDataSource extends Logging {
           path.makeQualified(fs.getUri, fs.getWorkingDirectory)
         }
 
-        PartitioningUtils.validatePartitionColumnDataTypes(data.schema, partitionColumns)
+        val caseSensitive = sqlContext.conf.caseSensitiveAnalysis
+        PartitioningUtils.validatePartitionColumnDataTypes(
+          data.schema, partitionColumns, caseSensitive)
 
-        val dataSchema = StructType(data.schema.filterNot(f => partitionColumns.contains(f.name)))
+        val equality = columnNameEquality(caseSensitive)
+        val dataSchema = StructType(
+          data.schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
         val r = dataSource.createRelation(
           sqlContext,
           Array(outputPath.toString),
           Some(dataSchema.asNullable),
-          Some(partitionColumnsSchema(data.schema, partitionColumns)),
+          Some(partitionColumnsSchema(data.schema, partitionColumns, caseSensitive)),
           caseInsensitiveOptions)
 
         // For partitioned relation r, r.schema's column ordering can be different from the column
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index abc016bf020d9..1a8e7ab202dc2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -140,7 +140,8 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           // OK
         }
 
-        PartitioningUtils.validatePartitionColumnDataTypes(r.schema, part.keySet.toArray)
+        PartitioningUtils.validatePartitionColumnDataTypes(
+          r.schema, part.keySet.toArray, catalog.conf.caseSensitiveAnalysis)
 
         // Get all input data source relations of the query.
         val srcRelations = query.collect {
@@ -190,7 +191,8 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           // OK
         }
 
-        PartitioningUtils.validatePartitionColumnDataTypes(query.schema, partitionColumns)
+        PartitioningUtils.validatePartitionColumnDataTypes(
+          query.schema, partitionColumns, catalog.conf.caseSensitiveAnalysis)
 
       case _ => // OK
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index a883bcb7b1012..a9e6413423118 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1118,4 +1118,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       if (!allSequential) throw new SparkException("Partition should contain all sequential values")
     })
   }
+
+  test("fix case sensitivity of partition by") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTempPath { path =>
+        val p = path.getAbsolutePath
+        Seq(2012 -> "a").toDF("year", "val").write.partitionBy("yEAr").parquet(p)
+        checkAnswer(sqlContext.read.parquet(p).select("YeaR"), Row(2012))
+      }
+    }
+  }
 }

From 8aff36e91de0fee2f3f56c6d240bb203b5bb48ba Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Wed, 4 Nov 2015 10:49:34 +0000
Subject: [PATCH 795/802] [SPARK-2960][DEPLOY] Support executing Spark from
 symlinks (reopen)

This PR is based on the work of roji to support running Spark scripts from symlinks. Thanks for the great work roji . Would you mind taking a look at this PR, thanks a lot.

For releases like HDP and others, normally it will expose the Spark executables as symlinks and put in `PATH`, but current Spark's scripts do not support finding real path from symlink recursively, this will make spark fail to execute from symlink. This PR try to solve this issue by finding the absolute path from symlink.

Instead of using `readlink -f` like what this PR (https://github.com/apache/spark/pull/2386) implemented is that `-f` is not support for Mac, so here manually seeking the path through loop.

I've tested with Mac and Linux (Cent OS), looks fine.

This PR did not fix the scripts under `sbin` folder, not sure if it needs to be fixed also?

Please help to review, any comment is greatly appreciated.

Author: jerryshao <sshao@hortonworks.com>
Author: Shay Rojansky <roji@roji.org>

Closes #8669 from jerryshao/SPARK-2960.
---
 bin/beeline                         |  8 +++++---
 bin/load-spark-env.sh               | 32 ++++++++++++++++-------------
 bin/pyspark                         | 14 +++++++------
 bin/run-example                     | 18 ++++++++--------
 bin/spark-class                     | 15 +++++++-------
 bin/spark-shell                     |  9 +++++---
 bin/spark-sql                       |  7 +++++--
 bin/spark-submit                    |  6 ++++--
 bin/sparkR                          |  9 +++++---
 sbin/slaves.sh                      |  9 ++++----
 sbin/spark-config.sh                | 23 +++++++--------------
 sbin/spark-daemon.sh                | 23 +++++++++++----------
 sbin/spark-daemons.sh               |  9 ++++----
 sbin/start-all.sh                   | 11 +++++-----
 sbin/start-history-server.sh        | 11 +++++-----
 sbin/start-master.sh                | 17 +++++++--------
 sbin/start-mesos-dispatcher.sh      | 11 +++++-----
 sbin/start-mesos-shuffle-service.sh | 11 +++++-----
 sbin/start-shuffle-service.sh       | 11 +++++-----
 sbin/start-slave.sh                 | 18 ++++++++--------
 sbin/start-slaves.sh                | 19 ++++++++---------
 sbin/start-thriftserver.sh          | 11 +++++-----
 sbin/stop-all.sh                    | 14 ++++++-------
 sbin/stop-history-server.sh         |  7 ++++---
 sbin/stop-master.sh                 | 13 ++++++------
 sbin/stop-mesos-dispatcher.sh       |  9 ++++----
 sbin/stop-mesos-shuffle-service.sh  |  7 ++++---
 sbin/stop-shuffle-service.sh        |  7 ++++---
 sbin/stop-slave.sh                  | 15 +++++++-------
 sbin/stop-slaves.sh                 | 15 +++++++-------
 sbin/stop-thriftserver.sh           |  7 ++++---
 31 files changed, 213 insertions(+), 183 deletions(-)

diff --git a/bin/beeline b/bin/beeline
index 3fcb6df34339d..1627626941a73 100755
--- a/bin/beeline
+++ b/bin/beeline
@@ -23,8 +23,10 @@
 # Enter posix mode for bash
 set -o posix
 
-# Figure out where Spark is installed
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+# Figure out if SPARK_HOME is set
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 CLASS="org.apache.hive.beeline.BeeLine"
-exec "$FWDIR/bin/spark-class" $CLASS "$@"
+exec "${SPARK_HOME}/bin/spark-class" $CLASS "$@"
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index 95779e9ddbb18..eaea964ed5b3d 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -20,13 +20,17 @@
 # This script loads spark-env.sh if it exists, and ensures it is only loaded once.
 # spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
 # conf/ subdirectory.
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+
+# Figure out where Spark is installed
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 if [ -z "$SPARK_ENV_LOADED" ]; then
   export SPARK_ENV_LOADED=1
 
   # Returns the parent of the directory this script lives in.
-  parent_dir="$(cd "`dirname "$0"`"/..; pwd)"
+  parent_dir="${SPARK_HOME}"
 
   user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}"
 
@@ -42,18 +46,18 @@ fi
 
 if [ -z "$SPARK_SCALA_VERSION" ]; then
 
-    ASSEMBLY_DIR2="$FWDIR/assembly/target/scala-2.11"
-    ASSEMBLY_DIR1="$FWDIR/assembly/target/scala-2.10"
+  ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
+  ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.10"
 
-    if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
-        echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
-        echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
-        exit 1
-    fi
+  if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
+    echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
+    echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
+    exit 1
+  fi
 
-    if [ -d "$ASSEMBLY_DIR2" ]; then
-        export SPARK_SCALA_VERSION="2.11"
-    else
-        export SPARK_SCALA_VERSION="2.10"
-    fi
+  if [ -d "$ASSEMBLY_DIR2" ]; then
+    export SPARK_SCALA_VERSION="2.11"
+  else
+    export SPARK_SCALA_VERSION="2.10"
+  fi
 fi
diff --git a/bin/pyspark b/bin/pyspark
index 18012ee4a0b4f..5eaa17d3c2016 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -17,9 +17,11 @@
 # limitations under the License.
 #
 
-export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-source "$SPARK_HOME"/bin/load-spark-env.sh
+source "${SPARK_HOME}"/bin/load-spark-env.sh
 export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
 
 # In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
@@ -64,12 +66,12 @@ fi
 export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
-export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
-export PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"
+export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py"
 
 # For pyspark tests
 if [[ -n "$SPARK_TESTING" ]]; then
@@ -82,4 +84,4 @@ fi
 
 export PYSPARK_DRIVER_PYTHON
 export PYSPARK_DRIVER_PYTHON_OPTS
-exec "$SPARK_HOME"/bin/spark-submit pyspark-shell-main --name "PySparkShell" "$@"
+exec "${SPARK_HOME}"/bin/spark-submit pyspark-shell-main --name "PySparkShell" "$@"
diff --git a/bin/run-example b/bin/run-example
index 798e2caeb88ce..e1b0d5789bed6 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -17,11 +17,13 @@
 # limitations under the License.
 #
 
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-export SPARK_HOME="$FWDIR"
-EXAMPLES_DIR="$FWDIR"/examples
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+EXAMPLES_DIR="${SPARK_HOME}"/examples
 
-. "$FWDIR"/bin/load-spark-env.sh
+. "${SPARK_HOME}"/bin/load-spark-env.sh
 
 if [ -n "$1" ]; then
   EXAMPLE_CLASS="$1"
@@ -34,8 +36,8 @@ else
   exit 1
 fi
 
-if [ -f "$FWDIR/RELEASE" ]; then
-  JAR_PATH="${FWDIR}/lib"
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  JAR_PATH="${SPARK_HOME}/lib"
 else
   JAR_PATH="${EXAMPLES_DIR}/target/scala-${SPARK_SCALA_VERSION}"
 fi
@@ -44,7 +46,7 @@ JAR_COUNT=0
 
 for f in "${JAR_PATH}"/spark-examples-*hadoop*.jar; do
   if [[ ! -e "$f" ]]; then
-    echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
+    echo "Failed to find Spark examples assembly in ${SPARK_HOME}/lib or ${SPARK_HOME}/examples/target" 1>&2
     echo "You need to build Spark before running this program" 1>&2
     exit 1
   fi
@@ -67,7 +69,7 @@ if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
   EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
 fi
 
-exec "$FWDIR"/bin/spark-submit \
+exec "${SPARK_HOME}"/bin/spark-submit \
   --master $EXAMPLE_MASTER \
   --class $EXAMPLE_CLASS \
   "$SPARK_EXAMPLES_JAR" \
diff --git a/bin/spark-class b/bin/spark-class
index 8cae6ccbabe7c..87d06693af4fe 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -17,10 +17,11 @@
 # limitations under the License.
 #
 
-# Figure out where Spark is installed
-export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$SPARK_HOME"/bin/load-spark-env.sh
+. "${SPARK_HOME}"/bin/load-spark-env.sh
 
 # Find the java binary
 if [ -n "${JAVA_HOME}" ]; then
@@ -36,10 +37,10 @@ fi
 
 # Find assembly jar
 SPARK_ASSEMBLY_JAR=
-if [ -f "$SPARK_HOME/RELEASE" ]; then
-  ASSEMBLY_DIR="$SPARK_HOME/lib"
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  ASSEMBLY_DIR="${SPARK_HOME}/lib"
 else
-  ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
+  ASSEMBLY_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
 fi
 
 GREP_OPTIONS=
@@ -65,7 +66,7 @@ LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
 
 # Add the launcher build dir to the classpath if requested.
 if [ -n "$SPARK_PREPEND_CLASSES" ]; then
-  LAUNCH_CLASSPATH="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
+  LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
 fi
 
 export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
diff --git a/bin/spark-shell b/bin/spark-shell
index 00ab7afd118b5..6583b5bd880ee 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -28,7 +28,10 @@ esac
 # Enter posix mode for bash
 set -o posix
 
-export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
 
 # SPARK-4161: scala does not assume use of the java classpath,
@@ -47,11 +50,11 @@ function main() {
     # (see https://github.com/sbt/sbt/issues/562).
     stty -icanon min 1 -echo > /dev/null 2>&1
     export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
-    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
+    "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
     stty icanon echo > /dev/null 2>&1
   else
     export SPARK_SUBMIT_OPTS
-    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
+    "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
   fi
 }
 
diff --git a/bin/spark-sql b/bin/spark-sql
index 4ea7bc6e39c07..970d12cbf51dd 100755
--- a/bin/spark-sql
+++ b/bin/spark-sql
@@ -17,6 +17,9 @@
 # limitations under the License.
 #
 
-export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
-exec "$FWDIR"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"
+exec "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"
diff --git a/bin/spark-submit b/bin/spark-submit
index 255378b0f077c..023f9c162f4b8 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -17,9 +17,11 @@
 # limitations under the License.
 #
 
-SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 # disable randomized hash for string in Python 3.3+
 export PYTHONHASHSEED=0
 
-exec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
+exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
diff --git a/bin/sparkR b/bin/sparkR
index 464c29f369424..2c07a82e2173b 100755
--- a/bin/sparkR
+++ b/bin/sparkR
@@ -17,7 +17,10 @@
 # limitations under the License.
 #
 
-export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
-source "$SPARK_HOME"/bin/load-spark-env.sh
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+source "${SPARK_HOME}"/bin/load-spark-env.sh
 export _SPARK_CMD_USAGE="Usage: ./bin/sparkR [options]"
-exec "$SPARK_HOME"/bin/spark-submit sparkr-shell-main "$@"
+exec "${SPARK_HOME}"/bin/spark-submit sparkr-shell-main "$@"
diff --git a/sbin/slaves.sh b/sbin/slaves.sh
index cdad47ee2e594..c971aa3296b09 100755
--- a/sbin/slaves.sh
+++ b/sbin/slaves.sh
@@ -36,10 +36,11 @@ if [ $# -le 0 ]; then
   exit 1
 fi
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
 # If the slaves file is specified in the command line,
 # then it takes precedence over the definition in
@@ -65,7 +66,7 @@ then
   shift
 fi
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 if [ "$HOSTLIST" = "" ]; then
   if [ "$SPARK_SLAVES" = "" ]; then
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index e6bf544c14799..d8d9d00d64ebc 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -19,21 +19,12 @@
 # should not be executable directly
 # also should not be passed any arguments, since we need original $*
 
-# resolve links - $0 may be a softlink
-this="${BASH_SOURCE:-$0}"
-common_bin="$(cd -P -- "$(dirname -- "$this")" && pwd -P)"
-script="$(basename -- "$this")"
-this="$common_bin/$script"
+# symlink and absolute path should rely on SPARK_HOME to resolve
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-# convert relative path to absolute path
-config_bin="`dirname "$this"`"
-script="`basename "$this"`"
-config_bin="`cd "$config_bin"; pwd`"
-this="$config_bin/$script"
-
-export SPARK_PREFIX="`dirname "$this"`"/..
-export SPARK_HOME="${SPARK_PREFIX}"
-export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"$SPARK_HOME/conf"}"
+export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
-export PYTHONPATH="$SPARK_HOME/python:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:${PYTHONPATH}"
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 0fbe795822fbf..6ab57df409529 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -37,10 +37,11 @@ if [ $# -le 1 ]; then
   exit 1
 fi
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
 # get arguments
 
@@ -86,7 +87,7 @@ spark_rotate_log ()
     fi
 }
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 if [ "$SPARK_IDENT_STRING" = "" ]; then
   export SPARK_IDENT_STRING="$USER"
@@ -97,7 +98,7 @@ export SPARK_PRINT_LAUNCH_COMMAND="1"
 
 # get log directory
 if [ "$SPARK_LOG_DIR" = "" ]; then
-  export SPARK_LOG_DIR="$SPARK_HOME/logs"
+  export SPARK_LOG_DIR="${SPARK_HOME}/logs"
 fi
 mkdir -p "$SPARK_LOG_DIR"
 touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1
@@ -137,7 +138,7 @@ run_command() {
 
   if [ "$SPARK_MASTER" != "" ]; then
     echo rsync from "$SPARK_MASTER"
-    rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "$SPARK_HOME"
+    rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
   fi
 
   spark_rotate_log "$log"
@@ -145,12 +146,12 @@ run_command() {
 
   case "$mode" in
     (class)
-      nohup nice -n "$SPARK_NICENESS" "$SPARK_PREFIX"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
+      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
       newpid="$!"
       ;;
 
     (submit)
-      nohup nice -n "$SPARK_NICENESS" "$SPARK_PREFIX"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
+      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
       newpid="$!"
       ;;
 
@@ -205,13 +206,13 @@ case $option in
       else
         echo $pid file is present but $command not running
         exit 1
-      fi  
+      fi
     else
       echo $command not running.
       exit 2
-    fi  
+    fi
     ;;
-  
+
   (*)
     echo $usage
     exit 1
diff --git a/sbin/spark-daemons.sh b/sbin/spark-daemons.sh
index 5d9f2bb51cae0..dec2f4432df39 100755
--- a/sbin/spark-daemons.sh
+++ b/sbin/spark-daemons.sh
@@ -27,9 +27,10 @@ if [ $# -le 1 ]; then
   exit 1
 fi
 
-sbin=`dirname "$0"`
-sbin=`cd "$sbin"; pwd`
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-exec "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/spark-daemon.sh" "$@"
+exec "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@"
diff --git a/sbin/start-all.sh b/sbin/start-all.sh
index 1baf57cea09ee..6217f9bf28e3d 100755
--- a/sbin/start-all.sh
+++ b/sbin/start-all.sh
@@ -21,8 +21,9 @@
 # Starts the master on this node.
 # Starts a worker on each node specified in conf/slaves
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 TACHYON_STR=""
 
@@ -36,10 +37,10 @@ shift
 done
 
 # Load the Spark configuration
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
 # Start Master
-"$sbin"/start-master.sh $TACHYON_STR
+"${SPARK_HOME}/sbin"/start-master.sh $TACHYON_STR
 
 # Start Workers
-"$sbin"/start-slaves.sh $TACHYON_STR
+"${SPARK_HOME}/sbin"/start-slaves.sh $TACHYON_STR
diff --git a/sbin/start-history-server.sh b/sbin/start-history-server.sh
index 9034e5715cc85..6851d99b7e8f4 100755
--- a/sbin/start-history-server.sh
+++ b/sbin/start-history-server.sh
@@ -24,10 +24,11 @@
 # Use the SPARK_HISTORY_OPTS environment variable to set history server configuration.
 #
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
-exec "$sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 $@
+exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 $@
diff --git a/sbin/start-master.sh b/sbin/start-master.sh
index a7f5d5702fd80..c20e19a8412df 100755
--- a/sbin/start-master.sh
+++ b/sbin/start-master.sh
@@ -19,8 +19,9 @@
 
 # Starts the master on the machine this script is executed on.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 ORIGINAL_ARGS="$@"
 
@@ -39,9 +40,9 @@ case $1 in
 shift
 done
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 if [ "$SPARK_MASTER_PORT" = "" ]; then
   SPARK_MASTER_PORT=7077
@@ -55,12 +56,12 @@ if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then
   SPARK_MASTER_WEBUI_PORT=8080
 fi
 
-"$sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 \
+"${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.master.Master 1 \
   --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \
   $ORIGINAL_ARGS
 
 if [ "$START_TACHYON" == "true" ]; then
-  "$sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP
-  "$sbin"/../tachyon/bin/tachyon format -s
-  "$sbin"/../tachyon/bin/tachyon-start.sh master
+  "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP
+  "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon format -s
+  "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon-start.sh master
 fi
diff --git a/sbin/start-mesos-dispatcher.sh b/sbin/start-mesos-dispatcher.sh
index ef1fc573d5c65..4777e1668c703 100755
--- a/sbin/start-mesos-dispatcher.sh
+++ b/sbin/start-mesos-dispatcher.sh
@@ -21,12 +21,13 @@
 # Rest server to handle driver requests for Mesos cluster mode.
 # Only one cluster dispatcher is needed per Mesos cluster.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 if [ "$SPARK_MESOS_DISPATCHER_PORT" = "" ]; then
   SPARK_MESOS_DISPATCHER_PORT=7077
@@ -37,4 +38,4 @@ if [ "$SPARK_MESOS_DISPATCHER_HOST" = "" ]; then
 fi
 
 
-"$sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosClusterDispatcher 1 --host $SPARK_MESOS_DISPATCHER_HOST --port $SPARK_MESOS_DISPATCHER_PORT "$@"
+"${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosClusterDispatcher 1 --host $SPARK_MESOS_DISPATCHER_HOST --port $SPARK_MESOS_DISPATCHER_PORT "$@"
diff --git a/sbin/start-mesos-shuffle-service.sh b/sbin/start-mesos-shuffle-service.sh
index 64580762c5dc4..1845845676029 100755
--- a/sbin/start-mesos-shuffle-service.sh
+++ b/sbin/start-mesos-shuffle-service.sh
@@ -26,10 +26,11 @@
 # Use the SPARK_SHUFFLE_OPTS environment variable to set shuffle service configuration.
 #
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
-exec "$sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosExternalShuffleService 1
+exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosExternalShuffleService 1
diff --git a/sbin/start-shuffle-service.sh b/sbin/start-shuffle-service.sh
index 4fddcf7f95d40..793e165be6c78 100755
--- a/sbin/start-shuffle-service.sh
+++ b/sbin/start-shuffle-service.sh
@@ -24,10 +24,11 @@
 # Use the SPARK_SHUFFLE_OPTS environment variable to set shuffle server configuration.
 #
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
-exec "$sbin"/spark-daemon.sh start org.apache.spark.deploy.ExternalShuffleService 1
+exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.ExternalShuffleService 1
diff --git a/sbin/start-slave.sh b/sbin/start-slave.sh
index 4c919ff76a8f5..21455648d1c6d 100755
--- a/sbin/start-slave.sh
+++ b/sbin/start-slave.sh
@@ -21,14 +21,14 @@
 #
 # Environment Variables
 #
-#   SPARK_WORKER_INSTANCES  The number of worker instances to run on this 
+#   SPARK_WORKER_INSTANCES  The number of worker instances to run on this
 #                           slave.  Default is 1.
-#   SPARK_WORKER_PORT       The base port number for the first worker. If set, 
+#   SPARK_WORKER_PORT       The base port number for the first worker. If set,
 #                           subsequent workers will increment this number.  If
 #                           unset, Spark will find a valid port number, but
 #                           with no guarantee of a predictable pattern.
 #   SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first
-#                           worker.  Subsequent workers will increment this 
+#                           worker.  Subsequent workers will increment this
 #                           number.  Default is 8081.
 
 usage="Usage: start-slave.sh <spark-master-URL> where <spark-master-URL> is like spark://localhost:7077"
@@ -39,12 +39,13 @@ if [ $# -lt 1 ]; then
   exit 1
 fi
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 # First argument should be the master; we need to store it aside because we may
 # need to insert arguments between it and the other arguments
@@ -71,7 +72,7 @@ function start_instance {
   fi
   WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 ))
 
-  "$sbin"/spark-daemon.sh start org.apache.spark.deploy.worker.Worker $WORKER_NUM \
+  "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.worker.Worker $WORKER_NUM \
      --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
 }
 
@@ -82,4 +83,3 @@ else
     start_instance $(( 1 + $i )) "$@"
   done
 fi
-
diff --git a/sbin/start-slaves.sh b/sbin/start-slaves.sh
index 24d6268815ed3..51ca81e053b70 100755
--- a/sbin/start-slaves.sh
+++ b/sbin/start-slaves.sh
@@ -19,16 +19,16 @@
 
 # Starts a slave instance on each machine specified in the conf/slaves file.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
-
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 START_TACHYON=false
 
 while (( "$#" )); do
 case $1 in
     --with-tachyon)
-      if [ ! -e "$sbin"/../tachyon/bin/tachyon ]; then
+      if [ ! -e "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon ]; then
         echo "Error: --with-tachyon specified, but tachyon not found."
         exit -1
       fi
@@ -38,9 +38,8 @@ case $1 in
 shift
 done
 
-. "$sbin/spark-config.sh"
-
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 # Find the port number for the master
 if [ "$SPARK_MASTER_PORT" = "" ]; then
@@ -52,11 +51,11 @@ if [ "$SPARK_MASTER_IP" = "" ]; then
 fi
 
 if [ "$START_TACHYON" == "true" ]; then
-  "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon bootstrap-conf "$SPARK_MASTER_IP"
+  "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon bootstrap-conf "$SPARK_MASTER_IP"
 
   # set -t so we can call sudo
-  SPARK_SSH_OPTS="-o StrictHostKeyChecking=no -t" "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/../tachyon/bin/tachyon-start.sh" worker SudoMount \; sleep 1
+  SPARK_SSH_OPTS="-o StrictHostKeyChecking=no -t" "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/tachyon/bin/tachyon-start.sh" worker SudoMount \; sleep 1
 fi
 
 # Launch the slaves
-"$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin/start-slave.sh" "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT"
+"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-slave.sh" "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT"
diff --git a/sbin/start-thriftserver.sh b/sbin/start-thriftserver.sh
index 5b0aeb177fff3..ad7e7c5277eb1 100755
--- a/sbin/start-thriftserver.sh
+++ b/sbin/start-thriftserver.sh
@@ -23,8 +23,9 @@
 # Enter posix mode for bash
 set -o posix
 
-# Figure out where Spark is installed
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 # NOTE: This exact class name is matched downstream by SparkSubmit.
 # Any changes need to be reflected there.
@@ -39,10 +40,10 @@ function usage {
   pattern+="\|======="
   pattern+="\|--help"
 
-  "$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  "${SPARK_HOME}"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
   echo
   echo "Thrift server options:"
-  "$FWDIR"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
+  "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
 }
 
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
@@ -52,4 +53,4 @@ fi
 
 export SUBMIT_USAGE_FUNCTION=usage
 
-exec "$FWDIR"/sbin/spark-daemon.sh submit $CLASS 1 "$@"
+exec "${SPARK_HOME}"/sbin/spark-daemon.sh submit $CLASS 1 "$@"
diff --git a/sbin/stop-all.sh b/sbin/stop-all.sh
index 1a9abe07db844..4e476ca05cb05 100755
--- a/sbin/stop-all.sh
+++ b/sbin/stop-all.sh
@@ -20,23 +20,23 @@
 # Stop all spark daemons.
 # Run this on the master node.
 
-
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
 # Load the Spark configuration
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
 # Stop the slaves, then the master
-"$sbin"/stop-slaves.sh
-"$sbin"/stop-master.sh
+"${SPARK_HOME}/sbin"/stop-slaves.sh
+"${SPARK_HOME}/sbin"/stop-master.sh
 
 if [ "$1" == "--wait" ]
 then
   printf "Waiting for workers to shut down..."
   while true
   do
-    running=`$sbin/slaves.sh ps -ef | grep -v grep | grep deploy.worker.Worker`
+    running=`${SPARK_HOME}/sbin/slaves.sh ps -ef | grep -v grep | grep deploy.worker.Worker`
     if [ -z "$running" ]
     then
       printf "\nAll workers successfully shut down.\n"
diff --git a/sbin/stop-history-server.sh b/sbin/stop-history-server.sh
index 6e6056359510f..14e3af4be910a 100755
--- a/sbin/stop-history-server.sh
+++ b/sbin/stop-history-server.sh
@@ -19,7 +19,8 @@
 
 # Stops the history server on the machine this script is executed on.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.deploy.history.HistoryServer 1
+"${SPARK_HOME}/sbin/spark-daemon.sh" stop org.apache.spark.deploy.history.HistoryServer 1
diff --git a/sbin/stop-master.sh b/sbin/stop-master.sh
index 729702d92191e..e57962bb354d9 100755
--- a/sbin/stop-master.sh
+++ b/sbin/stop-master.sh
@@ -19,13 +19,14 @@
 
 # Stops the master on the machine this script is executed on.
 
-sbin=`dirname "$0"`
-sbin=`cd "$sbin"; pwd`
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.deploy.master.Master 1
+"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.master.Master 1
 
-if [ -e "$sbin"/../tachyon/bin/tachyon ]; then
-  "$sbin"/../tachyon/bin/tachyon killAll tachyon.master.Master
+if [ -e "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon ]; then
+  "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon killAll tachyon.master.Master
 fi
diff --git a/sbin/stop-mesos-dispatcher.sh b/sbin/stop-mesos-dispatcher.sh
index cb65d95b5e524..5c0b4e051db38 100755
--- a/sbin/stop-mesos-dispatcher.sh
+++ b/sbin/stop-mesos-dispatcher.sh
@@ -18,10 +18,11 @@
 #
 # Stop the Mesos Cluster dispatcher on the machine this script is executed on.
 
-sbin=`dirname "$0"`
-sbin=`cd "$sbin"; pwd`
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosClusterDispatcher 1
+"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosClusterDispatcher 1
 
diff --git a/sbin/stop-mesos-shuffle-service.sh b/sbin/stop-mesos-shuffle-service.sh
index 0e965d5ec5886..d23cad375e1bd 100755
--- a/sbin/stop-mesos-shuffle-service.sh
+++ b/sbin/stop-mesos-shuffle-service.sh
@@ -19,7 +19,8 @@
 
 # Stops the Mesos external shuffle service on the machine this script is executed on.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosExternalShuffleService 1
+"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosExternalShuffleService 1
diff --git a/sbin/stop-shuffle-service.sh b/sbin/stop-shuffle-service.sh
index 4cb6891ae27fa..50d69cf34e0a5 100755
--- a/sbin/stop-shuffle-service.sh
+++ b/sbin/stop-shuffle-service.sh
@@ -19,7 +19,8 @@
 
 # Stops the external shuffle service on the machine this script is executed on.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.deploy.ExternalShuffleService 1
+"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.ExternalShuffleService 1
diff --git a/sbin/stop-slave.sh b/sbin/stop-slave.sh
index 3d1da5b254f2a..685bcf59b33aa 100755
--- a/sbin/stop-slave.sh
+++ b/sbin/stop-slave.sh
@@ -21,23 +21,24 @@
 #
 # Environment variables
 #
-#   SPARK_WORKER_INSTANCES The number of worker instances that should be 
+#   SPARK_WORKER_INSTANCES The number of worker instances that should be
 #                          running on this slave.  Default is 1.
 
 # Usage: stop-slave.sh
 #   Stops all slaves on this worker machine
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
-  "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker 1
+  "${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker 1
 else
   for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do
-    "$sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker $(( $i + 1 ))
+    "${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker $(( $i + 1 ))
   done
 fi
diff --git a/sbin/stop-slaves.sh b/sbin/stop-slaves.sh
index 54c9bd46803a9..63956377629d6 100755
--- a/sbin/stop-slaves.sh
+++ b/sbin/stop-slaves.sh
@@ -17,16 +17,17 @@
 # limitations under the License.
 #
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-. "$sbin/spark-config.sh"
+. "${SPARK_HOME}/sbin/spark-config.sh"
 
-. "$SPARK_PREFIX/bin/load-spark-env.sh"
+. "${SPARK_HOME}/bin/load-spark-env.sh"
 
 # do before the below calls as they exec
-if [ -e "$sbin"/../tachyon/bin/tachyon ]; then
-  "$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/../tachyon/bin/tachyon killAll tachyon.worker.Worker
+if [ -e "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon ]; then
+  "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/../tachyon/bin/tachyon killAll tachyon.worker.Worker
 fi
 
-"$sbin/slaves.sh" cd "$SPARK_HOME" \; "$sbin"/stop-slave.sh
+"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-slave.sh
diff --git a/sbin/stop-thriftserver.sh b/sbin/stop-thriftserver.sh
index 4031a00d4a689..cf45058f882a0 100755
--- a/sbin/stop-thriftserver.sh
+++ b/sbin/stop-thriftserver.sh
@@ -19,7 +19,8 @@
 
 # Stops the thrift server on the machine this script is executed on.
 
-sbin="`dirname "$0"`"
-sbin="`cd "$sbin"; pwd`"
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
 
-"$sbin"/spark-daemon.sh stop org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 1
+"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 1

From c09e5139874fb3626e005c8240cca5308b902ef3 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Wed, 4 Nov 2015 10:51:40 +0000
Subject: [PATCH 796/802] [SPARK-11442] Reduce numSlices for local metrics test
 of SparkListenerSuite

In the thread, http://search-hadoop.com/m/q3RTtcQiFSlTxeP/test+failed+due+to+OOME&subj=test+failed+due+to+OOME, it was discussed that memory consumption for SparkListenerSuite should be brought down.

This is an attempt in that direction by reducing numSlices for local metrics test.

Author: tedyu <yuzhihong@gmail.com>

Closes #9384 from tedyu/master.
---
 .../org/apache/spark/scheduler/SparkListenerSuite.scala  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index a9652d7e7d0b0..53102b9f1c936 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -212,14 +212,15 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
       i
     }
 
-    val d = sc.parallelize(0 to 1e4.toInt, 64).map(w)
+    val numSlices = 16
+    val d = sc.parallelize(0 to 1e3.toInt, numSlices).map(w)
     d.count()
     sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     listener.stageInfos.size should be (1)
 
     val d2 = d.map { i => w(i) -> i * 2 }.setName("shuffle input 1")
     val d3 = d.map { i => w(i) -> (0 to (i % 5)) }.setName("shuffle input 2")
-    val d4 = d2.cogroup(d3, 64).map { case (k, (v1, v2)) =>
+    val d4 = d2.cogroup(d3, numSlices).map { case (k, (v1, v2)) =>
       w(k) -> (v1.size, v2.size)
     }
     d4.setName("A Cogroup")
@@ -258,8 +259,8 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
         if (stageInfo.rddInfos.exists(_.name == d4.name)) {
           taskMetrics.shuffleReadMetrics should be ('defined)
           val sm = taskMetrics.shuffleReadMetrics.get
-          sm.totalBlocksFetched should be (128)
-          sm.localBlocksFetched should be (128)
+          sm.totalBlocksFetched should be (2*numSlices)
+          sm.localBlocksFetched should be (2*numSlices)
           sm.remoteBlocksFetched should be (0)
           sm.remoteBytesRead should be (0L)
         }

From e328b69c31821e4b27673d7ef6182ab3b7a05ca8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 4 Nov 2015 08:28:33 -0800
Subject: [PATCH 797/802] [SPARK-9492][ML][R] LogisticRegression in R should
 provide model statistics

Like ml ```LinearRegression```, ```LogisticRegression``` should provide a training summary including feature names and their coefficients.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9303 from yanboliang/spark-9492.
---
 R/pkg/inst/tests/test_mllib.R                   | 17 +++++++++++++++++
 .../ml/classification/LogisticRegression.scala  | 17 +++++++++++++----
 .../org/apache/spark/ml/r/SparkRWrappers.scala  |  7 ++++---
 project/MimaExcludes.scala                      |  4 +++-
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 3331ce738358c..032cfef061fd3 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -67,3 +67,20 @@ test_that("summary coefficients match with native glm", {
     as.character(stats$features) ==
     c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
 })
+
+test_that("summary coefficients match with native glm of family 'binomial'", {
+  df <- createDataFrame(sqlContext, iris)
+  training <- filter(df, df$Species != "setosa")
+  stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+    family = "binomial"))
+  coefs <- as.vector(stats$coefficients)
+
+  rTraining <- iris[iris$Species %in% c("versicolor","virginica"),]
+  rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+    family = binomial(link = "logit"))))
+
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    as.character(stats$features) ==
+    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+})
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a1335e7a1bde8..f5fca686df144 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -378,6 +378,7 @@ class LogisticRegression(override val uid: String)
       model.transform(dataset),
       $(probabilityCol),
       $(labelCol),
+      $(featuresCol),
       objectiveHistory)
     model.setSummary(logRegSummary)
   }
@@ -452,7 +453,8 @@ class LogisticRegressionModel private[ml] (
    */
   // TODO: decide on a good name before exposing to public API
   private[classification] def evaluate(dataset: DataFrame): LogisticRegressionSummary = {
-    new BinaryLogisticRegressionSummary(this.transform(dataset), $(probabilityCol), $(labelCol))
+    new BinaryLogisticRegressionSummary(
+      this.transform(dataset), $(probabilityCol), $(labelCol), $(featuresCol))
   }
 
   /**
@@ -614,9 +616,12 @@ sealed trait LogisticRegressionSummary extends Serializable {
   /** Field in "predictions" which gives the calibrated probability of each instance as a vector. */
   def probabilityCol: String
 
-  /** Field in "predictions" which gives the the true label of each instance. */
+  /** Field in "predictions" which gives the true label of each instance. */
   def labelCol: String
 
+  /** Field in "predictions" which gives the features of each instance as a vector. */
+  def featuresCol: String
+
 }
 
 /**
@@ -626,6 +631,7 @@ sealed trait LogisticRegressionSummary extends Serializable {
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
  *                       each instance as a vector.
  * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
 @Experimental
@@ -633,8 +639,9 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
     predictions: DataFrame,
     probabilityCol: String,
     labelCol: String,
+    featuresCol: String,
     val objectiveHistory: Array[Double])
-  extends BinaryLogisticRegressionSummary(predictions, probabilityCol, labelCol)
+  extends BinaryLogisticRegressionSummary(predictions, probabilityCol, labelCol, featuresCol)
   with LogisticRegressionTrainingSummary {
 
 }
@@ -646,12 +653,14 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
  *                       each instance.
  * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
  */
 @Experimental
 class BinaryLogisticRegressionSummary private[classification] (
     @transient override val predictions: DataFrame,
     override val probabilityCol: String,
-    override val labelCol: String) extends LogisticRegressionSummary {
+    override val labelCol: String,
+    override val featuresCol: String) extends LogisticRegressionSummary {
 
   private val sqlContext = predictions.sqlContext
   import sqlContext.implicits._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index 24f76de806d8f..5be2f86936211 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -66,9 +66,10 @@ private[r] object SparkRWrappers {
         val attrs = AttributeGroup.fromStructField(
           m.summary.predictions.schema(m.summary.featuresCol))
         Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
-      case _: LogisticRegressionModel =>
-        throw new UnsupportedOperationException(
-          "No features names available for LogisticRegressionModel")  // SPARK-9492
+      case m: LogisticRegressionModel =>
+        val attrs = AttributeGroup.fromStructField(
+          m.summary.predictions.schema(m.summary.featuresCol))
+        Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
     }
   }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index ec0e44b7f2d66..eeef96c378bdb 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -59,7 +59,9 @@ object MimaExcludes {
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.ml.classification.LogisticAggregator.add"),
         ProblemFilters.exclude[MissingMethodProblem](
-          "org.apache.spark.ml.classification.LogisticAggregator.count")
+          "org.apache.spark.ml.classification.LogisticAggregator.count"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticRegressionSummary.featuresCol")
       ) ++ Seq(
         // SPARK-10381 Fix types / units in private AskPermissionToCommitOutput RPC message.
         // This class is marked as `private` but MiMa still seems to be confused by the change.

From 820064e613609bbf7edd726d982da1de60bf417a Mon Sep 17 00:00:00 2001
From: Pravin Gadakh <pravingadakh177@gmail.com>
Date: Wed, 4 Nov 2015 08:32:08 -0800
Subject: [PATCH 798/802] [SPARK-11380][DOCS] Replace example code in
 mllib-frequent-pattern-mining.md using include_example

Author: Pravin Gadakh <pravingadakh177@gmail.com>
Author: Pravin Gadakh <prgadakh@in.ibm.com>

Closes #9340 from pravingadakh/SPARK-11380.
---
 docs/mllib-frequent-pattern-mining.md         | 168 +-----------------
 .../mllib/JavaAssociationRulesExample.java    |  56 ++++++
 .../examples/mllib/JavaPrefixSpanExample.java |  55 ++++++
 .../examples/mllib/JavaSimpleFPGrowth.java    |  71 ++++++++
 .../src/main/python/mllib/fpgrowth_example.py |  33 ++++
 .../mllib/AssociationRulesExample.scala       |  54 ++++++
 .../examples/mllib/PrefixSpanExample.scala    |  52 ++++++
 .../spark/examples/mllib/SimpleFPGrowth.scala |  59 ++++++
 8 files changed, 387 insertions(+), 161 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java
 create mode 100644 examples/src/main/python/mllib/fpgrowth_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index f749eb4f2ff4f..fe42896a05d8e 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -52,31 +52,7 @@ details) from `transactions`.
 
 Refer to the [`FPGrowth` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.fpm.FPGrowth
-
-val data = sc.textFile("data/mllib/sample_fpgrowth.txt")
-
-val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))
-
-val fpg = new FPGrowth()
-  .setMinSupport(0.2)
-  .setNumPartitions(10)
-val model = fpg.run(transactions)
-
-model.freqItemsets.collect().foreach { itemset =>
-  println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
-}
-
-val minConfidence = 0.8
-model.generateAssociationRules(minConfidence).collect().foreach { rule =>
-  println(
-    rule.antecedent.mkString("[", ",", "]")
-      + " => " + rule.consequent .mkString("[", ",", "]")
-      + ", " + rule.confidence)
-}
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala %}
 
 </div>
 
@@ -95,46 +71,7 @@ details) from `transactions`.
 
 Refer to the [`FPGrowth` Java docs](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) for details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.fpm.AssociationRules;
-import org.apache.spark.mllib.fpm.FPGrowth;
-import org.apache.spark.mllib.fpm.FPGrowthModel;
-
-SparkConf conf = new SparkConf().setAppName("FP-growth Example");
-JavaSparkContext sc = new JavaSparkContext(conf);
-
-JavaRDD<String> data = sc.textFile("data/mllib/sample_fpgrowth.txt");
-
-JavaRDD<List<String>> transactions = data.map(
-  new Function<String, List<String>>() {
-    public List<String> call(String line) {
-      String[] parts = line.split(" ");
-      return Arrays.asList(parts);
-    }
-  }
-);
-
-FPGrowth fpg = new FPGrowth()
-  .setMinSupport(0.2)
-  .setNumPartitions(10);
-FPGrowthModel<String> model = fpg.run(transactions);
-
-for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) {
-  System.out.println("[" + itemset.javaItems() + "], " + itemset.freq());
-}
-
-double minConfidence = 0.8;
-for (AssociationRules.Rule<String> rule
-    : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) {
-  System.out.println(
-    rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java %}
 
 </div>
 
@@ -149,19 +86,7 @@ that stores the frequent itemsets with their frequencies.
 
 Refer to the [`FPGrowth` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowth) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.fpm import FPGrowth
-
-data = sc.textFile("data/mllib/sample_fpgrowth.txt")
-
-transactions = data.map(lambda line: line.strip().split(' '))
-
-model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
-
-result = model.freqItemsets().collect()
-for fi in result:
-    print(fi)
-{% endhighlight %}
+{% include_example python/mllib/fpgrowth_example.py %}
 
 </div>
 
@@ -177,27 +102,7 @@ that have a single item as the consequent.
 
 Refer to the [`AssociationRules` Scala docs](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.fpm.AssociationRules
-import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
-
-val freqItemsets = sc.parallelize(Seq(
-  new FreqItemset(Array("a"), 15L),
-  new FreqItemset(Array("b"), 35L),
-  new FreqItemset(Array("a", "b"), 12L)
-));
-
-val ar = new AssociationRules()
-  .setMinConfidence(0.8)
-val results = ar.run(freqItemsets)
-
-results.collect().foreach { rule =>
-  println("[" + rule.antecedent.mkString(",")
-    + "=>"
-    + rule.consequent.mkString(",") + "]," + rule.confidence)
-}
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala %}
 
 </div>
 
@@ -208,29 +113,7 @@ that have a single item as the consequent.
 
 Refer to the [`AssociationRules` Java docs](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) for details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.fpm.AssociationRules;
-import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
-
-JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = sc.parallelize(Arrays.asList(
-  new FreqItemset<String>(new String[] {"a"}, 15L),
-  new FreqItemset<String>(new String[] {"b"}, 35L),
-  new FreqItemset<String>(new String[] {"a", "b"}, 12L)
-));
-
-AssociationRules arules = new AssociationRules()
-  .setMinConfidence(0.8);
-JavaRDD<AssociationRules.Rule<String>> results = arules.run(freqItemsets);
-
-for (AssociationRules.Rule<String> rule: results.collect()) {
-  System.out.println(
-    rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java %}
 
 </div>
 </div>
@@ -278,24 +161,7 @@ that stores the frequent sequences with their frequencies.
 
 Refer to the [`PrefixSpan` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) and [`PrefixSpanModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.fpm.PrefixSpan
-
-val sequences = sc.parallelize(Seq(
-    Array(Array(1, 2), Array(3)),
-    Array(Array(1), Array(3, 2), Array(1, 2)),
-    Array(Array(1, 2), Array(5)),
-    Array(Array(6))
-  ), 2).cache()
-val prefixSpan = new PrefixSpan()
-  .setMinSupport(0.5)
-  .setMaxPatternLength(5)
-val model = prefixSpan.run(sequences)
-model.freqSequences.collect().foreach { freqSequence =>
-println(
-  freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq)
-}
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala %}
 
 </div>
 
@@ -309,27 +175,7 @@ that stores the frequent sequences with their frequencies.
 
 Refer to the [`PrefixSpan` Java docs](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) and [`PrefixSpanModel` Java docs](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html) for details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.mllib.fpm.PrefixSpan;
-import org.apache.spark.mllib.fpm.PrefixSpanModel;
-
-JavaRDD<List<List<Integer>>> sequences = sc.parallelize(Arrays.asList(
-  Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)),
-  Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)),
-  Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)),
-  Arrays.asList(Arrays.asList(6))
-), 2);
-PrefixSpan prefixSpan = new PrefixSpan()
-  .setMinSupport(0.5)
-  .setMaxPatternLength(5);
-PrefixSpanModel<Integer> model = prefixSpan.run(sequences);
-for (PrefixSpan.FreqSequence<Integer> freqSeq: model.freqSequences().toJavaRDD().collect()) {
-  System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq());
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java %}
 
 </div>
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java
new file mode 100644
index 0000000000000..4d0f989819ace
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.fpm.AssociationRules;
+import org.apache.spark.mllib.fpm.FPGrowth;
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
+// $example off$
+
+import org.apache.spark.SparkConf;
+
+public class JavaAssociationRulesExample {
+
+  public static void main(String[] args) {
+
+    SparkConf sparkConf = new SparkConf().setAppName("JavaAssociationRulesExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    // $example on$
+    JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = sc.parallelize(Arrays.asList(
+      new FreqItemset<String>(new String[] {"a"}, 15L),
+      new FreqItemset<String>(new String[] {"b"}, 35L),
+      new FreqItemset<String>(new String[] {"a", "b"}, 12L)
+    ));
+
+    AssociationRules arules = new AssociationRules()
+      .setMinConfidence(0.8);
+    JavaRDD<AssociationRules.Rule<String>> results = arules.run(freqItemsets);
+
+    for (AssociationRules.Rule<String> rule : results.collect()) {
+      System.out.println(
+        rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
+    }
+    // $example off$
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java
new file mode 100644
index 0000000000000..68ec7c1e6ebe0
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+// $example off$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import org.apache.spark.mllib.fpm.PrefixSpan;
+import org.apache.spark.mllib.fpm.PrefixSpanModel;
+// $example off$
+import org.apache.spark.SparkConf;
+
+public class JavaPrefixSpanExample {
+
+  public static void main(String[] args) {
+
+    SparkConf sparkConf = new SparkConf().setAppName("JavaPrefixSpanExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    // $example on$
+    JavaRDD<List<List<Integer>>> sequences = sc.parallelize(Arrays.asList(
+      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)),
+      Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)),
+      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)),
+      Arrays.asList(Arrays.asList(6))
+    ), 2);
+    PrefixSpan prefixSpan = new PrefixSpan()
+      .setMinSupport(0.5)
+      .setMaxPatternLength(5);
+    PrefixSpanModel<Integer> model = prefixSpan.run(sequences);
+    for (PrefixSpan.FreqSequence<Integer> freqSeq: model.freqSequences().toJavaRDD().collect()) {
+      System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq());
+    }
+    // $example off$
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java
new file mode 100644
index 0000000000000..72edaca5e95b1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example off$
+import org.apache.spark.api.java.function.Function;
+// $example on$
+import org.apache.spark.mllib.fpm.AssociationRules;
+import org.apache.spark.mllib.fpm.FPGrowth;
+import org.apache.spark.mllib.fpm.FPGrowthModel;
+// $example off$
+
+import org.apache.spark.SparkConf;
+
+public class JavaSimpleFPGrowth {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("FP-growth Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // $example on$
+    JavaRDD<String> data = sc.textFile("data/mllib/sample_fpgrowth.txt");
+
+    JavaRDD<List<String>> transactions = data.map(
+      new Function<String, List<String>>() {
+        public List<String> call(String line) {
+          String[] parts = line.split(" ");
+          return Arrays.asList(parts);
+        }
+      }
+    );
+
+    FPGrowth fpg = new FPGrowth()
+      .setMinSupport(0.2)
+      .setNumPartitions(10);
+    FPGrowthModel<String> model = fpg.run(transactions);
+
+    for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) {
+      System.out.println("[" + itemset.javaItems() + "], " + itemset.freq());
+    }
+
+    double minConfidence = 0.8;
+    for (AssociationRules.Rule<String> rule
+      : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) {
+      System.out.println(
+        rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
+    }
+    // $example off$
+  }
+}
diff --git a/examples/src/main/python/mllib/fpgrowth_example.py b/examples/src/main/python/mllib/fpgrowth_example.py
new file mode 100644
index 0000000000000..715f5268206cb
--- /dev/null
+++ b/examples/src/main/python/mllib/fpgrowth_example.py
@@ -0,0 +1,33 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+from pyspark.mllib.fpm import FPGrowth
+# $example off$
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="FPGrowth")
+
+    # $example on$
+    data = sc.textFile("data/mllib/sample_fpgrowth.txt")
+    transactions = data.map(lambda line: line.strip().split(' '))
+    model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
+    result = model.freqItemsets().collect()
+    for fi in result:
+        print(fi)
+    # $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
new file mode 100644
index 0000000000000..ca22ddafc3c48
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.fpm.AssociationRules
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
+// $example off$
+
+import org.apache.spark.{SparkConf, SparkContext}
+
+object AssociationRulesExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("AssociationRulesExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val freqItemsets = sc.parallelize(Seq(
+      new FreqItemset(Array("a"), 15L),
+      new FreqItemset(Array("b"), 35L),
+      new FreqItemset(Array("a", "b"), 12L)
+    ))
+
+    val ar = new AssociationRules()
+      .setMinConfidence(0.8)
+    val results = ar.run(freqItemsets)
+
+    results.collect().foreach { rule =>
+      println("[" + rule.antecedent.mkString(",")
+        + "=>"
+        + rule.consequent.mkString(",") + "]," + rule.confidence)
+    }
+    // $example off$
+  }
+
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
new file mode 100644
index 0000000000000..d237232c430ca
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.fpm.PrefixSpan
+// $example off$
+
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PrefixSpanExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("PrefixSpanExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val sequences = sc.parallelize(Seq(
+      Array(Array(1, 2), Array(3)),
+      Array(Array(1), Array(3, 2), Array(1, 2)),
+      Array(Array(1, 2), Array(5)),
+      Array(Array(6))
+    ), 2).cache()
+    val prefixSpan = new PrefixSpan()
+      .setMinSupport(0.5)
+      .setMaxPatternLength(5)
+    val model = prefixSpan.run(sequences)
+    model.freqSequences.collect().foreach { freqSequence =>
+      println(
+        freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") +
+          ", " + freqSequence.freq)
+    }
+    // $example off$
+  }
+}
+// scalastyle:off println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
new file mode 100644
index 0000000000000..b4e06afa7410f
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.fpm.FPGrowth
+import org.apache.spark.rdd.RDD
+// $example off$
+
+import org.apache.spark.{SparkContext, SparkConf}
+
+object SimpleFPGrowth {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("SimpleFPGrowth")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")
+
+    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))
+
+    val fpg = new FPGrowth()
+      .setMinSupport(0.2)
+      .setNumPartitions(10)
+    val model = fpg.run(transactions)
+
+    model.freqItemsets.collect().foreach { itemset =>
+      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+    }
+
+    val minConfidence = 0.8
+    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
+      println(
+        rule.antecedent.mkString("[", ",", "]")
+          + " => " + rule.consequent .mkString("[", ",", "]")
+          + ", " + rule.confidence)
+    }
+    // $example off$
+  }
+}
+// scalastyle:on println

From 9b214cea896056e7d0a69ae9d3c282e1f027d5b9 Mon Sep 17 00:00:00 2001
From: Xusen Yin <yinxusen@gmail.com>
Date: Wed, 4 Nov 2015 08:36:55 -0800
Subject: [PATCH 799/802] [SPARK-11443] Reserve space lines

The trim_codeblock(lines) function in include_example.rb removes some blank lines in the code.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #9400 from yinxusen/SPARK-11443.
---
 docs/_plugins/include_example.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index 0f4184c7462be..6ee63a5ac69df 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -50,7 +50,7 @@ def trim_codeblock(lines)
         .map { |l| l[/\A */].size }
         .min
 
-      lines.map { |l| l[min_start_spaces .. -1] }
+      lines.map { |l| l.strip.size == 0 ? l : l[min_start_spaces .. -1] }
     end
 
     # Select lines according to labels in code. Currently we use "$example on$" and "$example off$"

From 8790ee6d69e50ca84eb849742be48f2476743b5b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 4 Nov 2015 09:07:22 -0800
Subject: [PATCH 800/802] [SPARK-10622][CORE][YARN] Differentiate dead from
 "mostly dead" executors.

In YARN mode, when preemption is enabled, we may leave executors in a
zombie state while we wait to retrieve the reason for which the executor
exited. This is so that we don't account for failed tasks that were
running on a preempted executor.

The issue is that while we wait for this information, the scheduler
might decide to schedule tasks on the executor, which will never be
able to run them. Other side effects include the block manager still
considering the executor available to cache blocks, for example.

So, when we know that an executor went down but we don't know why,
stop everything related to the executor, except its running tasks.
Only when we know the reason for the exit (or give up waiting for
it) we do update the running tasks.

This is achieved by a new `disableExecutor()` method in the
`Schedulable` interface. For managers that do not behave like this
(i.e. every one but YARN), the existing `executorLost()` method
will behave the same way it did before.

On top of that change, a few minor changes that made debugging easier,
and fixed some other minor issues:
- The cluster-mode AM was printing a misleading log message every
  time an executor disconnected from the driver (because the akka
  actor system was shared between driver and AM).
- Avoid sending unnecessary requests for an executor's exit reason
  when we already know it was explicitly disabled / killed. This
  avoids both multiple requests, and unnecessary requests that would
  just cause warning messages on the AM (in the explicit kill case).
- Tone down a log message about the executor being lost when it
  exited normally (e.g. preemption)
- Wake up the AM monitor thread when requests for executor loss
  reasons arrive too, so that we can more quickly remove executors
  from this zombie state.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #8887 from vanzin/SPARK-10622.
---
 .../spark/scheduler/ExecutorLossReason.scala  |  9 +++
 .../spark/scheduler/TaskSchedulerImpl.scala   | 32 ++++++--
 .../CoarseGrainedSchedulerBackend.scala       | 37 ++++++++-
 .../cluster/YarnSchedulerBackend.scala        |  9 +--
 .../scheduler/TaskSchedulerImplSuite.scala    | 36 +++++++++
 .../spark/deploy/yarn/ApplicationMaster.scala | 79 +++++++++++--------
 .../spark/deploy/yarn/YarnAllocator.scala     |  5 ++
 7 files changed, 157 insertions(+), 50 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
index 33edf25043850..47a5cbff4930b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -40,6 +40,15 @@ private[spark] object ExecutorExited {
   }
 }
 
+/**
+ * A loss reason that means we don't yet know why the executor exited.
+ *
+ * This is used by the task scheduler to remove state associated with the executor, but
+ * not yet fail any tasks that were running in the executor before the real loss reason
+ * is known.
+ */
+private [spark] object LossReasonPending extends ExecutorLossReason("Pending loss reason.")
+
 private[spark]
 case class SlaveLost(_message: String = "Slave lost")
   extends ExecutorLossReason(_message)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 1c7bfe89c02ac..43d7d80b7aae1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -468,11 +468,20 @@ private[spark] class TaskSchedulerImpl(
         removeExecutor(executorId, reason)
         failedExecutor = Some(executorId)
       } else {
-         // We may get multiple executorLost() calls with different loss reasons. For example, one
-         // may be triggered by a dropped connection from the slave while another may be a report
-         // of executor termination from Mesos. We produce log messages for both so we eventually
-         // report the termination reason.
-         logError("Lost an executor " + executorId + " (already removed): " + reason)
+         executorIdToHost.get(executorId) match {
+           case Some(_) =>
+             // If the host mapping still exists, it means we don't know the loss reason for the
+             // executor. So call removeExecutor() to update tasks running on that executor when
+             // the real loss reason is finally known.
+             removeExecutor(executorId, reason)
+
+           case None =>
+             // We may get multiple executorLost() calls with different loss reasons. For example,
+             // one may be triggered by a dropped connection from the slave while another may be a
+             // report of executor termination from Mesos. We produce log messages for both so we
+             // eventually report the termination reason.
+             logError("Lost an executor " + executorId + " (already removed): " + reason)
+         }
       }
     }
     // Call dagScheduler.executorLost without holding the lock on this to prevent deadlock
@@ -482,7 +491,11 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
-  /** Remove an executor from all our data structures and mark it as lost */
+  /**
+   * Remove an executor from all our data structures and mark it as lost. If the executor's loss
+   * reason is not yet known, do not yet remove its association with its host nor update the status
+   * of any running tasks, since the loss reason defines whether we'll fail those tasks.
+   */
   private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
     activeExecutorIds -= executorId
     val host = executorIdToHost(executorId)
@@ -497,8 +510,11 @@ private[spark] class TaskSchedulerImpl(
         }
       }
     }
-    executorIdToHost -= executorId
-    rootPool.executorLost(executorId, host, reason)
+
+    if (reason != LossReasonPending) {
+      executorIdToHost -= executorId
+      rootPool.executorLost(executorId, host, reason)
+    }
   }
 
   def executorAdded(execId: String, host: String) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index ebce5021b19dc..f71d98feac050 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -73,6 +73,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // The number of pending tasks which is locality required
   protected var localityAwareTasks = 0
 
+  // Executors that have been lost, but for which we don't yet know the real exit reason.
+  protected val executorsPendingLossReason = new HashSet[String]
+
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
@@ -184,7 +187,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     // Make fake resource offers on all executors
     private def makeOffers() {
       // Filter out executors under killing
-      val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_))
+      val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
       val workOffers = activeExecutors.map { case (id, executorData) =>
         new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
       }.toSeq
@@ -202,7 +205,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     // Make fake resource offers on just one executor
     private def makeOffers(executorId: String) {
       // Filter out executors under killing
-      if (!executorsPendingToRemove.contains(executorId)) {
+      if (executorIsAlive(executorId)) {
         val executorData = executorDataMap(executorId)
         val workOffers = Seq(
           new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))
@@ -210,6 +213,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       }
     }
 
+    private def executorIsAlive(executorId: String): Boolean = synchronized {
+      !executorsPendingToRemove.contains(executorId) &&
+        !executorsPendingLossReason.contains(executorId)
+    }
+
     // Launch tasks returned by a set of resource offers
     private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
       for (task <- tasks.flatten) {
@@ -246,6 +254,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             addressToExecutorId -= executorInfo.executorAddress
             executorDataMap -= executorId
             executorsPendingToRemove -= executorId
+            executorsPendingLossReason -= executorId
           }
           totalCoreCount.addAndGet(-executorInfo.totalCores)
           totalRegisteredExecutors.addAndGet(-1)
@@ -256,6 +265,30 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       }
     }
 
+    /**
+     * Stop making resource offers for the given executor. The executor is marked as lost with
+     * the loss reason still pending.
+     *
+     * @return Whether executor was alive.
+     */
+    protected def disableExecutor(executorId: String): Boolean = {
+      val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized {
+        if (executorIsAlive(executorId)) {
+          executorsPendingLossReason += executorId
+          true
+        } else {
+          false
+        }
+      }
+
+      if (shouldDisable) {
+        logInfo(s"Disabling executor $executorId.")
+        scheduler.executorLost(executorId, LossReasonPending)
+      }
+
+      shouldDisable
+    }
+
     override def onStop() {
       reviveThread.shutdownNow()
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index d75d6f673e84e..80da37b09b590 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -115,15 +115,12 @@ private[spark] abstract class YarnSchedulerBackend(
      * (e.g., preemption), according to the application master, then we pass that information down
      * to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
      * not count towards a job failure.
-     *
-     * TODO there's a race condition where while we are querying the ApplicationMaster for
-     * the executor loss reason, there is the potential that tasks will be scheduled on
-     * the executor that failed. We should fix this by having this onDisconnected event
-     * also "blacklist" executors so that tasks are not assigned to them.
      */
     override def onDisconnected(rpcAddress: RpcAddress): Unit = {
       addressToExecutorId.get(rpcAddress).foreach { executorId =>
-        yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+        if (disableExecutor(executorId)) {
+          yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+        }
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index c2edd4c317d6e..2afb595e6f10d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -237,4 +237,40 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L
     }
   }
 
+  test("tasks are not re-scheduled while executor loss reason is pending") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val e0Offers = Seq(new WorkerOffer("executor0", "host0", 1))
+    val e1Offers = Seq(new WorkerOffer("executor1", "host0", 1))
+    val attempt1 = FakeTask.createTaskSet(1)
+
+    // submit attempt 1, offer resources, task gets scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(e0Offers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // mark executor0 as dead but pending fail reason
+    taskScheduler.executorLost("executor0", LossReasonPending)
+
+    // offer some more resources on a different executor, nothing should change
+    val taskDescriptions2 = taskScheduler.resourceOffers(e1Offers).flatten
+    assert(0 === taskDescriptions2.length)
+
+    // provide the actual loss reason for executor0
+    taskScheduler.executorLost("executor0", SlaveLost("oops"))
+
+    // executor0's tasks should have failed now that the loss reason is known, so offering more
+    // resources should make them be scheduled on the new executor.
+    val taskDescriptions3 = taskScheduler.resourceOffers(e1Offers).flatten
+    assert(1 === taskDescriptions3.length)
+    assert("executor1" === taskDescriptions3(0).executorId)
+  }
+
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 12ae350e4cef6..50ae7ffeec4c5 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -87,8 +87,27 @@ private[spark] class ApplicationMaster(
 
   @volatile private var reporterThread: Thread = _
   @volatile private var allocator: YarnAllocator = _
+
+  // Lock for controlling the allocator (heartbeat) thread.
   private val allocatorLock = new Object()
 
+  // Steady state heartbeat interval. We want to be reasonably responsive without causing too many
+  // requests to RM.
+  private val heartbeatInterval = {
+    // Ensure that progress is sent before YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS elapses.
+    val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
+    math.max(0, math.min(expiryInterval / 2,
+      sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "3s")))
+  }
+
+  // Initial wait interval before allocator poll, to allow for quicker ramp up when executors are
+  // being requested.
+  private val initialAllocationInterval = math.min(heartbeatInterval,
+    sparkConf.getTimeAsMs("spark.yarn.scheduler.initial-allocation.interval", "200ms"))
+
+  // Next wait interval before allocator poll.
+  private var nextAllocationInterval = initialAllocationInterval
+
   // Fields used in client mode.
   private var rpcEnv: RpcEnv = null
   private var amEndpoint: RpcEndpointRef = _
@@ -332,19 +351,6 @@ private[spark] class ApplicationMaster(
   }
 
   private def launchReporterThread(): Thread = {
-    // Ensure that progress is sent before YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS elapses.
-    val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
-
-    // we want to be reasonably responsive without causing too many requests to RM.
-    val heartbeatInterval = math.max(0, math.min(expiryInterval / 2,
-      sparkConf.getTimeAsMs("spark.yarn.scheduler.heartbeat.interval-ms", "3s")))
-
-    // we want to check more frequently for pending containers
-    val initialAllocationInterval = math.min(heartbeatInterval,
-      sparkConf.getTimeAsMs("spark.yarn.scheduler.initial-allocation.interval", "200ms"))
-
-    var nextAllocationInterval = initialAllocationInterval
-
     // The number of failures in a row until Reporter thread give up
     val reporterMaxFailures = sparkConf.getInt("spark.yarn.scheduler.reporterThread.maxFailures", 5)
 
@@ -377,19 +383,19 @@ private[spark] class ApplicationMaster(
           }
           try {
             val numPendingAllocate = allocator.getPendingAllocate.size
-            val sleepInterval =
-              if (numPendingAllocate > 0) {
-                val currentAllocationInterval =
-                  math.min(heartbeatInterval, nextAllocationInterval)
-                nextAllocationInterval = currentAllocationInterval * 2 // avoid overflow
-                currentAllocationInterval
-              } else {
-                nextAllocationInterval = initialAllocationInterval
-                heartbeatInterval
-              }
-            logDebug(s"Number of pending allocations is $numPendingAllocate. " +
-                     s"Sleeping for $sleepInterval.")
             allocatorLock.synchronized {
+              val sleepInterval =
+                if (numPendingAllocate > 0 || allocator.getNumPendingLossReasonRequests > 0) {
+                  val currentAllocationInterval =
+                    math.min(heartbeatInterval, nextAllocationInterval)
+                  nextAllocationInterval = currentAllocationInterval * 2 // avoid overflow
+                  currentAllocationInterval
+                } else {
+                  nextAllocationInterval = initialAllocationInterval
+                  heartbeatInterval
+                }
+              logDebug(s"Number of pending allocations is $numPendingAllocate. " +
+                       s"Sleeping for $sleepInterval.")
               allocatorLock.wait(sleepInterval)
             }
           } catch {
@@ -560,6 +566,11 @@ private[spark] class ApplicationMaster(
     userThread
   }
 
+  private def resetAllocatorInterval(): Unit = allocatorLock.synchronized {
+    nextAllocationInterval = initialAllocationInterval
+    allocatorLock.notifyAll()
+  }
+
   /**
    * An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
    */
@@ -581,11 +592,9 @@ private[spark] class ApplicationMaster(
       case RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount) =>
         Option(allocator) match {
           case Some(a) =>
-            allocatorLock.synchronized {
-              if (a.requestTotalExecutorsWithPreferredLocalities(requestedTotal,
-                localityAwareTasks, hostToLocalTaskCount)) {
-                allocatorLock.notifyAll()
-              }
+            if (a.requestTotalExecutorsWithPreferredLocalities(requestedTotal,
+              localityAwareTasks, hostToLocalTaskCount)) {
+              resetAllocatorInterval()
             }
 
           case None =>
@@ -603,17 +612,19 @@ private[spark] class ApplicationMaster(
 
       case GetExecutorLossReason(eid) =>
         Option(allocator) match {
-          case Some(a) => a.enqueueGetLossReasonRequest(eid, context)
-          case None => logWarning(s"Container allocator is not ready to find" +
-            s" executor loss reasons yet.")
+          case Some(a) =>
+            a.enqueueGetLossReasonRequest(eid, context)
+            resetAllocatorInterval()
+          case None =>
+            logWarning("Container allocator is not ready to find executor loss reasons yet.")
         }
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
-      logInfo(s"Driver terminated or disconnected! Shutting down. $remoteAddress")
       // In cluster mode, do not rely on the disassociated event to exit
       // This avoids potentially reporting incorrect exit codes if the driver fails
       if (!isClusterMode) {
+        logInfo(s"Driver terminated or disconnected! Shutting down. $remoteAddress")
         finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
       }
     }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index a0cf1b4aa469b..4d9e777cb4134 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -550,6 +550,10 @@ private[yarn] class YarnAllocator(
 
   private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
 
+  private[yarn] def getNumPendingLossReasonRequests: Int = synchronized {
+    pendingLossReasonRequests.size
+  }
+
   /**
    * Split the pending container requests into 3 groups based on current localities of pending
    * tasks.
@@ -582,6 +586,7 @@ private[yarn] class YarnAllocator(
 
     (localityMatched.toSeq, localityUnMatched.toSeq, localityFree.toSeq)
   }
+
 }
 
 private object YarnAllocator {

From 27feafccbd6945b000ca51b14c57912acbad9031 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 4 Nov 2015 09:11:54 -0800
Subject: [PATCH 801/802] [SPARK-11235][NETWORK] Add ability to stream data
 using network lib.

The current interface used to fetch shuffle data is not very efficient for
large buffers; it requires the receiver to buffer the entirety of the
contents being downloaded in memory before processing the data.

To use the network library to transfer large files (such as those that
can be added using SparkContext addJar / addFile), this change adds a
more efficient way of downloding data, by streaming the data and feeding
it to a callback as data arrives.

This is achieved by a custom frame decoder that replaces the current netty
one; this decoder allows entering a mode where framing is skipped and data
is instead provided directly to a callback. The existing netty classes
(ByteToMessageDecoder and LengthFieldBasedFrameDecoder) could not be reused
since their semantics do not allow for the interception approach the new
decoder uses.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9206 from vanzin/SPARK-11235.
---
 .../spark/network/TransportContext.java       |   3 +-
 .../spark/network/client/StreamCallback.java  |  40 +++
 .../network/client/StreamInterceptor.java     |  76 ++++
 .../spark/network/client/TransportClient.java |  41 +++
 .../client/TransportResponseHandler.java      |  47 ++-
 .../network/protocol/ChunkFetchSuccess.java   |  16 +-
 .../spark/network/protocol/Message.java       |   6 +-
 .../network/protocol/MessageDecoder.java      |   9 +
 .../network/protocol/MessageEncoder.java      |  27 +-
 .../network/protocol/ResponseWithBody.java    |  40 +++
 .../spark/network/protocol/StreamFailure.java |  80 +++++
 .../spark/network/protocol/StreamRequest.java |  78 +++++
 .../network/protocol/StreamResponse.java      |  91 +++++
 .../spark/network/server/StreamManager.java   |  13 +
 .../server/TransportRequestHandler.java       |  20 ++
 .../apache/spark/network/util/NettyUtils.java |   9 +-
 .../network/util/TransportFrameDecoder.java   | 154 +++++++++
 .../apache/spark/network/ProtocolSuite.java   |   8 +
 .../org/apache/spark/network/StreamSuite.java | 325 ++++++++++++++++++
 .../util/TransportFrameDecoderSuite.java      | 142 ++++++++
 20 files changed, 1196 insertions(+), 29 deletions(-)
 create mode 100644 network/common/src/main/java/org/apache/spark/network/client/StreamCallback.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/protocol/ResponseWithBody.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
 create mode 100644 network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
 create mode 100644 network/common/src/test/java/org/apache/spark/network/StreamSuite.java
 create mode 100644 network/common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java

diff --git a/network/common/src/main/java/org/apache/spark/network/TransportContext.java b/network/common/src/main/java/org/apache/spark/network/TransportContext.java
index b8d073fa16b4b..43900e6f2c972 100644
--- a/network/common/src/main/java/org/apache/spark/network/TransportContext.java
+++ b/network/common/src/main/java/org/apache/spark/network/TransportContext.java
@@ -39,6 +39,7 @@
 import org.apache.spark.network.server.TransportServerBootstrap;
 import org.apache.spark.network.util.NettyUtils;
 import org.apache.spark.network.util.TransportConf;
+import org.apache.spark.network.util.TransportFrameDecoder;
 
 /**
  * Contains the context to create a {@link TransportServer}, {@link TransportClientFactory}, and to
@@ -119,7 +120,7 @@ public TransportChannelHandler initializePipeline(
       TransportChannelHandler channelHandler = createChannelHandler(channel, channelRpcHandler);
       channel.pipeline()
         .addLast("encoder", encoder)
-        .addLast("frameDecoder", NettyUtils.createFrameDecoder())
+        .addLast(TransportFrameDecoder.HANDLER_NAME, NettyUtils.createFrameDecoder())
         .addLast("decoder", decoder)
         .addLast("idleStateHandler", new IdleStateHandler(0, 0, conf.connectionTimeoutMs() / 1000))
         // NOTE: Chunks are currently guaranteed to be returned in the order of request, but this
diff --git a/network/common/src/main/java/org/apache/spark/network/client/StreamCallback.java b/network/common/src/main/java/org/apache/spark/network/client/StreamCallback.java
new file mode 100644
index 0000000000000..093fada320cc3
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/client/StreamCallback.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.client;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Callback for streaming data. Stream data will be offered to the {@link onData(ByteBuffer)}
+ * method as it arrives. Once all the stream data is received, {@link onComplete()} will be
+ * called.
+ * <p>
+ * The network library guarantees that a single thread will call these methods at a time, but
+ * different call may be made by different threads.
+ */
+public interface StreamCallback {
+  /** Called upon receipt of stream data. */
+  void onData(String streamId, ByteBuffer buf) throws IOException;
+
+  /** Called when all data from the stream has been received. */
+  void onComplete(String streamId) throws IOException;
+
+  /** Called if there's an error reading data from the stream. */
+  void onFailure(String streamId, Throwable cause) throws IOException;
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java b/network/common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
new file mode 100644
index 0000000000000..02230a00e69fc
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/client/StreamInterceptor.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.client;
+
+import java.nio.ByteBuffer;
+import java.nio.channels.ClosedChannelException;
+
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.util.TransportFrameDecoder;
+
+/**
+ * An interceptor that is registered with the frame decoder to feed stream data to a
+ * callback.
+ */
+class StreamInterceptor implements TransportFrameDecoder.Interceptor {
+
+  private final String streamId;
+  private final long byteCount;
+  private final StreamCallback callback;
+
+  private volatile long bytesRead;
+
+  StreamInterceptor(String streamId, long byteCount, StreamCallback callback) {
+    this.streamId = streamId;
+    this.byteCount = byteCount;
+    this.callback = callback;
+    this.bytesRead = 0;
+  }
+
+  @Override
+  public void exceptionCaught(Throwable cause) throws Exception {
+    callback.onFailure(streamId, cause);
+  }
+
+  @Override
+  public void channelInactive() throws Exception {
+    callback.onFailure(streamId, new ClosedChannelException());
+  }
+
+  @Override
+  public boolean handle(ByteBuf buf) throws Exception {
+    int toRead = (int) Math.min(buf.readableBytes(), byteCount - bytesRead);
+    ByteBuffer nioBuffer = buf.readSlice(toRead).nioBuffer();
+
+    int available = nioBuffer.remaining();
+    callback.onData(streamId, nioBuffer);
+    bytesRead += available;
+    if (bytesRead > byteCount) {
+      RuntimeException re = new IllegalStateException(String.format(
+        "Read too many bytes? Expected %d, but read %d.", byteCount, bytesRead));
+      callback.onFailure(streamId, re);
+      throw re;
+    } else if (bytesRead == byteCount) {
+      callback.onComplete(streamId);
+    }
+
+    return bytesRead != byteCount;
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index fbb8bb6b2f6c3..a0ba223e340a2 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -38,6 +38,7 @@
 import org.apache.spark.network.protocol.ChunkFetchRequest;
 import org.apache.spark.network.protocol.RpcRequest;
 import org.apache.spark.network.protocol.StreamChunkId;
+import org.apache.spark.network.protocol.StreamRequest;
 import org.apache.spark.network.util.NettyUtils;
 
 /**
@@ -159,6 +160,46 @@ public void operationComplete(ChannelFuture future) throws Exception {
       });
   }
 
+  /**
+   * Request to stream the data with the given stream ID from the remote end.
+   *
+   * @param streamId The stream to fetch.
+   * @param callback Object to call with the stream data.
+   */
+  public void stream(final String streamId, final StreamCallback callback) {
+    final String serverAddr = NettyUtils.getRemoteAddress(channel);
+    final long startTime = System.currentTimeMillis();
+    logger.debug("Sending stream request for {} to {}", streamId, serverAddr);
+
+    // Need to synchronize here so that the callback is added to the queue and the RPC is
+    // written to the socket atomically, so that callbacks are called in the right order
+    // when responses arrive.
+    synchronized (this) {
+      handler.addStreamCallback(callback);
+      channel.writeAndFlush(new StreamRequest(streamId)).addListener(
+        new ChannelFutureListener() {
+          @Override
+          public void operationComplete(ChannelFuture future) throws Exception {
+            if (future.isSuccess()) {
+              long timeTaken = System.currentTimeMillis() - startTime;
+              logger.trace("Sending request for {} to {} took {} ms", streamId, serverAddr,
+                timeTaken);
+            } else {
+              String errorMsg = String.format("Failed to send request for %s to %s: %s", streamId,
+                serverAddr, future.cause());
+              logger.error(errorMsg, future.cause());
+              channel.close();
+              try {
+                callback.onFailure(streamId, new IOException(errorMsg, future.cause()));
+              } catch (Exception e) {
+                logger.error("Uncaught exception in RPC response callback handler!", e);
+              }
+            }
+          }
+        });
+    }
+  }
+
   /**
    * Sends an opaque message to the RpcHandler on the server-side. The callback will be invoked
    * with the server's response or upon any failure.
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/network/common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index 94fc21af5e606..ed3f36af58048 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -19,7 +19,9 @@
 
 import java.io.IOException;
 import java.util.Map;
+import java.util.Queue;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.atomic.AtomicLong;
 
 import io.netty.channel.Channel;
@@ -32,8 +34,11 @@
 import org.apache.spark.network.protocol.RpcFailure;
 import org.apache.spark.network.protocol.RpcResponse;
 import org.apache.spark.network.protocol.StreamChunkId;
+import org.apache.spark.network.protocol.StreamFailure;
+import org.apache.spark.network.protocol.StreamResponse;
 import org.apache.spark.network.server.MessageHandler;
 import org.apache.spark.network.util.NettyUtils;
+import org.apache.spark.network.util.TransportFrameDecoder;
 
 /**
  * Handler that processes server responses, in response to requests issued from a
@@ -50,6 +55,8 @@ public class TransportResponseHandler extends MessageHandler<ResponseMessage> {
 
   private final Map<Long, RpcResponseCallback> outstandingRpcs;
 
+  private final Queue<StreamCallback> streamCallbacks;
+
   /** Records the time (in system nanoseconds) that the last fetch or RPC request was sent. */
   private final AtomicLong timeOfLastRequestNs;
 
@@ -57,6 +64,7 @@ public TransportResponseHandler(Channel channel) {
     this.channel = channel;
     this.outstandingFetches = new ConcurrentHashMap<StreamChunkId, ChunkReceivedCallback>();
     this.outstandingRpcs = new ConcurrentHashMap<Long, RpcResponseCallback>();
+    this.streamCallbacks = new ConcurrentLinkedQueue<StreamCallback>();
     this.timeOfLastRequestNs = new AtomicLong(0);
   }
 
@@ -78,6 +86,10 @@ public void removeRpcRequest(long requestId) {
     outstandingRpcs.remove(requestId);
   }
 
+  public void addStreamCallback(StreamCallback callback) {
+    streamCallbacks.offer(callback);
+  }
+
   /**
    * Fire the failure callback for all outstanding requests. This is called when we have an
    * uncaught exception or pre-mature connection termination.
@@ -124,11 +136,11 @@ public void handle(ResponseMessage message) {
       if (listener == null) {
         logger.warn("Ignoring response for block {} from {} since it is not outstanding",
           resp.streamChunkId, remoteAddress);
-        resp.buffer.release();
+        resp.body.release();
       } else {
         outstandingFetches.remove(resp.streamChunkId);
-        listener.onSuccess(resp.streamChunkId.chunkIndex, resp.buffer);
-        resp.buffer.release();
+        listener.onSuccess(resp.streamChunkId.chunkIndex, resp.body);
+        resp.body.release();
       }
     } else if (message instanceof ChunkFetchFailure) {
       ChunkFetchFailure resp = (ChunkFetchFailure) message;
@@ -161,6 +173,34 @@ public void handle(ResponseMessage message) {
         outstandingRpcs.remove(resp.requestId);
         listener.onFailure(new RuntimeException(resp.errorString));
       }
+    } else if (message instanceof StreamResponse) {
+      StreamResponse resp = (StreamResponse) message;
+      StreamCallback callback = streamCallbacks.poll();
+      if (callback != null) {
+        StreamInterceptor interceptor = new StreamInterceptor(resp.streamId, resp.byteCount,
+          callback);
+        try {
+          TransportFrameDecoder frameDecoder = (TransportFrameDecoder)
+            channel.pipeline().get(TransportFrameDecoder.HANDLER_NAME);
+          frameDecoder.setInterceptor(interceptor);
+        } catch (Exception e) {
+          logger.error("Error installing stream handler.", e);
+        }
+      } else {
+        logger.error("Could not find callback for StreamResponse.");
+      }
+    } else if (message instanceof StreamFailure) {
+      StreamFailure resp = (StreamFailure) message;
+      StreamCallback callback = streamCallbacks.poll();
+      if (callback != null) {
+        try {
+          callback.onFailure(resp.streamId, new RuntimeException(resp.error));
+        } catch (IOException ioe) {
+          logger.warn("Error in stream failure handler.", ioe);
+        }
+      } else {
+        logger.warn("Stream failure with unknown callback: {}", resp.error);
+      }
     } else {
       throw new IllegalStateException("Unknown response type: " + message.type());
     }
@@ -175,4 +215,5 @@ public int numOutstandingRequests() {
   public long getTimeOfLastRequestNs() {
     return timeOfLastRequestNs.get();
   }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java
index c962fb7ecf76d..e6a7e9a8b4145 100644
--- a/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java
@@ -30,13 +30,12 @@
  * may be written by Netty in a more efficient manner (i.e., zero-copy write).
  * Similarly, the client-side decoding will reuse the Netty ByteBuf as the buffer.
  */
-public final class ChunkFetchSuccess implements ResponseMessage {
+public final class ChunkFetchSuccess extends ResponseWithBody {
   public final StreamChunkId streamChunkId;
-  public final ManagedBuffer buffer;
 
   public ChunkFetchSuccess(StreamChunkId streamChunkId, ManagedBuffer buffer) {
+    super(buffer, true);
     this.streamChunkId = streamChunkId;
-    this.buffer = buffer;
   }
 
   @Override
@@ -53,6 +52,11 @@ public void encode(ByteBuf buf) {
     streamChunkId.encode(buf);
   }
 
+  @Override
+  public ResponseMessage createFailureResponse(String error) {
+    return new ChunkFetchFailure(streamChunkId, error);
+  }
+
   /** Decoding uses the given ByteBuf as our data, and will retain() it. */
   public static ChunkFetchSuccess decode(ByteBuf buf) {
     StreamChunkId streamChunkId = StreamChunkId.decode(buf);
@@ -63,14 +67,14 @@ public static ChunkFetchSuccess decode(ByteBuf buf) {
 
   @Override
   public int hashCode() {
-    return Objects.hashCode(streamChunkId, buffer);
+    return Objects.hashCode(streamChunkId, body);
   }
 
   @Override
   public boolean equals(Object other) {
     if (other instanceof ChunkFetchSuccess) {
       ChunkFetchSuccess o = (ChunkFetchSuccess) other;
-      return streamChunkId.equals(o.streamChunkId) && buffer.equals(o.buffer);
+      return streamChunkId.equals(o.streamChunkId) && body.equals(o.body);
     }
     return false;
   }
@@ -79,7 +83,7 @@ public boolean equals(Object other) {
   public String toString() {
     return Objects.toStringHelper(this)
       .add("streamChunkId", streamChunkId)
-      .add("buffer", buffer)
+      .add("buffer", body)
       .toString();
   }
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/Message.java b/network/common/src/main/java/org/apache/spark/network/protocol/Message.java
index d568370125fd4..d01598c20f16f 100644
--- a/network/common/src/main/java/org/apache/spark/network/protocol/Message.java
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/Message.java
@@ -27,7 +27,8 @@ public interface Message extends Encodable {
   /** Preceding every serialized Message is its type, which allows us to deserialize it. */
   public static enum Type implements Encodable {
     ChunkFetchRequest(0), ChunkFetchSuccess(1), ChunkFetchFailure(2),
-    RpcRequest(3), RpcResponse(4), RpcFailure(5);
+    RpcRequest(3), RpcResponse(4), RpcFailure(5),
+    StreamRequest(6), StreamResponse(7), StreamFailure(8);
 
     private final byte id;
 
@@ -51,6 +52,9 @@ public static Type decode(ByteBuf buf) {
         case 3: return RpcRequest;
         case 4: return RpcResponse;
         case 5: return RpcFailure;
+        case 6: return StreamRequest;
+        case 7: return StreamResponse;
+        case 8: return StreamFailure;
         default: throw new IllegalArgumentException("Unknown message type: " + id);
       }
     }
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java b/network/common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
index 81f8d7f96350f..3c04048f3821a 100644
--- a/network/common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
@@ -63,6 +63,15 @@ private Message decode(Message.Type msgType, ByteBuf in) {
       case RpcFailure:
         return RpcFailure.decode(in);
 
+      case StreamRequest:
+        return StreamRequest.decode(in);
+
+      case StreamResponse:
+        return StreamResponse.decode(in);
+
+      case StreamFailure:
+        return StreamFailure.decode(in);
+
       default:
         throw new IllegalArgumentException("Unexpected message type: " + msgType);
     }
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java b/network/common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
index 0f999f5dfe8d8..6cce97c807dc0 100644
--- a/network/common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
@@ -45,27 +45,32 @@ public final class MessageEncoder extends MessageToMessageEncoder<Message> {
   public void encode(ChannelHandlerContext ctx, Message in, List<Object> out) {
     Object body = null;
     long bodyLength = 0;
+    boolean isBodyInFrame = false;
 
-    // Only ChunkFetchSuccesses have data besides the header.
+    // Detect ResponseWithBody messages and get the data buffer out of them.
     // The body is used in order to enable zero-copy transfer for the payload.
-    if (in instanceof ChunkFetchSuccess) {
-      ChunkFetchSuccess resp = (ChunkFetchSuccess) in;
+    if (in instanceof ResponseWithBody) {
+      ResponseWithBody resp = (ResponseWithBody) in;
       try {
-        bodyLength = resp.buffer.size();
-        body = resp.buffer.convertToNetty();
+        bodyLength = resp.body.size();
+        body = resp.body.convertToNetty();
+        isBodyInFrame = resp.isBodyInFrame;
       } catch (Exception e) {
-        // Re-encode this message as BlockFetchFailure.
-        logger.error(String.format("Error opening block %s for client %s",
-          resp.streamChunkId, ctx.channel().remoteAddress()), e);
-        encode(ctx, new ChunkFetchFailure(resp.streamChunkId, e.getMessage()), out);
+        // Re-encode this message as a failure response.
+        String error = e.getMessage() != null ? e.getMessage() : "null";
+        logger.error(String.format("Error processing %s for client %s",
+          resp, ctx.channel().remoteAddress()), e);
+        encode(ctx, resp.createFailureResponse(error), out);
         return;
       }
     }
 
     Message.Type msgType = in.type();
-    // All messages have the frame length, message type, and message itself.
+    // All messages have the frame length, message type, and message itself. The frame length
+    // may optionally include the length of the body data, depending on what message is being
+    // sent.
     int headerLength = 8 + msgType.encodedLength() + in.encodedLength();
-    long frameLength = headerLength + bodyLength;
+    long frameLength = headerLength + (isBodyInFrame ? bodyLength : 0);
     ByteBuf header = ctx.alloc().heapBuffer(headerLength);
     header.writeLong(frameLength);
     msgType.encode(header);
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/ResponseWithBody.java b/network/common/src/main/java/org/apache/spark/network/protocol/ResponseWithBody.java
new file mode 100644
index 0000000000000..67be77e39f711
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/ResponseWithBody.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NettyManagedBuffer;
+
+/**
+ * Abstract class for response messages that contain a large data portion kept in a separate
+ * buffer. These messages are treated especially by MessageEncoder.
+ */
+public abstract class ResponseWithBody implements ResponseMessage {
+  public final ManagedBuffer body;
+  public final boolean isBodyInFrame;
+
+  protected ResponseWithBody(ManagedBuffer body, boolean isBodyInFrame) {
+    this.body = body;
+    this.isBodyInFrame = isBodyInFrame;
+  }
+
+  public abstract ResponseMessage createFailureResponse(String error);
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java b/network/common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java
new file mode 100644
index 0000000000000..e3dade2ebf905
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NettyManagedBuffer;
+
+/**
+ * Message indicating an error when transferring a stream.
+ */
+public final class StreamFailure implements ResponseMessage {
+  public final String streamId;
+  public final String error;
+
+  public StreamFailure(String streamId, String error) {
+    this.streamId = streamId;
+    this.error = error;
+  }
+
+  @Override
+  public Type type() { return Type.StreamFailure; }
+
+  @Override
+  public int encodedLength() {
+    return Encoders.Strings.encodedLength(streamId) + Encoders.Strings.encodedLength(error);
+  }
+
+  @Override
+  public void encode(ByteBuf buf) {
+    Encoders.Strings.encode(buf, streamId);
+    Encoders.Strings.encode(buf, error);
+  }
+
+  public static StreamFailure decode(ByteBuf buf) {
+    String streamId = Encoders.Strings.decode(buf);
+    String error = Encoders.Strings.decode(buf);
+    return new StreamFailure(streamId, error);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(streamId, error);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof StreamFailure) {
+      StreamFailure o = (StreamFailure) other;
+      return streamId.equals(o.streamId) && error.equals(o.error);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return Objects.toStringHelper(this)
+      .add("streamId", streamId)
+      .add("error", error)
+      .toString();
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java b/network/common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java
new file mode 100644
index 0000000000000..821e8f53884d7
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NettyManagedBuffer;
+
+/**
+ * Request to stream data from the remote end.
+ * <p>
+ * The stream ID is an arbitrary string that needs to be negotiated between the two endpoints before
+ * the data can be streamed.
+ */
+public final class StreamRequest implements RequestMessage {
+   public final String streamId;
+
+   public StreamRequest(String streamId) {
+     this.streamId = streamId;
+   }
+
+  @Override
+  public Type type() { return Type.StreamRequest; }
+
+  @Override
+  public int encodedLength() {
+    return Encoders.Strings.encodedLength(streamId);
+  }
+
+  @Override
+  public void encode(ByteBuf buf) {
+    Encoders.Strings.encode(buf, streamId);
+  }
+
+  public static StreamRequest decode(ByteBuf buf) {
+    String streamId = Encoders.Strings.decode(buf);
+    return new StreamRequest(streamId);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(streamId);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof StreamRequest) {
+      StreamRequest o = (StreamRequest) other;
+      return streamId.equals(o.streamId);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return Objects.toStringHelper(this)
+      .add("streamId", streamId)
+      .toString();
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java b/network/common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
new file mode 100644
index 0000000000000..ac5ab9a323a11
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.protocol;
+
+import com.google.common.base.Objects;
+import io.netty.buffer.ByteBuf;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NettyManagedBuffer;
+
+/**
+ * Response to {@link StreamRequest} when the stream has been successfully opened.
+ * <p>
+ * Note the message itself does not contain the stream data. That is written separately by the
+ * sender. The receiver is expected to set a temporary channel handler that will consume the
+ * number of bytes this message says the stream has.
+ */
+public final class StreamResponse extends ResponseWithBody {
+   public final String streamId;
+   public final long byteCount;
+
+   public StreamResponse(String streamId, long byteCount, ManagedBuffer buffer) {
+     super(buffer, false);
+     this.streamId = streamId;
+     this.byteCount = byteCount;
+   }
+
+  @Override
+  public Type type() { return Type.StreamResponse; }
+
+  @Override
+  public int encodedLength() {
+    return 8 + Encoders.Strings.encodedLength(streamId);
+  }
+
+  /** Encoding does NOT include 'buffer' itself. See {@link MessageEncoder}. */
+  @Override
+  public void encode(ByteBuf buf) {
+    Encoders.Strings.encode(buf, streamId);
+    buf.writeLong(byteCount);
+  }
+
+  @Override
+  public ResponseMessage createFailureResponse(String error) {
+    return new StreamFailure(streamId, error);
+  }
+
+  public static StreamResponse decode(ByteBuf buf) {
+    String streamId = Encoders.Strings.decode(buf);
+    long byteCount = buf.readLong();
+    return new StreamResponse(streamId, byteCount, null);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(byteCount, streamId);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof StreamResponse) {
+      StreamResponse o = (StreamResponse) other;
+      return byteCount == o.byteCount && streamId.equals(o.streamId);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return Objects.toStringHelper(this)
+      .add("streamId", streamId)
+      .add("byteCount", byteCount)
+      .toString();
+  }
+
+}
diff --git a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
index aaa677c965640..3f0155957a140 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
@@ -46,6 +46,19 @@ public abstract class StreamManager {
    */
   public abstract ManagedBuffer getChunk(long streamId, int chunkIndex);
 
+  /**
+   * Called in response to a stream() request. The returned data is streamed to the client
+   * through a single TCP connection.
+   *
+   * Note the <code>streamId</code> argument is not related to the similarly named argument in the
+   * {@link #getChunk(long, int)} method.
+   *
+   * @param streamId id of a stream that has been previously registered with the StreamManager.
+   */
+  public ManagedBuffer openStream(String streamId) {
+    throw new UnsupportedOperationException();
+  }
+
   /**
    * Associates a stream with a single client connection, which is guaranteed to be the only reader
    * of the stream. The getChunk() method will be called serially on this connection and once the
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index 9b8b047b49a86..4f67bd573be21 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -35,6 +35,9 @@
 import org.apache.spark.network.protocol.ChunkFetchSuccess;
 import org.apache.spark.network.protocol.RpcFailure;
 import org.apache.spark.network.protocol.RpcResponse;
+import org.apache.spark.network.protocol.StreamFailure;
+import org.apache.spark.network.protocol.StreamRequest;
+import org.apache.spark.network.protocol.StreamResponse;
 import org.apache.spark.network.util.NettyUtils;
 
 /**
@@ -92,6 +95,8 @@ public void handle(RequestMessage request) {
       processFetchRequest((ChunkFetchRequest) request);
     } else if (request instanceof RpcRequest) {
       processRpcRequest((RpcRequest) request);
+    } else if (request instanceof StreamRequest) {
+      processStreamRequest((StreamRequest) request);
     } else {
       throw new IllegalArgumentException("Unknown request type: " + request);
     }
@@ -117,6 +122,21 @@ private void processFetchRequest(final ChunkFetchRequest req) {
     respond(new ChunkFetchSuccess(req.streamChunkId, buf));
   }
 
+  private void processStreamRequest(final StreamRequest req) {
+    final String client = NettyUtils.getRemoteAddress(channel);
+    ManagedBuffer buf;
+    try {
+      buf = streamManager.openStream(req.streamId);
+    } catch (Exception e) {
+      logger.error(String.format(
+        "Error opening stream %s for request from %s", req.streamId, client), e);
+      respond(new StreamFailure(req.streamId, Throwables.getStackTraceAsString(e)));
+      return;
+    }
+
+    respond(new StreamResponse(req.streamId, buf.size(), buf));
+  }
+
   private void processRpcRequest(final RpcRequest req) {
     try {
       rpcHandler.receive(reverseClient, req.message, new RpcResponseCallback() {
diff --git a/network/common/src/main/java/org/apache/spark/network/util/NettyUtils.java b/network/common/src/main/java/org/apache/spark/network/util/NettyUtils.java
index 26c6399ce7dbc..caa7260bc8281 100644
--- a/network/common/src/main/java/org/apache/spark/network/util/NettyUtils.java
+++ b/network/common/src/main/java/org/apache/spark/network/util/NettyUtils.java
@@ -89,13 +89,8 @@ public static Class<? extends ServerChannel> getServerChannelClass(IOMode mode)
    * Creates a LengthFieldBasedFrameDecoder where the first 8 bytes are the length of the frame.
    * This is used before all decoders.
    */
-  public static ByteToMessageDecoder createFrameDecoder() {
-    // maxFrameLength = 2G
-    // lengthFieldOffset = 0
-    // lengthFieldLength = 8
-    // lengthAdjustment = -8, i.e. exclude the 8 byte length itself
-    // initialBytesToStrip = 8, i.e. strip out the length field itself
-    return new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 8, -8, 8);
+  public static TransportFrameDecoder createFrameDecoder() {
+    return new TransportFrameDecoder();
   }
 
   /** Returns the remote address on the channel or "&lt;unknown remote&gt;" if none exists. */
diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java b/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
new file mode 100644
index 0000000000000..272ea84e6180d
--- /dev/null
+++ b/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import com.google.common.base.Preconditions;
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.CompositeByteBuf;
+import io.netty.channel.ChannelHandlerContext;
+import io.netty.channel.ChannelInboundHandlerAdapter;
+
+/**
+ * A customized frame decoder that allows intercepting raw data.
+ * <p>
+ * This behaves like Netty's frame decoder (with harcoded parameters that match this library's
+ * needs), except it allows an interceptor to be installed to read data directly before it's
+ * framed.
+ * <p>
+ * Unlike Netty's frame decoder, each frame is dispatched to child handlers as soon as it's
+ * decoded, instead of building as many frames as the current buffer allows and dispatching
+ * all of them. This allows a child handler to install an interceptor if needed.
+ * <p>
+ * If an interceptor is installed, framing stops, and data is instead fed directly to the
+ * interceptor. When the interceptor indicates that it doesn't need to read any more data,
+ * framing resumes. Interceptors should not hold references to the data buffers provided
+ * to their handle() method.
+ */
+public class TransportFrameDecoder extends ChannelInboundHandlerAdapter {
+
+  public static final String HANDLER_NAME = "frameDecoder";
+  private static final int LENGTH_SIZE = 8;
+  private static final int MAX_FRAME_SIZE = Integer.MAX_VALUE;
+
+  private CompositeByteBuf buffer;
+  private volatile Interceptor interceptor;
+
+  @Override
+  public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception {
+    ByteBuf in = (ByteBuf) data;
+
+    if (buffer == null) {
+      buffer = in.alloc().compositeBuffer();
+    }
+
+    buffer.writeBytes(in);
+
+    while (buffer.isReadable()) {
+      feedInterceptor();
+      if (interceptor != null) {
+        continue;
+      }
+
+      ByteBuf frame = decodeNext();
+      if (frame != null) {
+        ctx.fireChannelRead(frame);
+      } else {
+        break;
+      }
+    }
+
+    // We can't discard read sub-buffers if there are other references to the buffer (e.g.
+    // through slices used for framing). This assumes that code that retains references
+    // will call retain() from the thread that called "fireChannelRead()" above, otherwise
+    // ref counting will go awry.
+    if (buffer != null && buffer.refCnt() == 1) {
+      buffer.discardReadComponents();
+    }
+  }
+
+  protected ByteBuf decodeNext() throws Exception {
+    if (buffer.readableBytes() < LENGTH_SIZE) {
+      return null;
+    }
+
+    int frameLen = (int) buffer.readLong() - LENGTH_SIZE;
+    if (buffer.readableBytes() < frameLen) {
+      buffer.readerIndex(buffer.readerIndex() - LENGTH_SIZE);
+      return null;
+    }
+
+    Preconditions.checkArgument(frameLen < MAX_FRAME_SIZE, "Too large frame: %s", frameLen);
+    Preconditions.checkArgument(frameLen > 0, "Frame length should be positive: %s", frameLen);
+
+    ByteBuf frame = buffer.readSlice(frameLen);
+    frame.retain();
+    return frame;
+  }
+
+  @Override
+  public void channelInactive(ChannelHandlerContext ctx) throws Exception {
+    if (buffer != null) {
+      if (buffer.isReadable()) {
+        feedInterceptor();
+      }
+      buffer.release();
+    }
+    if (interceptor != null) {
+      interceptor.channelInactive();
+    }
+    super.channelInactive(ctx);
+  }
+
+  @Override
+  public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception {
+    if (interceptor != null) {
+      interceptor.exceptionCaught(cause);
+    }
+    super.exceptionCaught(ctx, cause);
+  }
+
+  public void setInterceptor(Interceptor interceptor) {
+    Preconditions.checkState(this.interceptor == null, "Already have an interceptor.");
+    this.interceptor = interceptor;
+  }
+
+  private void feedInterceptor() throws Exception {
+    if (interceptor != null && !interceptor.handle(buffer)) {
+      interceptor = null;
+    }
+  }
+
+  public static interface Interceptor {
+
+    /**
+     * Handles data received from the remote end.
+     *
+     * @param data Buffer containing data.
+     * @return "true" if the interceptor expects more data, "false" to uninstall the interceptor.
+     */
+    boolean handle(ByteBuf data) throws Exception;
+
+    /** Called if an exception is thrown in the channel pipeline. */
+    void exceptionCaught(Throwable cause) throws Exception;
+
+    /** Called if the channel is closed and the interceptor is still installed. */
+    void channelInactive() throws Exception;
+
+  }
+
+}
diff --git a/network/common/src/test/java/org/apache/spark/network/ProtocolSuite.java b/network/common/src/test/java/org/apache/spark/network/ProtocolSuite.java
index d500bc3c98a78..22b451fc0e60e 100644
--- a/network/common/src/test/java/org/apache/spark/network/ProtocolSuite.java
+++ b/network/common/src/test/java/org/apache/spark/network/ProtocolSuite.java
@@ -39,6 +39,9 @@
 import org.apache.spark.network.protocol.RpcRequest;
 import org.apache.spark.network.protocol.RpcResponse;
 import org.apache.spark.network.protocol.StreamChunkId;
+import org.apache.spark.network.protocol.StreamFailure;
+import org.apache.spark.network.protocol.StreamRequest;
+import org.apache.spark.network.protocol.StreamResponse;
 import org.apache.spark.network.util.ByteArrayWritableChannel;
 import org.apache.spark.network.util.NettyUtils;
 
@@ -80,6 +83,7 @@ public void requests() {
     testClientToServer(new ChunkFetchRequest(new StreamChunkId(1, 2)));
     testClientToServer(new RpcRequest(12345, new byte[0]));
     testClientToServer(new RpcRequest(12345, new byte[100]));
+    testClientToServer(new StreamRequest("abcde"));
   }
 
   @Test
@@ -92,6 +96,10 @@ public void responses() {
     testServerToClient(new RpcResponse(12345, new byte[1000]));
     testServerToClient(new RpcFailure(0, "this is an error"));
     testServerToClient(new RpcFailure(0, ""));
+    // Note: buffer size must be "0" since StreamResponse's buffer is written differently to the
+    // channel and cannot be tested like this.
+    testServerToClient(new StreamResponse("anId", 12345L, new TestManagedBuffer(0)));
+    testServerToClient(new StreamFailure("anId", "this is an error"));
   }
 
   /**
diff --git a/network/common/src/test/java/org/apache/spark/network/StreamSuite.java b/network/common/src/test/java/org/apache/spark/network/StreamSuite.java
new file mode 100644
index 0000000000000..6dcec831dec71
--- /dev/null
+++ b/network/common/src/test/java/org/apache/spark/network/StreamSuite.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.io.Files;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+import org.apache.spark.network.buffer.FileSegmentManagedBuffer;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.buffer.NioManagedBuffer;
+import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.StreamCallback;
+import org.apache.spark.network.client.TransportClient;
+import org.apache.spark.network.client.TransportClientFactory;
+import org.apache.spark.network.server.RpcHandler;
+import org.apache.spark.network.server.StreamManager;
+import org.apache.spark.network.server.TransportServer;
+import org.apache.spark.network.util.SystemPropertyConfigProvider;
+import org.apache.spark.network.util.TransportConf;
+
+public class StreamSuite {
+  private static final String[] STREAMS = { "largeBuffer", "smallBuffer", "file" };
+
+  private static TransportServer server;
+  private static TransportClientFactory clientFactory;
+  private static File testFile;
+  private static File tempDir;
+
+  private static ByteBuffer smallBuffer;
+  private static ByteBuffer largeBuffer;
+
+  private static ByteBuffer createBuffer(int bufSize) {
+    ByteBuffer buf = ByteBuffer.allocate(bufSize);
+    for (int i = 0; i < bufSize; i ++) {
+      buf.put((byte) i);
+    }
+    buf.flip();
+    return buf;
+  }
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    tempDir = Files.createTempDir();
+    smallBuffer = createBuffer(100);
+    largeBuffer = createBuffer(100000);
+
+    testFile = File.createTempFile("stream-test-file", "txt", tempDir);
+    FileOutputStream fp = new FileOutputStream(testFile);
+    try {
+      Random rnd = new Random();
+      for (int i = 0; i < 512; i++) {
+        byte[] fileContent = new byte[1024];
+        rnd.nextBytes(fileContent);
+        fp.write(fileContent);
+      }
+    } finally {
+      fp.close();
+    }
+
+    final TransportConf conf = new TransportConf(new SystemPropertyConfigProvider());
+    final StreamManager streamManager = new StreamManager() {
+      @Override
+      public ManagedBuffer getChunk(long streamId, int chunkIndex) {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public ManagedBuffer openStream(String streamId) {
+        switch (streamId) {
+          case "largeBuffer":
+            return new NioManagedBuffer(largeBuffer);
+          case "smallBuffer":
+            return new NioManagedBuffer(smallBuffer);
+          case "file":
+            return new FileSegmentManagedBuffer(conf, testFile, 0, testFile.length());
+          default:
+            throw new IllegalArgumentException("Invalid stream: " + streamId);
+        }
+      }
+    };
+    RpcHandler handler = new RpcHandler() {
+      @Override
+      public void receive(TransportClient client, byte[] message, RpcResponseCallback callback) {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public StreamManager getStreamManager() {
+        return streamManager;
+      }
+    };
+    TransportContext context = new TransportContext(conf, handler);
+    server = context.createServer();
+    clientFactory = context.createClientFactory();
+  }
+
+  @AfterClass
+  public static void tearDown() {
+    server.close();
+    clientFactory.close();
+    if (tempDir != null) {
+      for (File f : tempDir.listFiles()) {
+        f.delete();
+      }
+      tempDir.delete();
+    }
+  }
+
+  @Test
+  public void testSingleStream() throws Throwable {
+    TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+    try {
+      StreamTask task = new StreamTask(client, "largeBuffer", TimeUnit.SECONDS.toMillis(5));
+      task.run();
+      task.check();
+    } finally {
+      client.close();
+    }
+  }
+
+  @Test
+  public void testMultipleStreams() throws Throwable {
+    TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+    try {
+      for (int i = 0; i < 20; i++) {
+        StreamTask task = new StreamTask(client, STREAMS[i % STREAMS.length],
+          TimeUnit.SECONDS.toMillis(5));
+        task.run();
+        task.check();
+      }
+    } finally {
+      client.close();
+    }
+  }
+
+  @Test
+  public void testConcurrentStreams() throws Throwable {
+    ExecutorService executor = Executors.newFixedThreadPool(20);
+    TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+
+    try {
+      List<StreamTask> tasks = new ArrayList<>();
+      for (int i = 0; i < 20; i++) {
+        StreamTask task = new StreamTask(client, STREAMS[i % STREAMS.length],
+          TimeUnit.SECONDS.toMillis(20));
+        tasks.add(task);
+        executor.submit(task);
+      }
+
+      executor.shutdown();
+      assertTrue("Timed out waiting for tasks.", executor.awaitTermination(30, TimeUnit.SECONDS));
+      for (StreamTask task : tasks) {
+        task.check();
+      }
+    } finally {
+      executor.shutdownNow();
+      client.close();
+    }
+  }
+
+  private static class StreamTask implements Runnable {
+
+    private final TransportClient client;
+    private final String streamId;
+    private final long timeoutMs;
+    private Throwable error;
+
+    StreamTask(TransportClient client, String streamId, long timeoutMs) {
+      this.client = client;
+      this.streamId = streamId;
+      this.timeoutMs = timeoutMs;
+    }
+
+    @Override
+    public void run() {
+      ByteBuffer srcBuffer = null;
+      OutputStream out = null;
+      File outFile = null;
+      try {
+        ByteArrayOutputStream baos = null;
+
+        switch (streamId) {
+          case "largeBuffer":
+            baos = new ByteArrayOutputStream();
+            out = baos;
+            srcBuffer = largeBuffer;
+            break;
+          case "smallBuffer":
+            baos = new ByteArrayOutputStream();
+            out = baos;
+            srcBuffer = smallBuffer;
+            break;
+          case "file":
+            outFile = File.createTempFile("data", ".tmp", tempDir);
+            out = new FileOutputStream(outFile);
+            break;
+          default:
+            throw new IllegalArgumentException(streamId);
+        }
+
+        TestCallback callback = new TestCallback(out);
+        client.stream(streamId, callback);
+        waitForCompletion(callback);
+
+        if (srcBuffer == null) {
+          assertTrue("File stream did not match.", Files.equal(testFile, outFile));
+        } else {
+          ByteBuffer base;
+          synchronized (srcBuffer) {
+            base = srcBuffer.duplicate();
+          }
+          byte[] result = baos.toByteArray();
+          byte[] expected = new byte[base.remaining()];
+          base.get(expected);
+          assertEquals(expected.length, result.length);
+          assertTrue("buffers don't match", Arrays.equals(expected, result));
+        }
+      } catch (Throwable t) {
+        error = t;
+      } finally {
+        if (out != null) {
+          try {
+            out.close();
+          } catch (Exception e) {
+            // ignore.
+          }
+        }
+        if (outFile != null) {
+          outFile.delete();
+        }
+      }
+    }
+
+    public void check() throws Throwable {
+      if (error != null) {
+        throw error;
+      }
+    }
+
+    private void waitForCompletion(TestCallback callback) throws Exception {
+      long now = System.currentTimeMillis();
+      long deadline = now + timeoutMs;
+      synchronized (callback) {
+        while (!callback.completed && now < deadline) {
+          callback.wait(deadline - now);
+          now = System.currentTimeMillis();
+        }
+      }
+      assertTrue("Timed out waiting for stream.", callback.completed);
+      assertNull(callback.error);
+    }
+
+  }
+
+  private static class TestCallback implements StreamCallback {
+
+    private final OutputStream out;
+    public volatile boolean completed;
+    public volatile Throwable error;
+
+    TestCallback(OutputStream out) {
+      this.out = out;
+      this.completed = false;
+    }
+
+    @Override
+    public void onData(String streamId, ByteBuffer buf) throws IOException {
+      byte[] tmp = new byte[buf.remaining()];
+      buf.get(tmp);
+      out.write(tmp);
+    }
+
+    @Override
+    public void onComplete(String streamId) throws IOException {
+      out.close();
+      synchronized (this) {
+        completed = true;
+        notifyAll();
+      }
+    }
+
+    @Override
+    public void onFailure(String streamId, Throwable cause) {
+      error = cause;
+      synchronized (this) {
+        completed = true;
+        notifyAll();
+      }
+    }
+
+  }
+
+}
diff --git a/network/common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/network/common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java
new file mode 100644
index 0000000000000..ca74f0a00cf9d
--- /dev/null
+++ b/network/common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.channel.ChannelHandlerContext;
+import org.junit.Test;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
+
+public class TransportFrameDecoderSuite {
+
+  @Test
+  public void testFrameDecoding() throws Exception {
+    Random rnd = new Random();
+    TransportFrameDecoder decoder = new TransportFrameDecoder();
+    ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
+
+    final int frameCount = 100;
+    ByteBuf data = Unpooled.buffer();
+    try {
+      for (int i = 0; i < frameCount; i++) {
+        byte[] frame = new byte[1024 * (rnd.nextInt(31) + 1)];
+        data.writeLong(frame.length + 8);
+        data.writeBytes(frame);
+      }
+
+      while (data.isReadable()) {
+        int size = rnd.nextInt(16 * 1024) + 256;
+        decoder.channelRead(ctx, data.readSlice(Math.min(data.readableBytes(), size)));
+      }
+
+      verify(ctx, times(frameCount)).fireChannelRead(any(ByteBuf.class));
+    } finally {
+      data.release();
+    }
+  }
+
+  @Test
+  public void testInterception() throws Exception {
+    final int interceptedReads = 3;
+    TransportFrameDecoder decoder = new TransportFrameDecoder();
+    TransportFrameDecoder.Interceptor interceptor = spy(new MockInterceptor(interceptedReads));
+    ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
+
+    byte[] data = new byte[8];
+    ByteBuf len = Unpooled.copyLong(8 + data.length);
+    ByteBuf dataBuf = Unpooled.wrappedBuffer(data);
+
+    try {
+      decoder.setInterceptor(interceptor);
+      for (int i = 0; i < interceptedReads; i++) {
+        decoder.channelRead(ctx, dataBuf);
+        dataBuf.release();
+        dataBuf = Unpooled.wrappedBuffer(data);
+      }
+      decoder.channelRead(ctx, len);
+      decoder.channelRead(ctx, dataBuf);
+      verify(interceptor, times(interceptedReads)).handle(any(ByteBuf.class));
+      verify(ctx).fireChannelRead(any(ByteBuffer.class));
+    } finally {
+      len.release();
+      dataBuf.release();
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testNegativeFrameSize() throws Exception {
+    testInvalidFrame(-1);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testEmptyFrame() throws Exception {
+    // 8 because frame size includes the frame length.
+    testInvalidFrame(8);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testLargeFrame() throws Exception {
+    // Frame length includes the frame size field, so need to add a few more bytes.
+    testInvalidFrame(Integer.MAX_VALUE + 9);
+  }
+
+  private void testInvalidFrame(long size) throws Exception {
+    TransportFrameDecoder decoder = new TransportFrameDecoder();
+    ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
+    ByteBuf frame = Unpooled.copyLong(size);
+    try {
+      decoder.channelRead(ctx, frame);
+    } finally {
+      frame.release();
+    }
+  }
+
+  private static class MockInterceptor implements TransportFrameDecoder.Interceptor {
+
+    private int remainingReads;
+
+    MockInterceptor(int readCount) {
+      this.remainingReads = readCount;
+    }
+
+    @Override
+    public boolean handle(ByteBuf data) throws Exception {
+      data.readerIndex(data.readerIndex() + data.readableBytes());
+      assertFalse(data.isReadable());
+      remainingReads -= 1;
+      return remainingReads != 0;
+    }
+
+    @Override
+    public void exceptionCaught(Throwable cause) throws Exception {
+
+    }
+
+    @Override
+    public void channelInactive() throws Exception {
+
+    }
+
+  }
+
+}

From cd1df662386c599a9d0968b9fc14f27b0883d285 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 4 Nov 2015 09:32:30 -0800
Subject: [PATCH 802/802] [SPARK-11485][SQL] Make DataFrameHolder and
 DatasetHolder public.

These two classes should be public, since they are used in public code.

Author: Reynold Xin <rxin@databricks.com>

Closes #9445 from rxin/SPARK-11485.
---
 project/MimaExcludes.scala                            |  3 +++
 .../scala/org/apache/spark/sql/DataFrameHolder.scala  |  7 ++++++-
 .../scala/org/apache/spark/sql/DatasetHolder.scala    | 11 ++++++++---
 .../scala/org/apache/spark/sql/SQLImplicits.scala     |  4 ++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index eeef96c378bdb..90dc947d4e588 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -161,6 +161,9 @@ object MimaExcludes {
           "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$23"),
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
+      ) ++ Seq(
+        // SPARK-11485
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.sql.DataFrameHolder.df")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
index 2f19ec0403017..3b30337f1f877 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
@@ -20,9 +20,14 @@ package org.apache.spark.sql
 /**
  * A container for a [[DataFrame]], used for implicit conversions.
  *
+ * To use this, import implicit conversions in SQL:
+ * {{{
+ *   import sqlContext.implicits._
+ * }}}
+ *
  * @since 1.3.0
  */
-private[sql] case class DataFrameHolder(df: DataFrame) {
+case class DataFrameHolder private[sql](private val df: DataFrame) {
 
   // This is declared with parentheses to prevent the Scala compiler from treating
   // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
index 17817cbcc5e05..45f0098b92887 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
@@ -18,11 +18,16 @@
 package org.apache.spark.sql
 
 /**
- * A container for a [[DataFrame]], used for implicit conversions.
+ * A container for a [[Dataset]], used for implicit conversions.
  *
- * @since 1.3.0
+ * To use this, import implicit conversions in SQL:
+ * {{{
+ *   import sqlContext.implicits._
+ * }}}
+ *
+ * @since 1.6.0
  */
-private[sql] case class DatasetHolder[T](df: Dataset[T]) {
+case class DatasetHolder[T] private[sql](private val df: Dataset[T]) {
 
   // This is declared with parentheses to prevent the Scala compiler from treating
   // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index f2904e270811e..6da46a5f7ef9a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -52,6 +52,10 @@ abstract class SQLImplicits {
     DatasetHolder(_sqlContext.createDataset(rdd))
   }
 
+  /**
+   * Creates a [[Dataset]] from a local Seq.
+   * @since 1.6.0
+   */
   implicit def localSeqToDatasetHolder[T : Encoder](s: Seq[T]): DatasetHolder[T] = {
     DatasetHolder(_sqlContext.createDataset(s))
   }